pandas : Post 02

7 min readNov 21, 2021

import numpy as np
import pandas as pddf = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Churn.csv')df.head(3)

df.columns = df.columns.str.replace(' ','_')df.head(3)

Membership filtering

# isin()

isin_mask = df.state.isin(['KS', 'OH'])
df.loc[isin_mask].head(3)

# ~ isin() 
df.loc[~ df.state.isin(['KS', 'OH'])].head(3)

# between

between_mask = df.total_day_minutes.between(200,300)
df.loc[between_mask].head(3)

# any() , all()

test_data = pd.DataFrame(np.random.randint(0,2, (2,10)),
                        index=['sample1', 'sample2'],
                        columns=['id'+str(i) for i in range(10)])
test_data

# for different ids mark test as +ve if any of the sample test is positive
test_data.any()id0     True
id1     True
id2    False
id3     True
id4     True
id5     True
id6     True
id7     True
id8     True
id9     True
dtype: bool# for different ids mark test as +ve if all of the sample test is positive
test_data.all()id0    False
id1    False
id2    False
id3    False
id4    False
id5    False
id6     True
id7    False
id8    False
id9     True
dtype: bool# Query()

df.query("state in ('KS','NJ')").head(3)

# multicondition and python variable in query
day_calls = 100
df.query("state in ('KS','NJ') and total_day_calls > @day_calls").head(3)

# eval() , it evaluates conditions and returns boolean mask of True/False
df.eval("state in ('KS', 'NJ') and total_day_calls > @day_calls").head()0     True
1    False
2     True
3    False
4    False
dtype: booldf.loc[df.eval("state in ('KS', 'NJ') and total_day_calls > @day_calls")].head(3)

# create new column using eval()
df.eval("minutes_per_call = total_day_minutes/total_day_calls", inplace=True)df.head(3)

Removing Duplicates

print(f"Original shape: {df.shape}")
print(f"Shape without duplicates: {df.drop_duplicates().shape}")Original shape: (3333, 22)
Shape without duplicates: (3333, 22)#drop dup rec. w.r.t state , keep first rec
df.drop_duplicates('state', keep='first').head(3)

df.drop_duplicates(['state', 'area_code'], keep='last').head()

Sorting

# Ascending sort, select 5 cust with lowest minutes_per_call
df.sort_values(by='minutes_per_call').head(5)

# Descending sort
df.sort_values(by='minutes_per_call', ascending=False).head()

# Sort by multiple columns

df.sort_values(by=['state','minutes_per_call'], ascending=[True, False]).head().loc[:, ['state','minutes_per_call','churn']]

map and applymap()

tips = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Tips100.csv')
tips.head()

tips.dtypestotal_bill          float64
tip                 float64
sex                  object
smoker               object
day                  object
time                 object
size                  int64
price_per_person    float64
Payer Name           object
CC Number             int64
Payment ID           object
dtype: object# assign() , create multiple columns 

tips = tips.assign(tip_percent = lambda x : round((x['tip']/x['total_bill'])*100,2),
            tip_per_person = lambda x : round(x['tip']/x['size'],2)
           )
tips.head()

# map()
tips.day.unique()array(['Sat', 'Thur', 'Fri', 'Sun'], dtype=object)day_map = {'Mon':1, 'Tue':2, 'Wed':3, 'Thur':4, 'Fri':5, 'Sat':6, 'Sun':7}
tips['weekday'] = tips['day'].map(day_map)
tips.head()

# use map() to mask credit card info.
tips['CC_masked'] = tips['CC Number'].map(lambda x : "x"+str(x)[-4:] if len(str(x))==16 else "xxxx")
tips.head()

# applymap() is the DataFrame version of map()
tips.applymap(lambda x : str(x).upper()[0] if str(x).isalpha() else x).head()

# Extract day part from PaymentID column

tips.loc[:5,'Payment ID']0     Sat4056
1    Thur9501
2    Thur4780
3    Thur9593
4    Thur7276
5     Thur437
Name: Payment ID, dtype: objectdef remove_digits(s):
    return ''.join([i for i in s if not i.isdigit()])remove_digits('Sat4056')'Sat'tips['Payment ID'].map(remove_digits)[0:5]0     Sat
1    Thur
2    Thur
3    Thur
4    Thur
Name: Payment ID, dtype: objecttips['Payment ID'].str.replace('\d','', regex=True)0      Sat
1     Thur
2     Thur
3     Thur
4     Thur
      ... 
95     Sat
96     Sun
97    Thur
98    Thur
99     Sat
Name: Payment ID, Length: 100, dtype: object

apply() row wise and column wise

df = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/hungary_chickenpox.csv')df.head()

df['Date'] = pd.to_datetime(df.Date)df.head(3)

df.set_index('Date', inplace=True)df.head(3)

# find max no. of cases for each day
df.apply(max, axis=1)Date
2005-03-01    178
2005-10-01    200
2005-01-17    191
2005-01-24    163
2005-01-31    172
             ... 
2014-01-12    122
2014-08-12     70
2014-12-15     72
2014-12-22     83
2014-12-29    259
Length: 522, dtype: int64# find max no. of cases for each day
df.apply(min, axis=1)Date
2005-03-01     2
2005-10-01    26
2005-01-17     4
2005-01-24    14
2005-01-31     7
              ..
2014-01-12     0
2014-08-12     2
2014-12-15     0
2014-12-22     0
2014-12-29    11
Length: 522, dtype: int64# what is median number of chickenpoxes for each region
df.apply(np.median, axis=0)BUDAPEST    93.0
BARANYA     25.0
BACS        29.5
BEKES       14.0
BORSOD      46.5
CSONGRAD    20.5
FEJER       24.0
GYOR        35.0
HAJDU       37.0
HEVES       21.0
JASZ        31.0
KOMAROM     19.0
NOGRAD      15.0
PEST        81.0
SOMOGY      20.5
SZABOLCS    18.5
TOLNA       12.0
VAS         13.0
VESZPREM    32.0
ZALA        13.0
dtype: float64# find the diference between highest and lowest no. of cases for each day
df.apply(lambda x: max(x) - min(x), axis=1)Date
2005-03-01    176
2005-10-01    174
2005-01-17    187
2005-01-24    149
2005-01-31    165
             ... 
2014-01-12    122
2014-08-12     68
2014-12-15     72
2014-12-22     83
2014-12-29    248
Length: 522, dtype: int64# find date when highest and lowest cases were observed for each region

hi_indx = df.apply(lambda x : np.argmax(x),  axis=0)
lo_indx = df.apply(lambda x : np.argmin(x),  axis=0)df.iloc[hi_indx].indexDatetimeIndex(['2008-01-14', '2013-04-02', '2014-12-05', '2005-03-21',
               '2009-12-01', '2009-03-30', '2008-04-21', '2005-07-02',
               '2009-08-06', '2007-08-01', '2005-09-05', '2008-01-14',
               '2007-05-03', '2008-01-14', '2006-03-04', '2005-03-28',
               '2013-04-15', '2009-05-01', '2006-03-27', '2012-06-02'],
              dtype='datetime64[ns]', name='Date', freq=None)df.iloc[lo_indx].indexDatetimeIndex(['2012-08-27', '2005-08-22', '2005-08-22', '2005-12-09',
               '2014-04-28', '2005-08-22', '2005-09-19', '2008-08-09',
               '2008-08-25', '2007-08-20', '2006-08-28', '2006-08-28',
               '2005-08-22', '2013-09-09', '2007-10-09', '2007-08-13',
               '2006-08-14', '2005-08-29', '2005-08-29', '2005-05-09'],
              dtype='datetime64[ns]', name='Date', freq=None)

Scaling

Z Score scaling

def z_score(x):
    return (x - np.mean(x))/np.std(x)

min max scaling

def min_max_scaling(x):
    return (x - min(x))/(max(x) - min(x))

maximum absolute scaling

def max_abs_scaling(x):
    return x/max(x)

robust scaling

def robust_scaling(x):
    q2 = np.median(x)
    q3 = np.percentile(x, 75)
    q1 = np.percentile(x, 25)
    iqr = q3-q1
    return (x - q2)/iqrimport matplotlib.pyplot as plt%matplotlib inlinebudapest_zscale = df.loc[:, ['BUDAPEST']].apply(z_score)['BUDAPEST']
print(min(budapest_zscale), max(budapest_zscale))-1.327254255980001 4.952102407345022plt.boxplot(budapest_zscale)
plt.show()

budapest_minmaxscale = df.loc[:, ['BUDAPEST']].apply(min_max_scaling)['BUDAPEST']
print(min(budapest_minmaxscale), max(budapest_minmaxscale))0.0 1.0plt.boxplot(budapest_minmaxscale)
plt.show()

budapest_robustscale = df.loc[:, ['BUDAPEST']].apply(robust_scaling)['BUDAPEST']
print(min(budapest_robustscale), max(budapest_robustscale))-0.8104575163398693 3.363834422657952plt.boxplot(budapest_robustscale)
plt.show()

Binning

qcut()

pd.qcut(df.BUDAPEST, q=4, labels=['LOW', 'MEDIUM', 'HIGH', 'VERY HIGH'])Date
2005-03-01    VERY HIGH
2005-10-01    VERY HIGH
2005-01-17         HIGH
2005-01-24    VERY HIGH
2005-01-31         HIGH
                ...    
2014-01-12         HIGH
2014-08-12       MEDIUM
2014-12-15       MEDIUM
2014-12-22          LOW
2014-12-29    VERY HIGH
Name: BUDAPEST, Length: 522, dtype: category
Categories (4, object): ['LOW' < 'MEDIUM' < 'HIGH' < 'VERY HIGH']min(df.BUDAPEST), np.median(df.BUDAPEST), max(df.BUDAPEST)(0, 93.0, 479)bin_edges = [0, 10, 50, 100, 1000]
pd.cut(df.BUDAPEST, bins=bin_edges, labels=[1,2,3,4])Date
2005-03-01    4
2005-10-01    4
2005-01-17    3
2005-01-24    4
2005-01-31    4
             ..
2014-01-12    3
2014-08-12    2
2014-12-15    2
2014-12-22    2
2014-12-29    4
Name: BUDAPEST, Length: 522, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]pd.cut(df.BUDAPEST, bins=bin_edges, labels=[1,2,3,4]).value_counts()4    247
2    123
3    110
1     41
Name: BUDAPEST, dtype: int64df.apply(pd.qcut, q=4, axis=0)

Membership filtering

Removing Duplicates

Sorting

map and applymap()

apply() row wise and column wise

Scaling

Z Score scaling

min max scaling

maximum absolute scaling

robust scaling

Binning

qcut()

Written by shekhar pandey