shekhar pandey
7 min readNov 21, 2021

--

import numpy as np
import pandas as pd
df = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Churn.csv')df.head(3)
png
df.columns = df.columns.str.replace(' ','_')df.head(3)
png

Membership filtering

# isin()

isin_mask = df.state.isin(['KS', 'OH'])
df.loc[isin_mask].head(3)
png
# ~ isin() 
df.loc[~ df.state.isin(['KS', 'OH'])].head(3)
png
# between

between_mask = df.total_day_minutes.between(200,300)
df.loc[between_mask].head(3)
png
# any() , all()

test_data = pd.DataFrame(np.random.randint(0,2, (2,10)),
index=['sample1', 'sample2'],
columns=['id'+str(i) for i in range(10)])
test_data
png
# for different ids mark test as +ve if any of the sample test is positive
test_data.any()
id0 True
id1 True
id2 False
id3 True
id4 True
id5 True
id6 True
id7 True
id8 True
id9 True
dtype: bool
# for different ids mark test as +ve if all of the sample test is positive
test_data.all()
id0 False
id1 False
id2 False
id3 False
id4 False
id5 False
id6 True
id7 False
id8 False
id9 True
dtype: bool
# Query()

df.query("state in ('KS','NJ')").head(3)
png
# multicondition and python variable in query
day_calls = 100
df.query("state in ('KS','NJ') and total_day_calls > @day_calls").head(3)
png
# eval() , it evaluates conditions and returns boolean mask of True/False
df.eval("state in ('KS', 'NJ') and total_day_calls > @day_calls").head()
0 True
1 False
2 True
3 False
4 False
dtype: bool
df.loc[df.eval("state in ('KS', 'NJ') and total_day_calls > @day_calls")].head(3)
png
# create new column using eval()
df.eval("minutes_per_call = total_day_minutes/total_day_calls", inplace=True)
df.head(3)
png

Removing Duplicates

print(f"Original shape: {df.shape}")
print(f"Shape without duplicates: {df.drop_duplicates().shape}")
Original shape: (3333, 22)
Shape without duplicates: (3333, 22)
#drop dup rec. w.r.t state , keep first rec
df.drop_duplicates('state', keep='first').head(3)
png
df.drop_duplicates(['state', 'area_code'], keep='last').head()
png

Sorting

# Ascending sort, select 5 cust with lowest minutes_per_call
df.sort_values(by='minutes_per_call').head(5)
png
# Descending sort
df.sort_values(by='minutes_per_call', ascending=False).head()
png
# Sort by multiple columns

df.sort_values(by=['state','minutes_per_call'], ascending=[True, False]).head().loc[:, ['state','minutes_per_call','churn']]
png

map and applymap()

tips = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Tips100.csv')
tips.head()
png
tips.dtypestotal_bill          float64
tip float64
sex object
smoker object
day object
time object
size int64
price_per_person float64
Payer Name object
CC Number int64
Payment ID object
dtype: object
# assign() , create multiple columns

tips = tips.assign(tip_percent = lambda x : round((x['tip']/x['total_bill'])*100,2),
tip_per_person = lambda x : round(x['tip']/x['size'],2)
)
tips.head()
png
# map()
tips.day.unique()
array(['Sat', 'Thur', 'Fri', 'Sun'], dtype=object)day_map = {'Mon':1, 'Tue':2, 'Wed':3, 'Thur':4, 'Fri':5, 'Sat':6, 'Sun':7}
tips['weekday'] = tips['day'].map(day_map)
tips.head()
png
# use map() to mask credit card info.
tips['CC_masked'] = tips['CC Number'].map(lambda x : "x"+str(x)[-4:] if len(str(x))==16 else "xxxx")
tips.head()
png
# applymap() is the DataFrame version of map()
tips.applymap(lambda x : str(x).upper()[0] if str(x).isalpha() else x).head()
png
# Extract day part from PaymentID column

tips.loc[:5,'Payment ID']
0 Sat4056
1 Thur9501
2 Thur4780
3 Thur9593
4 Thur7276
5 Thur437
Name: Payment ID, dtype: object
def remove_digits(s):
return ''.join([i for i in s if not i.isdigit()])
remove_digits('Sat4056')'Sat'tips['Payment ID'].map(remove_digits)[0:5]0 Sat
1 Thur
2 Thur
3 Thur
4 Thur
Name: Payment ID, dtype: object
tips['Payment ID'].str.replace('\d','', regex=True)0 Sat
1 Thur
2 Thur
3 Thur
4 Thur
...
95 Sat
96 Sun
97 Thur
98 Thur
99 Sat
Name: Payment ID, Length: 100, dtype: object

apply() row wise and column wise

df = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/hungary_chickenpox.csv')df.head()
png
df['Date'] = pd.to_datetime(df.Date)df.head(3)
png
df.set_index('Date', inplace=True)df.head(3)
png
# find max no. of cases for each day
df.apply(max, axis=1)
Date
2005-03-01 178
2005-10-01 200
2005-01-17 191
2005-01-24 163
2005-01-31 172
...
2014-01-12 122
2014-08-12 70
2014-12-15 72
2014-12-22 83
2014-12-29 259
Length: 522, dtype: int64
# find max no. of cases for each day
df.apply(min, axis=1)
Date
2005-03-01 2
2005-10-01 26
2005-01-17 4
2005-01-24 14
2005-01-31 7
..
2014-01-12 0
2014-08-12 2
2014-12-15 0
2014-12-22 0
2014-12-29 11
Length: 522, dtype: int64
# what is median number of chickenpoxes for each region
df.apply(np.median, axis=0)
BUDAPEST 93.0
BARANYA 25.0
BACS 29.5
BEKES 14.0
BORSOD 46.5
CSONGRAD 20.5
FEJER 24.0
GYOR 35.0
HAJDU 37.0
HEVES 21.0
JASZ 31.0
KOMAROM 19.0
NOGRAD 15.0
PEST 81.0
SOMOGY 20.5
SZABOLCS 18.5
TOLNA 12.0
VAS 13.0
VESZPREM 32.0
ZALA 13.0
dtype: float64
# find the diference between highest and lowest no. of cases for each day
df.apply(lambda x: max(x) - min(x), axis=1)
Date
2005-03-01 176
2005-10-01 174
2005-01-17 187
2005-01-24 149
2005-01-31 165
...
2014-01-12 122
2014-08-12 68
2014-12-15 72
2014-12-22 83
2014-12-29 248
Length: 522, dtype: int64
# find date when highest and lowest cases were observed for each region

hi_indx = df.apply(lambda x : np.argmax(x), axis=0)
lo_indx = df.apply(lambda x : np.argmin(x), axis=0)
df.iloc[hi_indx].indexDatetimeIndex(['2008-01-14', '2013-04-02', '2014-12-05', '2005-03-21',
'2009-12-01', '2009-03-30', '2008-04-21', '2005-07-02',
'2009-08-06', '2007-08-01', '2005-09-05', '2008-01-14',
'2007-05-03', '2008-01-14', '2006-03-04', '2005-03-28',
'2013-04-15', '2009-05-01', '2006-03-27', '2012-06-02'],
dtype='datetime64[ns]', name='Date', freq=None)
df.iloc[lo_indx].indexDatetimeIndex(['2012-08-27', '2005-08-22', '2005-08-22', '2005-12-09',
'2014-04-28', '2005-08-22', '2005-09-19', '2008-08-09',
'2008-08-25', '2007-08-20', '2006-08-28', '2006-08-28',
'2005-08-22', '2013-09-09', '2007-10-09', '2007-08-13',
'2006-08-14', '2005-08-29', '2005-08-29', '2005-05-09'],
dtype='datetime64[ns]', name='Date', freq=None)

Scaling

Z Score scaling

def z_score(x):
return (x - np.mean(x))/np.std(x)

min max scaling

def min_max_scaling(x):
return (x - min(x))/(max(x) - min(x))

maximum absolute scaling

def max_abs_scaling(x):
return x/max(x)

robust scaling

def robust_scaling(x):
q2 = np.median(x)
q3 = np.percentile(x, 75)
q1 = np.percentile(x, 25)
iqr = q3-q1
return (x - q2)/iqr
import matplotlib.pyplot as plt%matplotlib inlinebudapest_zscale = df.loc[:, ['BUDAPEST']].apply(z_score)['BUDAPEST']
print(min(budapest_zscale), max(budapest_zscale))
-1.327254255980001 4.952102407345022plt.boxplot(budapest_zscale)
plt.show()
png
budapest_minmaxscale = df.loc[:, ['BUDAPEST']].apply(min_max_scaling)['BUDAPEST']
print(min(budapest_minmaxscale), max(budapest_minmaxscale))
0.0 1.0plt.boxplot(budapest_minmaxscale)
plt.show()
png
budapest_robustscale = df.loc[:, ['BUDAPEST']].apply(robust_scaling)['BUDAPEST']
print(min(budapest_robustscale), max(budapest_robustscale))
-0.8104575163398693 3.363834422657952plt.boxplot(budapest_robustscale)
plt.show()
png

Binning

qcut()

pd.qcut(df.BUDAPEST, q=4, labels=['LOW', 'MEDIUM', 'HIGH', 'VERY HIGH'])Date
2005-03-01 VERY HIGH
2005-10-01 VERY HIGH
2005-01-17 HIGH
2005-01-24 VERY HIGH
2005-01-31 HIGH
...
2014-01-12 HIGH
2014-08-12 MEDIUM
2014-12-15 MEDIUM
2014-12-22 LOW
2014-12-29 VERY HIGH
Name: BUDAPEST, Length: 522, dtype: category
Categories (4, object): ['LOW' < 'MEDIUM' < 'HIGH' < 'VERY HIGH']
min(df.BUDAPEST), np.median(df.BUDAPEST), max(df.BUDAPEST)(0, 93.0, 479)bin_edges = [0, 10, 50, 100, 1000]
pd.cut(df.BUDAPEST, bins=bin_edges, labels=[1,2,3,4])
Date
2005-03-01 4
2005-10-01 4
2005-01-17 3
2005-01-24 4
2005-01-31 4
..
2014-01-12 3
2014-08-12 2
2014-12-15 2
2014-12-22 2
2014-12-29 4
Name: BUDAPEST, Length: 522, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]
pd.cut(df.BUDAPEST, bins=bin_edges, labels=[1,2,3,4]).value_counts()4 247
2 123
3 110
1 41
Name: BUDAPEST, dtype: int64
df.apply(pd.qcut, q=4, axis=0)
png

--

--