shekhar pandey
6 min readNov 21, 2021

--

Method Chaining

import pandas as pd
import numpy as np
df = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Property_Crimes.csv')df.head(3)
png
# rename columns to lower case
df.columns = df.columns.str.lower()
df.head(3)
png
# create new subgroup
df['subgroup'] = df.group_name.str.split(r'-', expand=True).loc[:,0]
df.head(3)
png
# Convert fields to ctegorical
df['area_cat'] = df['area_name'].astype('category')
df['group'] = df.group_name.astype('category')df['subgroup'] = df.subgroup.astype('category')df.head(2)
png
df.drop(columns=['area_name', 'group_name','sub_group_name'], inplace=True)df.head(2)
png
df.rename(columns={'area_cat':'area'}, inplace=True)df.head(2)
png
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2449 entries, 0 to 2448
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 2449 non-null int64
1 cases_property_recovered 2449 non-null int64
2 cases_property_stolen 2449 non-null int64
3 value_of_property_recovered 2449 non-null int64
4 value_of_property_stolen 2449 non-null int64
5 subgroup 2449 non-null category
6 area 2449 non-null category
7 group 2449 non-null category
dtypes: category(3), int64(5)
memory usage: 105.0 KB
df['cases_property_lost'] = df.cases_property_stolen - df.cases_property_recovereddf['value_property_lost'] = df.value_of_property_stolen - df.value_of_property_recovereddf.sample(3)
png

Pipeline

def read_data(filepath):
df = (pd.read_csv(filepath)
.rename(columns=str.lower)
.pipe(to_categorical, 'area_name')
.pipe(to_categorical, 'group_name')
.pipe(to_categorical, 'sub_group_name')
.assign(cases_lost = lambda x : x['cases_property_stolen'] - x['cases_property_recovered'])
.assign(value_lost = lambda x : x['value_of_property_stolen'] - x['value_of_property_recovered'])
)
return df

def to_categorical(df, col_name):
df[str(col_name) + '_cat'] =pd.Categorical(df[col_name])
return df

df = read_data(r'../numpy_datasets/Property_Crimes.csv')
df.sample(5)
png
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2449 entries, 0 to 2448
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 area_name 2449 non-null object
1 year 2449 non-null int64
2 group_name 2449 non-null object
3 sub_group_name 2449 non-null object
4 cases_property_recovered 2449 non-null int64
5 cases_property_stolen 2449 non-null int64
6 value_of_property_recovered 2449 non-null int64
7 value_of_property_stolen 2449 non-null int64
8 area_name_cat 2449 non-null category
9 group_name_cat 2449 non-null category
10 sub_group_name_cat 2449 non-null category
11 cases_lost 2449 non-null int64
12 value_lost 2449 non-null int64
dtypes: category(3), int64(7), object(3)
memory usage: 200.6+ KB

Read data from multiple files

Approach 1

import globcsvfiles = []

csvfiles = glob.glob(r'../numpy_datasets/arem/lying/*.csv')
print(*csvfiles, sep="\n")
../numpy_datasets/arem/lying\dataset1.csv
../numpy_datasets/arem/lying\dataset2.csv
../numpy_datasets/arem/lying\dataset3.csv
../numpy_datasets/arem/lying\dataset4.csv
import oslist_df =[]

for csvfile in csvfiles:
fpath = csvfile.replace(r'\\','/')
print("Reading ", fpath.ljust(40), "Exists: ", os.path.exists(fpath))
df = pd.read_csv(fpath, skiprows=4, header=0)

csv_name = csvfile.split('\\')[-1].split('.')[0]
df['file'] = csv_name

list_df.append(df)

final_df = pd.concat(list_df)
print(final_df.shape)
Reading ../numpy_datasets/arem/lying\dataset1.csv Exists: True
Reading ../numpy_datasets/arem/lying\dataset2.csv Exists: True
Reading ../numpy_datasets/arem/lying\dataset3.csv Exists: True
Reading ../numpy_datasets/arem/lying\dataset4.csv Exists: True
(1920, 8)
final_df.sample(5)
png

Approach 2: Generator based approach

data = pd.concat(pd.read_csv(fpath, skiprows=4, header=0) for fpath in csvfiles)
data.shape
(1920, 7)def read(fpath):
df = pd.read_csv(fpath, skiprows=4, header=0)
csv_name = fpath.split('\\')[-1].split('.')[0]
df['file'] = csv_name
return df

df_1 = pd.concat(read(fpath) for fpath in csvfiles)
df_1.sample(5)
png

Challenge

# read only 2 columns, avg_rss12 and var_rss12 , and read first 10 observations only

csvfiles = []

csvfiles = glob.glob(r'../numpy_datasets/arem/lying/*.csv')
print(*csvfiles, sep="\n")
../numpy_datasets/arem/lying\dataset1.csv
../numpy_datasets/arem/lying\dataset2.csv
../numpy_datasets/arem/lying\dataset3.csv
../numpy_datasets/arem/lying\dataset4.csv
def read_subdata(fpath):
fname = fpath.split('\\')[-1].split('.')[0]
df = pd.read_csv(fpath, usecols=['avg_rss12', 'var_rss12'], skiprows=4, nrows=10)
df['file'] = fname
return df

pd_final2 = pd.concat(read_subdata(fpath) for fpath in csvfiles)

pd_final2.sample(12)
png

Aggregation

split -> apply -> combine

titanic = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Titanic.csv')
titanic.sample(5)
png
titanic.groupby('Pclass').agg({'Age': np.mean})
png
titanic.groupby('Pclass').agg({'Survived':np.mean})
png
titanic.groupby('Pclass').agg({'Survived':[np.mean, np.sum]})
png
titanic.groupby(['Sex', 'Pclass']).agg({'Survived':[np.mean,np.sum ]})
png
df = titanic.groupby(['Sex', 'Pclass']).agg({'Survived':[np.mean,np.sum ]}).reset_index()
df
png
df.columnsMultiIndex([(     'Sex',     ''),
( 'Pclass', ''),
('Survived', 'mean'),
('Survived', 'sum')],
)
df.loc[:,["Survived"]]
png
titanic.groupby("Pclass").agg({'Fare':lambda x : max(x) - min(x)})
png

Challenge

titanic.groupby("Pclass").apply(lambda x : x[["Fare", "Survived"]].corr())
png
titanic.head()
png
titanic.groupby('Sex').apply(lambda x : x[["Survived", "Age"]].corr())
png

Iterating between groups

titanic.groupby('Pclass').apply(lambda x : np.mean(x['Fare']))Pclass
1 84.154687
2 20.662183
3 13.675550
dtype: float64
grouped = titanic.groupby('Pclass')
type(grouped)
pandas.core.groupby.generic.DataFrameGroupByfor name, group in grouped:
print(name)
print(group.head(2))
print("_"* 20)
1
PassengerId Survived Pclass \
1 2 1 1
3 4 1 1

Name Sex Age SibSp \
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1

Parch Ticket Fare Cabin Embarked
1 0 PC 17599 71.2833 C85 C
3 0 113803 53.1000 C123 S
____________________
2
PassengerId Survived Pclass Name \
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem)
15 16 1 2 Hewlett, Mrs. (Mary D Kingcome)

Sex Age SibSp Parch Ticket Fare Cabin Embarked
9 female 14.0 1 0 237736 30.0708 NaN C
15 female 55.0 0 0 248706 16.0000 NaN S
____________________
3
PassengerId Survived Pclass Name Sex Age \
0 1 0 3 Braund, Mr. Owen Harris male 22.0
2 3 1 3 Heikkinen, Miss. Laina female 26.0

SibSp Parch Ticket Fare Cabin Embarked
0 1 0 A/5 21171 7.250 NaN S
2 0 0 STON/O2. 3101282 7.925 NaN S
____________________
for name, group in titanic.groupby('Pclass'):
print(f"Group name: {name}")
if name == 2:
print(group.loc[(group.Sex!="male") & (group.Fare!=0), "Fare"].mean())
else:
print(group.Fare.mean())
Group name: 1
84.15468749999992
Group name: 2
21.97012105263158
Group name: 3
13.675550101832997
# group on the basis of Pclass, find name of top 3 femaile who paid highest fare
grouped = titanic.groupby('Pclass')
for name, group in grouped:
print(f"Pclass: {name}")
print(group.loc[group.Sex == "female", ["Name", "Fare"]].sort_values(by='Fare', ascending=False).head(3))
print("-"*50)
Pclass: 1
Name Fare
258 Ward, Miss. Anna 512.3292
88 Fortune, Miss. Mabel Helen 263.0000
341 Fortune, Miss. Alice Elizabeth 263.0000
--------------------------------------------------
Pclass: 2
Name Fare
615 Herman, Miss. Alice 65.0000
754 Herman, Mrs. Samuel (Jane Laver) 65.0000
43 Laroche, Miss. Simonne Marie Anne Andree 41.5792
--------------------------------------------------
Pclass: 3
Name Fare
792 Sage, Miss. Stella Anna 69.55
863 Sage, Miss. Dorothy Edith "Dolly" 69.55
180 Sage, Miss. Constance Gladys 69.55
--------------------------------------------------

transform

titanic.groupby('Pclass').agg({'Fare': np.mean})
png
titanic['mean_Fare'] = titanic.groupby('Pclass')['Fare'].transform('mean')
titanic.sample(5)
png
def min_max_diff(s):
return(np.max(s) - np.min(s))
titanic.loc[titanic.Fare >0].groupby('Pclass').agg({'Fare': min_max_diff})
png
titanic['min_max_diff']=titanic.loc[titanic.Fare > 0].groupby('Pclass')['Fare'].transform(min_max_diff)
titanic.sample(7)
png
def contribution(s):
return round((s/np.sum(s))*100,2)
contribution(titanic['Fare'])0 0.03
1 0.25
2 0.03
3 0.19
4 0.03
...
886 0.05
887 0.10
888 0.08
889 0.10
890 0.03
Name: Fare, Length: 891, dtype: float64
titanic['Contribution'] = titanic.groupby('Pclass')['Fare'].transform(contribution)titanic.sort_values(by=['Pclass', 'Contribution'], ascending=[True, False])
png

--

--