pandas : Post 04

6 min readNov 21, 2021

Method Chaining

import pandas as pd
import numpy as npdf = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Property_Crimes.csv')df.head(3)

# rename columns to lower case
df.columns = df.columns.str.lower()df.head(3)

# create new subgroup
df['subgroup'] = df.group_name.str.split(r'-', expand=True).loc[:,0]df.head(3)

# Convert fields to ctegorical
df['area_cat'] = df['area_name'].astype('category')df['group'] = df.group_name.astype('category')df['subgroup'] = df.subgroup.astype('category')df.head(2)

df.drop(columns=['area_name', 'group_name','sub_group_name'], inplace=True)df.head(2)

df.rename(columns={'area_cat':'area'}, inplace=True)df.head(2)

df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2449 entries, 0 to 2448
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   year                         2449 non-null   int64   
 1   cases_property_recovered     2449 non-null   int64   
 2   cases_property_stolen        2449 non-null   int64   
 3   value_of_property_recovered  2449 non-null   int64   
 4   value_of_property_stolen     2449 non-null   int64   
 5   subgroup                     2449 non-null   category
 6   area                         2449 non-null   category
 7   group                        2449 non-null   category
dtypes: category(3), int64(5)
memory usage: 105.0 KBdf['cases_property_lost'] = df.cases_property_stolen - df.cases_property_recovereddf['value_property_lost'] = df.value_of_property_stolen - df.value_of_property_recovereddf.sample(3)

Pipeline

def read_data(filepath):
    df = (pd.read_csv(filepath)
          .rename(columns=str.lower)
          .pipe(to_categorical, 'area_name')
          .pipe(to_categorical, 'group_name')
          .pipe(to_categorical, 'sub_group_name')
          .assign(cases_lost = lambda x : x['cases_property_stolen'] - x['cases_property_recovered'])
          .assign(value_lost = lambda x : x['value_of_property_stolen'] - x['value_of_property_recovered'])
         )
    return df

def to_categorical(df, col_name):
    df[str(col_name) + '_cat'] =pd.Categorical(df[col_name])
    return df

df = read_data(r'../numpy_datasets/Property_Crimes.csv')
df.sample(5)

df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2449 entries, 0 to 2448
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   area_name                    2449 non-null   object  
 1   year                         2449 non-null   int64   
 2   group_name                   2449 non-null   object  
 3   sub_group_name               2449 non-null   object  
 4   cases_property_recovered     2449 non-null   int64   
 5   cases_property_stolen        2449 non-null   int64   
 6   value_of_property_recovered  2449 non-null   int64   
 7   value_of_property_stolen     2449 non-null   int64   
 8   area_name_cat                2449 non-null   category
 9   group_name_cat               2449 non-null   category
 10  sub_group_name_cat           2449 non-null   category
 11  cases_lost                   2449 non-null   int64   
 12  value_lost                   2449 non-null   int64   
dtypes: category(3), int64(7), object(3)
memory usage: 200.6+ KB

Read data from multiple files

Approach 1

import globcsvfiles = []

csvfiles = glob.glob(r'../numpy_datasets/arem/lying/*.csv')
print(*csvfiles, sep="\n")../numpy_datasets/arem/lying\dataset1.csv
../numpy_datasets/arem/lying\dataset2.csv
../numpy_datasets/arem/lying\dataset3.csv
../numpy_datasets/arem/lying\dataset4.csvimport oslist_df =[]

for csvfile in csvfiles:
    fpath = csvfile.replace(r'\\','/')
    print("Reading ", fpath.ljust(40), "Exists: ", os.path.exists(fpath))
    df = pd.read_csv(fpath, skiprows=4, header=0)
    
    csv_name = csvfile.split('\\')[-1].split('.')[0]
    df['file'] = csv_name
    
    list_df.append(df)
    
final_df = pd.concat(list_df)    
print(final_df.shape)Reading  ../numpy_datasets/arem/lying\dataset1.csv Exists:  True
Reading  ../numpy_datasets/arem/lying\dataset2.csv Exists:  True
Reading  ../numpy_datasets/arem/lying\dataset3.csv Exists:  True
Reading  ../numpy_datasets/arem/lying\dataset4.csv Exists:  True
(1920, 8)final_df.sample(5)

Approach 2: Generator based approach

data = pd.concat(pd.read_csv(fpath, skiprows=4, header=0) for fpath in csvfiles)
data.shape(1920, 7)def read(fpath):
    df = pd.read_csv(fpath, skiprows=4, header=0)
    csv_name = fpath.split('\\')[-1].split('.')[0]
    df['file'] = csv_name
    return df

df_1 = pd.concat(read(fpath) for fpath in csvfiles)
df_1.sample(5)

Challenge

# read only 2 columns, avg_rss12 and var_rss12 , and read first 10 observations only

csvfiles = []

csvfiles = glob.glob(r'../numpy_datasets/arem/lying/*.csv')
print(*csvfiles, sep="\n")../numpy_datasets/arem/lying\dataset1.csv
../numpy_datasets/arem/lying\dataset2.csv
../numpy_datasets/arem/lying\dataset3.csv
../numpy_datasets/arem/lying\dataset4.csvdef read_subdata(fpath):
    fname = fpath.split('\\')[-1].split('.')[0]
    df = pd.read_csv(fpath, usecols=['avg_rss12', 'var_rss12'], skiprows=4, nrows=10)
    df['file'] = fname
    return df

pd_final2 = pd.concat(read_subdata(fpath) for fpath in csvfiles)

pd_final2.sample(12)

Aggregation

split -> apply -> combine

titanic = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Titanic.csv')
titanic.sample(5)

titanic.groupby('Pclass').agg({'Age': np.mean})

titanic.groupby('Pclass').agg({'Survived':np.mean})

titanic.groupby('Pclass').agg({'Survived':[np.mean, np.sum]})

titanic.groupby(['Sex', 'Pclass']).agg({'Survived':[np.mean,np.sum ]})

df = titanic.groupby(['Sex', 'Pclass']).agg({'Survived':[np.mean,np.sum ]}).reset_index()
df

df.columnsMultiIndex([(     'Sex',     ''),
            (  'Pclass',     ''),
            ('Survived', 'mean'),
            ('Survived',  'sum')],
           )df.loc[:,["Survived"]]

titanic.groupby("Pclass").agg({'Fare':lambda x : max(x) - min(x)})

Challenge

titanic.groupby("Pclass").apply(lambda x : x[["Fare", "Survived"]].corr())

titanic.head()

titanic.groupby('Sex').apply(lambda x : x[["Survived", "Age"]].corr())

Iterating between groups

titanic.groupby('Pclass').apply(lambda x : np.mean(x['Fare']))Pclass
1    84.154687
2    20.662183
3    13.675550
dtype: float64grouped = titanic.groupby('Pclass')
type(grouped)pandas.core.groupby.generic.DataFrameGroupByfor name, group in grouped:
    print(name)
    print(group.head(2))
    print("_"* 20)1
   PassengerId  Survived  Pclass  \
1            2         1       1   
3            4         1       1   

                                                Name     Sex   Age  SibSp  \
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   

   Parch    Ticket     Fare Cabin Embarked  
1      0  PC 17599  71.2833   C85        C  
3      0    113803  53.1000  C123        S  
____________________
2
    PassengerId  Survived  Pclass                                 Name  \
9            10         1       2  Nasser, Mrs. Nicholas (Adele Achem)   
15           16         1       2     Hewlett, Mrs. (Mary D Kingcome)    

       Sex   Age  SibSp  Parch  Ticket     Fare Cabin Embarked  
9   female  14.0      1      0  237736  30.0708   NaN        C  
15  female  55.0      0      0  248706  16.0000   NaN        S  
____________________
3
   PassengerId  Survived  Pclass                     Name     Sex   Age  \
0            1         0       3  Braund, Mr. Owen Harris    male  22.0   
2            3         1       3   Heikkinen, Miss. Laina  female  26.0   

   SibSp  Parch            Ticket   Fare Cabin Embarked  
0      1      0         A/5 21171  7.250   NaN        S  
2      0      0  STON/O2. 3101282  7.925   NaN        S  
____________________for name, group in titanic.groupby('Pclass'):
    print(f"Group name: {name}")
    if name == 2:
        print(group.loc[(group.Sex!="male") & (group.Fare!=0), "Fare"].mean())
    else:
        print(group.Fare.mean())Group name: 1
84.15468749999992
Group name: 2
21.97012105263158
Group name: 3
13.675550101832997# group on the basis of Pclass, find name of top 3 femaile who paid highest fare
grouped = titanic.groupby('Pclass')
for name, group in grouped:
    print(f"Pclass: {name}")
    print(group.loc[group.Sex == "female", ["Name", "Fare"]].sort_values(by='Fare', ascending=False).head(3))
    print("-"*50)Pclass: 1
                               Name      Fare
258                Ward, Miss. Anna  512.3292
88       Fortune, Miss. Mabel Helen  263.0000
341  Fortune, Miss. Alice Elizabeth  263.0000
--------------------------------------------------
Pclass: 2
                                         Name     Fare
615                       Herman, Miss. Alice  65.0000
754          Herman, Mrs. Samuel (Jane Laver)  65.0000
43   Laroche, Miss. Simonne Marie Anne Andree  41.5792
--------------------------------------------------
Pclass: 3
                                  Name   Fare
792            Sage, Miss. Stella Anna  69.55
863  Sage, Miss. Dorothy Edith "Dolly"  69.55
180       Sage, Miss. Constance Gladys  69.55
--------------------------------------------------

transform

titanic.groupby('Pclass').agg({'Fare': np.mean})

titanic['mean_Fare'] = titanic.groupby('Pclass')['Fare'].transform('mean')
titanic.sample(5)

def min_max_diff(s):
    return(np.max(s) - np.min(s))titanic.loc[titanic.Fare >0].groupby('Pclass').agg({'Fare': min_max_diff})

titanic['min_max_diff']=titanic.loc[titanic.Fare > 0].groupby('Pclass')['Fare'].transform(min_max_diff)
titanic.sample(7)

def contribution(s):
    return round((s/np.sum(s))*100,2)contribution(titanic['Fare'])0      0.03
1      0.25
2      0.03
3      0.19
4      0.03
       ... 
886    0.05
887    0.10
888    0.08
889    0.10
890    0.03
Name: Fare, Length: 891, dtype: float64titanic['Contribution'] = titanic.groupby('Pclass')['Fare'].transform(contribution)titanic.sort_values(by=['Pclass', 'Contribution'], ascending=[True, False])

Method Chaining

Pipeline

Read data from multiple files

Approach 1

Approach 2: Generator based approach

Challenge

Aggregation

split -> apply -> combine

Challenge

Iterating between groups

transform

Written by shekhar pandey