pandas : Post 04

Random Sampling

import numpy as np
import pandas as pd
df = pd.read_csv(r'https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Property_Crimes.csv')df.head()
png

Random sample 70% without replacement

nrows = df.shape[0]
nrows
2449df_sample = df.sample(frac=0.70, replace=False, random_state=100)
df_sample.shape
(1714, 8)

Bootstrap sample

# randomly pick same no. of rows as in dataset but with replacement
bootstrap_sample = df.sample(frac=1, replace=True, random_state=100)
bootstrap_sample.shape
(2449, 8)

Challenge

  • Calculate 95% Confidence Interval of the means for the following -
np.random.seed(100)
arr = pd.Series(np.random.normal(10, 3, (100)))
arr.head()
0 4.750704
1 11.028041
2 13.459107
3 9.242692
4 12.943962
dtype: float64
type(arr)pandas.core.series.Series# bootstrap sample a large no. of times say 10000
means = []
for i in range(10000):
means.append(np.mean(arr.sample(frac=1, replace=True)))

# sort
means = pd.Series(sorted(means))

print(np.percentile(means, 2.5), np.percentile(means, 97.5))
9.117631418512076 10.263385872742084

Dummy Variables

df.Group_Name.unique()array(['Burglary - Property', 'Criminal Breach of Trust - Property',
'Dacoity -Property', 'Other heads of Property',
'Robbery - Property', 'Theft - Property', 'Total Property'],
dtype=object)
pd.get_dummies(df.Group_Name, prefix='Group').head()
png

Categorical Data

group_name_cat = df.Group_Name.astype('category')type(group_name_cat)pandas.core.series.Seriesgroup_name_cat.head()0    Burglary - Property
1 Burglary - Property
2 Burglary - Property
3 Burglary - Property
4 Burglary - Property
Name: Group_Name, dtype: category
Categories (7, object): ['Burglary - Property', 'Criminal Breach of Trust - Property', 'Dacoity -Property', 'Other heads of Property', 'Robbery - Property', 'Theft - Property', 'Total Property']
group_name_cat1 = pd.Categorical(df.Group_Name, df.Group_Name.unique())
group_name_cat1
['Burglary - Property', 'Burglary - Property', 'Burglary - Property', 'Burglary - Property', 'Burglary - Property', ..., 'Total Property', 'Total Property', 'Total Property', 'Total Property', 'Total Property']
Length: 2449
Categories (7, object): ['Burglary - Property', 'Criminal Breach of Trust - Property', 'Dacoity -Property', 'Other heads of Property', 'Robbery - Property', 'Theft - Property', 'Total Property']
type(group_name_cat1)pandas.core.arrays.categorical.Categoricalgroup_name_cat1.codesarray([0, 0, 0, ..., 6, 6, 6], dtype=int8)group_name_cat1.categoriesIndex(['Burglary - Property', 'Criminal Breach of Trust - Property',
'Dacoity -Property', 'Other heads of Property', 'Robbery - Property',
'Theft - Property', 'Total Property'],
dtype='object')
group_name_cat.cat.categoriesIndex(['Burglary - Property', 'Criminal Breach of Trust - Property',
'Dacoity -Property', 'Other heads of Property', 'Robbery - Property',
'Theft - Property', 'Total Property'],
dtype='object')
df.head(3)
png
df.Sub_Group_Name.unique()array(['3. Burglary', '5. Criminal Breach of Trust', '1. Dacoity',
'6. Other Property', '2. Robbery', '4. Theft',
'7. Total Property Stolen & Recovered'], dtype=object)
subgroup_cat = df.Sub_Group_Name.astype('category')subgroup_cat.cat.categoriesIndex(['1. Dacoity', '2. Robbery', '3. Burglary', '4. Theft',
'5. Criminal Breach of Trust', '6. Other Property',
'7. Total Property Stolen & Recovered'],
dtype='object')
subgroup_cat.cat.add_categories('missing')0 3. Burglary
1 3. Burglary
2 3. Burglary
3 3. Burglary
4 3. Burglary
...
2444 7. Total Property Stolen & Recovered
2445 7. Total Property Stolen & Recovered
2446 7. Total Property Stolen & Recovered
2447 7. Total Property Stolen & Recovered
2448 7. Total Property Stolen & Recovered
Name: Sub_Group_Name, Length: 2449, dtype: category
Categories (8, object): ['1. Dacoity', '2. Robbery', '3. Burglary', '4. Theft', '5. Criminal Breach of Trust', '6. Other Property', '7. Total Property Stolen & Recovered', 'missing']

add categories

subgroup_cat = subgroup_cat.cat.add_categories('missing')subgroup_cat.value_counts()1. Dacoity                              350
2. Robbery 350
3. Burglary 350
4. Theft 350
5. Criminal Breach of Trust 350
6. Other Property 350
7. Total Property Stolen & Recovered 349
missing 0
Name: Sub_Group_Name, dtype: int64

remove unused categories

subgroup_cat = subgroup_cat.cat.remove_unused_categories()subgroup_cat.value_counts()1. Dacoity                              350
2. Robbery 350
3. Burglary 350
4. Theft 350
5. Criminal Breach of Trust 350
6. Other Property 350
7. Total Property Stolen & Recovered 349
Name: Sub_Group_Name, dtype: int64

remove used categories

subgroup_cat.cat.remove_categories('3. Burglary')0                                        NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
2444 7. Total Property Stolen & Recovered
2445 7. Total Property Stolen & Recovered
2446 7. Total Property Stolen & Recovered
2447 7. Total Property Stolen & Recovered
2448 7. Total Property Stolen & Recovered
Name: Sub_Group_Name, Length: 2449, dtype: category
Categories (6, object): ['1. Dacoity', '2. Robbery', '4. Theft', '5. Criminal Breach of Trust', '6. Other Property', '7. Total Property Stolen & Recovered']
subgroup_ordcat = pd.Series(pd.Categorical(df.Sub_Group_Name, ordered=True))subgroup_ordcat.cat.categoriesIndex(['1. Dacoity', '2. Robbery', '3. Burglary', '4. Theft',
'5. Criminal Breach of Trust', '6. Other Property',
'7. Total Property Stolen & Recovered'],
dtype='object')
subgroup_ordcat.head(3)0 3. Burglary
1 3. Burglary
2 3. Burglary
dtype: category
Categories (7, object): ['1. Dacoity' < '2. Robbery' < '3. Burglary' < '4. Theft' < '5. Criminal Breach of Trust' < '6. Other Property' < '7. Total Property Stolen & Recovered']
subgroup_ordcat.sort_values()874 1. Dacoity
938 1. Dacoity
937 1. Dacoity
936 1. Dacoity
935 1. Dacoity
...
2212 7. Total Property Stolen & Recovered
2211 7. Total Property Stolen & Recovered
2210 7. Total Property Stolen & Recovered
2218 7. Total Property Stolen & Recovered
2448 7. Total Property Stolen & Recovered
Length: 2449, dtype: category
Categories (7, object): ['1. Dacoity' < '2. Robbery' < '3. Burglary' < '4. Theft' < '5. Criminal Breach of Trust' < '6. Other Property' < '7. Total Property Stolen & Recovered']
grades = pd.Series(['A','A+','B', 'C', 'Excellence','A','A+','B+'])
grades
0 A
1 A+
2 B
3 C
4 Excellence
5 A
6 A+
7 B+
dtype: object
grades_cat = pd.Series(pd.Categorical(grades))
grades_cat
0 A
1 A+
2 B
3 C
4 Excellence
5 A
6 A+
7 B+
dtype: category
Categories (6, object): ['A', 'A+', 'B', 'B+', 'C', 'Excellence']
grades_cat1 = pd.Series(pd.Categorical(grades, ordered=True))
grades_cat1
0 A
1 A+
2 B
3 C
4 Excellence
5 A
6 A+
7 B+
dtype: category
Categories (6, object): ['A' < 'A+' < 'B' < 'B+' < 'C' < 'Excellence']
grades_cat1.cat.categoriesIndex(['A', 'A+', 'B', 'B+', 'C', 'Excellence'], dtype='object')grades_cat1.sort_values()0 A
5 A
1 A+
6 A+
2 B
7 B+
3 C
4 Excellence
dtype: category
Categories (6, object): ['A' < 'A+' < 'B' < 'B+' < 'C' < 'Excellence']
grades_cat1 = grades_cat1.cat.reorder_categories(['C','B','B+','A','A+','Excellence'])
grades_cat1
0 A
1 A+
2 B
3 C
4 Excellence
5 A
6 A+
7 B+
dtype: category
Categories (6, object): ['C' < 'B' < 'B+' < 'A' < 'A+' < 'Excellence']
grades_cat1.sort_values()3 C
2 B
7 B+
0 A
5 A
1 A+
6 A+
4 Excellence
dtype: category
Categories (6, object): ['C' < 'B' < 'B+' < 'A' < 'A+' < 'Excellence']
grades_cat1.cat.codes0 3
1 4
2 1
3 0
4 5
5 3
6 4
7 2
dtype: int8