shekhar pandey
9 min readNov 21, 2021

--

pandas : Post 01

pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool.

Getting Started
Installation

import pandas as pd
print(pd.__version__)
1.1.5import numpy as np
print(np.__version__)
1.19.5

Create a DataFrame from Array

# Set the seed for random values generator 
np.random.seed(100)
# Create a 2 dim array
arr = np.random.randint(0,100, (5,3))
arr
array([[ 8, 24, 67],
[87, 79, 48],
[10, 94, 52],
[98, 53, 66],
[98, 14, 34]])
# Creating dataframe from array
df = pd.DataFrame(arr)
df
png
type(df)pandas.core.frame.DataFrame# Set row names and column names of dataframe
rownames = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri']
colnames = ['A', 'B', 'C']
df = pd.DataFrame(arr, index=rownames, columns=colnames)
df
png

Access rows and columns

# Access a column
df['A']
Mon 8
Tue 87
Wed 10
Thu 98
Fri 98
Name: A, dtype: int64
# Access two columns
df[['A', 'C']]
png
# Access specific row
df.loc['Mon']
A 8
B 24
C 67
Name: Mon, dtype: int64
# Access a range of rows
df.loc['Mon':'Thu']
png
# Access a range of rows and columns
df.loc['Mon':'Wed', 'A':'B']
png

Dictionary to DataFrame

# Create a dictionary
d = {'Jan':[1,2,3,4,5],
'Feb':[10,20,30,40,50],
'Mar':[11,22,33,44,55]}

pd.DataFrame(d, columns=['Jan', 'Feb','Mar'])
png

Read data from a file

# reading data from a file
file_raw_url='https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/ToothGrowth.csv'
tooth = pd.read_csv(file_raw_url)

tooth.head()
png
# find no. of rows and columns in file
tooth.shape
(60, 3)# find no. of dimentions of dataframe
tooth.ndim
2# Check datatype of columns
tooth.dtypes
len float64
supp object
dose float64
dtype: object
# convert supp column as category column
tooth.supp = tooth.supp.astype('category')
# get details about dataframe
tooth.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 len 60 non-null float64
1 supp 60 non-null category
2 dose 60 non-null float64
dtypes: category(1), float64(2)
memory usage: 1.2 KB
# Check for empty or na values in each column
tooth.isna().sum(axis=0)
len 0
supp 0
dose 0
dtype: int64

Read Churn.csv file

churn_df = pd.read_csv('https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Churn.csv')
churn_df.sample(5)
png
# check no. of rows/cols
churn_df.shape
(3333, 21)# no. of rows we can also get from len()
len(churn_df)
3333churn_df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 state 3333 non-null object
1 account length 3333 non-null int64
2 area code 3333 non-null int64
3 phone number 3333 non-null object
4 international plan 3333 non-null object
5 voice mail plan 3333 non-null object
6 number vmail messages 3333 non-null int64
7 total day minutes 3333 non-null float64
8 total day calls 3333 non-null int64
9 total day charge 3333 non-null float64
10 total eve minutes 3333 non-null float64
11 total eve calls 3333 non-null int64
12 total eve charge 3333 non-null float64
13 total night minutes 3333 non-null float64
14 total night calls 3333 non-null int64
15 total night charge 3333 non-null float64
16 total intl minutes 3333 non-null float64
17 total intl calls 3333 non-null int64
18 total intl charge 3333 non-null float64
19 customer service calls 3333 non-null int64
20 churn 3333 non-null bool
dtypes: bool(1), float64(8), int64(8), object(4)
memory usage: 524.2+ KB
# Return the memory usage of each column in bytes.
churn_df.memory_usage(deep=True)
Index 128
state 196647
account length 26664
area code 26664
phone number 216645
international plan 196970
voice mail plan 197569
number vmail messages 26664
total day minutes 26664
total day calls 26664
total day charge 26664
total eve minutes 26664
total eve calls 26664
total eve charge 26664
total night minutes 26664
total night calls 26664
total night charge 26664
total intl minutes 26664
total intl calls 26664
total intl charge 26664
customer service calls 26664
churn 3333
dtype: int64
# convert datatype of churn field from boolean to integer
churn_df['churn'] = churn_df.churn.astype('int')
churn_df['churn'].sample(5)
491 1
2812 0
3125 1
2469 0
1795 0
Name: churn, dtype: int64
churn_df.columnsIndex(['state', 'account length', 'area code', 'phone number',
'international plan', 'voice mail plan', 'number vmail messages',
'total day minutes', 'total day calls', 'total day charge',
'total eve minutes', 'total eve calls', 'total eve charge',
'total night minutes', 'total night calls', 'total night charge',
'total intl minutes', 'total intl calls', 'total intl charge',
'customer service calls', 'churn'],
dtype='object')

Rename columns

churn_df.rename(columns={'account length': 'account_length'}, inplace=True)churn_df.columnsIndex(['state', 'account_length', 'area code', 'phone number',
'international plan', 'voice mail plan', 'number vmail messages',
'total day minutes', 'total day calls', 'total day charge',
'total eve minutes', 'total eve calls', 'total eve charge',
'total night minutes', 'total night calls', 'total night charge',
'total intl minutes', 'total intl calls', 'total intl charge',
'customer service calls', 'churn'],
dtype='object')
churn_df.rename(str.upper, axis='columns').head(3)
png
churn_df.rename(lambda x : x.replace(" ","_"), axis='columns').head(3)
png
churn_df.rename(str.title, axis=1).head(3)
png
churn_df.rename(lambda x : x.replace(" ","_"), axis=1, inplace=True)churn_df.head(3)
png
churn_df.describe()
png
!pip install pandas_summaryCollecting pandas_summary
Downloading pandas_summary-0.1.0-py3-none-any.whl (2.3 kB)
Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from pandas_summary) (1.1.5)
Collecting datatile
Downloading datatile-0.1.0-py3-none-any.whl (7.6 kB)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from pandas_summary) (1.19.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pandas_summary) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->pandas_summary) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->pandas_summary) (1.15.0)
Installing collected packages: datatile, pandas-summary
Successfully installed datatile-0.1.0 pandas-summary-0.1.0
!pip install pandas_summaryRequirement already satisfied: pandas_summary in c:\users\shekhar2707\anaconda3\lib\site-packages (0.0.7)
Requirement already satisfied: numpy in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas_summary) (1.20.1)
Requirement already satisfied: pandas in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas_summary) (1.2.4)
Requirement already satisfied: pytz>=2017.3 in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas->pandas_summary) (2021.1)
Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas->pandas_summary) (2.8.1)
Requirement already satisfied: six>=1.5 in c:\users\shekhar2707\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas->pandas_summary) (1.15.0)
from pandas_summary import DataFrameSummarysummary = DataFrameSummary(churn_df)
summary.columns_stats
png
summary['state']top                 WV: 106
counts 3333
uniques 51
missing 0
missing_perc 0%
types categorical
Name: state, dtype: object
churn_df['state'].value_counts().head(5)WV 106
MN 84
NY 83
AL 80
OR 78
Name: state, dtype: int64
summary['account_length']mean 101.065
std 39.8221
variance 1585.8
min 1
max 243
mode 105
5% 35
25% 74
50% 101
75% 127
95% 167
iqr 53
kurtosis -0.107836
skewness 0.0966063
sum 336849
mad 31.8214
cv 0.394025
zeros_num 0
zeros_perc 0%
deviating_of_mean 7
deviating_of_mean_perc 0.21%
deviating_of_median 31
deviating_of_median_perc 0.93%
top_correlations
counts 3333
uniques 212
missing 0
missing_perc 0%
types numeric
Name: account_length, dtype: object

Common Operations

# Extract unique values of a column
churn_df['state'].unique()
array(['KS', 'OH', 'NJ', 'OK', 'AL', 'MA', 'MO', 'LA', 'WV', 'IN', 'RI',
'IA', 'MT', 'NY', 'ID', 'VT', 'VA', 'TX', 'FL', 'CO', 'AZ', 'SC',
'NE', 'WY', 'HI', 'IL', 'NH', 'GA', 'AK', 'MD', 'AR', 'WI', 'OR',
'MI', 'DE', 'UT', 'CA', 'MN', 'SD', 'NC', 'WA', 'NM', 'NV', 'DC',
'KY', 'ME', 'MS', 'TN', 'PA', 'CT', 'ND'], dtype=object)
churn_df['state'].nunique()51churn_df['state'].value_counts().head()WV 106
MN 84
NY 83
AL 80
OR 78
Name: state, dtype: int64
churn_df['state'].value_counts(normalize=True).head()WV 0.031803
MN 0.025203
NY 0.024902
AL 0.024002
OR 0.023402
Name: state, dtype: float64
# 'n' largest values
churn_df.nlargest(5, 'account_length')
png
churn_df.drop(columns=['churn']).head()
png
# drop by index
churn_df.drop(index=[0,2,4]).head()
png
# filter rows for specific condition
churn_df[~(churn_df['state'] == 'OH')].head()
png

Display Options

pd.set_option("display.max_rows", 5)churn_df
png
# Reset
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.precision')
pd.reset_option('display.float_format')
pd.options.display.max_rows60pd.options.display.max_columns0pd.options.display.precision6pd.set_option('display.max_rows', 5)churn_df
png
pd.reset_option('display.max_rows')

Selecting data from pandas dataframe

dot notation to select specific column

churn_df.state.head()0    KS
1 OH
2 NJ
3 OH
4 OK
Name: state, dtype: object

.loc

churn_df.loc[:5, ['phone_number', 'churn']]
png

.iloc

churn_df.iloc[0:3, [0,1,2,-1]]
png
# Select second last row
churn_df.iloc[-2:-1]
png
# minutes per call -  create a new column
churn_df['minutes_per_call'] = churn_df.total_day_minutes / churn_df.total_day_calls
churn_df.head()
png
## create a function which calculate hypotenuse of a triangle
def hypotenuse(a,b):
return (a**2 + b**2)**0.5
df = pd.DataFrame(np.random.randint(1,20,(3000,3)), columns=['A','B','C'])
df['C'] = np.nan
df.head()
png
df['C'] = hypotenuse(df.A, df.B)
df.head()
png
### Challenge
z = np.zeros([10,10], dtype=int)
z_df = pd.DataFrame(z)
z_df.head()
png
%%time
# make all diagonals as 1
for i in range(len(z_df)):
z_df.iloc[i,i] = 1

print(z_df)
0 1 2 3 4 5 6 7 8 9
0 1 0 0 0 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0
2 0 0 1 0 0 0 0 0 0 0
3 0 0 0 1 0 0 0 0 0 0
4 0 0 0 0 1 0 0 0 0 0
5 0 0 0 0 0 1 0 0 0 0
6 0 0 0 0 0 0 1 0 0 0
7 0 0 0 0 0 0 0 1 0 0
8 0 0 0 0 0 0 0 0 1 0
9 0 0 0 0 0 0 0 0 0 1
CPU times: user 17.3 ms, sys: 0 ns, total: 17.3 ms
Wall time: 20.9 ms
%%time
# make all diagonals in increasing order
for i in range(len(z_df)):
z_df.iat[i,i] = i+1

print(z_df)
0 1 2 3 4 5 6 7 8 9
0 1 0 0 0 0 0 0 0 0 0
1 0 2 0 0 0 0 0 0 0 0
2 0 0 3 0 0 0 0 0 0 0
3 0 0 0 4 0 0 0 0 0 0
4 0 0 0 0 5 0 0 0 0 0
5 0 0 0 0 0 6 0 0 0 0
6 0 0 0 0 0 0 7 0 0 0
7 0 0 0 0 0 0 0 8 0 0
8 0 0 0 0 0 0 0 0 9 0
9 0 0 0 0 0 0 0 0 0 10
CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 17 ms

Row filter

churn_df.head()
png
# select records where account_length > 100
row_mask = churn_df.account_length > 100
churn_df[row_mask].head()
png
# churn_df all columns that start with t
col_mask = churn_df.columns.str.startswith('t')
churn_df.loc[0:5, col_mask]
png
churn_df.loc[row_mask, col_mask].head()
png
# OR filter 

filter1 = churn_df.account_length > 100
filter2 = churn_df.total_night_calls > 100

churn_df[filter1 | filter2].head()
png
# AND filter condition
churn_df[filter1 & filter2].head()
png
# NOT filter condition
churn_df.loc[~ filter1 & filter2,:].head()
png

--

--