Pandas -01

9 min readNov 21, 2021

pandas : Post 01

pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool.

Getting Started
Installation

import pandas as pd
print(pd.__version__)1.1.5import numpy as np
print(np.__version__)1.19.5

Create a DataFrame from Array

# Set the seed for random values generator 
np.random.seed(100)
# Create a 2 dim array
arr = np.random.randint(0,100, (5,3))
arrarray([[ 8, 24, 67],
       [87, 79, 48],
       [10, 94, 52],
       [98, 53, 66],
       [98, 14, 34]])# Creating dataframe from array
df = pd.DataFrame(arr)
df

type(df)pandas.core.frame.DataFrame# Set row names and column names of dataframe
rownames = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri']
colnames = ['A', 'B', 'C']
df = pd.DataFrame(arr, index=rownames, columns=colnames)
df

Access rows and columns

# Access a column
df['A']Mon     8
Tue    87
Wed    10
Thu    98
Fri    98
Name: A, dtype: int64# Access two columns
df[['A', 'C']]

# Access specific row
df.loc['Mon']A     8
B    24
C    67
Name: Mon, dtype: int64# Access a range of rows 
df.loc['Mon':'Thu']

# Access a range of rows and columns
df.loc['Mon':'Wed', 'A':'B']

Dictionary to DataFrame

# Create a dictionary
d = {'Jan':[1,2,3,4,5],
    'Feb':[10,20,30,40,50],
    'Mar':[11,22,33,44,55]}

pd.DataFrame(d, columns=['Jan', 'Feb','Mar'])

Read data from a file

# reading data from a file
file_raw_url='https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/ToothGrowth.csv'
tooth = pd.read_csv(file_raw_url)

tooth.head()

# find no. of rows and columns in file
tooth.shape(60, 3)# find no. of dimentions of dataframe
tooth.ndim2# Check datatype of columns
tooth.dtypeslen     float64
supp     object
dose    float64
dtype: object# convert supp column as category column
tooth.supp = tooth.supp.astype('category')# get details about dataframe
tooth.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   len     60 non-null     float64 
 1   supp    60 non-null     category
 2   dose    60 non-null     float64 
dtypes: category(1), float64(2)
memory usage: 1.2 KB# Check for empty or na values in each column
tooth.isna().sum(axis=0)len     0
supp    0
dose    0
dtype: int64

Read Churn.csv file

churn_df = pd.read_csv('https://raw.githubusercontent.com/shekhar270779/Learn_ML/main/datasets/Churn.csv')
churn_df.sample(5)

# check no. of rows/cols
churn_df.shape(3333, 21)# no. of rows we can also get from len()
len(churn_df)3333churn_df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   int64  
 15  total night charge      3333 non-null   float64
 16  total intl minutes      3333 non-null   float64
 17  total intl calls        3333 non-null   int64  
 18  total intl charge       3333 non-null   float64
 19  customer service calls  3333 non-null   int64  
 20  churn                   3333 non-null   bool   
dtypes: bool(1), float64(8), int64(8), object(4)
memory usage: 524.2+ KB# Return the memory usage of each column in bytes.
churn_df.memory_usage(deep=True)Index                        128
state                     196647
account length             26664
area code                  26664
phone number              216645
international plan        196970
voice mail plan           197569
number vmail messages      26664
total day minutes          26664
total day calls            26664
total day charge           26664
total eve minutes          26664
total eve calls            26664
total eve charge           26664
total night minutes        26664
total night calls          26664
total night charge         26664
total intl minutes         26664
total intl calls           26664
total intl charge          26664
customer service calls     26664
churn                       3333
dtype: int64# convert datatype of churn field from boolean to integer
churn_df['churn'] = churn_df.churn.astype('int')
churn_df['churn'].sample(5)491     1
2812    0
3125    1
2469    0
1795    0
Name: churn, dtype: int64churn_df.columnsIndex(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

Rename columns

churn_df.rename(columns={'account length': 'account_length'}, inplace=True)churn_df.columnsIndex(['state', 'account_length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')churn_df.rename(str.upper, axis='columns').head(3)

churn_df.rename(lambda x : x.replace(" ","_"), axis='columns').head(3)

churn_df.rename(str.title, axis=1).head(3)

churn_df.rename(lambda x : x.replace(" ","_"), axis=1, inplace=True)churn_df.head(3)

churn_df.describe()

!pip install pandas_summaryCollecting pandas_summary
  Downloading pandas_summary-0.1.0-py3-none-any.whl (2.3 kB)
Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from pandas_summary) (1.1.5)
Collecting datatile
  Downloading datatile-0.1.0-py3-none-any.whl (7.6 kB)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from pandas_summary) (1.19.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pandas_summary) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->pandas_summary) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->pandas_summary) (1.15.0)
Installing collected packages: datatile, pandas-summary
Successfully installed datatile-0.1.0 pandas-summary-0.1.0!pip install pandas_summaryRequirement already satisfied: pandas_summary in c:\users\shekhar2707\anaconda3\lib\site-packages (0.0.7)
Requirement already satisfied: numpy in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas_summary) (1.20.1)
Requirement already satisfied: pandas in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas_summary) (1.2.4)
Requirement already satisfied: pytz>=2017.3 in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas->pandas_summary) (2021.1)
Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\shekhar2707\anaconda3\lib\site-packages (from pandas->pandas_summary) (2.8.1)
Requirement already satisfied: six>=1.5 in c:\users\shekhar2707\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas->pandas_summary) (1.15.0)from pandas_summary import DataFrameSummarysummary = DataFrameSummary(churn_df)
summary.columns_stats

summary['state']top                 WV: 106
counts                 3333
uniques                  51
missing                   0
missing_perc             0%
types           categorical
Name: state, dtype: objectchurn_df['state'].value_counts().head(5)WV    106
MN     84
NY     83
AL     80
OR     78
Name: state, dtype: int64summary['account_length']mean                          101.065
std                           39.8221
variance                       1585.8
min                                 1
max                               243
mode                              105
5%                                 35
25%                                74
50%                               101
75%                               127
95%                               167
iqr                                53
kurtosis                    -0.107836
skewness                    0.0966063
sum                            336849
mad                           31.8214
cv                           0.394025
zeros_num                           0
zeros_perc                         0%
deviating_of_mean                   7
deviating_of_mean_perc          0.21%
deviating_of_median                31
deviating_of_median_perc        0.93%
top_correlations                     
counts                           3333
uniques                           212
missing                             0
missing_perc                       0%
types                         numeric
Name: account_length, dtype: object

Common Operations

# Extract unique values of a column
churn_df['state'].unique()array(['KS', 'OH', 'NJ', 'OK', 'AL', 'MA', 'MO', 'LA', 'WV', 'IN', 'RI',
       'IA', 'MT', 'NY', 'ID', 'VT', 'VA', 'TX', 'FL', 'CO', 'AZ', 'SC',
       'NE', 'WY', 'HI', 'IL', 'NH', 'GA', 'AK', 'MD', 'AR', 'WI', 'OR',
       'MI', 'DE', 'UT', 'CA', 'MN', 'SD', 'NC', 'WA', 'NM', 'NV', 'DC',
       'KY', 'ME', 'MS', 'TN', 'PA', 'CT', 'ND'], dtype=object)churn_df['state'].nunique()51churn_df['state'].value_counts().head()WV    106
MN     84
NY     83
AL     80
OR     78
Name: state, dtype: int64churn_df['state'].value_counts(normalize=True).head()WV    0.031803
MN    0.025203
NY    0.024902
AL    0.024002
OR    0.023402
Name: state, dtype: float64# 'n' largest values 
churn_df.nlargest(5, 'account_length')

churn_df.drop(columns=['churn']).head()

# drop by index
churn_df.drop(index=[0,2,4]).head()

# filter rows for specific condition
churn_df[~(churn_df['state'] == 'OH')].head()

Display Options

pd.set_option("display.max_rows", 5)churn_df

# Reset
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.precision')
pd.reset_option('display.float_format')pd.options.display.max_rows60pd.options.display.max_columns0pd.options.display.precision6pd.set_option('display.max_rows', 5)churn_df

pd.reset_option('display.max_rows')

Selecting data from pandas dataframe

dot notation to select specific column

churn_df.state.head()0    KS
1    OH
2    NJ
3    OH
4    OK
Name: state, dtype: object

.loc

churn_df.loc[:5, ['phone_number', 'churn']]

.iloc

churn_df.iloc[0:3, [0,1,2,-1]]

# Select second last row
churn_df.iloc[-2:-1]

# minutes per call -  create a new column
churn_df['minutes_per_call'] = churn_df.total_day_minutes / churn_df.total_day_callschurn_df.head()

## create a function which calculate hypotenuse of a triangle
def hypotenuse(a,b):
    return (a**2 + b**2)**0.5df = pd.DataFrame(np.random.randint(1,20,(3000,3)), columns=['A','B','C'])
df['C'] = np.nan
df.head()

df['C'] = hypotenuse(df.A, df.B)
df.head()

### Challenge
z = np.zeros([10,10], dtype=int)
z_df = pd.DataFrame(z)
z_df.head()

%%time
# make all diagonals as 1
for i in range(len(z_df)):
    z_df.iloc[i,i] = 1

print(z_df)0  1  2  3  4  5  6  7  8  9
0  1  0  0  0  0  0  0  0  0  0
1  0  1  0  0  0  0  0  0  0  0
2  0  0  1  0  0  0  0  0  0  0
3  0  0  0  1  0  0  0  0  0  0
4  0  0  0  0  1  0  0  0  0  0
5  0  0  0  0  0  1  0  0  0  0
6  0  0  0  0  0  0  1  0  0  0
7  0  0  0  0  0  0  0  1  0  0
8  0  0  0  0  0  0  0  0  1  0
9  0  0  0  0  0  0  0  0  0  1
CPU times: user 17.3 ms, sys: 0 ns, total: 17.3 ms
Wall time: 20.9 ms%%time
# make all diagonals in increasing order
for i in range(len(z_df)):
    z_df.iat[i,i] = i+1

print(z_df)0  1  2  3  4  5  6  7  8   9
0  1  0  0  0  0  0  0  0  0   0
1  0  2  0  0  0  0  0  0  0   0
2  0  0  3  0  0  0  0  0  0   0
3  0  0  0  4  0  0  0  0  0   0
4  0  0  0  0  5  0  0  0  0   0
5  0  0  0  0  0  6  0  0  0   0
6  0  0  0  0  0  0  7  0  0   0
7  0  0  0  0  0  0  0  8  0   0
8  0  0  0  0  0  0  0  0  9   0
9  0  0  0  0  0  0  0  0  0  10
CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 17 ms

Row filter

churn_df.head()

# select records where account_length > 100
row_mask = churn_df.account_length > 100
churn_df[row_mask].head()

# churn_df all columns that start with t
col_mask = churn_df.columns.str.startswith('t')
churn_df.loc[0:5, col_mask]

churn_df.loc[row_mask, col_mask].head()

# OR filter 

filter1 = churn_df.account_length > 100
filter2 = churn_df.total_night_calls > 100

churn_df[filter1 | filter2].head()

# AND filter condition
churn_df[filter1 & filter2].head()

# NOT filter condition
churn_df.loc[~ filter1 & filter2,:].head()