18 min readNov 21, 2021
Numpy -01
Post- Numpy 01:Array
This post is about applying basic operations on numpy array
import numpy as npprint(np.__version__)1.20.1
Create numpy array from a list
Main datastructure used is Numpy array (nd array i.e n dimensional array)
# lets have a list of numbers and create an numpy array from using the list
nums_list = [1,2,3,4]
print(f"List: {nums_list}")
arr = np.array(nums_list)
print(f"Array: {arr}")
print(type(arr))List: [1, 2, 3, 4]
Array: [1 2 3 4]
<class 'numpy.ndarray'>
Types of ndarray
- Scalar
- Vector
- 2 dim Matrix
- n dim Matrix
Numpy array is a homogeneous data structure i.e. all elements within ndarray are of same datatype
Vectorization
- It means applying an mathmetical operation on each element of ndarray without explicitly writing a for loop
# notice difference between List and array operation
print(nums_list * 2) # repetition of list elements
print(arr * 2) # vectorization takes place[1, 2, 3, 4, 1, 2, 3, 4]
[2 4 6 8]
Subset, Slicing
arr = np.array([100, 121, 144, 196, 225, 256])
arr[1:4]array([121, 144, 196])arr[-1::-1]array([256, 225, 196, 144, 121, 100])#### Create 2 dim array
arr = np.array([[1,2,3,4,5],
[1,4,9,16,25],
[1,8,18,32,50]])
# array
print(arr)
# datatype of elements of array
print(arr.dtype)
# dimension of array
print(arr.ndim)
# size i.e no. of elements in array
print(arr.size)
# shape, no. of elements in each dimension
print(arr.shape)
# size of elements in array
print(arr.nbytes)
# actual size occupied by array
from sys import getsizeof
getsizeof(arr)[[ 1 2 3 4 5]
[ 1 4 9 16 25]
[ 1 8 18 32 50]]
int32
2
15
(3, 5)
60
180
Indexing
arrarray([[ 1, 2, 3, 4, 5],
[ 1, 4, 9, 16, 25],
[ 1, 8, 18, 32, 50]])arr[0:2, 2:5]array([[ 3, 4, 5],
[ 9, 16, 25]])# all elements of column 3
arr[:, 2]array([ 3, 9, 18])# print elements of specific index location
arr[[2,1,1], [1,4,3]]array([ 8, 25, 16])arr_float = arr.astype('float')
print(arr_float)[[ 1. 2. 3. 4. 5.]
[ 1. 4. 9. 16. 25.]
[ 1. 8. 18. 32. 50.]]
Special numpy arrays
ones = np.ones((3,3), dtype='int')
print(ones)[[1 1 1]
[1 1 1]
[1 1 1]]zeroes = np.zeros((3,3), dtype='int')
print(zeroes)[[0 0 0]
[0 0 0]
[0 0 0]]# to extract diagonal elements
np.diag(arr)array([ 1, 4, 18])
Operations
A = np.array(np.random.randint(low=5, high=15, size=(3,5)))
Aarray([[10, 13, 11, 9, 11],
[ 6, 8, 6, 9, 10],
[10, 5, 9, 14, 11]])B = np.array(np.random.randint(low=5, high=15, size=(3,5)))
Barray([[ 5, 14, 10, 9, 6],
[ 8, 9, 8, 11, 13],
[ 8, 5, 14, 6, 11]])A + Barray([[15, 27, 21, 18, 17],
[14, 17, 14, 20, 23],
[18, 10, 23, 20, 22]])A - Barray([[ 5, -1, 1, 0, 5],
[-2, -1, -2, -2, -3],
[ 2, 0, -5, 8, 0]])A * Barray([[ 50, 182, 110, 81, 66],
[ 48, 72, 48, 99, 130],
[ 80, 25, 126, 84, 121]])
Reference VS Copy
X = np.array(np.arange(3,13)).reshape((2,5))
Xarray([[ 3, 4, 5, 6, 7],
[ 8, 9, 10, 11, 12]])# y is referencing X , any changes in X would be reflect in y
y = X
X[0,0] = 30
print(y)[[30 4 5 6 7]
[ 8 9 10 11 12]]y is XTrue# z contains copy of elements of X
z = X.copy()
X[0,0] = 300
print(z)[[30 4 5 6 7]
[ 8 9 10 11 12]]z is XFalse
Datatypes
- For integer type values we get dataype like int8, int16, int32, int64
- int8 can hold 8 bits, where as int64 holds 64 bits, hence depending upon type, different range of values can be stored.
- Default is int32 for integer
- If the values we want to store is of smaller range we should int8
# To find the range of values that can be stored in int32 datatype
np.iinfo('int32')iinfo(min=-2147483648, max=2147483647, dtype=int32)lucky_nums = np.array([1, 3, 6, 7, 9, 11], dtype='int8')
print(lucky_nums)
print(lucky_nums.dtype)[ 1 3 6 7 9 11]
int8# comparison of memory saving by switiching to appropriate datatype
getsizeof(np.array([1, 3, 6, 7, 9, 11])) - getsizeof(np.array([1, 3, 6, 7, 9, 11], dtype='int8'))18
Load data from file
np.loadtxt()
It works if input file has no missing data
marks = np.loadtxt(r'./Numpy_Datasets/marks.txt', delimiter='\t', dtype='int8')marksarray([[ 1, 80, 89, 90],
[ 2, 78, 90, 69],
[ 3, 50, 60, 70]], dtype=int8)marks[:,1:]array([[80, 89, 90],
[78, 90, 69],
[50, 60, 70]], dtype=int8)
np.genfromtxt()
- missing values become nan
marks_1 = np.genfromtxt(r'./Numpy_Datasets/marks_1.txt', delimiter='\t')
marks_1array([[ 1., 80., 89., 90.],
[ 2., 78., nan, 69.],
[ 3., 50., 60., 70.]])# reading data from csv file
marks_csv = np.genfromtxt(r'./Numpy_datasets/marks.csv', delimiter=',', skip_header=1)
marks_csvarray([[ 1., 80., 90., 70.],
[ 2., 97., 98., 72.]])# Define format of imported data
dt = np.dtype({'names': ['Name','Rollno','Phy','Chem','Math'],
'formats': ['U16', 'int16', 'int8', 'int8', 'int8']})
marks_1_csv = np.genfromtxt(r'./Numpy_datasets/marks_1.csv', delimiter=',', skip_header=1,
dtype=dt)marks_1_csvarray([('Ankit', 1, 89, 90, 98), ('Rahul', 2, 88, 89, 98),
('Vijay', 3, 79, 89, 89), ('Roshni', 4, 88, 89, 80)],
dtype=[('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])marks_1_csv.ndim1marks_1_csv.shape(4,)marks_1_csv.dtypedtype([('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])marks_1_csv[0:2]array([('Ankit', 1, 89, 90, 98), ('Rahul', 2, 88, 89, 98)],
dtype=[('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])# extract phy marks and find avg. Phy marks
print(marks_1_csv['Phy'])
print(np.mean(marks_1_csv['Phy']))[89 88 79 88]
86.0# For Ankit extrat Phy and Chem marks
marks_1_csv[marks_1_csv['Name'] == 'Ankit'][['Phy', 'Chem']]array([(89, 90)],
dtype={'names':['Phy','Chem'], 'formats':['i1','i1'], 'offsets':[66,67], 'itemsize':69})
Export
# Exporting single array to a file
np.save(r'./TEMP/marks_1.npy', marks_1)# Export more than one array to a fle
np.savez(r'./TEMP/marks.npz', marks_1, marks_1_csv)
Load the saved arrays
a = np.load(r'./TEMP/marks_1.npy')
aarray([[ 1., 80., 89., 90.],
[ 2., 78., nan, 69.],
[ 3., 50., 60., 70.]])# load multiple arrays
b = np.load(r'./TEMP/marks.npz', allow_pickle=True)
b<numpy.lib.npyio.NpzFile at 0x19fba5dafa0>b.files['arr_0', 'arr_1']b['arr_0']array([[ 1., 80., 89., 90.],
[ 2., 78., nan, 69.],
[ 3., 50., 60., 70.]])b['arr_1']array([('Ankit', 1, 89, 90, 98), ('Rahul', 2, 88, 89, 98),
('Vijay', 3, 79, 89, 89), ('Roshni', 4, 88, 89, 80)],
dtype=[('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])
Load data from url
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'dt = np.dtype({'names':['Petal_len','Petal_width','Sepal_len', 'Sepal_width','Species'],
'formats':['float','float','float','float','U16']})
iris = np.genfromtxt(url, delimiter=',', dtype=dt)iris.ndim1iris[0:2]array([(5.1, 3.5, 1.4, 0.2, 'Iris-setosa'),
(4.9, 3. , 1.4, 0.2, 'Iris-setosa')],
dtype=[('Petal_len', '<f8'), ('Petal_width', '<f8'), ('Sepal_len', '<f8'), ('Sepal_width', '<f8'), ('Species', '<U16')])
Missing Data
- In Python, missing value is represented as None
- In numpy, missing value is np.nan and infinity is np.inf
np.nannan
Comparison equalto should not be used with missing (np.nan)
# setting value of x as np.nan
x = np.nan
# check whther x is having nan value using == [not the correct way]
x == np.nanFalsex is np.nanTruex in [np.nan]Truedata = np.genfromtxt(r'./Numpy_Datasets/data_miss.txt', delimiter='\t')
check for missing
len(data[np.isnan(data)])2
check for infinity
len(data[np.isinf(data)])1
replace missing or inf with 0
data[np.isnan(data) | np.isinf(data)] = 0
Import data from file and extract specific values
data = np.genfromtxt(r'./Numpy_Datasets/Mall_Customers_Int.csv', delimiter=',', skip_header=1)data.shape(200, 5)data[0:3,:]
# cust_id, genre, age, annual_income, spending_scorearray([[ 1., 1., 19., 15., 39.],
[ 2., 1., 21., 15., 81.],
[ 3., 0., 20., 16., 6.]])
filter data based on conditions
# find rows where genre = 1
## Create a boolean mask
mask = data[:, 1] == 1
## print first 10 mask values
print(mask[0:10])
# print first 10 rec base on mask
data[mask][0:10][ True True False False False False False False True False]
array([[ 1., 1., 19., 15., 39.],
[ 2., 1., 21., 15., 81.],
[ 9., 1., 64., 19., 3.],
[11., 1., 67., 19., 14.],
[15., 1., 37., 20., 13.],
[16., 1., 22., 20., 79.],
[18., 1., 20., 21., 66.],
[19., 1., nan, 23., 29.],
[21., 1., 35., 24., 35.],
[22., 1., 25., 24., 73.]])# find rows where missing value in annual income (4th column , i.e. col with index 3)
mask_income = np.isnan(data[:,3])
data[mask_income]array([[13., 0., 58., nan, 15.],
[27., 0., 45., nan, 32.],
[57., 0., 51., nan, 50.],
[68., 0., 68., nan, 48.]])# find recs if any value is missing
data[np.isnan(data).any(axis=1)]array([[ 8., 0., 23., 18., nan],
[13., 0., 58., nan, 15.],
[17., 0., 35., 21., nan],
[19., 1., nan, 23., 29.],
[27., 0., 45., nan, 32.],
[37., 0., nan, 34., 17.],
[57., 0., 51., nan, 50.],
[68., 0., 68., nan, 48.]])# get recs with no missing value
data[~ np.isnan(data).any(axis=1)]array([[ 1., 1., 19., 15., 39.],
[ 2., 1., 21., 15., 81.],
[ 3., 0., 20., 16., 6.],
[ 4., 0., 23., 16., 77.],
[ 5., 0., 31., 17., 40.],
[ 6., 0., 22., 17., 76.],
[ 7., 0., 35., 18., 6.],
[ 9., 1., 64., 19., 3.],
[ 10., 0., 30., 19., 72.],
[ 11., 1., 67., 19., 14.],
[ 12., 0., 35., 19., 99.],
[ 14., 0., 24., 20., 77.],
[ 15., 1., 37., 20., 13.],
[ 16., 1., 22., 20., 79.],
[ 18., 1., 20., 21., 66.],
[ 20., 0., 35., 23., 98.],
[ 21., 1., 35., 24., 35.],
[ 22., 1., 25., 24., 73.],
[ 23., 0., 46., 25., 5.],
[ 24., 1., 31., 25., 73.],
[ 25., 0., 54., 28., 14.],
[ 26., 1., 29., 28., 82.],
[ 28., 1., 35., 28., 61.],
[ 29., 0., 40., 29., 31.],
[ 30., 0., 23., 29., 87.],
[ 31., 1., 60., 30., 4.],
[ 32., 0., 21., 30., 73.],
[ 33., 1., 53., 33., 4.],
[ 34., 1., 18., 33., 92.],
[ 35., 0., 49., 33., 14.],
[ 36., 0., 21., 33., 81.],
[ 38., 0., 30., 34., 73.],
[ 39., 0., 36., 37., 26.],
[ 40., 0., 20., 37., 75.],
[ 41., 0., 65., 38., 35.],
[ 42., 1., 24., 38., 92.],
[ 43., 1., 48., 39., 36.],
[ 44., 0., 31., 39., 61.],
[ 45., 0., 49., 39., 28.],
[ 46., 0., 24., 39., 65.],
[ 47., 0., 50., 40., 55.],
[ 48., 0., 27., 40., 47.],
[ 49., 0., 29., 40., 42.],
[ 50., 0., 31., 40., 42.],
[ 51., 0., 49., 42., 52.],
[ 52., 1., 33., 42., 60.],
[ 53., 0., 31., 43., 54.],
[ 54., 1., 59., 43., 60.],
[ 55., 0., 50., 43., 45.],
[ 56., 1., 47., 43., 41.],
[ 58., 1., 69., 44., 46.],
[ 59., 0., 27., 46., 51.],
[ 60., 1., 53., 46., 46.],
[ 61., 1., 70., 46., 56.],
[ 62., 1., 19., 46., 55.],
[ 63., 0., 67., 47., 52.],
[ 64., 0., 54., 47., 59.],
[ 65., 1., 63., 48., 51.],
[ 66., 1., 18., 48., 59.],
[ 67., 0., 43., 48., 50.],
[ 69., 1., 19., 48., 59.],
[ 70., 0., 32., 48., 47.],
[ 71., 1., 70., 49., 55.],
[ 72., 0., 47., 49., 42.],
[ 73., 0., 60., 50., 49.],
[ 74., 0., 60., 50., 56.],
[ 75., 1., 59., 54., 47.],
[ 76., 1., 26., 54., 54.],
[ 77., 0., 45., 54., 53.],
[ 78., 1., 40., 54., 48.],
[ 79., 0., 23., 54., 52.],
[ 80., 0., 49., 54., 42.],
[ 81., 1., 57., 54., 51.],
[ 82., 1., 38., 54., 55.],
[ 83., 1., 67., 54., 41.],
[ 84., 0., 46., 54., 44.],
[ 85., 0., 21., 54., 57.],
[ 86., 1., 48., 54., 46.],
[ 87., 0., 55., 57., 58.],
[ 88., 0., 22., 57., 55.],
[ 89., 0., 34., 58., 60.],
[ 90., 0., 50., 58., 46.],
[ 91., 0., 68., 59., 55.],
[ 92., 1., 18., 59., 41.],
[ 93., 1., 48., 60., 49.],
[ 94., 0., 40., 60., 40.],
[ 95., 0., 32., 60., 42.],
[ 96., 1., 24., 60., 52.],
[ 97., 0., 47., 60., 47.],
[ 98., 0., 27., 60., 50.],
[ 99., 1., 48., 61., 42.],
[100., 1., 20., 61., 49.],
[101., 0., 23., 62., 41.],
[102., 0., 49., 62., 48.],
[103., 1., 67., 62., 59.],
[104., 1., 26., 62., 55.],
[105., 1., 49., 62., 56.],
[106., 0., 21., 62., 42.],
[107., 0., 66., 63., 50.],
[108., 1., 54., 63., 46.],
[109., 1., 68., 63., 43.],
[110., 1., 66., 63., 48.],
[111., 1., 65., 63., 52.],
[112., 0., 19., 63., 54.],
[113., 0., 38., 64., 42.],
[114., 1., 19., 64., 46.],
[115., 0., 18., 65., 48.],
[116., 0., 19., 65., 50.],
[117., 0., 63., 65., 43.],
[118., 0., 49., 65., 59.],
[119., 0., 51., 67., 43.],
[120., 0., 50., 67., 57.],
[121., 1., 27., 67., 56.],
[122., 0., 38., 67., 40.],
[123., 0., 40., 69., 58.],
[124., 1., 39., 69., 91.],
[125., 0., 23., 70., 29.],
[126., 0., 31., 70., 77.],
[127., 1., 43., 71., 35.],
[128., 1., 40., 71., 95.],
[129., 1., 59., 71., 11.],
[130., 1., 38., 71., 75.],
[131., 1., 47., 71., 9.],
[132., 1., 39., 71., 75.],
[133., 0., 25., 72., 34.],
[134., 0., 31., 72., 71.],
[135., 1., 20., 73., 5.],
[136., 0., 29., 73., 88.],
[137., 0., 44., 73., 7.],
[138., 1., 32., 73., 73.],
[139., 1., 19., 74., 10.],
[140., 0., 35., 74., 72.],
[141., 0., 57., 75., 5.],
[142., 1., 32., 75., 93.],
[143., 0., 28., 76., 40.],
[144., 0., 32., 76., 87.],
[145., 1., 25., 77., 12.],
[146., 1., 28., 77., 97.],
[147., 1., 48., 77., 36.],
[148., 0., 32., 77., 74.],
[149., 0., 34., 78., 22.],
[150., 1., 34., 78., 90.],
[151., 1., 43., 78., 17.],
[152., 1., 39., 78., 88.],
[153., 0., 44., 78., 20.],
[154., 0., 38., 78., 76.],
[155., 0., 47., 78., 16.],
[156., 0., 27., 78., 89.],
[157., 1., 37., 78., 1.],
[158., 0., 30., 78., 78.],
[159., 1., 34., 78., 1.],
[160., 0., 30., 78., 73.],
[161., 0., 56., 79., 35.],
[162., 0., 29., 79., 83.],
[163., 1., 19., 81., 5.],
[164., 0., 31., 81., 93.],
[165., 1., 50., 85., 26.],
[166., 0., 36., 85., 75.],
[167., 1., 42., 86., 20.],
[168., 0., 33., 86., 95.],
[169., 0., 36., 87., 27.],
[170., 1., 32., 87., 63.],
[171., 1., 40., 87., 13.],
[172., 1., 28., 87., 75.],
[173., 1., 36., 87., 10.],
[174., 1., 36., 87., 92.],
[175., 0., 52., 88., 13.],
[176., 0., 30., 88., 86.],
[177., 1., 58., 88., 15.],
[178., 1., 27., 88., 69.],
[179., 1., 59., 93., 14.],
[180., 1., 35., 93., 90.],
[181., 0., 37., 97., 32.],
[182., 0., 32., 97., 86.],
[183., 1., 46., 98., 15.],
[184., 0., 29., 98., 88.],
[185., 0., 41., 99., 39.],
[186., 1., 30., 99., 97.],
[187., 0., 54., 101., 24.],
[188., 1., 28., 101., 68.],
[189., 0., 41., 103., 17.],
[190., 0., 36., 103., 85.],
[191., 0., 34., 103., 23.],
[192., 0., 32., 103., 69.],
[193., 1., 33., 113., 8.],
[194., 0., 38., 113., 91.],
[195., 0., 47., 120., 16.],
[196., 0., 35., 120., 79.],
[197., 0., 45., 126., 28.],
[198., 1., 32., 126., 74.],
[199., 1., 32., 137., 18.],
[200., 1., 30., 137., 83.]])# get max value for each col ignoring nan and inf
data_clean = data[~ (np.isnan(data).any(axis=1) | np.isinf(data).any(axis=1))].copy()data_clean.max(axis=0)array([200., 1., 70., 137., 99.])np.nanmax(data, axis=0)array([200., 1., 70., 137., 99.])
Find position of value within array
np.where(), if else logic
new_spending_score = np.where(data_clean[:,1] == 1, data_clean[:,4], data_clean[:,4]*2)
new_spending_score[0:10]array([ 39., 81., 12., 154., 80., 152., 12., 3., 144., 14.])# find position of max values
np.argmax(data_clean, axis=0)array([191, 0, 53, 190, 10], dtype=int64)data_clean[np.argmax(data_clean[3])]array([ 5., 0., 31., 17., 40.])# find position for specific value , annual income is missing
data[np.argwhere(np.isnan(data[:, 3]))]array([[[13., 0., 58., nan, 15.]],
[[27., 0., 45., nan, 32.]],
[[57., 0., 51., nan, 50.]],
[[68., 0., 68., nan, 48.]]])
Random numbers
# create a 2 dim array of random integers
np.random.randint(low=1, high=30, size=(3,3))array([[22, 23, 16],
[23, 24, 15],
[28, 20, 8]])np.random.random(size=(2,2)).round(2)array([[0.57, 0.44],
[0.2 , 0.07]])
pseudo random numbers
np.random.seed(100)
np.random.randint(low=10, high=50, size=(4,4))array([[18, 34, 13, 49],
[33, 25, 20, 40],
[44, 12, 44, 24],
[44, 34, 25, 46]])#### randomstate
rn = np.random.RandomState(100)
rn.randint(low=10, high=50, size=(4,4))array([[18, 34, 13, 49],
[33, 25, 20, 40],
[44, 12, 44, 24],
[44, 34, 25, 46]])
Uniform distribution
data_uniform = np.random.uniform(1,10,30).round(2)
data_uniformarray([4.48, 3.26, 3.69, 8.71, 5.26, 6.97, 8.25, 3.28, 1.72, 7.59, 9.65,
9.58, 5.41, 6.69, 7.6 , 9.12, 2.46, 4.65, 4.75, 7.26, 4.82, 8.72,
8.62, 1.63, 3.72, 9.82, 1.32, 5.43, 9.57, 8.3 ])
Normal distribution
data_random = np.random.normal(10, 2, 30)
data_randomarray([ 9.74080688, 9.65833918, 11.58085618, 12.18363181, 9.18983347,
11.07288787, 5.73737068, 10.52615338, 7.45788403, 11.85820023,
6.13717429, 10.04843068, 7.57261692, 10.37627391, 8.8554722 ,
15.49023593, 10.24990796, 7.86950769, 11.2061074 , 7.87706473,
6.96713901, 10.65856501, 6.03275811, 10.33427357, 9.642212 ,
12.35647832, 9.11318403, 11.4779078 , 9.69030309, 9.82788871])
Random Sampling
np.random.choice(np.arange(100), 30, replace=False)array([87, 49, 48, 45, 88, 55, 19, 72, 60, 18, 66, 16, 61, 78, 5, 2, 47,
21, 6, 50, 37, 17, 46, 15, 20, 67, 98, 43, 41, 62])
Bootstrap Sampling
np.random.choice(np.arange(30), 30, replace=True)array([25, 16, 27, 22, 0, 15, 5, 25, 3, 7, 26, 29, 6, 12, 9, 5, 20,
7, 11, 11, 15, 4, 7, 25, 18, 1, 20, 29, 10, 26])
Binomial Distrubution
np.random.binomial(n=10, p=0.35, size=100)array([2, 3, 2, 3, 5, 5, 6, 3, 7, 3, 5, 3, 3, 4, 4, 4, 4, 5, 4, 3, 5, 2,
4, 3, 4, 4, 3, 5, 3, 4, 3, 3, 4, 8, 2, 4, 6, 4, 5, 4, 3, 5, 2, 6,
6, 4, 3, 3, 3, 4, 3, 2, 3, 4, 3, 3, 3, 5, 3, 5, 5, 2, 4, 3, 7, 1,
4, 3, 7, 3, 2, 7, 3, 3, 4, 6, 3, 1, 6, 4, 4, 2, 5, 4, 6, 3, 4, 5,
3, 3, 2, 2, 4, 3, 2, 3, 6, 2, 4, 5])# A new mobile product is launched, 80% of it was purchaed by women.
# out of 9 randomly selected people who purchaed this mobile, what is probability that 6 of them were womenoutcome = np.random.binomial(n=9, p=0.8, size=100)
np.mean(outcome==6)0.13# Generate 100 random number follow normal dist , mean of 10, sd=3,
normal_data = np.random.normal(10, 3, 100)# bootstrap this normal data 1000 times, compute mean and sd of mean
means = []
for i in range(1000):
sample = np.random.choice(normal_data, 1000, replace=True)
means.append(np.mean(sample))
# print(means)
# mean of the means i.e. standard error
print(np.std(means))0.09753394897171634
Set Operations
import pandas as pddf = pd.read_csv(r'./Numpy_Datasets/cars93.csv')df.head()
df.dtypesManufacturer object
Model object
Type object
Min.Price float64
Price float64
Max.Price float64
MPG.city int64
MPG.highway int64
AirBags object
DriveTrain object
Cylinders object
EngineSize float64
Horsepower int64
RPM int64
Rev.per.mile int64
Man.trans.avail object
Fuel.tank.capacity float64
Passengers int64
Length int64
Wheelbase int64
Width int64
Turn.circle int64
Rear.seat.room float64
Luggage.room float64
Weight int64
Origin object
Make object
dtype: objectdf.shape(93, 27)# Unique value of manufacturer
np.unique(df.Manufacturer)array(['Acura', 'Audi', 'BMW', 'Buick', 'Cadillac', 'Chevrolet',
'Chrylser', 'Chrysler', 'Dodge', 'Eagle', 'Ford', 'Geo', 'Honda',
'Hyundai', 'Infiniti', 'Lexus', 'Lincoln', 'Mazda',
'Mercedes-Benz', 'Mercury', 'Mitsubishi', 'Nissan', 'Oldsmobile',
'Plymouth', 'Pontiac', 'Saab', 'Saturn', 'Subaru', 'Suzuki',
'Toyota', 'Volkswagen', 'Volvo'], dtype=object)keys, values = np.unique(df.Manufacturer, return_counts=True)
dict(zip(keys, values)){'Acura': 2,
'Audi': 2,
'BMW': 1,
'Buick': 4,
'Cadillac': 2,
'Chevrolet': 8,
'Chrylser': 1,
'Chrysler': 2,
'Dodge': 6,
'Eagle': 2,
'Ford': 8,
'Geo': 2,
'Honda': 3,
'Hyundai': 4,
'Infiniti': 1,
'Lexus': 2,
'Lincoln': 2,
'Mazda': 5,
'Mercedes-Benz': 2,
'Mercury': 2,
'Mitsubishi': 2,
'Nissan': 4,
'Oldsmobile': 4,
'Plymouth': 1,
'Pontiac': 5,
'Saab': 1,
'Saturn': 1,
'Subaru': 3,
'Suzuki': 1,
'Toyota': 4,
'Volkswagen': 4,
'Volvo': 2}# record corresponding to first occurance of each manufacturer
keys, index = np.unique(df.Manufacturer, return_index=True)
df.iloc[index].head()
A = np.random.choice(df.Manufacturer, size=5, replace=True)
print(A)
B = np.random.choice(df.Manufacturer, size=5, replace=True)
print(B)['Dodge' 'Pontiac' 'Ford' 'Chevrolet' 'Honda']
['Nissan' 'Oldsmobile' 'Honda' 'Cadillac' 'Dodge']# Find elements of A that are in B
A_in_B_mask = np.in1d(A, B)
A[A_in_B_mask]array(['Dodge', 'Honda'], dtype=object)# Elements of B present in A
B_in_A_mask = np.in1d(B, A)
B[B_in_A_mask]array(['Honda', 'Dodge'], dtype=object)
Union
np.union1d(A,B)array(['Cadillac', 'Chevrolet', 'Dodge', 'Ford', 'Honda', 'Nissan',
'Oldsmobile', 'Pontiac'], dtype=object)np.intersect1d(A,B)array(['Dodge', 'Honda'], dtype=object)# A-B
np.setdiff1d(A,B)array(['Chevrolet', 'Ford', 'Pontiac'], dtype=object)
Code Challenge
np.random.seed(100)
alphabets_list = []
for i in range(65, 65+26):
alphabets_list.append(chr(i))ABCDEFGHIJKLMNOPQRSTUVWXYZA = np.random.choice(alphabets_list, 10)
B = np.random.choice(alphabets_list, 20)
C = np.random.choice(alphabets_list, 5)
print(A)
print(B)
print(C)['I' 'Y' 'D' 'H' 'X' 'P' 'Q' 'K' 'U' 'C']
['V' 'C' 'C' 'O' 'C' 'R' 'Q' 'Y' 'P' 'E' 'L' 'Q' 'J' 'W' 'C' 'M' 'E' 'B'
'N' 'V']
['T' 'E' 'E' 'D' 'H']# Elements common in A & B but not in C
np.setdiff1d(np.intersect1d(A,B), C)array(['C', 'P', 'Q', 'Y'], dtype='<U1')# Not common elements between A, B, C
np.setdiff1d(np.setdiff1d(A,B),C)array(['I', 'K', 'U', 'X'], dtype='<U1')
Statistical Summaries
df.Weight0 2705
1 3560
2 3375
3 3405
4 3640
...
88 3960
89 2985
90 2810
91 2985
92 3245
Name: Weight, Length: 93, dtype: int64# min, max
np.min(df.Weight)1695np.max(df.Weight)4105# Mean, Median
np.mean(df.Weight)3072.9032258064517np.median(df.Weight)3040.0# Percentile
np.percentile(df.Weight, 50)3040.0# Quantile
np.quantile(df.Weight, q=.50)3040.0# Standard deviation , Variance
np.std(df.Weight)586.7164519048623np.var(df.Weight)344236.1949358306# Coeffecient of variation
np.std(df.Weight)/ np.mean(df.Weight)0.190932290668179# Correlation
np.corrcoef(df.Weight, df['MPG.city'])array([[ 1. , -0.84313855],
[-0.84313855, 1. ]])# covariance
np.cov(df.Weight, df['MPG.city'])array([[ 3.47977893e+05, -2.79509467e+03],
[-2.79509467e+03, 3.15822814e+01]])
Challenge
arr = np. array([112, 118, 132, 129, 121, 135, 148, 148, 136, 119, 104, 118, 115,
126, 141, 135, 125, 149, 170, 170, 158, 133, 114, 140, 145, 150,
178, 163, 172, 178, 199, 199, 184, 162, 146, 166, 171, 180, 193,
181, 183, 218, 230, 242, 209, 191, 172, 194, 196, 196, 236, 235,
229, 243, 264, 272, 237, 211, 180, 201, 204, 188, 235, 227, 234,
264, 302, 293, 259, 229, 203, 229, 242, 233, 267, 269, 270, 315,
364, 347, 312, 274, 237, 278, 284, 277, 317, 313, 318, 374, 413,
405, 355, 306, 271, 306, 315, 301, 356, 348, 355, 422, 465, 467,
404, 347, 305, 336, 340, 318, 362, 348, 363, 435, 491, 505, 404,
359, 310, 337, 360, 342, 406, 396, 420, 472, 548, 559, 463, 407,
362, 405, 417, 391, 419, 461, 472, 535, 622, 606, 508, 461, 390,
432], dtype=float)
arrarray([112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
432.])# create another array arr_miss and introduce some missing values in it .
arr_miss = arr.copy()
# Introduce missing values
np.random.seed(100)
missing_loc = []
for i in range(5):
loc = np.random.randint(0, len(arr_miss)); missing_loc.append(loc)
arr_miss[loc] = np.nan
arr_missarray([112., 118., 132., 129., 121., 135., 148., 148., nan, 119., 104.,
118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
114., 140., nan, 150., 178., 163., 172., 178., 199., 199., 184.,
162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
302., nan, 259., 229., 203., 229., 242., 233., 267., 269., 270.,
315., 364., 347., 312., 274., 237., 278., 284., 277., 317., nan,
318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
348., 355., 422., 465., nan, 404., 347., 305., 336., 340., 318.,
362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
432.])# Fill missing values with avg of prior and next value
miss_index = np.where(np.isnan(arr_miss))miss_index(array([ 8, 24, 67, 87, 103], dtype=int64),)arr_miss[miss_index]array([nan, nan, nan, nan, nan])# mean value of elements , excluding nan
np.nanmean(arr_miss).round(2)280.64# Find mean based on prior and next values
for i in miss_index:
arr_miss[i] = (arr_miss[i-1] + arr_miss[i+1])/2.0
arr_miss[miss_index]array([133.5, 145. , 280.5, 317.5, 434.5])# Actual values
arr[miss_index]array([136., 145., 293., 313., 467.])
Challenge
# turn off scientiifc notations
np.set_printoptions(suppress=True)
# create array , setting seed first
np.random.seed(100)
arr = np.random.normal(100, 75, size=300).round(3)np.quantile(arr,.75) - np.quantile(arr,.25)105.18025# Find IQR [inter quartile range i.e differene between 75th and 25th percentile]
np.percentile(arr, q=75) - np.percentile(arr,q=25)105.18025# capping
# replace vaues greater than 95th percentile with 95th percentile
# replace values less than 5 percentile with 5th percentile
upper_cap = np.percentile(arr, q=95).round(3)
print(f"Upper cap is {upper_cap}")
lower_cap = np.percentile(arr, q=5).round(3)
print(f"Lower cap is {lower_cap}")Upper cap is 228.043
Lower cap is -26.388arr = np.where(arr < lower_cap, lower_cap, arr)arr = np.where(arr> upper_cap, upper_cap, arr)min(arr), max(arr)(-26.388, 228.043)
Reshaping and new Axis
arr = np.array([[1,2], [3,4], [5,6]])
print(arr)
arr.shape[[1 2]
[3 4]
[5 6]]
(3, 2)arr2 = arr.reshape(2,3)
arr2array([[1, 2, 3],
[4, 5, 6]])arr2 is arrFalse# use -1 if we do not know no. of elements in that dimension
x = np.arange(12).reshape(-1, 3)
xarray([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11]])# resize changes the shape of array inplace
x.resize(6,2)xarray([[ 0, 1],
[ 2, 3],
[ 4, 5],
[ 6, 7],
[ 8, 9],
[10, 11]])# Add axis
a = np.arange(6)
print(a)
print(a.shape)
print(a.ndim)[0 1 2 3 4 5]
(6,)
1a1 = a[:, np.newaxis]
print(a1)
print(a1.shape)[[0]
[1]
[2]
[3]
[4]
[5]]
(6, 1)a2 = a[np.newaxis,:]
print(a2)
print(a2.shape)[[0 1 2 3 4 5]]
(1, 6)
Flatten an array
a1.flatten()array([0, 1, 2, 3, 4, 5])### Reshape array
z = np.arange(24)
print(z)
# reshape into 4 rows
print(z.reshape(4, -1))[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
[[ 0 1 2 3 4 5]
[ 6 7 8 9 10 11]
[12 13 14 15 16 17]
[18 19 20 21 22 23]]
Sequences and Repetitions
np.arange(1, 10, 2)array([1, 3, 5, 7, 9])np.arange(20, 1, -3)array([20, 17, 14, 11, 8, 5, 2])# linearally equally spaced
np.linspace(1,5,10)array([1. , 1.44444444, 1.88888889, 2.33333333, 2.77777778,
3.22222222, 3.66666667, 4.11111111, 4.55555556, 5. ])np.linspace(1,5,10, retstep=True)(array([1. , 1.44444444, 1.88888889, 2.33333333, 2.77777778,
3.22222222, 3.66666667, 4.11111111, 4.55555556, 5. ]),
0.4444444444444444)# logspace : it increases in log scale
np.logspace(1, 5, 10)array([1.00000000e+01, 2.78255940e+01, 7.74263683e+01, 2.15443469e+02,
5.99484250e+02, 1.66810054e+03, 4.64158883e+03, 1.29154967e+04,
3.59381366e+04, 1.00000000e+05])# repeat sequence 'x' for n times
np.tile([1,2,3,4], 3)array([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4])# repeat each element 3 times
np.repeat([1,2,3,4], 3)array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4])a = np.array([1,2,3])
aarray([1, 2, 3])# Challenge to produce pattern
np.array([np.repeat(a, 3),np.tile(a,3)]).flatten()array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])