Numpy Array

18 min readNov 21, 2021

Numpy -01
Post- Numpy 01:Array
This post is about applying basic operations on numpy array

import numpy as npprint(np.__version__)1.20.1

Create numpy array from a list

Main datastructure used is Numpy array (nd array i.e n dimensional array)

# lets have a list of numbers and create an numpy array from using the list
nums_list = [1,2,3,4]
print(f"List: {nums_list}")

arr = np.array(nums_list)
print(f"Array: {arr}")
print(type(arr))List: [1, 2, 3, 4]
Array: [1 2 3 4]
<class 'numpy.ndarray'>

Types of ndarray

Scalar
Vector
2 dim Matrix
n dim Matrix

Numpy array is a homogeneous data structure i.e. all elements within ndarray are of same datatype

Vectorization

It means applying an mathmetical operation on each element of ndarray without explicitly writing a for loop

# notice difference between List and array operation

print(nums_list * 2) # repetition of list elements

print(arr * 2) # vectorization takes place[1, 2, 3, 4, 1, 2, 3, 4]
[2 4 6 8]

Subset, Slicing

arr = np.array([100, 121, 144, 196, 225, 256])
arr[1:4]array([121, 144, 196])arr[-1::-1]array([256, 225, 196, 144, 121, 100])#### Create 2 dim array
arr = np.array([[1,2,3,4,5],
               [1,4,9,16,25],
               [1,8,18,32,50]])

# array
print(arr)

# datatype of elements of array
print(arr.dtype)

# dimension of array
print(arr.ndim)

# size i.e no. of elements in array
print(arr.size)

# shape, no. of elements in each dimension
print(arr.shape)

# size of elements in array
print(arr.nbytes) 

# actual size occupied by array
from sys import getsizeof
getsizeof(arr)[[ 1  2  3  4  5]
 [ 1  4  9 16 25]
 [ 1  8 18 32 50]]
int32
2
15
(3, 5)
60





180

Indexing

arrarray([[ 1,  2,  3,  4,  5],
       [ 1,  4,  9, 16, 25],
       [ 1,  8, 18, 32, 50]])arr[0:2, 2:5]array([[ 3,  4,  5],
       [ 9, 16, 25]])# all elements of column 3
arr[:, 2]array([ 3,  9, 18])# print elements of specific index location
arr[[2,1,1], [1,4,3]]array([ 8, 25, 16])arr_float = arr.astype('float')
print(arr_float)[[ 1.  2.  3.  4.  5.]
 [ 1.  4.  9. 16. 25.]
 [ 1.  8. 18. 32. 50.]]

Special numpy arrays

ones = np.ones((3,3), dtype='int')
print(ones)[[1 1 1]
 [1 1 1]
 [1 1 1]]zeroes = np.zeros((3,3), dtype='int')
print(zeroes)[[0 0 0]
 [0 0 0]
 [0 0 0]]# to extract diagonal elements
np.diag(arr)array([ 1,  4, 18])

Operations

A = np.array(np.random.randint(low=5, high=15, size=(3,5)))
Aarray([[10, 13, 11,  9, 11],
       [ 6,  8,  6,  9, 10],
       [10,  5,  9, 14, 11]])B = np.array(np.random.randint(low=5, high=15, size=(3,5)))
Barray([[ 5, 14, 10,  9,  6],
       [ 8,  9,  8, 11, 13],
       [ 8,  5, 14,  6, 11]])A + Barray([[15, 27, 21, 18, 17],
       [14, 17, 14, 20, 23],
       [18, 10, 23, 20, 22]])A - Barray([[ 5, -1,  1,  0,  5],
       [-2, -1, -2, -2, -3],
       [ 2,  0, -5,  8,  0]])A * Barray([[ 50, 182, 110,  81,  66],
       [ 48,  72,  48,  99, 130],
       [ 80,  25, 126,  84, 121]])

Reference VS Copy

X = np.array(np.arange(3,13)).reshape((2,5))
Xarray([[ 3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12]])# y is referencing X , any changes in X would be reflect in y
y = X 
X[0,0] = 30
print(y)[[30  4  5  6  7]
 [ 8  9 10 11 12]]y is XTrue# z contains copy of elements of X
z = X.copy()
X[0,0] = 300
print(z)[[30  4  5  6  7]
 [ 8  9 10 11 12]]z is XFalse

Datatypes

For integer type values we get dataype like int8, int16, int32, int64
int8 can hold 8 bits, where as int64 holds 64 bits, hence depending upon type, different range of values can be stored.
Default is int32 for integer
If the values we want to store is of smaller range we should int8

# To find the range of values that can be stored in int32 datatype
np.iinfo('int32')iinfo(min=-2147483648, max=2147483647, dtype=int32)lucky_nums = np.array([1, 3, 6, 7, 9, 11], dtype='int8')
print(lucky_nums)
print(lucky_nums.dtype)[ 1  3  6  7  9 11]
int8# comparison of memory saving by switiching to appropriate datatype
getsizeof(np.array([1, 3, 6, 7, 9, 11])) - getsizeof(np.array([1, 3, 6, 7, 9, 11], dtype='int8'))18

Load data from file

np.loadtxt()

It works if input file has no missing data

marks = np.loadtxt(r'./Numpy_Datasets/marks.txt', delimiter='\t', dtype='int8')marksarray([[ 1, 80, 89, 90],
       [ 2, 78, 90, 69],
       [ 3, 50, 60, 70]], dtype=int8)marks[:,1:]array([[80, 89, 90],
       [78, 90, 69],
       [50, 60, 70]], dtype=int8)

np.genfromtxt()

missing values become nan

marks_1 = np.genfromtxt(r'./Numpy_Datasets/marks_1.txt', delimiter='\t')
marks_1array([[ 1., 80., 89., 90.],
       [ 2., 78., nan, 69.],
       [ 3., 50., 60., 70.]])# reading data from csv file
marks_csv = np.genfromtxt(r'./Numpy_datasets/marks.csv', delimiter=',', skip_header=1)
marks_csvarray([[ 1., 80., 90., 70.],
       [ 2., 97., 98., 72.]])# Define format of imported data

dt = np.dtype({'names': ['Name','Rollno','Phy','Chem','Math'],
              'formats': ['U16', 'int16', 'int8', 'int8', 'int8']})

marks_1_csv = np.genfromtxt(r'./Numpy_datasets/marks_1.csv', delimiter=',', skip_header=1, 
              dtype=dt)marks_1_csvarray([('Ankit', 1, 89, 90, 98), ('Rahul', 2, 88, 89, 98),
       ('Vijay', 3, 79, 89, 89), ('Roshni', 4, 88, 89, 80)],
      dtype=[('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])marks_1_csv.ndim1marks_1_csv.shape(4,)marks_1_csv.dtypedtype([('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])marks_1_csv[0:2]array([('Ankit', 1, 89, 90, 98), ('Rahul', 2, 88, 89, 98)],
      dtype=[('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])# extract phy marks and find avg. Phy marks
print(marks_1_csv['Phy'])

print(np.mean(marks_1_csv['Phy']))[89 88 79 88]
86.0# For Ankit extrat Phy and Chem marks
marks_1_csv[marks_1_csv['Name'] == 'Ankit'][['Phy', 'Chem']]array([(89, 90)],
      dtype={'names':['Phy','Chem'], 'formats':['i1','i1'], 'offsets':[66,67], 'itemsize':69})

Export

# Exporting single array to a file
np.save(r'./TEMP/marks_1.npy', marks_1)# Export more than one array to a fle
np.savez(r'./TEMP/marks.npz', marks_1, marks_1_csv)

Load the saved arrays

a = np.load(r'./TEMP/marks_1.npy')
aarray([[ 1., 80., 89., 90.],
       [ 2., 78., nan, 69.],
       [ 3., 50., 60., 70.]])# load multiple arrays
b = np.load(r'./TEMP/marks.npz', allow_pickle=True)
b<numpy.lib.npyio.NpzFile at 0x19fba5dafa0>b.files['arr_0', 'arr_1']b['arr_0']array([[ 1., 80., 89., 90.],
       [ 2., 78., nan, 69.],
       [ 3., 50., 60., 70.]])b['arr_1']array([('Ankit', 1, 89, 90, 98), ('Rahul', 2, 88, 89, 98),
       ('Vijay', 3, 79, 89, 89), ('Roshni', 4, 88, 89, 80)],
      dtype=[('Name', '<U16'), ('Rollno', '<i2'), ('Phy', 'i1'), ('Chem', 'i1'), ('Math', 'i1')])

Load data from url

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'dt = np.dtype({'names':['Petal_len','Petal_width','Sepal_len', 'Sepal_width','Species'],
              'formats':['float','float','float','float','U16']})
iris = np.genfromtxt(url, delimiter=',', dtype=dt)iris.ndim1iris[0:2]array([(5.1, 3.5, 1.4, 0.2, 'Iris-setosa'),
       (4.9, 3. , 1.4, 0.2, 'Iris-setosa')],
      dtype=[('Petal_len', '<f8'), ('Petal_width', '<f8'), ('Sepal_len', '<f8'), ('Sepal_width', '<f8'), ('Species', '<U16')])

Missing Data

In Python, missing value is represented as None
In numpy, missing value is np.nan and infinity is np.inf

np.nannan

Comparison equalto should not be used with missing (np.nan)

# setting value of x as np.nan
x = np.nan

# check whther x is having nan value using == [not the correct way]
x == np.nanFalsex is np.nanTruex in [np.nan]Truedata = np.genfromtxt(r'./Numpy_Datasets/data_miss.txt', delimiter='\t')

check for missing

len(data[np.isnan(data)])2

check for infinity

len(data[np.isinf(data)])1

replace missing or inf with 0

data[np.isnan(data) | np.isinf(data)] = 0

Import data from file and extract specific values

data = np.genfromtxt(r'./Numpy_Datasets/Mall_Customers_Int.csv', delimiter=',', skip_header=1)data.shape(200, 5)data[0:3,:]

# cust_id, genre, age, annual_income, spending_scorearray([[ 1.,  1., 19., 15., 39.],
       [ 2.,  1., 21., 15., 81.],
       [ 3.,  0., 20., 16.,  6.]])

filter data based on conditions

# find rows where genre = 1

## Create a boolean mask
mask = data[:, 1] == 1

## print first 10 mask values
print(mask[0:10])

# print first 10 rec base on mask 
data[mask][0:10][ True  True False False False False False False  True False]





array([[ 1.,  1., 19., 15., 39.],
       [ 2.,  1., 21., 15., 81.],
       [ 9.,  1., 64., 19.,  3.],
       [11.,  1., 67., 19., 14.],
       [15.,  1., 37., 20., 13.],
       [16.,  1., 22., 20., 79.],
       [18.,  1., 20., 21., 66.],
       [19.,  1., nan, 23., 29.],
       [21.,  1., 35., 24., 35.],
       [22.,  1., 25., 24., 73.]])# find rows where missing value in annual income (4th column , i.e. col with index 3)
mask_income = np.isnan(data[:,3])
data[mask_income]array([[13.,  0., 58., nan, 15.],
       [27.,  0., 45., nan, 32.],
       [57.,  0., 51., nan, 50.],
       [68.,  0., 68., nan, 48.]])# find recs if any value is missing

data[np.isnan(data).any(axis=1)]array([[ 8.,  0., 23., 18., nan],
       [13.,  0., 58., nan, 15.],
       [17.,  0., 35., 21., nan],
       [19.,  1., nan, 23., 29.],
       [27.,  0., 45., nan, 32.],
       [37.,  0., nan, 34., 17.],
       [57.,  0., 51., nan, 50.],
       [68.,  0., 68., nan, 48.]])# get recs with no missing value
data[~ np.isnan(data).any(axis=1)]array([[  1.,   1.,  19.,  15.,  39.],
       [  2.,   1.,  21.,  15.,  81.],
       [  3.,   0.,  20.,  16.,   6.],
       [  4.,   0.,  23.,  16.,  77.],
       [  5.,   0.,  31.,  17.,  40.],
       [  6.,   0.,  22.,  17.,  76.],
       [  7.,   0.,  35.,  18.,   6.],
       [  9.,   1.,  64.,  19.,   3.],
       [ 10.,   0.,  30.,  19.,  72.],
       [ 11.,   1.,  67.,  19.,  14.],
       [ 12.,   0.,  35.,  19.,  99.],
       [ 14.,   0.,  24.,  20.,  77.],
       [ 15.,   1.,  37.,  20.,  13.],
       [ 16.,   1.,  22.,  20.,  79.],
       [ 18.,   1.,  20.,  21.,  66.],
       [ 20.,   0.,  35.,  23.,  98.],
       [ 21.,   1.,  35.,  24.,  35.],
       [ 22.,   1.,  25.,  24.,  73.],
       [ 23.,   0.,  46.,  25.,   5.],
       [ 24.,   1.,  31.,  25.,  73.],
       [ 25.,   0.,  54.,  28.,  14.],
       [ 26.,   1.,  29.,  28.,  82.],
       [ 28.,   1.,  35.,  28.,  61.],
       [ 29.,   0.,  40.,  29.,  31.],
       [ 30.,   0.,  23.,  29.,  87.],
       [ 31.,   1.,  60.,  30.,   4.],
       [ 32.,   0.,  21.,  30.,  73.],
       [ 33.,   1.,  53.,  33.,   4.],
       [ 34.,   1.,  18.,  33.,  92.],
       [ 35.,   0.,  49.,  33.,  14.],
       [ 36.,   0.,  21.,  33.,  81.],
       [ 38.,   0.,  30.,  34.,  73.],
       [ 39.,   0.,  36.,  37.,  26.],
       [ 40.,   0.,  20.,  37.,  75.],
       [ 41.,   0.,  65.,  38.,  35.],
       [ 42.,   1.,  24.,  38.,  92.],
       [ 43.,   1.,  48.,  39.,  36.],
       [ 44.,   0.,  31.,  39.,  61.],
       [ 45.,   0.,  49.,  39.,  28.],
       [ 46.,   0.,  24.,  39.,  65.],
       [ 47.,   0.,  50.,  40.,  55.],
       [ 48.,   0.,  27.,  40.,  47.],
       [ 49.,   0.,  29.,  40.,  42.],
       [ 50.,   0.,  31.,  40.,  42.],
       [ 51.,   0.,  49.,  42.,  52.],
       [ 52.,   1.,  33.,  42.,  60.],
       [ 53.,   0.,  31.,  43.,  54.],
       [ 54.,   1.,  59.,  43.,  60.],
       [ 55.,   0.,  50.,  43.,  45.],
       [ 56.,   1.,  47.,  43.,  41.],
       [ 58.,   1.,  69.,  44.,  46.],
       [ 59.,   0.,  27.,  46.,  51.],
       [ 60.,   1.,  53.,  46.,  46.],
       [ 61.,   1.,  70.,  46.,  56.],
       [ 62.,   1.,  19.,  46.,  55.],
       [ 63.,   0.,  67.,  47.,  52.],
       [ 64.,   0.,  54.,  47.,  59.],
       [ 65.,   1.,  63.,  48.,  51.],
       [ 66.,   1.,  18.,  48.,  59.],
       [ 67.,   0.,  43.,  48.,  50.],
       [ 69.,   1.,  19.,  48.,  59.],
       [ 70.,   0.,  32.,  48.,  47.],
       [ 71.,   1.,  70.,  49.,  55.],
       [ 72.,   0.,  47.,  49.,  42.],
       [ 73.,   0.,  60.,  50.,  49.],
       [ 74.,   0.,  60.,  50.,  56.],
       [ 75.,   1.,  59.,  54.,  47.],
       [ 76.,   1.,  26.,  54.,  54.],
       [ 77.,   0.,  45.,  54.,  53.],
       [ 78.,   1.,  40.,  54.,  48.],
       [ 79.,   0.,  23.,  54.,  52.],
       [ 80.,   0.,  49.,  54.,  42.],
       [ 81.,   1.,  57.,  54.,  51.],
       [ 82.,   1.,  38.,  54.,  55.],
       [ 83.,   1.,  67.,  54.,  41.],
       [ 84.,   0.,  46.,  54.,  44.],
       [ 85.,   0.,  21.,  54.,  57.],
       [ 86.,   1.,  48.,  54.,  46.],
       [ 87.,   0.,  55.,  57.,  58.],
       [ 88.,   0.,  22.,  57.,  55.],
       [ 89.,   0.,  34.,  58.,  60.],
       [ 90.,   0.,  50.,  58.,  46.],
       [ 91.,   0.,  68.,  59.,  55.],
       [ 92.,   1.,  18.,  59.,  41.],
       [ 93.,   1.,  48.,  60.,  49.],
       [ 94.,   0.,  40.,  60.,  40.],
       [ 95.,   0.,  32.,  60.,  42.],
       [ 96.,   1.,  24.,  60.,  52.],
       [ 97.,   0.,  47.,  60.,  47.],
       [ 98.,   0.,  27.,  60.,  50.],
       [ 99.,   1.,  48.,  61.,  42.],
       [100.,   1.,  20.,  61.,  49.],
       [101.,   0.,  23.,  62.,  41.],
       [102.,   0.,  49.,  62.,  48.],
       [103.,   1.,  67.,  62.,  59.],
       [104.,   1.,  26.,  62.,  55.],
       [105.,   1.,  49.,  62.,  56.],
       [106.,   0.,  21.,  62.,  42.],
       [107.,   0.,  66.,  63.,  50.],
       [108.,   1.,  54.,  63.,  46.],
       [109.,   1.,  68.,  63.,  43.],
       [110.,   1.,  66.,  63.,  48.],
       [111.,   1.,  65.,  63.,  52.],
       [112.,   0.,  19.,  63.,  54.],
       [113.,   0.,  38.,  64.,  42.],
       [114.,   1.,  19.,  64.,  46.],
       [115.,   0.,  18.,  65.,  48.],
       [116.,   0.,  19.,  65.,  50.],
       [117.,   0.,  63.,  65.,  43.],
       [118.,   0.,  49.,  65.,  59.],
       [119.,   0.,  51.,  67.,  43.],
       [120.,   0.,  50.,  67.,  57.],
       [121.,   1.,  27.,  67.,  56.],
       [122.,   0.,  38.,  67.,  40.],
       [123.,   0.,  40.,  69.,  58.],
       [124.,   1.,  39.,  69.,  91.],
       [125.,   0.,  23.,  70.,  29.],
       [126.,   0.,  31.,  70.,  77.],
       [127.,   1.,  43.,  71.,  35.],
       [128.,   1.,  40.,  71.,  95.],
       [129.,   1.,  59.,  71.,  11.],
       [130.,   1.,  38.,  71.,  75.],
       [131.,   1.,  47.,  71.,   9.],
       [132.,   1.,  39.,  71.,  75.],
       [133.,   0.,  25.,  72.,  34.],
       [134.,   0.,  31.,  72.,  71.],
       [135.,   1.,  20.,  73.,   5.],
       [136.,   0.,  29.,  73.,  88.],
       [137.,   0.,  44.,  73.,   7.],
       [138.,   1.,  32.,  73.,  73.],
       [139.,   1.,  19.,  74.,  10.],
       [140.,   0.,  35.,  74.,  72.],
       [141.,   0.,  57.,  75.,   5.],
       [142.,   1.,  32.,  75.,  93.],
       [143.,   0.,  28.,  76.,  40.],
       [144.,   0.,  32.,  76.,  87.],
       [145.,   1.,  25.,  77.,  12.],
       [146.,   1.,  28.,  77.,  97.],
       [147.,   1.,  48.,  77.,  36.],
       [148.,   0.,  32.,  77.,  74.],
       [149.,   0.,  34.,  78.,  22.],
       [150.,   1.,  34.,  78.,  90.],
       [151.,   1.,  43.,  78.,  17.],
       [152.,   1.,  39.,  78.,  88.],
       [153.,   0.,  44.,  78.,  20.],
       [154.,   0.,  38.,  78.,  76.],
       [155.,   0.,  47.,  78.,  16.],
       [156.,   0.,  27.,  78.,  89.],
       [157.,   1.,  37.,  78.,   1.],
       [158.,   0.,  30.,  78.,  78.],
       [159.,   1.,  34.,  78.,   1.],
       [160.,   0.,  30.,  78.,  73.],
       [161.,   0.,  56.,  79.,  35.],
       [162.,   0.,  29.,  79.,  83.],
       [163.,   1.,  19.,  81.,   5.],
       [164.,   0.,  31.,  81.,  93.],
       [165.,   1.,  50.,  85.,  26.],
       [166.,   0.,  36.,  85.,  75.],
       [167.,   1.,  42.,  86.,  20.],
       [168.,   0.,  33.,  86.,  95.],
       [169.,   0.,  36.,  87.,  27.],
       [170.,   1.,  32.,  87.,  63.],
       [171.,   1.,  40.,  87.,  13.],
       [172.,   1.,  28.,  87.,  75.],
       [173.,   1.,  36.,  87.,  10.],
       [174.,   1.,  36.,  87.,  92.],
       [175.,   0.,  52.,  88.,  13.],
       [176.,   0.,  30.,  88.,  86.],
       [177.,   1.,  58.,  88.,  15.],
       [178.,   1.,  27.,  88.,  69.],
       [179.,   1.,  59.,  93.,  14.],
       [180.,   1.,  35.,  93.,  90.],
       [181.,   0.,  37.,  97.,  32.],
       [182.,   0.,  32.,  97.,  86.],
       [183.,   1.,  46.,  98.,  15.],
       [184.,   0.,  29.,  98.,  88.],
       [185.,   0.,  41.,  99.,  39.],
       [186.,   1.,  30.,  99.,  97.],
       [187.,   0.,  54., 101.,  24.],
       [188.,   1.,  28., 101.,  68.],
       [189.,   0.,  41., 103.,  17.],
       [190.,   0.,  36., 103.,  85.],
       [191.,   0.,  34., 103.,  23.],
       [192.,   0.,  32., 103.,  69.],
       [193.,   1.,  33., 113.,   8.],
       [194.,   0.,  38., 113.,  91.],
       [195.,   0.,  47., 120.,  16.],
       [196.,   0.,  35., 120.,  79.],
       [197.,   0.,  45., 126.,  28.],
       [198.,   1.,  32., 126.,  74.],
       [199.,   1.,  32., 137.,  18.],
       [200.,   1.,  30., 137.,  83.]])# get max value for each col ignoring nan and inf

data_clean = data[~ (np.isnan(data).any(axis=1) | np.isinf(data).any(axis=1))].copy()data_clean.max(axis=0)array([200.,   1.,  70., 137.,  99.])np.nanmax(data, axis=0)array([200.,   1.,  70., 137.,  99.])

Find position of value within array

np.where(), if else logic

new_spending_score = np.where(data_clean[:,1] == 1, data_clean[:,4], data_clean[:,4]*2)
new_spending_score[0:10]array([ 39.,  81.,  12., 154.,  80., 152.,  12.,   3., 144.,  14.])# find position of max values
np.argmax(data_clean, axis=0)array([191,   0,  53, 190,  10], dtype=int64)data_clean[np.argmax(data_clean[3])]array([ 5.,  0., 31., 17., 40.])# find position for specific value , annual income is missing
data[np.argwhere(np.isnan(data[:, 3]))]array([[[13.,  0., 58., nan, 15.]],

       [[27.,  0., 45., nan, 32.]],

       [[57.,  0., 51., nan, 50.]],

       [[68.,  0., 68., nan, 48.]]])

Random numbers

# create a 2 dim array of random integers

np.random.randint(low=1, high=30, size=(3,3))array([[22, 23, 16],
       [23, 24, 15],
       [28, 20,  8]])np.random.random(size=(2,2)).round(2)array([[0.57, 0.44],
       [0.2 , 0.07]])

pseudo random numbers

np.random.seed(100)
np.random.randint(low=10, high=50, size=(4,4))array([[18, 34, 13, 49],
       [33, 25, 20, 40],
       [44, 12, 44, 24],
       [44, 34, 25, 46]])#### randomstate
rn = np.random.RandomState(100)
rn.randint(low=10, high=50, size=(4,4))array([[18, 34, 13, 49],
       [33, 25, 20, 40],
       [44, 12, 44, 24],
       [44, 34, 25, 46]])

Uniform distribution

data_uniform = np.random.uniform(1,10,30).round(2)
data_uniformarray([4.48, 3.26, 3.69, 8.71, 5.26, 6.97, 8.25, 3.28, 1.72, 7.59, 9.65,
       9.58, 5.41, 6.69, 7.6 , 9.12, 2.46, 4.65, 4.75, 7.26, 4.82, 8.72,
       8.62, 1.63, 3.72, 9.82, 1.32, 5.43, 9.57, 8.3 ])

Normal distribution

data_random = np.random.normal(10, 2, 30)
data_randomarray([ 9.74080688,  9.65833918, 11.58085618, 12.18363181,  9.18983347,
       11.07288787,  5.73737068, 10.52615338,  7.45788403, 11.85820023,
        6.13717429, 10.04843068,  7.57261692, 10.37627391,  8.8554722 ,
       15.49023593, 10.24990796,  7.86950769, 11.2061074 ,  7.87706473,
        6.96713901, 10.65856501,  6.03275811, 10.33427357,  9.642212  ,
       12.35647832,  9.11318403, 11.4779078 ,  9.69030309,  9.82788871])

Random Sampling

np.random.choice(np.arange(100), 30, replace=False)array([87, 49, 48, 45, 88, 55, 19, 72, 60, 18, 66, 16, 61, 78,  5,  2, 47,
       21,  6, 50, 37, 17, 46, 15, 20, 67, 98, 43, 41, 62])

Bootstrap Sampling

np.random.choice(np.arange(30), 30, replace=True)array([25, 16, 27, 22,  0, 15,  5, 25,  3,  7, 26, 29,  6, 12,  9,  5, 20,
        7, 11, 11, 15,  4,  7, 25, 18,  1, 20, 29, 10, 26])

Binomial Distrubution

np.random.binomial(n=10, p=0.35, size=100)array([2, 3, 2, 3, 5, 5, 6, 3, 7, 3, 5, 3, 3, 4, 4, 4, 4, 5, 4, 3, 5, 2,
       4, 3, 4, 4, 3, 5, 3, 4, 3, 3, 4, 8, 2, 4, 6, 4, 5, 4, 3, 5, 2, 6,
       6, 4, 3, 3, 3, 4, 3, 2, 3, 4, 3, 3, 3, 5, 3, 5, 5, 2, 4, 3, 7, 1,
       4, 3, 7, 3, 2, 7, 3, 3, 4, 6, 3, 1, 6, 4, 4, 2, 5, 4, 6, 3, 4, 5,
       3, 3, 2, 2, 4, 3, 2, 3, 6, 2, 4, 5])# A new mobile product is launched, 80% of it was purchaed by women.
# out of 9 randomly selected people who purchaed this mobile, what is probability that 6 of them were womenoutcome = np.random.binomial(n=9, p=0.8, size=100)
np.mean(outcome==6)0.13# Generate 100 random number follow normal dist , mean of 10, sd=3, 
normal_data = np.random.normal(10, 3, 100)# bootstrap this normal data 1000 times, compute mean and sd of mean
means = []
for i in range(1000):
    sample = np.random.choice(normal_data, 1000, replace=True)
    means.append(np.mean(sample))

# print(means)    

# mean of the means i.e. standard error
print(np.std(means))0.09753394897171634

Set Operations

import pandas as pddf = pd.read_csv(r'./Numpy_Datasets/cars93.csv')df.head()

df.dtypesManufacturer           object
Model                  object
Type                   object
Min.Price             float64
Price                 float64
Max.Price             float64
MPG.city                int64
MPG.highway             int64
AirBags                object
DriveTrain             object
Cylinders              object
EngineSize            float64
Horsepower              int64
RPM                     int64
Rev.per.mile            int64
Man.trans.avail        object
Fuel.tank.capacity    float64
Passengers              int64
Length                  int64
Wheelbase               int64
Width                   int64
Turn.circle             int64
Rear.seat.room        float64
Luggage.room          float64
Weight                  int64
Origin                 object
Make                   object
dtype: objectdf.shape(93, 27)# Unique value of manufacturer
np.unique(df.Manufacturer)array(['Acura', 'Audi', 'BMW', 'Buick', 'Cadillac', 'Chevrolet',
       'Chrylser', 'Chrysler', 'Dodge', 'Eagle', 'Ford', 'Geo', 'Honda',
       'Hyundai', 'Infiniti', 'Lexus', 'Lincoln', 'Mazda',
       'Mercedes-Benz', 'Mercury', 'Mitsubishi', 'Nissan', 'Oldsmobile',
       'Plymouth', 'Pontiac', 'Saab', 'Saturn', 'Subaru', 'Suzuki',
       'Toyota', 'Volkswagen', 'Volvo'], dtype=object)keys, values = np.unique(df.Manufacturer, return_counts=True)
dict(zip(keys, values)){'Acura': 2,
 'Audi': 2,
 'BMW': 1,
 'Buick': 4,
 'Cadillac': 2,
 'Chevrolet': 8,
 'Chrylser': 1,
 'Chrysler': 2,
 'Dodge': 6,
 'Eagle': 2,
 'Ford': 8,
 'Geo': 2,
 'Honda': 3,
 'Hyundai': 4,
 'Infiniti': 1,
 'Lexus': 2,
 'Lincoln': 2,
 'Mazda': 5,
 'Mercedes-Benz': 2,
 'Mercury': 2,
 'Mitsubishi': 2,
 'Nissan': 4,
 'Oldsmobile': 4,
 'Plymouth': 1,
 'Pontiac': 5,
 'Saab': 1,
 'Saturn': 1,
 'Subaru': 3,
 'Suzuki': 1,
 'Toyota': 4,
 'Volkswagen': 4,
 'Volvo': 2}# record corresponding to first occurance of each manufacturer 
keys, index = np.unique(df.Manufacturer, return_index=True)
df.iloc[index].head()

A = np.random.choice(df.Manufacturer, size=5, replace=True)
print(A)

B = np.random.choice(df.Manufacturer, size=5, replace=True)
print(B)['Dodge' 'Pontiac' 'Ford' 'Chevrolet' 'Honda']
['Nissan' 'Oldsmobile' 'Honda' 'Cadillac' 'Dodge']# Find elements of A that are in B
A_in_B_mask = np.in1d(A, B)
A[A_in_B_mask]array(['Dodge', 'Honda'], dtype=object)# Elements of B present in A
B_in_A_mask = np.in1d(B, A)
B[B_in_A_mask]array(['Honda', 'Dodge'], dtype=object)

Union

np.union1d(A,B)array(['Cadillac', 'Chevrolet', 'Dodge', 'Ford', 'Honda', 'Nissan',
       'Oldsmobile', 'Pontiac'], dtype=object)np.intersect1d(A,B)array(['Dodge', 'Honda'], dtype=object)# A-B
np.setdiff1d(A,B)array(['Chevrolet', 'Ford', 'Pontiac'], dtype=object)

Code Challenge

np.random.seed(100)
alphabets_list = []
for i in range(65, 65+26):
    alphabets_list.append(chr(i))ABCDEFGHIJKLMNOPQRSTUVWXYZA = np.random.choice(alphabets_list, 10)
B = np.random.choice(alphabets_list, 20)
C = np.random.choice(alphabets_list, 5)
print(A)
print(B)
print(C)['I' 'Y' 'D' 'H' 'X' 'P' 'Q' 'K' 'U' 'C']
['V' 'C' 'C' 'O' 'C' 'R' 'Q' 'Y' 'P' 'E' 'L' 'Q' 'J' 'W' 'C' 'M' 'E' 'B'
 'N' 'V']
['T' 'E' 'E' 'D' 'H']# Elements common in A & B but not in C
np.setdiff1d(np.intersect1d(A,B), C)array(['C', 'P', 'Q', 'Y'], dtype='<U1')# Not common elements between A, B, C
np.setdiff1d(np.setdiff1d(A,B),C)array(['I', 'K', 'U', 'X'], dtype='<U1')

Statistical Summaries

df.Weight0     2705
1     3560
2     3375
3     3405
4     3640
      ... 
88    3960
89    2985
90    2810
91    2985
92    3245
Name: Weight, Length: 93, dtype: int64# min, max
np.min(df.Weight)1695np.max(df.Weight)4105# Mean, Median
np.mean(df.Weight)3072.9032258064517np.median(df.Weight)3040.0# Percentile
np.percentile(df.Weight, 50)3040.0# Quantile
np.quantile(df.Weight, q=.50)3040.0# Standard deviation , Variance
np.std(df.Weight)586.7164519048623np.var(df.Weight)344236.1949358306# Coeffecient of variation
np.std(df.Weight)/ np.mean(df.Weight)0.190932290668179# Correlation
np.corrcoef(df.Weight, df['MPG.city'])array([[ 1.        , -0.84313855],
       [-0.84313855,  1.        ]])# covariance
np.cov(df.Weight, df['MPG.city'])array([[ 3.47977893e+05, -2.79509467e+03],
       [-2.79509467e+03,  3.15822814e+01]])

Challenge

arr = np. array([112, 118, 132, 129, 121, 135, 148, 148, 136, 119, 104, 118, 115,
               126, 141, 135, 125, 149, 170, 170, 158, 133, 114, 140, 145, 150,
               178, 163, 172, 178, 199, 199, 184, 162, 146, 166, 171, 180, 193,
               181, 183, 218, 230, 242, 209, 191, 172, 194, 196, 196, 236, 235,
               229, 243, 264, 272, 237, 211, 180, 201, 204, 188, 235, 227, 234,
               264, 302, 293, 259, 229, 203, 229, 242, 233, 267, 269, 270, 315,
               364, 347, 312, 274, 237, 278, 284, 277, 317, 313, 318, 374, 413,
               405, 355, 306, 271, 306, 315, 301, 356, 348, 355, 422, 465, 467,
               404, 347, 305, 336, 340, 318, 362, 348, 363, 435, 491, 505, 404,
               359, 310, 337, 360, 342, 406, 396, 420, 472, 548, 559, 463, 407,
               362, 405, 417, 391, 419, 461, 472, 535, 622, 606, 508, 461, 390,
               432], dtype=float)
arrarray([112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
       118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
       114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
       162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
       209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
       272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
       302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
       315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
       318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
       348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
       362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
       342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
       417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
       432.])# create another array arr_miss and introduce some missing values in it .
arr_miss = arr.copy()

# Introduce missing values
np.random.seed(100)
missing_loc = []
for i in range(5):
    loc = np.random.randint(0, len(arr_miss)); missing_loc.append(loc)
    arr_miss[loc] = np.nan
    
arr_missarray([112., 118., 132., 129., 121., 135., 148., 148.,  nan, 119., 104.,
       118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
       114., 140.,  nan, 150., 178., 163., 172., 178., 199., 199., 184.,
       162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
       209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
       272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
       302.,  nan, 259., 229., 203., 229., 242., 233., 267., 269., 270.,
       315., 364., 347., 312., 274., 237., 278., 284., 277., 317.,  nan,
       318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
       348., 355., 422., 465.,  nan, 404., 347., 305., 336., 340., 318.,
       362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
       342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
       417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
       432.])# Fill missing values with avg of prior and next value
miss_index = np.where(np.isnan(arr_miss))miss_index(array([  8,  24,  67,  87, 103], dtype=int64),)arr_miss[miss_index]array([nan, nan, nan, nan, nan])# mean value of elements , excluding nan
np.nanmean(arr_miss).round(2)280.64# Find mean based on prior and next values
for i in miss_index:
    arr_miss[i] = (arr_miss[i-1] + arr_miss[i+1])/2.0
arr_miss[miss_index]array([133.5, 145. , 280.5, 317.5, 434.5])# Actual values
arr[miss_index]array([136., 145., 293., 313., 467.])

Challenge

# turn off scientiifc notations
np.set_printoptions(suppress=True)

# create array , setting seed first
np.random.seed(100)
arr = np.random.normal(100, 75, size=300).round(3)np.quantile(arr,.75) - np.quantile(arr,.25)105.18025# Find IQR [inter quartile range i.e differene between 75th and 25th percentile]
np.percentile(arr, q=75) - np.percentile(arr,q=25)105.18025# capping 
# replace vaues greater than 95th percentile with 95th percentile
# replace values less than 5 percentile with 5th percentile
upper_cap = np.percentile(arr, q=95).round(3)
print(f"Upper cap is {upper_cap}")

lower_cap = np.percentile(arr, q=5).round(3)
print(f"Lower cap is {lower_cap}")Upper cap is 228.043
Lower cap is -26.388arr = np.where(arr < lower_cap, lower_cap, arr)arr = np.where(arr> upper_cap, upper_cap, arr)min(arr), max(arr)(-26.388, 228.043)

Reshaping and new Axis

arr = np.array([[1,2], [3,4], [5,6]])
print(arr)
arr.shape[[1 2]
 [3 4]
 [5 6]]





(3, 2)arr2 = arr.reshape(2,3)
arr2array([[1, 2, 3],
       [4, 5, 6]])arr2 is arrFalse# use -1 if we do not know no. of elements in that dimension
x = np.arange(12).reshape(-1, 3)
xarray([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])# resize changes the shape of array inplace
x.resize(6,2)xarray([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])# Add axis
a = np.arange(6)
print(a)
print(a.shape)
print(a.ndim)[0 1 2 3 4 5]
(6,)
1a1 = a[:, np.newaxis]
print(a1)
print(a1.shape)[[0]
 [1]
 [2]
 [3]
 [4]
 [5]]
(6, 1)a2 = a[np.newaxis,:]
print(a2)
print(a2.shape)[[0 1 2 3 4 5]]
(1, 6)

Flatten an array

a1.flatten()array([0, 1, 2, 3, 4, 5])### Reshape array
z = np.arange(24)
print(z)

# reshape into 4 rows 
print(z.reshape(4, -1))[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]

Sequences and Repetitions

np.arange(1, 10, 2)array([1, 3, 5, 7, 9])np.arange(20, 1, -3)array([20, 17, 14, 11,  8,  5,  2])# linearally equally spaced
np.linspace(1,5,10)array([1.        , 1.44444444, 1.88888889, 2.33333333, 2.77777778,
       3.22222222, 3.66666667, 4.11111111, 4.55555556, 5.        ])np.linspace(1,5,10, retstep=True)(array([1.        , 1.44444444, 1.88888889, 2.33333333, 2.77777778,
        3.22222222, 3.66666667, 4.11111111, 4.55555556, 5.        ]),
 0.4444444444444444)# logspace : it increases in log scale
np.logspace(1, 5, 10)array([1.00000000e+01, 2.78255940e+01, 7.74263683e+01, 2.15443469e+02,
       5.99484250e+02, 1.66810054e+03, 4.64158883e+03, 1.29154967e+04,
       3.59381366e+04, 1.00000000e+05])# repeat sequence 'x' for  n times

np.tile([1,2,3,4], 3)array([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4])# repeat each element 3 times
np.repeat([1,2,3,4], 3)array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4])a = np.array([1,2,3])
aarray([1, 2, 3])# Challenge to produce pattern
np.array([np.repeat(a, 3),np.tile(a,3)]).flatten()array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

Create numpy array from a list

Main datastructure used is Numpy array (nd array i.e n dimensional array)

Types of ndarray

Numpy array is a homogeneous data structure i.e. all elements within ndarray are of same datatype

Vectorization

Subset, Slicing

Indexing

Special numpy arrays

Operations

Reference VS Copy

Datatypes

Load data from file

np.loadtxt()

np.genfromtxt()

Export

Load the saved arrays

Load data from url

Missing Data

Comparison equalto should not be used with missing (np.nan)

check for missing

check for infinity

replace missing or inf with 0

Import data from file and extract specific values

filter data based on conditions

Find position of value within array

np.where(), if else logic

Random numbers

pseudo random numbers

Uniform distribution

Normal distribution

Random Sampling

Bootstrap Sampling

Binomial Distrubution

Set Operations

Union

Code Challenge

Statistical Summaries

Challenge

Challenge

Reshaping and new Axis

Flatten an array

Sequences and Repetitions

Written by shekhar pandey

No responses yet