Numpy

Post 04 — Date, Functions, Vectorization

import numpy as np

Search sorted

image.png

Search sort returns the position at which a new number to be inserted into array so that new array remains sorted

np.set_printoptions(threshold=2000)# create an array

np.random.seed(100)
arr = np.random.randint(1, 30, size=100)
arr.sort()
arr
array([ 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5,
5, 5, 5, 5, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 10, 11, 11,
11, 11, 11, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 20,
20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 24, 24, 24, 25, 25,
25, 25, 26, 26, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29])
num = 6
pos = np.searchsorted(arr, num)
print(f"{num} should be inserted at index {pos} in array")
6 should be inserted at index 21 in array# reconstruct array
arr = np.r_[arr[0:pos], num, arr[pos:]]
print(arr)
[ 1 1 2 2 3 3 3 3 3 3 3 3 4 4 4 4 5 5 5 5 5 6 7 7
8 8 8 8 9 9 9 10 10 11 11 11 11 11 12 13 13 13 14 14 14 14 15 15
15 15 16 16 16 16 16 16 17 17 17 17 17 18 18 18 18 18 19 19 20 20 20 21
21 22 22 22 22 22 22 23 23 24 24 24 25 25 25 25 26 26 27 28 28 28 28 28
28 28 28 29 29]
%%timeit
np.searchsorted(arr, [20, 29, 37])
4.93 µs ± 92.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Compiler time: 0.31 s

Bisect module

import bisectnp.random.seed(100)
arr = np.random.randint(1, 30, size=100)
arr.sort()
arr
array([ 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5,
5, 5, 5, 5, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 10, 11, 11,
11, 11, 11, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 20,
20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 24, 24, 24, 25, 25,
25, 25, 26, 26, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29])
num=6
bisect.bisect(arr, num)
21arr = arr.tolist()
bisect.insort_left(arr, num)
arr = np.array(arr)
arrarray([ 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5,
5, 5, 5, 5, 6, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 10, 11,
11, 11, 11, 11, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16,
16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19,
20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 24, 24, 24, 25,
25, 25, 25, 26, 26, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29])
### grades

grades = [30, 50, 80, 90]
grades_letters = ['E','D','C','B','A']
for marks in [20, 35,41,51,81,99, 30]:
print((marks, grades_letters[np.searchsorted(grades, marks)]))
(20, 'E')
(35, 'D')
(41, 'D')
(51, 'C')
(81, 'B')
(99, 'A')
(30, 'E')

Handling Dates

from datetime import datetimetoday = datetime.now()
print(today)
2021-11-01 23:53:37.429048#### Levels of granularity

# year
print(np.datetime64(today, 'Y'))

# month
print(np.datetime64(today, 'M'))

# Day
print(np.datetime64(today, 'D'))

# Hour
print(np.datetime64(today, 'h'))

# minute
print(np.datetime64(today, 'm'))

# sec
print(np.datetime64(today, 's'))

# milli sec
print(np.datetime64(today, 'ms'))

# nano sec
print(np.datetime64(today, 'ns'))
2021
2021-11
2021-11-01
2021-11-01T23
2021-11-01T23:53
2021-11-01T23:53:37
2021-11-01T23:53:37.429
2021-11-01T23:53:37.429048000
# sequence of dates
print("Dates of Nov 2021:")
nov_2021 = np.arange('2021-11-01','2021-12-01', dtype='datetime64[D]')
print(nov_2021)
Dates of Nov 2021:
['2021-11-01' '2021-11-02' '2021-11-03' '2021-11-04' '2021-11-05'
'2021-11-06' '2021-11-07' '2021-11-08' '2021-11-09' '2021-11-10'
'2021-11-11' '2021-11-12' '2021-11-13' '2021-11-14' '2021-11-15'
'2021-11-16' '2021-11-17' '2021-11-18' '2021-11-19' '2021-11-20'
'2021-11-21' '2021-11-22' '2021-11-23' '2021-11-24' '2021-11-25'
'2021-11-26' '2021-11-27' '2021-11-28' '2021-11-29' '2021-11-30']
np.timedelta64(nov_2021[15] - nov_2021[0] , "D")numpy.timedelta64(15,'D')

Next Businessday

np.busday_offset('2021-10-29',1) # 29th Oct is Friday, so next bus day is monday i.e 1st novnumpy.datetime64('2021-11-01')np.datetime64('2021-11-02') + np.timedelta64(10, 'W')numpy.datetime64('2022-01-11')# Count no. of business days
np.busday_count(np.datetime64('2021-11-01'), np.datetime64('2021-11-30'))
21# Find first friday of month
np.busday_offset(np.datetime64('2021-11-01'), 0, roll='forward', weekmask='Fri')
numpy.datetime64('2021-11-05')# Find last Friday of nov 2011
np.busday_offset(np.datetime64('2021-12-01'), 0, roll='backward', weekmask='Fri')
numpy.datetime64('2021-11-26')# convert to python datetimeobject
np.datetime64('2021-11-01').astype(datetime)
datetime.date(2021, 11, 1)# create dates at gap of 2 days
np.arange('2021-11-01','2021-11-30', 2, dtype='datetime64[D]')
array(['2021-11-01', '2021-11-03', '2021-11-05', '2021-11-07',
'2021-11-09', '2021-11-11', '2021-11-13', '2021-11-15',
'2021-11-17', '2021-11-19', '2021-11-21', '2021-11-23',
'2021-11-25', '2021-11-27', '2021-11-29'], dtype='datetime64[D]')

Vectorization

def is_prime(n):
if n > 1:
for i in range(2, n):
if n %i == 0:
return False
else:
return True
for x in [10, 11, 13, 23, 17, 19, 29, 28]:
print(f"Is {x} prime ? {is_prime(x)}")
Is 10 prime ? False
Is 11 prime ? True
Is 13 prime ? True
Is 23 prime ? True
Is 17 prime ? True
Is 19 prime ? True
Is 29 prime ? True
Is 28 prime ? False
is_prime([10, 11, 13, 23, 17, 19, 29, 28])---------------------------------------------------------------------------

TypeError Traceback (most recent call last)

<ipython-input-75-d70b9753b4f7> in <module>
----> 1 is_prime([10, 11, 13, 23, 17, 19, 29, 28])


<ipython-input-73-d4960ac7e06d> in is_prime(n)
1 def is_prime(n):
----> 2 if n > 1:
3 for i in range(2, n):
4 if n %i == 0:
5 return False


TypeError: '>' not supported between instances of 'list' and 'int'
# We are going to vectorize this function
is_prime_v = np.vectorize(is_prime)
is_prime_v([10, 11, 13, 23, 17, 19, 29, 28])array([False, True, True, True, True, True, True, False])is_prime_v(99)array(False)is_prime_v1 = np.vectorize(is_prime, otypes=[bool])is_prime_v1([10,11,12,14,15])array([False, True, False, False, False])

Apply along axis

np.random.seed(100)

arr = np.random.randint(1, 100, size=(50,4))
arr[:5,]
array([[ 9, 25, 68, 88],
[80, 49, 11, 95],
[53, 99, 54, 67],
[99, 15, 35, 25],
[16, 61, 59, 17]])
def min_max(x_list):
return min(x_list)/max(x_list)
# row wise ,i.e. for each row
np.apply_along_axis(min_max, axis=1, arr=arr)[0:5]
array([0.10227273, 0.11578947, 0.53535354, 0.15151515, 0.26229508])9/88, 11/95(0.10227272727272728, 0.11578947368421053)# column wise , i.e for each column
np.apply_along_axis(min_max, axis=0, arr=arr)
array([0.01010101, 0.01010101, 0.03030303, 0.01052632])

Functions

iris = np.genfromtxt(r'./Numpy_Datasets/iris.csv', delimiter=',', skip_header=1)
dt = np.dtype({'names':['Sepal_len','Sepal_wid','Petal_len','Petal_wid'],
'formats':[float, float, float, float]})
iris.dtype = dt
iris[:5]array([[(4.7, 3.2, 1.3, 0.2)],
[(5. , 3.6, 1.4, 0.2)],
[(5.4, 3.9, 1.7, 0.4)],
[(4.6, 3.4, 1.4, 0.3)],
[(4.9, 3.1, 1.5, 0.1)]],
dtype=[('Sepal_len', '<f8'), ('Sepal_wid', '<f8'), ('Petal_len', '<f8'), ('Petal_wid', '<f8')])
iris['Sepal_len'].shape(105, 1)iris['Sepal_len'].reshape(-1)(105,)sepal_len = iris['Sepal_len'].reshape(-1)sepal_len[0:10]array([4.7, 5. , 5.4, 4.6, 4.9, 5.4, 4.8, 4.8, 5.8, 5.7])np.digitize(sepal_len, bins=[0,5,6,7,10])array([1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1,
2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 3, 2, 3, 1, 3, 2, 2, 2,
3, 2, 3, 2, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 2, 2, 2,
2, 3, 2, 2, 3, 2, 4, 3, 1, 4, 3, 4, 3, 3, 3, 4, 3, 3, 2, 4, 3, 3,
3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 3, 2, 3, 3, 3, 3, 2], dtype=int64)
np.clip(sepal_len, 5,7)array([5. , 5. , 5.4, 5. , 5. , 5.4, 5. , 5. , 5.8, 5.7, 5.4, 5.1, 5.7,
5.4, 5.1, 5. , 5.1, 5. , 5. , 5.2, 5.2, 5. , 5.4, 5.2, 5.5, 5. ,
5. , 5. , 5. , 5. , 5.1, 5. , 5. , 5.3, 5. , 5.5, 6.5, 5.7, 6.3,
5. , 6.6, 5.2, 5.9, 5.6, 6.7, 5.6, 6.2, 5.6, 6.3, 6.4, 5.7, 5.5,
5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 6.1, 5.8, 5. , 5.6,
5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7. , 6.3, 5. , 7. , 6.7, 7. ,
6.5, 6.8, 6.5, 7. , 6. , 6.9, 5.6, 7. , 6.3, 6.7, 6.2, 6.1, 6.4,
7. , 7. , 6.3, 6.1, 7. , 6.3, 6.4, 6. , 5.8, 6.7, 6.7, 6.5, 6.2,
5.9])
np.histogram(sepal_len, bins=10)(array([ 6, 20, 11, 23, 9, 15, 12, 2, 3, 4], dtype=int64),
array([4.4 , 4.75, 5.1 , 5.45, 5.8 , 6.15, 6.5 , 6.85, 7.2 , 7.55, 7.9 ]))
### Capping at 10 percentile and 90 percentile

np.random.seed(100)
arr = np.random.normal(30, 10, 100).round(2)
arr
array([12.5 , 33.43, 41.53, 27.48, 39.81, 35.14, 32.21, 19.3 , 28.11,
32.55, 25.42, 34.35, 24.16, 38.17, 36.73, 28.96, 24.69, 40.3 ,
25.62, 18.82, 46.19, 45.42, 27.48, 21.58, 31.85, 39.37, 37.31,
43.62, 26.74, 30.56, 32.22, 15.57, 22.44, 38.16, 37.5 , 25.44,
41.9 , 13.09, 16.44, 17.68, 24.56, 23.32, 30.07, 23.87, 43. ,
12.67, 20.17, 33.58, 13.86, 44.71, 18.12, 24.5 , 20.6 , 21.72,
31.09, 35.08, 21.38, 42.49, 29.2 , 21.1 , 21.18, 30.19, 32.38,
30.14, 13.64, 19.56, 36.13, 37.36, 40.27, 15.68, 11.59, 33.66,
26.68, 23.11, 50.35, 24.49, 37.5 , 16.93, 35.81, 18.95, 36.9 ,
36.87, 14.33, 39.05, 37.79, 34.28, 31.09, 30.28, 24.21, 18.01,
12.94, 33.69, 48.77, 26.23, 48.32, 30.03, 29.24, 30.04, 28.15,
5.13])
ten_percentile = np.percentile(arr,10)
ninenty_percentile = np.percentile(arr, 90)

np.clip(arr, ten_percentile, ninenty_percentile)
array([15.669, 33.43 , 41.53 , 27.48 , 39.81 , 35.14 , 32.21 , 19.3 ,
28.11 , 32.55 , 25.42 , 34.35 , 24.16 , 38.17 , 36.73 , 28.96 ,
24.69 , 40.3 , 25.62 , 18.82 , 41.567, 41.567, 27.48 , 21.58 ,
31.85 , 39.37 , 37.31 , 41.567, 26.74 , 30.56 , 32.22 , 15.669,
22.44 , 38.16 , 37.5 , 25.44 , 41.567, 15.669, 16.44 , 17.68 ,
24.56 , 23.32 , 30.07 , 23.87 , 41.567, 15.669, 20.17 , 33.58 ,
15.669, 41.567, 18.12 , 24.5 , 20.6 , 21.72 , 31.09 , 35.08 ,
21.38 , 41.567, 29.2 , 21.1 , 21.18 , 30.19 , 32.38 , 30.14 ,
15.669, 19.56 , 36.13 , 37.36 , 40.27 , 15.68 , 15.669, 33.66 ,
26.68 , 23.11 , 41.567, 24.49 , 37.5 , 16.93 , 35.81 , 18.95 ,
36.9 , 36.87 , 15.669, 39.05 , 37.79 , 34.28 , 31.09 , 30.28 ,
24.21 , 18.01 , 15.669, 33.69 , 41.567, 26.23 , 41.567, 30.03 ,
29.24 , 30.04 , 28.15 , 15.669])

Broadcasting

np.array([10, 11, 12]) + 5array([15, 16, 17])np.random.seed(100)

x = np.random.randint(1,12,size=(5,3))
x
array([[ 9, 9, 4],
[ 8, 8, 1],
[11, 5, 3],
[ 6, 3, 3],
[ 3, 2, 1]])
# find mean of each column
np.mean(x, axis=0)
array([7.4, 5.4, 2.4])# substract column mean from each element
x -np.mean(x, axis=0)
array([[ 1.6, 3.6, 1.6],
[ 0.6, 2.6, -1.4],
[ 3.6, -0.4, 0.6],
[-1.4, -2.4, 0.6],
[-4.4, -3.4, -1.4]])
# Substract row mean from each element
print(x)

print('Row means are:')

row_means = np.mean(x, axis=1).astype(int)
print(row_means)
[[ 9 9 4]
[ 8 8 1]
[11 5 3]
[ 6 3 3]
[ 3 2 1]]
Row means are:
[7 5 6 4 2]
row_means.reshape(5,1)array([[7],
[5],
[6],
[4],
[2]])
x - row_means.reshape(5,1)array([[ 2, 2, -3],
[ 3, 3, -4],
[ 5, -1, -3],
[ 2, -1, -1],
[ 1, 0, -1]])
# Challenge
np.random.seed(100)
x = np.random.random((10, 7)).round(3)
x
array([[0.543, 0.278, 0.425, 0.845, 0.005, 0.122, 0.671],
[0.826, 0.137, 0.575, 0.891, 0.209, 0.185, 0.108],
[0.22 , 0.979, 0.812, 0.172, 0.816, 0.274, 0.432],
[0.94 , 0.818, 0.336, 0.175, 0.373, 0.006, 0.252],
[0.796, 0.015, 0.599, 0.604, 0.105, 0.382, 0.036],
[0.89 , 0.981, 0.06 , 0.891, 0.577, 0.742, 0.63 ],
[0.582, 0.02 , 0.21 , 0.545, 0.769, 0.251, 0.286],
[0.852, 0.975, 0.885, 0.36 , 0.599, 0.355, 0.34 ],
[0.178, 0.238, 0.045, 0.505, 0.376, 0.593, 0.63 ],
[0.143, 0.934, 0.946, 0.602, 0.388, 0.363, 0.204]])
# Subtract every element in row by corresponding row mean and divide each element by sd
row_means = np.mean(x, axis=1).reshape(-1,1)
rows_std = np.std(x, axis=1).reshape(-1,1)x_normalised = (x-row_means)/(rows_std)
x_normalised
array([[ 0.46925539, -0.48520596, 0.04424996, 1.55698116, -1.46848124,
-1.04707755, 0.93027824],
[ 1.29925738, -0.89867961, 0.49855755, 1.50660993, -0.66899679,
-0.74555773, -0.99119074],
[-1.00548241, 1.46201322, 0.91909916, -1.16152956, 0.93210309,
-0.82992936, -0.31627414],
[ 1.66544858, 1.27895589, -0.24800702, -0.75805065, -0.13079202,
-1.29343806, -0.51411674],
[ 1.48561331, -1.19044862, 0.81060153, 0.82773381, -0.8820676 ,
0.06706063, -1.11849305],
[ 0.72374166, 1.03972757, -2.15832759, 0.72721403, -0.36311096,
0.20983052, -0.17907522],
[ 0.83975005, -1.50155165, -0.71000837, 0.68560741, 1.61879528,
-0.53920166, -0.39339106],
[ 0.88416878, 1.3605576 , 1.01198042, -1.02138647, -0.0957204 ,
-1.04075187, -1.09884806],
[-0.91723464, -0.62516599, -1.56465348, 0.6745395 , 0.0465919 ,
1.10290686, 1.28301586],
[-1.21548515, 1.39410821, 1.43369749, 0.2988048 , -0.40720402,
-0.48968169, -1.01423965]])
np.mean(x_normalised, axis=1).round(3)array([-0., -0., 0., 0., -0., -0., 0., 0., -0., 0.])

Ufuncs in Numpy

  • It supports broadcasting, typecasting
  • Most common functions are median(), mean() etc are ufuncs
  • They are written in C and linked to Python via numpy’s ufunc facility
def hyp(s1, s2):
return np.sqrt(s1**2 + s2**2)
# works on scaler
hyp(6,8)
10.0hyp([2,3,4,5,6], [2,3,3,7,8])---------------------------------------------------------------------------

TypeError Traceback (most recent call last)

<ipython-input-35-1590ad9bb1e0> in <module>
----> 1 hyp([2,3,4,5,6], [2,3,3,7,8])


<ipython-input-33-74e3b17ce128> in hyp(s1, s2)
1 def hyp(s1, s2):
----> 2 return np.sqrt(s1**2 + s2**2)


TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'
# Making a ufunc
hyp_v = np.frompyfunc(hyp, nin=2, nout=1)
hyp_v([2,3,4,5,6], [2,3,3,7,8])array([2.8284271247461903, 4.242640687119285, 5.0, 8.602325267042627,
10.0], dtype=object)
# Use vectorize function
hyp_vec = np.vectorize(hyp, otypes=[np.float16])
hyp_vec([2,3,4,5,6], [2,3,3,7,8])
array([ 2.828, 4.242, 5. , 8.6 , 10. ], dtype=float16)

Interpolation

image.png
travel_class = [1,3,4]
travel_fare = [100, 60, 20]

# find fare of travel class 2
np.interp(2, travel_class, travel_fare)
80.0### Sinewave interpolation
np.set_printoptions(suppress=True)

x = np.linspace(0, 4*np.pi, 25)
y = np.sin(x)
# plot
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(x, y, 'ro-')
plt.show()
png
# interpolate
xvals = np.linspace(0, 4*np.pi, 100)
yinterp = np.interp(xvals, x, y)

plt.plot(x,y,'ro')
plt.plot(xvals, yinterp,'-x')
plt.show()
png
### Interpolate with Scipy
from scipy import interpolate
fx = interpolate.interp1d(x, y)
ynew = fx(xvals)

plt.plot(x,y,'ro')
plt.plot(xvals, ynew, '-x')
plt.show()
png
fx = interpolate.interp1d(x, y, kind='cubic')
ynew = fx(xvals)

plt.plot(x,y,'ro')
plt.plot(xvals, ynew, '-x')
plt.show()
png
# Challenge
arr = np.genfromtxt(r'./Numpy_Datasets/Class_and_Fare.csv', delimiter=',', skip_header=1)
print(arr.ndim)
print(arr.shape)
arr[:5,:]
2
(50, 2)





array([[ 3. , 10.4625],
[ 3. , 7.8792],
[ 3. , 8.6625],
[ 2. , 10.5 ],
[ 2. , 13. ]])
# missing values are
missing_indx = np.isnan(arr[:,1])
arr[missing_indx]
array([[ 3., nan],
[ 1., nan],
[ 3., nan],
[ 3., nan],
[ 3., nan],
[ 1., nan],
[ 2., nan],
[ 3., nan],
[ 3., nan],
[ 2., nan]])
X = arr[~missing_indx,0]
Y = arr[~missing_indx, 1]

y_interpolated = np.interp(arr[missing_indx,0], X, Y)
arr[missing_indx,1] = y_interpolated

plt.plot(X,Y,'ro')
plt.plot(arr[missing_indx,0], y_interpolated,'bo')
plt.show()
png