numpy_04_sort

11 min readNov 21, 2021

Numpy

Post 04 — Date, Functions, Vectorization

import numpy as np

Search sorted

Search sort returns the position at which a new number to be inserted into array so that new array remains sorted

np.set_printoptions(threshold=2000)# create an array

np.random.seed(100)
arr = np.random.randint(1, 30, size=100)
arr.sort()
arrarray([ 1,  1,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
        5,  5,  5,  5,  7,  7,  8,  8,  8,  8,  9,  9,  9, 10, 10, 11, 11,
       11, 11, 11, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
       16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 20,
       20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 24, 24, 24, 25, 25,
       25, 25, 26, 26, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29])num = 6
pos = np.searchsorted(arr, num)
print(f"{num} should be inserted at index {pos} in array")6 should be inserted at index 21 in array# reconstruct array
arr = np.r_[arr[0:pos], num, arr[pos:]]
print(arr)[ 1  1  2  2  3  3  3  3  3  3  3  3  4  4  4  4  5  5  5  5  5  6  7  7
  8  8  8  8  9  9  9 10 10 11 11 11 11 11 12 13 13 13 14 14 14 14 15 15
 15 15 16 16 16 16 16 16 17 17 17 17 17 18 18 18 18 18 19 19 20 20 20 21
 21 22 22 22 22 22 22 23 23 24 24 24 25 25 25 25 26 26 27 28 28 28 28 28
 28 28 28 29 29]%%timeit
np.searchsorted(arr, [20, 29, 37])4.93 µs ± 92.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Compiler time: 0.31 s

Bisect module

Bisect module also provides similar functionality like searchsorted()

import bisectnp.random.seed(100)
arr = np.random.randint(1, 30, size=100)
arr.sort()
arrarray([ 1,  1,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
        5,  5,  5,  5,  7,  7,  8,  8,  8,  8,  9,  9,  9, 10, 10, 11, 11,
       11, 11, 11, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
       16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 20,
       20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 24, 24, 24, 25, 25,
       25, 25, 26, 26, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29])num=6
bisect.bisect(arr, num)21arr = arr.tolist()
bisect.insort_left(arr, num)
arr = np.array(arr)arrarray([ 1,  1,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
        5,  5,  5,  5,  6,  7,  7,  8,  8,  8,  8,  9,  9,  9, 10, 10, 11,
       11, 11, 11, 11, 12, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16,
       16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19,
       20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 24, 24, 24, 25,
       25, 25, 25, 26, 26, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29])### grades

grades = [30, 50, 80, 90]
grades_letters = ['E','D','C','B','A']for marks in [20, 35,41,51,81,99, 30]:
    print((marks, grades_letters[np.searchsorted(grades, marks)]))(20, 'E')
(35, 'D')
(41, 'D')
(51, 'C')
(81, 'B')
(99, 'A')
(30, 'E')

Handling Dates

from datetime import datetimetoday = datetime.now()
print(today)2021-11-01 23:53:37.429048#### Levels of granularity

# year
print(np.datetime64(today, 'Y'))

# month
print(np.datetime64(today, 'M'))

# Day
print(np.datetime64(today, 'D'))

# Hour
print(np.datetime64(today, 'h'))

# minute
print(np.datetime64(today, 'm'))

# sec
print(np.datetime64(today, 's'))

# milli sec
print(np.datetime64(today, 'ms'))

# nano sec
print(np.datetime64(today, 'ns'))2021
2021-11
2021-11-01
2021-11-01T23
2021-11-01T23:53
2021-11-01T23:53:37
2021-11-01T23:53:37.429
2021-11-01T23:53:37.429048000# sequence of dates
print("Dates of Nov 2021:")
nov_2021 = np.arange('2021-11-01','2021-12-01', dtype='datetime64[D]')
print(nov_2021)Dates of Nov 2021:
['2021-11-01' '2021-11-02' '2021-11-03' '2021-11-04' '2021-11-05'
 '2021-11-06' '2021-11-07' '2021-11-08' '2021-11-09' '2021-11-10'
 '2021-11-11' '2021-11-12' '2021-11-13' '2021-11-14' '2021-11-15'
 '2021-11-16' '2021-11-17' '2021-11-18' '2021-11-19' '2021-11-20'
 '2021-11-21' '2021-11-22' '2021-11-23' '2021-11-24' '2021-11-25'
 '2021-11-26' '2021-11-27' '2021-11-28' '2021-11-29' '2021-11-30']np.timedelta64(nov_2021[15] - nov_2021[0] , "D")numpy.timedelta64(15,'D')

Next Businessday

np.busday_offset('2021-10-29',1) # 29th Oct is Friday, so next bus day is monday i.e 1st novnumpy.datetime64('2021-11-01')np.datetime64('2021-11-02') + np.timedelta64(10, 'W')numpy.datetime64('2022-01-11')# Count no. of business days
np.busday_count(np.datetime64('2021-11-01'), np.datetime64('2021-11-30'))21# Find first friday of month
np.busday_offset(np.datetime64('2021-11-01'), 0, roll='forward', weekmask='Fri')numpy.datetime64('2021-11-05')# Find last Friday of nov 2011
np.busday_offset(np.datetime64('2021-12-01'), 0, roll='backward', weekmask='Fri')numpy.datetime64('2021-11-26')# convert to python datetimeobject
np.datetime64('2021-11-01').astype(datetime)datetime.date(2021, 11, 1)# create dates at gap of 2 days
np.arange('2021-11-01','2021-11-30', 2,  dtype='datetime64[D]')array(['2021-11-01', '2021-11-03', '2021-11-05', '2021-11-07',
       '2021-11-09', '2021-11-11', '2021-11-13', '2021-11-15',
       '2021-11-17', '2021-11-19', '2021-11-21', '2021-11-23',
       '2021-11-25', '2021-11-27', '2021-11-29'], dtype='datetime64[D]')

Vectorization

def is_prime(n):
    if n > 1:
        for i in range(2, n):
            if n %i == 0:
                return False
        else:
            return Truefor x in [10, 11, 13, 23, 17, 19, 29, 28]:
    print(f"Is {x} prime ? {is_prime(x)}")Is 10 prime ? False
Is 11 prime ? True
Is 13 prime ? True
Is 23 prime ? True
Is 17 prime ? True
Is 19 prime ? True
Is 29 prime ? True
Is 28 prime ? Falseis_prime([10, 11, 13, 23, 17, 19, 29, 28])---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-75-d70b9753b4f7> in <module>
----> 1 is_prime([10, 11, 13, 23, 17, 19, 29, 28])


<ipython-input-73-d4960ac7e06d> in is_prime(n)
      1 def is_prime(n):
----> 2     if n > 1:
      3         for i in range(2, n):
      4             if n %i == 0:
      5                 return False


TypeError: '>' not supported between instances of 'list' and 'int'# We are going to vectorize this function
is_prime_v = np.vectorize(is_prime)is_prime_v([10, 11, 13, 23, 17, 19, 29, 28])array([False,  True,  True,  True,  True,  True,  True, False])is_prime_v(99)array(False)is_prime_v1 = np.vectorize(is_prime, otypes=[bool])is_prime_v1([10,11,12,14,15])array([False,  True, False, False, False])

Apply along axis

np.random.seed(100)

arr = np.random.randint(1, 100, size=(50,4))
arr[:5,]array([[ 9, 25, 68, 88],
       [80, 49, 11, 95],
       [53, 99, 54, 67],
       [99, 15, 35, 25],
       [16, 61, 59, 17]])def min_max(x_list):
    return min(x_list)/max(x_list)# row wise ,i.e.  for each row
np.apply_along_axis(min_max, axis=1, arr=arr)[0:5]array([0.10227273, 0.11578947, 0.53535354, 0.15151515, 0.26229508])9/88, 11/95(0.10227272727272728, 0.11578947368421053)# column wise , i.e for each column
np.apply_along_axis(min_max, axis=0, arr=arr)array([0.01010101, 0.01010101, 0.03030303, 0.01052632])

Functions

iris = np.genfromtxt(r'./Numpy_Datasets/iris.csv', delimiter=',', skip_header=1)
dt = np.dtype({'names':['Sepal_len','Sepal_wid','Petal_len','Petal_wid'],
              'formats':[float, float, float, float]})
iris.dtype = dtiris[:5]array([[(4.7, 3.2, 1.3, 0.2)],
       [(5. , 3.6, 1.4, 0.2)],
       [(5.4, 3.9, 1.7, 0.4)],
       [(4.6, 3.4, 1.4, 0.3)],
       [(4.9, 3.1, 1.5, 0.1)]],
      dtype=[('Sepal_len', '<f8'), ('Sepal_wid', '<f8'), ('Petal_len', '<f8'), ('Petal_wid', '<f8')])iris['Sepal_len'].shape(105, 1)iris['Sepal_len'].reshape(-1)(105,)sepal_len = iris['Sepal_len'].reshape(-1)sepal_len[0:10]array([4.7, 5. , 5.4, 4.6, 4.9, 5.4, 4.8, 4.8, 5.8, 5.7])np.digitize(sepal_len, bins=[0,5,6,7,10])array([1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1,
       2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 3, 2, 3, 1, 3, 2, 2, 2,
       3, 2, 3, 2, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 2, 2, 2,
       2, 3, 2, 2, 3, 2, 4, 3, 1, 4, 3, 4, 3, 3, 3, 4, 3, 3, 2, 4, 3, 3,
       3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 3, 2, 3, 3, 3, 3, 2], dtype=int64)np.clip(sepal_len, 5,7)array([5. , 5. , 5.4, 5. , 5. , 5.4, 5. , 5. , 5.8, 5.7, 5.4, 5.1, 5.7,
       5.4, 5.1, 5. , 5.1, 5. , 5. , 5.2, 5.2, 5. , 5.4, 5.2, 5.5, 5. ,
       5. , 5. , 5. , 5. , 5.1, 5. , 5. , 5.3, 5. , 5.5, 6.5, 5.7, 6.3,
       5. , 6.6, 5.2, 5.9, 5.6, 6.7, 5.6, 6.2, 5.6, 6.3, 6.4, 5.7, 5.5,
       5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 6.1, 5.8, 5. , 5.6,
       5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7. , 6.3, 5. , 7. , 6.7, 7. ,
       6.5, 6.8, 6.5, 7. , 6. , 6.9, 5.6, 7. , 6.3, 6.7, 6.2, 6.1, 6.4,
       7. , 7. , 6.3, 6.1, 7. , 6.3, 6.4, 6. , 5.8, 6.7, 6.7, 6.5, 6.2,
       5.9])np.histogram(sepal_len, bins=10)(array([ 6, 20, 11, 23,  9, 15, 12,  2,  3,  4], dtype=int64),
 array([4.4 , 4.75, 5.1 , 5.45, 5.8 , 6.15, 6.5 , 6.85, 7.2 , 7.55, 7.9 ]))### Capping at 10 percentile and 90 percentile

np.random.seed(100)
arr = np.random.normal(30, 10, 100).round(2)
arrarray([12.5 , 33.43, 41.53, 27.48, 39.81, 35.14, 32.21, 19.3 , 28.11,
       32.55, 25.42, 34.35, 24.16, 38.17, 36.73, 28.96, 24.69, 40.3 ,
       25.62, 18.82, 46.19, 45.42, 27.48, 21.58, 31.85, 39.37, 37.31,
       43.62, 26.74, 30.56, 32.22, 15.57, 22.44, 38.16, 37.5 , 25.44,
       41.9 , 13.09, 16.44, 17.68, 24.56, 23.32, 30.07, 23.87, 43.  ,
       12.67, 20.17, 33.58, 13.86, 44.71, 18.12, 24.5 , 20.6 , 21.72,
       31.09, 35.08, 21.38, 42.49, 29.2 , 21.1 , 21.18, 30.19, 32.38,
       30.14, 13.64, 19.56, 36.13, 37.36, 40.27, 15.68, 11.59, 33.66,
       26.68, 23.11, 50.35, 24.49, 37.5 , 16.93, 35.81, 18.95, 36.9 ,
       36.87, 14.33, 39.05, 37.79, 34.28, 31.09, 30.28, 24.21, 18.01,
       12.94, 33.69, 48.77, 26.23, 48.32, 30.03, 29.24, 30.04, 28.15,
        5.13])ten_percentile = np.percentile(arr,10)
ninenty_percentile = np.percentile(arr, 90)

np.clip(arr, ten_percentile, ninenty_percentile)array([15.669, 33.43 , 41.53 , 27.48 , 39.81 , 35.14 , 32.21 , 19.3  ,
       28.11 , 32.55 , 25.42 , 34.35 , 24.16 , 38.17 , 36.73 , 28.96 ,
       24.69 , 40.3  , 25.62 , 18.82 , 41.567, 41.567, 27.48 , 21.58 ,
       31.85 , 39.37 , 37.31 , 41.567, 26.74 , 30.56 , 32.22 , 15.669,
       22.44 , 38.16 , 37.5  , 25.44 , 41.567, 15.669, 16.44 , 17.68 ,
       24.56 , 23.32 , 30.07 , 23.87 , 41.567, 15.669, 20.17 , 33.58 ,
       15.669, 41.567, 18.12 , 24.5  , 20.6  , 21.72 , 31.09 , 35.08 ,
       21.38 , 41.567, 29.2  , 21.1  , 21.18 , 30.19 , 32.38 , 30.14 ,
       15.669, 19.56 , 36.13 , 37.36 , 40.27 , 15.68 , 15.669, 33.66 ,
       26.68 , 23.11 , 41.567, 24.49 , 37.5  , 16.93 , 35.81 , 18.95 ,
       36.9  , 36.87 , 15.669, 39.05 , 37.79 , 34.28 , 31.09 , 30.28 ,
       24.21 , 18.01 , 15.669, 33.69 , 41.567, 26.23 , 41.567, 30.03 ,
       29.24 , 30.04 , 28.15 , 15.669])

Broadcasting

np.array([10, 11, 12]) + 5array([15, 16, 17])np.random.seed(100)

x = np.random.randint(1,12,size=(5,3))
xarray([[ 9,  9,  4],
       [ 8,  8,  1],
       [11,  5,  3],
       [ 6,  3,  3],
       [ 3,  2,  1]])# find mean of each column 
np.mean(x, axis=0)array([7.4, 5.4, 2.4])# substract column mean from each element
x -np.mean(x, axis=0)array([[ 1.6,  3.6,  1.6],
       [ 0.6,  2.6, -1.4],
       [ 3.6, -0.4,  0.6],
       [-1.4, -2.4,  0.6],
       [-4.4, -3.4, -1.4]])# Substract row mean from each element
print(x)

print('Row means are:')

row_means = np.mean(x, axis=1).astype(int)
print(row_means)[[ 9  9  4]
 [ 8  8  1]
 [11  5  3]
 [ 6  3  3]
 [ 3  2  1]]
Row means are:
[7 5 6 4 2]row_means.reshape(5,1)array([[7],
       [5],
       [6],
       [4],
       [2]])x - row_means.reshape(5,1)array([[ 2,  2, -3],
       [ 3,  3, -4],
       [ 5, -1, -3],
       [ 2, -1, -1],
       [ 1,  0, -1]])# Challenge
np.random.seed(100)
x = np.random.random((10, 7)).round(3)
xarray([[0.543, 0.278, 0.425, 0.845, 0.005, 0.122, 0.671],
       [0.826, 0.137, 0.575, 0.891, 0.209, 0.185, 0.108],
       [0.22 , 0.979, 0.812, 0.172, 0.816, 0.274, 0.432],
       [0.94 , 0.818, 0.336, 0.175, 0.373, 0.006, 0.252],
       [0.796, 0.015, 0.599, 0.604, 0.105, 0.382, 0.036],
       [0.89 , 0.981, 0.06 , 0.891, 0.577, 0.742, 0.63 ],
       [0.582, 0.02 , 0.21 , 0.545, 0.769, 0.251, 0.286],
       [0.852, 0.975, 0.885, 0.36 , 0.599, 0.355, 0.34 ],
       [0.178, 0.238, 0.045, 0.505, 0.376, 0.593, 0.63 ],
       [0.143, 0.934, 0.946, 0.602, 0.388, 0.363, 0.204]])# Subtract every element in row by corresponding row mean and divide each element by sd
row_means = np.mean(x, axis=1).reshape(-1,1)rows_std = np.std(x, axis=1).reshape(-1,1)x_normalised = (x-row_means)/(rows_std)
x_normalisedarray([[ 0.46925539, -0.48520596,  0.04424996,  1.55698116, -1.46848124,
        -1.04707755,  0.93027824],
       [ 1.29925738, -0.89867961,  0.49855755,  1.50660993, -0.66899679,
        -0.74555773, -0.99119074],
       [-1.00548241,  1.46201322,  0.91909916, -1.16152956,  0.93210309,
        -0.82992936, -0.31627414],
       [ 1.66544858,  1.27895589, -0.24800702, -0.75805065, -0.13079202,
        -1.29343806, -0.51411674],
       [ 1.48561331, -1.19044862,  0.81060153,  0.82773381, -0.8820676 ,
         0.06706063, -1.11849305],
       [ 0.72374166,  1.03972757, -2.15832759,  0.72721403, -0.36311096,
         0.20983052, -0.17907522],
       [ 0.83975005, -1.50155165, -0.71000837,  0.68560741,  1.61879528,
        -0.53920166, -0.39339106],
       [ 0.88416878,  1.3605576 ,  1.01198042, -1.02138647, -0.0957204 ,
        -1.04075187, -1.09884806],
       [-0.91723464, -0.62516599, -1.56465348,  0.6745395 ,  0.0465919 ,
         1.10290686,  1.28301586],
       [-1.21548515,  1.39410821,  1.43369749,  0.2988048 , -0.40720402,
        -0.48968169, -1.01423965]])np.mean(x_normalised, axis=1).round(3)array([-0., -0.,  0.,  0., -0., -0.,  0.,  0., -0.,  0.])

Ufuncs in Numpy

Ufuncs are functions that operate element by element on whole arrays.
It supports broadcasting, typecasting
Most common functions are median(), mean() etc are ufuncs
They are written in C and linked to Python via numpy’s ufunc facility

def hyp(s1, s2):
    return np.sqrt(s1**2 + s2**2)# works on scaler
hyp(6,8)10.0hyp([2,3,4,5,6], [2,3,3,7,8])---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-35-1590ad9bb1e0> in <module>
----> 1 hyp([2,3,4,5,6], [2,3,3,7,8])


<ipython-input-33-74e3b17ce128> in hyp(s1, s2)
      1 def hyp(s1, s2):
----> 2     return np.sqrt(s1**2 + s2**2)


TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'# Making a ufunc
hyp_v = np.frompyfunc(hyp, nin=2, nout=1)hyp_v([2,3,4,5,6], [2,3,3,7,8])array([2.8284271247461903, 4.242640687119285, 5.0, 8.602325267042627,
       10.0], dtype=object)# Use vectorize function
hyp_vec = np.vectorize(hyp, otypes=[np.float16])
hyp_vec([2,3,4,5,6], [2,3,3,7,8])array([ 2.828,  4.242,  5.   ,  8.6  , 10.   ], dtype=float16)

Interpolation

travel_class = [1,3,4]
travel_fare = [100, 60, 20]

# find fare of travel class 2
np.interp(2, travel_class, travel_fare)80.0### Sinewave interpolation
np.set_printoptions(suppress=True)

x = np.linspace(0, 4*np.pi, 25)
y = np.sin(x)# plot
import matplotlib.pyplot as plt
%matplotlib inlineplt.plot(x, y, 'ro-')
plt.show()

# interpolate
xvals = np.linspace(0, 4*np.pi, 100)
yinterp = np.interp(xvals, x, y)

plt.plot(x,y,'ro')
plt.plot(xvals, yinterp,'-x')
plt.show()

### Interpolate with Scipy
from scipy import interpolatefx = interpolate.interp1d(x, y)
ynew = fx(xvals)

plt.plot(x,y,'ro')
plt.plot(xvals, ynew, '-x')
plt.show()

fx = interpolate.interp1d(x, y, kind='cubic')
ynew = fx(xvals)

plt.plot(x,y,'ro')
plt.plot(xvals, ynew, '-x')
plt.show()

# Challenge
arr = np.genfromtxt(r'./Numpy_Datasets/Class_and_Fare.csv', delimiter=',', skip_header=1)
print(arr.ndim)
print(arr.shape)
arr[:5,:]2
(50, 2)





array([[ 3.    , 10.4625],
       [ 3.    ,  7.8792],
       [ 3.    ,  8.6625],
       [ 2.    , 10.5   ],
       [ 2.    , 13.    ]])# missing values are
missing_indx = np.isnan(arr[:,1])
arr[missing_indx]array([[ 3., nan],
       [ 1., nan],
       [ 3., nan],
       [ 3., nan],
       [ 3., nan],
       [ 1., nan],
       [ 2., nan],
       [ 3., nan],
       [ 3., nan],
       [ 2., nan]])X = arr[~missing_indx,0]
Y = arr[~missing_indx, 1]

y_interpolated = np.interp(arr[missing_indx,0], X, Y)
arr[missing_indx,1] = y_interpolated

plt.plot(X,Y,'ro')
plt.plot(arr[missing_indx,0], y_interpolated,'bo')
plt.show()