NumPy#
NumPy is a package for numerical computations
supports vectors, matrices and multidimensional arrays
fast numerical processing by means of vectorized functions
based on object type ndarray
numpy
vs list
#
NumPy array has fixed length, while lists can grow dynamically
All the elements of a NumPy array must have the same type
Math operations with NumPy arrays are allowed (just like with vectors)
Motivating example
import numpy as np
import matplotlib.pyplot as plt
from time import time
%config InlineBackend.figure_format = 'svg'
def inner_list(a, b: list):
return sum([left*right for left, right in zip(a, b)])
def inner_np(a, b: np.array):
return a.dot(b)
def measure_time(func, n, n_samples=10):
result = np.zeros(n_samples)
for i in range(n_samples):
begin = time()
func(np.random.randn(n), np.random.randn(n))
result[i] = time() - begin
return result.mean()
def get_times_lists(func, step=10, max_size=200, n_samples=20):
times = []
sizes = np.arange(step, max_size + 1, step)
for size in sizes:
times.append(measure_time(func, size, n_samples))
return np.array(times)
def plot_time_vs_size(step=100, max_size=1000, n_samples=20):
loop_times = 1000*get_times_lists(inner_list, step, max_size, n_samples)
np_times = 1000*get_times_lists(inner_np, step, max_size, n_samples)
sizes = np.arange(step, max_size + 1, step)
plt.semilogy(sizes, loop_times, c='r', lw=2, label="list")
plt.semilogy(sizes, np_times, c='m', lw=2, label="np")
plt.xlim(0, max_size)
plt.title("Dot product")
plt.legend()
plt.xlabel("size")
plt.ylabel("time, ms")
plt.grid(ls=":");
plot_time_vs_size(200, 10**4)
NumPy arrays creation#
Converting Python structures
Generation via built-in functions
Converting from Python structures#
lst = [1, 2, 3, 4, 5]
arr = np.array(lst)
print(f"list = {lst}, np.array = {arr}")
print(type(lst), type(arr))
list = [1, 2, 3, 4, 5], np.array = [1 2 3 4 5]
<class 'list'> <class 'numpy.ndarray'>
tpl = (1, 2, 3, 4, 5)
arr = np.array(tpl)
print(f"tuple = {tpl}, np.array = {arr}")
print(type(tpl), type(arr))
tuple = (1, 2, 3, 4, 5), np.array = [1 2 3 4 5]
<class 'tuple'> <class 'numpy.ndarray'>
The underlying data type can be specified by the argument dtype:
arr.dtype
dtype('int64')
np.array([1, 2, 3, 4, 5], dtype=np.float32)
array([1., 2., 3., 4., 5.], dtype=float32)
Numpy arrays generation#
arange — like
range
linspace — uniform partition of a segment
logspace — log scale partition
zeros — creates an array of zeroes
ones — creates an array of ones
full — creates an array of the same values
np.arange(0, 5, 0.5)
array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5])
np.linspace(0, 5, 11, endpoint=False)
array([0. , 0.45454545, 0.90909091, 1.36363636, 1.81818182,
2.27272727, 2.72727273, 3.18181818, 3.63636364, 4.09090909,
4.54545455])
np.logspace(0, 9, 11, base=2)
array([ 1. , 1.86606598, 3.48220225, 6.49801917,
12.12573253, 22.627417 , 42.22425314, 78.79324245,
147.03338944, 274.37400641, 512. ])
np.zeros((3, 4))
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
np.ones((2, 2))
array([[1., 1.],
[1., 1.]])
np.full((2, 2), 42)
array([[42, 42],
[42, 42]])
# creates a diagonal matrix
np.diag([1, 2, 3])
array([[1, 0, 0],
[0, 2, 0],
[0, 0, 3]])
# creates an identity matrix
np.eye(3)
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
array = np.ones((2, 3))
print('Array shape = {}, number of dimensions = {}'.format(array.shape, array.ndim))
Array shape = (2, 3), number of dimensions = 2
array
array([[1., 1., 1.],
[1., 1., 1.]])
Method reshape allows to broadcast an array without changing its data.
a = np.arange(0, 6)
print(a, a.shape)
[0 1 2 3 4 5] (6,)
array = a.reshape((2, 3))
print(array, array.shape)
[[0 1 2]
[3 4 5]] (2, 3)
Use ravel to flatten a multidimensional array into a vector
# can use -1 instead of one dimension
array = np.arange(0, 6, 0.5).reshape((3, -1))
print(array, array.shape)
array = np.ravel(array)
print(array, array.shape)
[[0. 0.5 1. 1.5]
[2. 2.5 3. 3.5]
[4. 4.5 5. 5.5]] (3, 4)
[0. 0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5 5. 5.5] (12,)
Indexing#
print(array[0])
print(array[-1])
print(array[1:-1])
print(array[1:-1:2])
print(array[::-1])
0.0
5.5
[0.5 1. 1.5 2. 2.5 3. 3.5 4. 4.5 5. ]
[0.5 1.5 2.5 3.5 4.5]
[5.5 5. 4.5 4. 3.5 3. 2.5 2. 1.5 1. 0.5 0. ]
Can use arrays of ints or booleans as indices
array[[0, 2, 4, 6, 8, 10]]
array([0., 1., 2., 3., 4., 5.])
array[[True, False, True, False, True, False, True, False, True, False, True, False]]
array([0., 1., 2., 3., 4., 5.])
Boolean indexing is commonly used for filtering
x = np.arange(10)
x
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
x[(x % 2 == 0) & (x > 5)]
array([6, 8])
x
was not actually changed but altering via boolean indexing is possible
print(x)
x[x > 3] *= 2
print(x)
[0 1 2 3 4 5 6 7 8 9]
[ 0 1 2 3 8 10 12 14 16 18]
Random#
np.random.seed(101)
a = np.random.rand(5)
b = np.random.rand(5)
print(a)
print(b)
[0.51639863 0.57066759 0.02847423 0.17152166 0.68527698]
[0.83389686 0.30696622 0.89361308 0.72154386 0.18993895]
Arithmetics with arrays as vectors#
a + b
array([1.35029549, 0.87763381, 0.92208731, 0.89306552, 0.87521594])
a - b
array([-0.31749823, 0.26370137, -0.86513885, -0.55002221, 0.49533803])
a * b
array([0.4306232 , 0.17517567, 0.02544494, 0.1237604 , 0.13016079])
a / b
array([0.61925959, 1.85905663, 0.03186416, 0.2377148 , 3.60788015])
Inner product: \((a, b) = \sum\limits_{k=1}^n a_k b_k\)
a.dot(b)
0.8851650000094948
a @ b
0.8851650000094948
np.dot(a, b)
0.8851650000094948
sum
, mean
, std
#
np.sum(a), a.sum()
(1.9723390789710982, 1.9723390789710982)
np.mean(b), b.mean()
(0.5891917955929626, 0.5891917955929626)
np.std(a), np.std(b)
(0.2506550421114526, 0.2860503677763882)
lst = list(range(2*10**6))
arr = np.arange(2*10**6)
print(sum(lst), arr.sum())
1999999000000 1999999000000
%%timeit
sum(lst)
13.2 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
np.sum(arr)
1.17 ms ± 58.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Matrix operations#
A = np.random.normal(size=(2, 2))
A
array([[-2.01816824, 0.74012206],
[ 0.52881349, -0.58900053]])
# transpose
A.T
array([[-2.01816824, 0.52881349],
[ 0.74012206, -0.58900053]])
np.transpose(A)
array([[-2.01816824, 0.52881349],
[ 0.74012206, -0.58900053]])
# calc determinant
np.linalg.det(A)
0.7973156409556252
# inverse matrix
B = np.linalg.inv(A)
A @ B
array([[ 1.00000000e+00, -4.51542844e-17],
[-1.01303503e-16, 1.00000000e+00]])
np.sum(A)
-1.3382332261785077
# sum of elements in every column
np.sum(A, axis=0)
array([-1.48935475, 0.15112152])
# sum of elements in every row
np.sum(A, axis=1)
array([-1.27804619, -0.06018704])
Matrix indexing#
A = np.arange(15).reshape((3, 5))
A
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
B = np.random.normal(loc=5, scale=10, size=(3, 4))
B
array([[ 6.88695309, -2.58872056, -4.33237216, 14.55056509],
[ 6.90794322, 24.78757324, 31.0596728 , 11.83508886],
[ 8.02665449, 21.93722925, -12.06085931, -6.59119416]])
np.sort(B, axis=None)
array([-12.06085931, -6.59119416, -4.33237216, -2.58872056,
6.88695309, 6.90794322, 8.02665449, 11.83508886,
14.55056509, 21.93722925, 24.78757324, 31.0596728 ])
# access to element
A[1, 2]
7
# second row
A[1, :]
array([5, 6, 7, 8, 9])
# third column
A[:, 2]
array([ 2, 7, 12])
# slice
A[0, 1:4]
array([1, 2, 3])
# every second element of the last row
A[-1, ::2]
array([10, 12, 14])
# average over the whole matrix
np.mean(A)
7.0
# average over each column
np.mean(A, axis=0)
array([5., 6., 7., 8., 9.])
# average over each row
np.mean(A, axis=1)
array([ 2., 7., 12.])
B = np.arange(20, 30).reshape((5, 2))
B
array([[20, 21],
[22, 23],
[24, 25],
[26, 27],
[28, 29]])
# matrix product
A.dot(B)
array([[ 260, 270],
[ 860, 895],
[1460, 1520]])
A @ B
array([[ 260, 270],
[ 860, 895],
[1460, 1520]])
Concatenation#
np.concatenate, np.hstack, np.vstack, np.dstack
x = np.arange(6).reshape(3, 2)
y = np.arange(100, 112).reshape(3, 4)
x, y
(array([[0, 1],
[2, 3],
[4, 5]]),
array([[100, 101, 102, 103],
[104, 105, 106, 107],
[108, 109, 110, 111]]))
np.hstack((x, y))
array([[ 0, 1, 100, 101, 102, 103],
[ 2, 3, 104, 105, 106, 107],
[ 4, 5, 108, 109, 110, 111]])
np.vstack((x.T, y.T))
array([[ 0, 2, 4],
[ 1, 3, 5],
[100, 104, 108],
[101, 105, 109],
[102, 106, 110],
[103, 107, 111]])
np.concatenate((x, y), axis=1)
array([[ 0, 1, 100, 101, 102, 103],
[ 2, 3, 104, 105, 106, 107],
[ 4, 5, 108, 109, 110, 111]])
np.concatenate((x.T, y.T), axis=0)
array([[ 0, 2, 4],
[ 1, 3, 5],
[100, 104, 108],
[101, 105, 109],
[102, 106, 110],
[103, 107, 111]])
Broadcasting#
Broadcasting снимает правило одной размерности и позволяет производить арифметические операции над массивами разных, но всё-таки созгласованных размерностей. Простейшим примером является умножение вектора на число:
2*np.arange(1, 4)
array([2, 4, 6])
Broadcasting rule
In order to broadcast, the size of the trailing axes for both arrays in an operation must either be the same size or one of them must be one
Если количество размерностей не совпадают, то к массиву меньшей размерности добавляются фиктивные размерности “слева”, например:
a = np.ones((2,3,4))
b = np.ones(4)
c = a * b # here a.shape=(2,3,4) and b.shape is considered to be (1,1,4)
np.array([[0, 0, 0], [10, 10, 10], [20, 20, 20], [30, 30, 30]]) + np.arange(3)
array([[ 0, 1, 2],
[10, 11, 12],
[20, 21, 22],
[30, 31, 32]])
Проделать тот же трюк со столбцами не получится:
A = np.repeat(np.arange(0, 31, 10), 3).reshape(4, 3)
A
array([[ 0, 0, 0],
[10, 10, 10],
[20, 20, 20],
[30, 30, 30]])
A + np.arange(4)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[71], line 1
----> 1 A + np.arange(4)
ValueError: operands could not be broadcast together with shapes (4,3) (4,)
A possible solution is to use double transposing:
(A.T + np.arange(4)).T
array([[ 0, 0, 0],
[11, 11, 11],
[22, 22, 22],
[33, 33, 33]])