Python numpy 学习笔记

Posted by Daniel on July 8, 2017

1、数组创建

np.set_printoptions(precision=6, linewidth=320, suppress=True, threshold=np.inf)

# 生成[0,1)之间的数据

np.random.random(10)
np.random.rand(4, 2)
np.random.rand(4, 3, 2)

# 标准正态分布

np.random.randn(2, 4)

# Two-by-four array of samples from N(3, 6.25)

2.5 * np.random.randn(2, 4) + 3
# 随机整数,范围区间为[low,high)

np.random.randint(-5, 5, size=(2,2))

# arange类似于range函数,只不过返回的不是列表,而是数组

# arange允许非整数值输入,产生一个非整型的数组

>>> m = np.array([np.arange(2), np.arange(2)])
>>> m.ndim  # num of dimensions/axes
2
>>> m.shape
(2, 2)
>>> m.dtype #数组元素类型
>>> m.itemsize  # 每个元素占字节数
>>> m.nbytes  #  所有元素占的字节
>>> m.size   # 数组元素数
>>> a = np.array([[1,2],[3,4]])
>>> a[0,1]
2
>>> np.zeros(10)
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])
>>> np.zeros((2,3))
array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])
>>> np.empty((2,3))
array([[  7.70742408e-322,   0.00000000e+000,   2.78145267e-307],
       [  4.00537061e-307,   2.23419104e-317,   0.00000000e+000]])

# linspace(start, stop, N) 产生N个等距分布

>>> np.linspace(0, 1, 5)
array([ 0.  ,  0.25,  0.5 ,  0.75,  1.  ])
>>> np.logspace(0, 1, 5)  # 产生N个对数等距分布
array([1., 1.77827941, 3.16227766, 5.62341325, 10.])
>>> x_ticks = np.linspace(-1, 1, 5)
>>> y_ticks = np.linspace(-1, 1, 5)

#  r_ / c_ 来产生行向量或者列向量

>>> np.r_[0:1:.1]
array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9])
>>> np.r_[(3,22,11), 4.0, [15, 6]]
array([3., 22., 11., 4., 15., 6.])
>>> np.c_[1:3:5j]
array([[ 1. ],
       [ 1.5],
       [ 2. ],
       [ 2.5],
       [ 3. ]])

2、numpy数据类型

>>> np.float64(42)
42.0
>>> np.int8(42.0)
42
>>> np.bool(42)
True
>>> np.arange(7, dtype=np.uint16)
array([0, 1, 2, 3, 4, 5, 6], dtype=uint16)

# 类型转换

arr = np.array([1, 2, 3, 4, 5])
float_arr = arr.astype(np.float64)
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)
numeric_strings.astype(float)

# 编码转换 Encode words as UTF-8

>>> wordsList = [word.decode('UTF-8') for word in wordsList]

3、索引与切片

>>> a = np.arange(9)
>>> a[:7:2]
array([0, 2, 4, 6])
>>> a[::-1]
array([8, 7, 6, 5, 4, 3, 2, 1, 0])
>>> s = slice(3,7,2)
>>> a[s]
array([3, 5])
# 非零元素索引

>>> np.nonzero(a)

>>> b = np.arange(24).reshape(2,3,4) 
>>> b
array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])
>>> b[:,0,0]
array([ 0, 12])
>>> b[0, :, :]
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
>>> b[0, ...]
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
>>> b[0,1]
array([4, 5, 6, 7])
>>> b[...,1]
array([[ 1,  5,  9],
       [13, 17, 21]])
>>> b[:,1]
array([[ 4,  5,  6,  7],
       [16, 17, 18, 19]])
>>> b[0,:,1]
array([1, 5, 9])
>>> b[0,:,-1]
array([ 3,  7, 11])
>>> b[0,::2,-1]
array([ 3, 11])
#花式索引

arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
>>> arr[[4, 3, 0, 6]]
>>> arr[[-3, -5, -7]]

>>> b = np.arange(24).reshape(2,3,4)
>>> b
array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])
# ravel返回视图,对视图所做的修改会影响原始矩阵

>>> b.ravel()
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])
# flatten返回拷贝,对拷贝所做的修改不会影响原始矩阵

>>> b.flatten()
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])
>>> b.transpose()
>>> b.resize((2,12))
>>> b
array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]])

4、数组拼接

>>> a = np.arange(9).reshape(3,3)
>>> b = 2 * a
>>> np.hstack((a, b))
array([[ 0,  1,  2,  0,  2,  4],
       [ 3,  4,  5,  6,  8, 10],
       [ 6,  7,  8, 12, 14, 16]])
>>> np.concatenate((a, b), axis=1)
array([[ 0,  1,  2,  0,  2,  4],
       [ 3,  4,  5,  6,  8, 10],
       [ 6,  7,  8, 12, 14, 16]])
>>> np.vstack((a, b))
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])
>>> np.concatenate((a, b), axis=0)
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16]])
>>> np.dstack((a, b))
array([[[ 0,  0],
        [ 1,  2],
        [ 2,  4]],

       [[ 3,  6],
        [ 4,  8],
        [ 5, 10]],

       [[ 6, 12],
        [ 7, 14],
        [ 8, 16]]])
>>> oned = np.arange(2)
>>> twice_oned = 2 * oned
>>> np.column_stack((oned, twice_oned))
array([[0, 0],
       [1, 2]])
>>> np.row_stack((oned, twice_oned))
array([[0, 1],
       [0, 2]])
>>> a = np.arange(9).reshape(3, 3)
>>> np.hsplit(a, 3)
[array([[0],
       [3],
       [6]]), array([[1],
       [4],
       [7]]), array([[2],
       [5],
       [8]])]
>>> np.split(a, 3, axis=1)
[array([[0],
       [3],
       [6]]), array([[1],
       [4],
       [7]]), array([[2],
       [5],
       [8]])]
>>> np.vsplit(a, 3)
[array([[0, 1, 2]]), array([[3, 4, 5]]), array([[6, 7, 8]])]

# stack()增加维度

a=np.array([[1,2,3],[4,5,6]])
b=np.array([[7,8,9],[10,11,12]])
a
array([[1, 2, 3],
       [4, 5, 6]])
b
array([[ 7,  8,  9],
       [10, 11, 12]])
np.stack([a, b], axis=0)
array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])
np.stack([a, b], axis=1)
array([[[ 1,  2,  3],
        [ 7,  8,  9]],

       [[ 4,  5,  6],
        [10, 11, 12]]])
np.stack([a, b], axis=2)
array([[[ 1,  7],
        [ 2,  8],
        [ 3,  9]],

       [[ 4, 10],
        [ 5, 11],
        [ 6, 12]]])

5、数组增加一维

>>> import numpy as np
>>> a = np.random.rand(4)
>>> a
array([0.153286, 0.78193 , 0.813937, 0.211314])
>>> a[np.newaxis, ...]
array([[0.153286, 0.78193 , 0.813937, 0.211314]])
>>> a[np.newaxis, :]
array([[0.153286, 0.78193 , 0.813937, 0.211314]])
>>> a[:, np.newaxis]
array([[0.153286],
       [0.78193 ],
       [0.813937],
       [0.211314]])
>>> np.expand_dims(a, axis=0)
array([[0.153286, 0.78193 , 0.813937, 0.211314]])
>>> np.expand_dims(a, axis=1)
array([[0.153286],
       [0.78193 ],
       [0.813937],
       [0.211314]])
>>> np.reshape(a, (-1, 1))
array([[0.153286],
       [0.78193 ],
       [0.813937],
       [0.211314]])

>>> b = np.reshape(a, (-1, 1))
>>> np.squeeze(b, axis=1)
array([0.153286, 0.78193 , 0.813937, 0.211314])

6、where

>>> xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
>>> yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
>>> cond = np.array([True, False, True, True, False])
>>> np.where(cond, xarr, yarr)
array([ 1.1,  2.2,  1.3,  1.4,  2.5])
>>> np.where(arr > 0, 2, -2)
array([2, 2, 2, 2, 2])
>>> np.where(arr > 0, 2, arr)
array([2, 2, 2, 2, 2])

7、数组前一项减去后一项

od = np.array([21000, 21180, 21240, 22100, 22400])
dist = od[1:] - od[:-1]

8、数组排序

names = np.array(['bob', 'sue', 'jan', 'ad'])
weights = np.array([20.8, 93.2, 53.4, 61.8])
weights.sort()
ordered_indices = np.argsort(weights)
weights[ordered_indices]
names[ordered_indices]

9、squeeze 方法去除多余的轴

>>> a = np.arange(6)
>>> a.shape = (2,1,3)
# 将所有长度为1的维度去除

>>> b = a.squeeze()
b.shape=(2,3)

10、Flatten 数组

a = array([[0,1], [2,3]])
# 返回的是数组的复制,因此,改变 b 并不会影响 a 的值

b = a.flatten()
# a.flat 相当于返回了所有元组组成的一个迭代器, 修改 b 的值会影响 a

b = a.flat
b = a.ravel()

11、对角线

a = np.array([11,21,31,12,22,32,13,23,33], )
a.shape = 3,3
a.diagonal()
# 使用偏移来查看它的次对角线,正数表示右移,负数表示左移

a.diagonal(offset=1)
a.diagonal(offset=-1)

# 获取矩阵相乘对角元素

A = np.random.uniform(0, 1, (5, 5))
B = np.random.uniform(0, 1, (5, 5))
# Slow version

np.diag(np.dot(A, B))
# Fast version

np.sum(A * B.T, axis=1)
# Faster version

np.einsum("ij,ji->i", A, B)

12、toString

a = np.array([[1,2], [3,4]], dtype = np.uint8)
s = a.tostring()
b = np.fromstring(s, dtype=np.uint8)
b
array([1, 2, 3, 4], dtype=uint8)

from io import StringIO
s = StringIO("""1, 2, 3, 4, 5\n
                6,  ,  , 7, 8\n
                 ,  , 9,10,11\n""")
a = np.genfromtxt(s, delimiter=",", dtype=np.int)
a = np.genfromtxt(sample_file, delimiter=",", dtype=np.int)

# dataframe其中一列为numpy array

df = pd.read_csv(filename, dtype={'emotion':np.int32, 'pixels':str, 'Usage':str})
df['pixels'] = df['pixels'].apply(lambda text: np.fromstring(text, sep=' '))

13、矩阵生成

# 随机的 3*3*3矩阵

np.random.random((3,3,3))

>>> a = np.array([[1,2,4],
                [2,5,3],
                [7,8,9]])
>>> A = np.mat(a)
>>> A
matrix([[1, 2, 4],
        [2, 5, 3],
        [7, 8, 9]])
>>> A = np.mat('1,2,4;2,5,3;7,8,9')
>>> A * A.I  # A.I表示A矩阵的逆矩阵

# 产生一个n乘n的单位矩阵

np.eye(3)
np.identity(3)
      
#  全为1的矩阵

np.ones((10,10))

14、替换最大值

a = np.random.random(10)
a[a.argmax()] = 0

15、类型转换

a = np.arange(10, dtype=np.int32)
a = a.astype(np.float32, copy=False)

16、遍历array

a = np.arange(9).reshape(3,3)
for index, value in np.ndenumerate(a):
    print(index, value)
for index in np.ndindex(a.shape):
    print(index, a[index])

17、矩阵操作

# 减去每行的均值

x = np.random.rand(5, 10)
y = x - x.mean(axis=1, keepdims=True)

# 对第n列排序

m = np.random.randint(0, 10, (3, 3))
m[m[:,1].argsort()]

# 矩阵互换两行

A = np.arange(25).reshape(5,5)
A[[2,1]] = A[[1,2]]

# SVD

Z = np.random.uniform(0,1,(10,10))
U, S, V = np.linalg.svd(Z)
rank = np.sum(S > 1e-10)

# 矩阵内积、外积、和、乘法

A = np.random.uniform(0, 1, 10)
B = np.random.uniform(0, 1, 10)
np.einsum('i->', A)       # np.sum(A)
np.einsum('i,i->i', A, B) # A * B
np.einsum('i,i', A, B)    # np.inner(A, B)
np.einsum('i,j->ij', A, B)    # np.outer(A, B)

18、滑动窗口内求均值

def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

a = np.arange(20)
moving_average(a, n=3)

19、数组中最值

# 出现最频繁的数

a = np.random.randint(0, 10, 50)
# bin的数量比a中的最大值大1,每个bin给出了它的索引值在a中出现的次数

np.bincount(a).argmax()

# 取top-k

a = np.arange(10000)
np.random.shuffle(a)
n = 5
# Slow

a[np.argsort(a)[-n:]]
# Fast

a[np.argpartition(-a, n)[:n]]

20、np aggr axes

axes-2d axes-3d

a = np.arange(6).reshape((3, 2))
a
array([[0, 1],
       [2, 3],
       [4, 5]])
np.sum(a, axis=0, keepdims=True)
array([[6, 9]])
np.sum(a, axis=1, keepdims=True)
array([[1],
       [5],
       [9]])
np.max(a, axis=0, keepdims=True)
array([[4, 5]])
np.max(a, axis=1, keepdims=True)
array([[1],
       [3],
       [5]])

b = np.array([a, a])
b
array([[[0, 1],
        [2, 3],
        [4, 5]],

       [[0, 1],
        [2, 3],
        [4, 5]]])
np.sum(b, axis=0, keepdims=True)
array([[[ 0,  2],
        [ 4,  6],
        [ 8, 10]]])
np.sum(b, axis=1, keepdims=True)
array([[[6, 9]],

       [[6, 9]]])
np.sum(b, axis=2, keepdims=True)
array([[[1],
        [5],
        [9]],

       [[1],
        [5],
        [9]]])
np.max(b, axis=0, keepdims=True)
array([[[0, 1],
        [2, 3],
        [4, 5]]])
np.max(b, axis=1, keepdims=True)
array([[[4, 5]],

       [[4, 5]]])
np.max(b, axis=2, keepdims=True)
array([[[1],
        [3],
        [5]],

       [[1],
        [3],
        [5]]])

21、多维数组乘法

对于array对象,* 和 np.multiply 函数代表的是数量积,np.dot 和 np.matmul 函数代表矩阵乘法(矢量乘法); 对于matrix对象,* 直接代表矩阵乘法(矢量乘法),np.multiply 函数表示数量积;

tensorflow乘法

1、c = tf.matmul(a, b)

二维tensor相乘 [m, k] x [k, n] = [m, n] 多维tensor相乘 [x, y, m, k] x [x, y, k, n] = [x, y, m, n],并且这里的x, y是支持广播的

2、c = tf.multiply(a, b)

按元素相乘,a, b必须形状相同,支持广播。

3、c = tf.tensordot(a, b, axes=(axes_a, axes_b))

表示任意多维,任意组合形式的矩阵相乘。(axes_a, axes_b)把 a 和 b 的元素的乘积沿着 axes_a 和 axes_b 加和。

a = tf.constant([1, 2, 3, 4])
b = tf.constant([2, 3, 4, 5])
c1 = tf.tensordot(a, b, axes=0) # axes=0则按照逐元素挨个相乘

<tf.Tensor: id=195, shape=(4, 4), dtype=int32, numpy=
array([[ 2,  3,  4,  5],
       [ 4,  6,  8, 10],
       [ 6,  9, 12, 15],
       [ 8, 12, 16, 20]], dtype=int32)>
c2 = tf.tensordot(a, b, axes=1) # 表示内积,直接令 axes=1 

<tf.Tensor: id=206, shape=(), dtype=int32, numpy=40>

22、矩阵两两之间的相似性

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
A =  np.array([[0, 1, 0, 0, 1], [0, 0, 1, 1, 1], [1, 1, 0, 1, 0]])
A_sparse = sparse.csr_matrix(A)
similarities = cosine_similarity(A_sparse)
print('pairwise dense output:\n {}\n'.format(similarities))

#also can output sparse matrices

similarities_sparse = cosine_similarity(A_sparse, dense_output=False)
print('pairwise sparse output:\n {}\n'.format(similarities_sparse))

top_k=3
for i in range(len(similarities)):
	sim_arr = similarities[i]
	# top_index=np.argsort(sim_arr)[-top_k:]

	top_index=np.argpartition(sim_arr, -top_k)[-top_k:]
	print(str(list(zip(top_index, sim_arr[top_index]))))


# 协方差

np.cov(A)
# 相关系数

np.corrcoef(A)

23、kNN求解

import numpy as np
from numpy import ndarray

random.seed(5)

def generate_points(n: int=6) -> list[tuple]:
    points = []
    for i in range(n):
        points.append((random.randint(0, 100), random.randint(0, 100)))
    return points

def structured_array(points: list[tuple]) -> ndarray:
    dt = np.dtype([('x', 'int'), ('y', 'int')])
    return np.array(points, dtype=dt)

def np_find_dist(s_array: ndarray) -> ndarray:
    size = s_array.shape[0]
    a = s_array.reshape(size, 1)
    b = s_array.reshape(1, size)
    # 广播机制
    dist = (a['x'] - b['x'])**2 + (a['y'] - b['y'])**2
    return dist

def np_k_nearest(dist: ndarray, k: int) -> ndarray:
    k_indices = np.argpartition(dist, k+1, axis=1)[:, :k+1]
    return k_indices

def np_main(count: int = 6, top_k: int = 2):
    points = generate_points(count)
    s_array = structured_array(points)
    np_dist = np_find_dist(s_array)
    k_indices = np_k_nearest(np_dist, top_k - 1)
    # 花式索引
    results = [s_array[k_indices[i, :k+1]] for i in range(s_array.shape[0])]
    return results, s_array, k_indices, k

results, s_array, k_indices, k = np_main(100, 1)

24、稀疏举例

Coordinate (COO)

COO 格式中每一个元素用一个三元组(行号,列号,数值)表示

from scipy import sparse
rows = np.array([0, 3, 1, 0])
cols = np.array([0, 3, 1, 2])
data = np.array([6, 5, 7, 8])
sparse.coo_matrix((data, (rows, cols)), shape=(4, 4)).toarray()
array([[6, 0, 8, 0],
       [0, 7, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 5]])
Compressed Sparse Column (CSC)

CSC是按列存储的稀疏矩阵,其中indptr中的数据代表矩阵中每一列所存的数据在data中的开始和结束的索引,例如indptr为[0, 2, 3, 6],即表示在data中,索引[0, 2)为第一列的数据,索引[2, 3)为第二列的数据,索引[3, 6)为第三列的数据。而indices中的数据代表所对应的data中的数据在其所在列中的所在行数,例如,这里的indices为[0, 2, 2, 0, 1, 2],表示在data中,数据1在第0行,数据2在第2行,数据3在第2行,数据4在第0行,数据5在第一行,数据6在第2行。 CSC 适合做矩阵运算以及列切片,在内存中,它的元素是按列存储。

indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
sparse.csc_matrix((data, indices, indptr), shape=(3, 3)).toarray()
array([[1, 0, 4],
       [0, 0, 5],
       [2, 3, 6]])
Compressed Sparse Row (CSR)

CSR是按行来存储的稀疏矩阵,其与CSC类似。indptr中的数据表示矩阵中每一行的数据在data中开始和结束的索引,而indices中的数据表示所对应的在data中的数据在矩阵中其所在行的所在列数。

indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
sparse.csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]])

25、numpy array和tensor轴线对比

x = tf.constant([[0, 1], [2, 3], [4, 5]])
tf.reduce_sum(x)  # 15

tf.reduce_sum(x, axis=0)  # shape=(2,), array([6, 9])

tf.reduce_sum(x, axis=0, keep_dims=True)  #  shape=(1, 2) array([[6, 9]])

tf.reduce_sum(x, axis=1)  # shape=(3,) array([1, 5, 9])

tf.reduce_sum(x, axis=1, keep_dims=True)  # shape=(3, 1) array([[1],[5],[9]])

tf.reduce_sum(x, axis=-1)   # shape=(3,)  array([1, 5, 9])

tf.reduce_sum(x, axis=[0, 1])  # 15


a = np.array([[0, 1], [2, 3], [4, 5]])
np.sum(a) # 15

np.sum(a, axis=0) # array([6, 9])

np.sum(a, axis=0, keepdims=True)  # array([[6, 9]])

np.sum(a, axis=1) # array([1, 5, 9])

np.sum(a, axis=1, keepdims=True) # array([[1],[5],[9]])

np.sum(a, axis=-1) # array([1, 5, 9])

26、Numexpr优化

import numpy as np
from numpy import ndarray
import numexpr as ne

rng = np.random.default_rng(seed=4000)

def generate_ndarray(rows: int) -> np.ndarray:
    result_array = rng.random((rows, 1))
    return result_array

arr = generate_ndarray(10)


def numexpr_to_binary(np_array: np.ndarray) -> np.ndarray:
    # N * 1 -> N
    temp = np_array[:, 0]
    temp = ne.evaluate("where(temp<0.5, 0, 1)")
    # N -> N * 1
    return temp[:, np.newaxis]

arr = generate_ndarray(10)
result = numexpr_to_binary(arr)

mapping = np.column_stack((arr, result))
mapping