14.1.1 相似度或距离
计算马哈拉诺比斯距离矩阵(Python实现)
import numpy as np
def mahalanobis_distance(X):
"""计算所有样本之间的马哈拉诺比斯距离矩阵"""
n_samples = len(X[0])
# 计算协方差矩阵
S = np.cov(X)
# 计算协方差矩阵的逆矩阵
S = np.linalg.inv(S)
# 构造马哈拉诺比斯距离矩阵
D = np.zeros((n_samples, n_samples)) # 初始化为零矩阵
for i in range(n_samples):
for j in range(i + 1, n_samples):
xi = X[:, i][:, np.newaxis]
xj = X[:, j][:, np.newaxis]
D[i][j] = D[j][i] = np.sqrt((np.dot(np.dot((xi - xj).T, S), (xi - xj))))
return D
【测试】例14.2
if __name__ == "__main__":
X = np.array([[0, 0, 1, 5, 5],
[2, 0, 0, 0, 2]])
print(mahalanobis_distance(X))
# [[0. 1.83604716 1.91649575 2.81058198 1.94257172]
# [1.83604716 0. 0.38851434 1.94257172 2.52783249]
# [1.91649575 0.38851434 0. 1.55405738 2.27648631]
# [2.81058198 1.94257172 1.55405738 0. 1.83604716]
# [1.94257172 2.52783249 2.27648631 1.83604716 0. ]]
计算相关系数矩阵(Python实现)
import numpy as np
def correlation_coefficient(X):
"""计算所有样本之间的相关系数矩阵"""
n_samples = len(X[0])
# 计算均值
means = np.mean(X, axis=0)
# 计算误差平方和
variance = [np.square((X[:, i] - means[i])).sum() for i in range(n_samples)]
# 构造相关系数矩阵
D = np.identity(n_samples) # 初始化为单位矩阵
for i in range(n_samples):
for j in range(i + 1, n_samples):
xi, xj = X[:, i], X[:, j]
numerator = ((xi - means[i]) * (xj - means[j])).sum()
denominator = np.sqrt(variance[i] * variance[j])
if denominator:
D[i][j] = D[j][i] = numerator / denominator
else: # 当出现零方差时
D[i][j] = D[j][i] = np.nan
return D
【测试】例14.2(第2个样本为零方差)
if __name__ == "__main__":
X = np.array([[0, 0, 1, 5, 5],
[2, 0, 0, 0, 2]])
print(correlation_coefficient(X))
# [[ 1. nan -1. -1. -1.]
# [nan 1. nan nan nan]
# [-1. nan 1. 1. 1.]
# [-1. nan 1. 1. 1.]
# [-1. nan 1. 1. 1.]]
计算夹角余弦矩阵(Python实现)
import numpy as np
def cosine(X):
"""计算所有样本之间的夹角余弦矩阵"""
n_samples = len(X[0])
# 计算平方和
square = [np.square(X[:, i]).sum() for i in range(n_samples)]
# 构造夹角余弦矩阵
D = np.identity(n_samples) # 初始化为单位矩阵
for i in range(n_samples):
for j in range(i + 1, n_samples):
xi, xj = X[:, i], X[:, j]
numerator = (xi * xj).sum()
denominator = np.sqrt(square[i] * square[j])
if denominator:
D[i][j] = D[j][i] = numerator / denominator
else: # 当出现零平方和时
D[i][j] = D[j][i] = np.nan
return D
【测试】例14.2(第2个样本为零平方和)
if __name__ == "__main__":
X = np.array([[0, 0, 1, 5, 5],
[2, 0, 0, 0, 2]])
print(cosine(X))
# [[1. nan 0. 0. 0.37139068]
# [ nan 1. nan nan nan]
# [0. nan 1. 1. 0.92847669]
# [0. nan 1. 1. 0.92847669]
# [0.37139068 nan 0.92847669 0.92847669 1. ]]
14.1.2 类或簇
计算类的样本散布矩阵与样本协方差矩阵(Python实现)
import numpy as np
def scatter_matrix(X, G):
"""计算样本散布矩阵
:param X: 样本
:param G: 类别包含的样本
:return: 样本散步矩阵
"""
n_samples = len(G)
n_features = len(X)
# 计算类的中心
means = np.mean(X[:, G], axis=1)
A = np.zeros((n_features, n_features))
for i in range(n_samples):
A += np.dot((X[:, i] - means)[:, np.newaxis], (X[:, i] - means)[:, np.newaxis].T)
return A
def covariance_matrix(X, G):
"""计算样本协方差矩阵"""
n_features = len(X)
A = scatter_matrix(X, G)
S = A / (n_features - 1)
return S
【测试】例14.2(将第3个、第4个和第5个样本视作一个类;因为只有2个特征,所以样本散布矩阵与协方差矩阵一致)
if __name__ == "__main__":
X = np.array([[0, 0, 1, 5, 5],
[2, 0, 0, 0, 2]])
print(scatter_matrix(X, [2, 3, 4]))
# [[34. -0.66666667]
# [-0.66666667 2.66666667]]
print(covariance_matrix(X, [2, 3, 4]))
# [[34. -0.66666667]
# [-0.66666667 2.66666667]]
14.2 层次聚类
算法14.1的聚合聚类算法(原生Python实现)
合并规则:类间距离最小
类间距离:最短距离
停止条件:类的个数为 k k k
import numpy as np
class DSU:
def __init__(self, n: int):
self._n = n
self._array = [i for i in range(n)]
self._size = [1] * n
self._group_num = n
def find(self, i: int) -> int:
"""查询i所在的连通分支:O(1)"""
if self._array[i] != i:
self._array[i] = self.find(self._array[i])
return self._array[i]
def union(self, i: int, j: int) -> bool:
"""合并i和j所属的连通分支:O(1)"""
i, j = self.find(i), self.find(j)
if i != j:
self._group_num -= 1
if self._size[i] >= self._size[j]:
self._array[j] = i
self._size[i] += self._size[j]
else:
self._array[i] = j
self._size[j] += self._size[i]
return True
else:
return False
def is_connected(self, i: int, j: int) -> bool:
return self.find(i) == self.find(j)
@property
def array(self) -> list:
return self._array
@property
def group_num(self) -> int:
"""计算连通分支数量:O(1)"""
return self._group_num
@property
def max_group_size(self) -> int:
"""计算最大连通分支包含的数量:O(N)"""
import collections
return max(collections.Counter(self._array).values())
def single_linkage_agglomerative_clustering(D, k):
"""聚合聚类算法:合并规则为类间距离最小,类间距离为最短距离,停止条件为类的个数是k
时间复杂度:O(N^2 log(N^2))
:param D: 样本之间的距离矩阵
:param k: 停止时类的个数
:return: 每个样本所属的类别
"""
n_samples = len(D)
# 排序所有的样本
sorted_distance = []
for i in range(n_samples):
for j in range(i + 1, n_samples):
sorted_distance.append((D[i][j], i, j))
sorted_distance.sort()
# 初始化并查集
dsu = DSU(n_samples)
for d, i, j in sorted_distance:
if dsu.find(i) != dsu.find(j):
dsu.union(i, j)
if dsu.group_num == k:
break
return dsu.array
【测试】例14.1(将停止条件改为类的数量为 k k k)
if __name__ == "__main__":
D = np.array([[0, 7, 2, 9, 3],
[7, 0, 5, 4, 6],
[2, 5, 0, 8, 1],
[9, 4, 8, 0, 5],
[3, 6, 1, 5, 0]])
print(single_linkage_agglomerative_clustering(D, 2)) # [2, 1, 2, 1, 2]
14.3 k均值聚类
【补充解释】
n
l
=
∑
i
=
1
n
I
(
C
(
i
)
=
l
)
n_l=\sum_{i=1}^n I(C(i)=l)
nl=∑i=1nI(C(i)=l)是每个类的样本数,在上式(14.19)中没有出现。另有第
l
l
l个类的均值如下式
x
ˉ
l
=
∑
i
=
1
n
I
(
C
(
i
)
=
l
)
x
i
∑
i
=
1
n
I
(
C
(
i
)
=
l
)
\bar{x}_{l} = \frac{\sum_{i=1}^n I(C(i)=l) x_i}{\sum_{i=1}^n I(C(i)=l)}
xˉl=∑i=1nI(C(i)=l)∑i=1nI(C(i)=l)xi
式14.21的修正与推导
【参考文献】n个不同对象聚类为k个类别有多少种可能性? 李航博士,统计学习方法2nd.公式14.21的修正.
k均值聚类算法(Python实现)
import random
import numpy as np
def k_means_clustering(X, k, random_state=0, max_iter=100):
"""k均值聚类算法
:param X: 样本集
:param k: 聚类数
:param random_state: 随机种子
:param max_iter: 最大迭代次数
:return: 样本集合的聚类C
"""
n_samples = len(X[0])
# 随机选择k个样本点作为初始聚类中心
random.seed(random_state) # 选取随机种子
means = [X[:, i] for i in random.sample(range(n_samples), k)] # 将随机选择样本点作为初始聚类中心
G0 = [[] for _ in range(k)] # 每个初始聚类中心包含的样本点
for _ in range(max_iter):
G1 = [[] for _ in range(k)]
# 对样本进行聚类
for i in range(n_samples):
c0, d0 = -1, float("inf")
for c in range(k):
d = np.sqrt((np.square(X[:, i] - means[c])).sum())
if d < d0:
c0, d0 = c, d
G1[c0].append(i)
# 计算新的类中心
change = False
for c in range(k):
mean = np.average([X[:, i] for i in G1[c]], axis=0)
if not all(np.equal(mean, means[c])):
means[c] = mean
change = True
if not change:
break
G0 = G1
return G0
【测试】例14.2
if __name__ == "__main__":
X = np.array([[0, 0, 1, 5, 5],
[2, 0, 0, 0, 2]])
# 当随机种子(random_state)为1时,随机选择的初始聚类中心刚好与例14.2的解中的初始聚类中心相同
print(k_means_clustering(X, 2, random_state=1)) # [[1, 2, 3], [0, 4]]