14.1.1 相似度或距离

计算马哈拉诺比斯距离矩阵(Python实现)

import numpy as np

def mahalanobis_distance(X):
    """计算所有样本之间的马哈拉诺比斯距离矩阵"""
    n_samples = len(X[0])

    # 计算协方差矩阵
    S = np.cov(X)

    # 计算协方差矩阵的逆矩阵
    S = np.linalg.inv(S)

    # 构造马哈拉诺比斯距离矩阵
    D = np.zeros((n_samples, n_samples))  # 初始化为零矩阵

    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            xi = X[:, i][:, np.newaxis]
            xj = X[:, j][:, np.newaxis]
            D[i][j] = D[j][i] = np.sqrt((np.dot(np.dot((xi - xj).T, S), (xi - xj))))

    return D

【测试】例14.2

if __name__ == "__main__":
    X = np.array([[0, 0, 1, 5, 5],
                  [2, 0, 0, 0, 2]])
    
    print(mahalanobis_distance(X))
    
    # [[0.         1.83604716 1.91649575 2.81058198 1.94257172]
    #  [1.83604716 0.         0.38851434 1.94257172 2.52783249]
    #  [1.91649575 0.38851434 0.         1.55405738 2.27648631]
    #  [2.81058198 1.94257172 1.55405738 0.         1.83604716]
    #  [1.94257172 2.52783249 2.27648631 1.83604716 0.        ]]

计算相关系数矩阵(Python实现)

import numpy as np

def correlation_coefficient(X):
    """计算所有样本之间的相关系数矩阵"""
    n_samples = len(X[0])

    # 计算均值
    means = np.mean(X, axis=0)

    # 计算误差平方和
    variance = [np.square((X[:, i] - means[i])).sum() for i in range(n_samples)]

    # 构造相关系数矩阵
    D = np.identity(n_samples)  # 初始化为单位矩阵

    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            xi, xj = X[:, i], X[:, j]
            numerator = ((xi - means[i]) * (xj - means[j])).sum()
            denominator = np.sqrt(variance[i] * variance[j])
            if denominator:
                D[i][j] = D[j][i] = numerator / denominator
            else:  # 当出现零方差时
                D[i][j] = D[j][i] = np.nan

    return D

【测试】例14.2(第2个样本为零方差)

if __name__ == "__main__":
    X = np.array([[0, 0, 1, 5, 5],
                  [2, 0, 0, 0, 2]])

    print(correlation_coefficient(X))
    
    # [[ 1. nan -1. -1. -1.]
    #  [nan  1. nan nan nan]
    #  [-1. nan  1.  1.  1.]
    #  [-1. nan  1.  1.  1.]
    #  [-1. nan  1.  1.  1.]]

计算夹角余弦矩阵(Python实现)

import numpy as np

def cosine(X):
    """计算所有样本之间的夹角余弦矩阵"""
    n_samples = len(X[0])

    # 计算平方和
    square = [np.square(X[:, i]).sum() for i in range(n_samples)]

    # 构造夹角余弦矩阵
    D = np.identity(n_samples)  # 初始化为单位矩阵

    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            xi, xj = X[:, i], X[:, j]
            numerator = (xi * xj).sum()
            denominator = np.sqrt(square[i] * square[j])
            if denominator:
                D[i][j] = D[j][i] = numerator / denominator
            else:  # 当出现零平方和时
                D[i][j] = D[j][i] = np.nan

    return D

【测试】例14.2(第2个样本为零平方和)

if __name__ == "__main__":
    X = np.array([[0, 0, 1, 5, 5],
                  [2, 0, 0, 0, 2]])

    print(cosine(X))

    # [[1.                nan 0.         0.         0.37139068]
    #  [       nan 1.                nan        nan        nan]
    #  [0.                nan 1.         1.         0.92847669]
    #  [0.                nan 1.         1.         0.92847669]
    #  [0.37139068        nan 0.92847669 0.92847669 1.        ]]

14.1.2 类或簇

计算类的样本散布矩阵与样本协方差矩阵(Python实现)

import numpy as np

def scatter_matrix(X, G):
    """计算样本散布矩阵

    :param X: 样本
    :param G: 类别包含的样本
    :return: 样本散步矩阵
    """
    n_samples = len(G)
    n_features = len(X)

    # 计算类的中心
    means = np.mean(X[:, G], axis=1)

    A = np.zeros((n_features, n_features))
    for i in range(n_samples):
        A += np.dot((X[:, i] - means)[:, np.newaxis], (X[:, i] - means)[:, np.newaxis].T)

    return A

def covariance_matrix(X, G):
    """计算样本协方差矩阵"""
    n_features = len(X)
    A = scatter_matrix(X, G)
    S = A / (n_features - 1)
    return S

【测试】例14.2(将第3个、第4个和第5个样本视作一个类;因为只有2个特征,所以样本散布矩阵与协方差矩阵一致)

if __name__ == "__main__":
    X = np.array([[0, 0, 1, 5, 5],
                  [2, 0, 0, 0, 2]])

    print(scatter_matrix(X, [2, 3, 4]))
    # [[34.         -0.66666667]
    #  [-0.66666667  2.66666667]]
    
   	print(covariance_matrix(X, [2, 3, 4]))
    # [[34.         -0.66666667]
    #  [-0.66666667  2.66666667]]

14.2 层次聚类

算法14.1的聚合聚类算法(原生Python实现)

合并规则:类间距离最小

类间距离:最短距离

停止条件:类的个数为 k k k

import numpy as np

class DSU:
    def __init__(self, n: int):
        self._n = n
        self._array = [i for i in range(n)]
        self._size = [1] * n
        self._group_num = n

    def find(self, i: int) -> int:
        """查询i所在的连通分支:O(1)"""
        if self._array[i] != i:
            self._array[i] = self.find(self._array[i])
        return self._array[i]

    def union(self, i: int, j: int) -> bool:
        """合并i和j所属的连通分支:O(1)"""
        i, j = self.find(i), self.find(j)
        if i != j:
            self._group_num -= 1
            if self._size[i] >= self._size[j]:
                self._array[j] = i
                self._size[i] += self._size[j]
            else:
                self._array[i] = j
                self._size[j] += self._size[i]
            return True
        else:
            return False

    def is_connected(self, i: int, j: int) -> bool:
        return self.find(i) == self.find(j)

    @property
    def array(self) -> list:
        return self._array

    @property
    def group_num(self) -> int:
        """计算连通分支数量:O(1)"""
        return self._group_num

    @property
    def max_group_size(self) -> int:
        """计算最大连通分支包含的数量:O(N)"""
        import collections
        return max(collections.Counter(self._array).values())

def single_linkage_agglomerative_clustering(D, k):
    """聚合聚类算法:合并规则为类间距离最小,类间距离为最短距离,停止条件为类的个数是k
    时间复杂度:O(N^2 log(N^2))

    :param D: 样本之间的距离矩阵
    :param k: 停止时类的个数
    :return: 每个样本所属的类别
    """
    n_samples = len(D)

    # 排序所有的样本
    sorted_distance = []
    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            sorted_distance.append((D[i][j], i, j))
    sorted_distance.sort()

    # 初始化并查集
    dsu = DSU(n_samples)
    for d, i, j in sorted_distance:
        if dsu.find(i) != dsu.find(j):
            dsu.union(i, j)
            if dsu.group_num == k:
                break

    return dsu.array

【测试】例14.1(将停止条件改为类的数量为 k k k

if __name__ == "__main__":
    D = np.array([[0, 7, 2, 9, 3],
                  [7, 0, 5, 4, 6],
                  [2, 5, 0, 8, 1],
                  [9, 4, 8, 0, 5],
                  [3, 6, 1, 5, 0]])

    print(single_linkage_agglomerative_clustering(D, 2))  # [2, 1, 2, 1, 2]

14.3 k均值聚类

【补充解释】 n l = ∑ i = 1 n I ( C ( i ) = l ) n_l=\sum_{i=1}^n I(C(i)=l) nl=i=1nI(C(i)=l)是每个类的样本数,在上式(14.19)中没有出现。另有第 l l l个类的均值如下式
x ˉ l = ∑ i = 1 n I ( C ( i ) = l ) x i ∑ i = 1 n I ( C ( i ) = l ) \bar{x}_{l} = \frac{\sum_{i=1}^n I(C(i)=l) x_i}{\sum_{i=1}^n I(C(i)=l)} xˉl=i=1nI(C(i)=l)i=1nI(C(i)=l)xi

式14.21的修正与推导

【参考文献】n个不同对象聚类为k个类别有多少种可能性? 李航博士,统计学习方法2nd.公式14.21的修正.

k均值聚类算法(Python实现)

import random
import numpy as np

def k_means_clustering(X, k, random_state=0, max_iter=100):
    """k均值聚类算法

    :param X: 样本集
    :param k: 聚类数
    :param random_state: 随机种子
    :param max_iter: 最大迭代次数
    :return: 样本集合的聚类C
    """
    n_samples = len(X[0])

    # 随机选择k个样本点作为初始聚类中心
    random.seed(random_state)  # 选取随机种子
    means = [X[:, i] for i in random.sample(range(n_samples), k)]  # 将随机选择样本点作为初始聚类中心
    G0 = [[] for _ in range(k)]  # 每个初始聚类中心包含的样本点

    for _ in range(max_iter):
        G1 = [[] for _ in range(k)]

        # 对样本进行聚类
        for i in range(n_samples):
            c0, d0 = -1, float("inf")
            for c in range(k):
                d = np.sqrt((np.square(X[:, i] - means[c])).sum())
                if d < d0:
                    c0, d0 = c, d
            G1[c0].append(i)

        # 计算新的类中心
        change = False
        for c in range(k):
            mean = np.average([X[:, i] for i in G1[c]], axis=0)
            if not all(np.equal(mean, means[c])):
                means[c] = mean
                change = True

        if not change:
            break

        G0 = G1

    return G0

【测试】例14.2

if __name__ == "__main__":
    X = np.array([[0, 0, 1, 5, 5],
                  [2, 0, 0, 0, 2]])

    # 当随机种子(random_state)为1时,随机选择的初始聚类中心刚好与例14.2的解中的初始聚类中心相同
    print(k_means_clustering(X, 2, random_state=1))  # [[1, 2, 3], [0, 4]]