导入数据

import pandas as pd

df = pd.read_csv('D:/研/数学基础/统计分析/统计分析/聚类分析/datasets/ch1ex1.csv')
points = df.values
df.head()

K-means聚类实验_聚类

散点图观察

#散点图观察
import matplotlib.pyplot as plt
xs = points[:,0]
ys = points[:,1]
plt.scatter(xs, ys)
plt.show()

K-means聚类实验_统计分析_02

聚类操作

#聚类操作
df = pd.read_csv('D:/研/数学基础/统计分析/统计分析/聚类分析/datasets/ch1ex1.csv')
points = df.values

from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
model.fit(points)
labels = model.predict(points)
labels

K-means聚类实验_聚类_03

聚类中心

#聚类中心
centroids = model.cluster_centers_
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]

plt.scatter(xs, ys, c=labels)
plt.scatter(centroids_x, centroids_y, marker='X', s=200)
plt.show()

K-means聚类实验_python_04

K值对结果的影响

import pandas as pd

seeds_df = pd.read_csv('D:/研/数学基础/统计分析/统计分析/聚类分析/datasets/seeds.csv')

varieties = list(seeds_df['grain_variety'])

del seeds_df['grain_variety']

seeds_df.head()

K-means聚类实验_统计分析_05

samples = seeds_df.values

from sklearn.cluster import KMeans

ks = range(1, 6)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)

    # Fit model to samples
    model.fit(samples)

    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
import matplotlib.pyplot as plt

# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

K-means聚类实验_统计分析_06


K-means的结果带有一定的随机性

model = KMeans(n_clusters=3)
labels = model.fit_predict(samples)
df = pd.DataFrame({'labels': labels, 'varieties': varieties})
ct = pd.crosstab(df['labels'], df['varieties'])
ct

K-means聚类实验_python_07

make_pipeline更方便

import pandas as pd

df = pd.read_csv('D:/研/数学基础/统计分析/统计分析/聚类分析/datasets/fish.csv')

species = list(df['species'])

del df['species']

df.head()

K-means聚类实验_python_08

samples = df.values
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
scaler = StandardScaler()
kmeans = KMeans(n_clusters=4)

pipeline = make_pipeline(scaler, kmeans)
pipeline.fit(samples)

labels = pipeline.predict(samples)
df = pd.DataFrame({'labels': labels, 'species': species})
ct = pd.crosstab(df['labels'], df['species'])
ct

K-means聚类实验_kmeans_09