[飞桨机器学习]Kmeans算法
一、简介
k均值聚类算法(k-means clustering algorithm)是一种迭代求解的聚类分析算法,其步骤是,预将数据分为K组,则随机选取K个对象作为初始的聚类中心,然后计算每个对象与各个种子聚类中心之间的距离,把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个聚类。每分配一个样本,聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。终止条件可以是没有(或最小数目)对象被重新分配给不同的聚类,没有(或最小数目)聚类中心再发生变化,误差平方和局部最小。
二、算法步骤
(1)从N个数据文档(样本)随机选取K个数据文档作为质心(聚类中心)。
(2)对每个数据文档测量其到每个质心的距离,并把它归到最近的质心的类。
(3)重新计算已经得到的各个类的质心。
(4)迭代(2)~(3步直至新的质心与原质心相等或小于指定阈值,算法结束。
本文采用所有样本所属的质心都不再变化时,算法收敛。
三、代码实现
1.导入库
import operator
import csv
import numpy as np
import random
2.数据集的划分
本次数据集划分采用了最简单的holdout的方法,将训练集和测试机三(测)七(训)分。
import random
import csv
import pandas as pd
def loadDataset(filename, split, trainingSet = [], testSet = []):
with open(filename, 'r') as f:
lines = csv.reader(f)
dataset = list(lines)
for x in range(len(dataset)-1):
if random.random() < split: #将数据集随机划分
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
if __name__ == "__main__":
train = []
test = []
loadDataset('', 0.7, train, test)//填补文件目录
print(train)
print(test)
train2 = pd.DataFrame(data=train)
train2.to_csv('')//填补文件目录
test2 = pd.DataFrame(data=test)
test2.to_csv('')//填补文件目录
我运行时训练集为98条数据,部分示例如下
测试集52条数据,部分示例如下:
3.数据集的读取和划分
删除第一列序号,并把数据读入list。并将标签去掉单独存储。
names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
train_lables = []
test_lables = []
def loaddata(filename1, filename2, train_set=[], test_set=[]):
with open(filename1, 'r') as f:
lines = csv.reader(f)
train_set = list(lines)
for i in range(len(train_set)):
del(train_set[i][0])
if train_set[i][-1] == names[0]:
train_lables.append(0)
elif train_set[i][-1] == names[1]:
train_lables.append(1)
else:
train_lables.append(2)
del(train_set[i][-1])
for j in range(4):
train_set[i][j] = float(train_set[i][j])
train_set[i] = np.array(train_set[i])
with open(filename2, 'r') as f:
lines = csv.reader(f)
test_set = list(lines)
for i in range(len(test_set)):
del(test_set[i][0])
if test_set[i][-1] == names[0]:
test_lables.append(0)
elif test_set[i][-1] == names[1]:
test_lables.append(1)
else:
test_lables.append(2)
del(test_set[i][-1])
for j in range(4):
test_set[i][j] = float(test_set[i][j])
test_set[i] = np.array(test_set[i])
return train_set, test_set
4.计算聚类中心坐标和相应的数据
def get_core(core, k):
newcore = []
newcoredata = []
for i in range(k):
newcoredata.append([])
for i in range(len(train_set)):
distance = []
for j in range(k):
distance.append(np.linalg.norm(train_set[i] - core[j], ord=None, axis=None, keepdims=False))
newcoredata[distance.index(min(distance))].append(i)
for i in range(k):
temp = np.zeros(4)
for j in newcoredata[i]:
temp += train_set[j]
newcore.append(temp / len(newcoredata[i]))
return newcore, newcoredata
5.kmans计算
def kmeans(k):
core = []
while len(core) < 3:
x = random.randint(0, len(train_set))
if x not in core:
core.append(x)
now_core = [train_set[i] for i in core]
print(now_core)
for i in range(10):
new_core, newcoredata = get_core(now_core, k)
print(new_core, newcoredata)
dis = 0
for i in range(k):
dis += np.linalg.norm(new_core[i] - now_core[i], ord=None, axis=None, keepdims=False)
if dis >= 0.1:
now_core = new_core
else:
break
return now_core, newcoredata
6.获取聚类对应的标签
def get_response(neighbors):
class_vote = {}
for i in range(len(neighbors)):
response = neighbors[i]
if response not in class_vote:
class_vote[response] = 1
else:
class_vote[response] += 1
softed_vote = sorted(class_vote.items(), key=operator.itemgetter(1), reverse=True)
return softed_vote[0][0]
7.预测
def predection(test_data, core_label):
result = []
for i in range(len(test_data)):
distance = []
for j in range(k):
distance.append(np.linalg.norm(test_data[i] - core[j], ord=None, axis=None, keepdims=False))
result.append(core_label[distance.index(min(distance))])
return result
8.主程序
if __name__ == "__main__":
k = 3
train_set, test_set = loaddata('iris_train.csv', 'iris_test.csv')
core, data = kmeans(k)
core_label = []
for i in range(k):
temp = [train_lables[j] for j in data[i]]
core_label.append(get_response(temp))
print(core_label)
result = predection(test_set, core_label)
print(test_lables)
print(result)
acc_num = 0
for i in range(len(test_lables)):
if result[i] == test_lables[i]:
acc_num += 1
print(acc_num/len(test_lables))
9.结果展示
准确率88.4%
本人正在学习人工智能方向,日常更新一些学习记录,欢迎互相交流
欢迎三连,互粉,等你呦
import operator
import csv
import numpy as np
import random
names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
train_lables = []
test_lables = []
def loaddata(filename1, filename2, train_set=[], test_set=[]):
with open(filename1, 'r') as f:
lines = csv.reader(f)
train_set = list(lines)
for i in range(len(train_set)):
del(train_set[i][0])
if train_set[i][-1] == names[0]:
train_lables.append(0)
elif train_set[i][-1] == names[1]:
train_lables.append(1)
else:
train_lables.append(2)
del(train_set[i][-1])
for j in range(4):
train_set[i][j] = float(train_set[i][j])
train_set[i] = np.array(train_set[i])
with open(filename2, 'r') as f:
lines = csv.reader(f)
test_set = list(lines)
for i in range(len(test_set)):
del(test_set[i][0])
if test_set[i][-1] == names[0]:
test_lables.append(0)
elif test_set[i][-1] == names[1]:
test_lables.append(1)
else:
test_lables.append(2)
del(test_set[i][-1])
for j in range(4):
test_set[i][j] = float(test_set[i][j])
test_set[i] = np.array(test_set[i])
return train_set, test_set
def get_core(core, k):
newcore = []
newcoredata = []
for i in range(k):
newcoredata.append([])
for i in range(len(train_set)):
distance = []
for j in range(k):
distance.append(np.linalg.norm(train_set[i] - core[j], ord=None, axis=None, keepdims=False))
newcoredata[distance.index(min(distance))].append(i)
for i in range(k):
temp = np.zeros(4)
for j in newcoredata[i]:
temp += train_set[j]
newcore.append(temp / len(newcoredata[i]))
return newcore, newcoredata
def kmeans(k):
core = []
while len(core) < 3:
x = random.randint(0, len(train_set))
if x not in core:
core.append(x)
now_core = [train_set[i] for i in core]
print(now_core)
for i in range(10):
new_core, newcoredata = get_core(now_core, k)
print(new_core, newcoredata)
dis = 0
for i in range(k):
dis += np.linalg.norm(new_core[i] - now_core[i], ord=None, axis=None, keepdims=False)
if dis >= 0.1:
now_core = new_core
else:
break
return now_core, newcoredata
def get_response(neighbors):
class_vote = {}
for i in range(len(neighbors)):
response = neighbors[i]
if response not in class_vote:
class_vote[response] = 1
else:
class_vote[response] += 1
softed_vote = sorted(class_vote.items(), key=operator.itemgetter(1), reverse=True)
return softed_vote[0][0]
def predection(test_data, core_label):
result = []
for i in range(len(test_data)):
distance = []
for j in range(k):
distance.append(np.linalg.norm(test_data[i] - core[j], ord=None, axis=None, keepdims=False))
result.append(core_label[distance.index(min(distance))])
return result
if __name__ == "__main__":
k = 3
train_set, test_set = loaddata('data/data42984/iris_train.csv', 'data/data42984/iris_test.csv')
core, data = kmeans(k)
core_label = []
for i in range(k):
temp = [train_lables[j] for j in data[i]]
core_label.append(get_response(temp))
print(core_label)
result = predection(test_set, core_label)
print(test_lables)
print(result)
acc_num = 0
for i in range(len(test_lables)):
if result[i] == test_lables[i]:
acc_num += 1
print(acc_num/len(test_lables))
[array([6.6, 2.9, 4.6, 1.3]), array([6.3, 3.3, 4.7, 1.6]), array([5.7, 4.4, 1.5, 0.4])]
[array([6.43157895, 2.71052632, 4.87368421, 1.52631579]), array([6.24146341, 3.04878049, 5.02926829, 1.8195122 ]), array([5.03684211, 3.45263158, 1.45 , 0.23684211])] [[39, 40, 41, 44, 48, 51, 52, 53, 55, 59, 60, 64, 68, 75, 77, 84, 85, 86, 95], [38, 42, 43, 45, 46, 47, 49, 50, 54, 56, 57, 58, 61, 62, 63, 65, 66, 67, 69, 70, 71, 72, 73, 74, 76, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]]
[array([6.01785714, 2.775 , 4.42142857, 1.39642857]), array([6.55 , 3.0875 , 5.46875 , 2.015625]), array([5.03684211, 3.45263158, 1.45 , 0.23684211])] [[38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 51, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63, 64, 65, 68, 78, 84, 85, 95], [43, 49, 50, 56, 57, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]]
[array([5.9 , 2.84545455, 4.39393939, 1.46969697]), array([6.79259259, 3.05925926, 5.6962963 , 2.04074074]), array([5.03684211, 3.45263158, 1.45 , 0.23684211])] [[38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 71, 76, 78, 81, 89, 92], [39, 66, 67, 68, 69, 70, 72, 73, 74, 75, 77, 79, 80, 82, 83, 84, 85, 86, 87, 88, 90, 91, 93, 94, 95, 96, 97], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]]
[array([5.92222222, 2.83888889, 4.45 , 1.49166667]), array([6.87083333, 3.09583333, 5.775 , 2.07916667]), array([5.03684211, 3.45263158, 1.45 , 0.23684211])] [[38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 71, 76, 78, 81, 84, 89, 92, 95, 97], [39, 66, 67, 68, 69, 70, 72, 73, 74, 75, 77, 79, 80, 82, 83, 85, 86, 87, 88, 90, 91, 93, 94, 96], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]]
[array([5.92222222, 2.83888889, 4.45 , 1.49166667]), array([6.87083333, 3.09583333, 5.775 , 2.07916667]), array([5.03684211, 3.45263158, 1.45 , 0.23684211])] [[38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 71, 76, 78, 81, 84, 89, 92, 95, 97], [39, 66, 67, 68, 69, 70, 72, 73, 74, 75, 77, 79, 80, 82, 83, 85, 86, 87, 88, 90, 91, 93, 94, 96], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]]
[1, 2, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2]
0.8846153846153846
运行代码请点击:https://aistudio.baidu.com/aistudio/projectdetail/619369?shared=1
欢迎三连