这个玩意和改进约会网站的那个差不多,它是提前把所有数字转换成了32*32像素大小的黑白图,然后转换成字符图(用0,1表示),将所有1024个像素点用一维矩阵保存下来,这样就可以通过knn计算欧几里得距离来得到最接近的答案。



1 import os
2 import operator
3 from numpy import *
4
5 def classify0(inX, dataSet, labels, k):
6 dataSetSize = dataSet.shape[0]
7 diffMat = tile(inX, (dataSetSize,1)) - dataSet #统一矩阵,实现加减
8 sqDiffMat = diffMat**2
9 sqDistances = sqDiffMat.sum(axis=1) #进行累加,axis=0是按列,axis=1是按行
10 distances = sqDistances**0.5 #开根号
11 sortedDistIndicies = distances.argsort() #按升序进行排序,返回原下标
12 classCount = {}
13 for i in range(k):
14 voteIlabel = labels[sortedDistIndicies[i]]
15 classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 #get是字典中的方法,前面是要获得的值,后面是若该值不存在时的默认值
16 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
17 return sortedClassCount[0][0]
18
19
20 def img2vector(filename):
21 f = open(filename)
22 returnVect = zeros((1,1024))
23 for i in range(32):
24 line = f.readline()
25 for j in range(32):
26 returnVect[0,i*32+j] = int(line[j])
27 return returnVect
28
29
30 def handwritingClassTest():
31 fileList = os.listdir('trainingDigits')
32 m = len(fileList)
33 traingMat = zeros((m, 1024))
34 hwlabels = []
35 for i in range(m):
36 fileName = fileList[i]
37 prefix = fileName.split('.')[0]
38 number = int(prefix.split('_')[0])
39 hwlabels.append(number)
40 traingMat[i,:] = img2vector('trainingDigits/%s' %fileName)
41 testFileList = os.listdir('testDigits')
42 m = len(testFileList)
43 errorNum = 0.0
44 for i in range(m):
45 testFileName = testFileList[i]
46 prefix = testFileList[i].split('.')[0]
47 realNumber = int(prefix.split('_')[0])
48 testMat = img2vector('testDigits/%s' %testFileName)
49 testResult = classify0(testMat, traingMat, hwlabels, 3)
50 if testResult != realNumber:
51 errorNum += 1
52 print('The classifier came back with: %d, the real answer is: %d' %(testResult, realNumber))
53 print('错误率为%f' %(errorNum/float(m)))
54
55 if __name__ == '__main__':
56 handwritingClassTest()


《机器学习实战》之k-近邻算法(手写识别系统)_数字转换