1 #通过统计classList(标签列)中每种值的个数,得到次数最多的标签
2 def majorityCnt(classList):
3 classCount = {}
4 for vote in classList:
5 if vote not in classCount.keys():
6 classCount[vote] = 0
7 classCount[vote] += 1
8 sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True)
9 return sortedClassCount[0][0] #返回classList中出现次数最多的标签值。



1 #创建树
2 def createTree(dataSet,labels):
3 classList = [example[-1] for example in dataSet]
4 if classList.count(classList[0]) == len(classList):
5 return classList[0] #递归中止条件之1:如果标签值只有一种,那么则返回该标签值。即如果标签值已经分好类,则不再考虑其他特征列
6 if len(dataSet[0]) == 1:
7 return majorityCnt(classList) ##递归中止条件之2:如果数据集每条样本只有1个数据(即只剩下最后的标签列),返回出现次数最多的标签值
8 bestFeat = chooseBestFeatureToSplit(dataSet) #返回最好的特征列的下标
9 bestFeatLabel = labels[bestFeat] #获得特征列的特征名称
10 myTree = {bestFeatLabel:{}}
11 del(labels[bestFeat]) #删除该特征列名称
12 featValues = [example[bestFeat] for example in dataSet] #取出该特征列所有值
13 uniqueVals = set(featValues) #获得该特征列的值的集合(去重后)
14 for value in uniqueVals:
15 subLabels = labels[:]
16 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
17 return myTree
18
19 myDat,labels = createDataSet()
20 myTree = createTree(myDat,labels)
21 print(myTree)