"#测试gini\n",
"gini=calGini((l,r),classLabels)\n",
"print(gini)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def getBestSplit(dataSet,featureNumbers):\n",
"    '''\n",
"    对于一个数据集,选择featureNumber个特征进行简单划分,得到最好的特征和划分结果\n",
"    args:\n",
"      dataSet:数据集,类型:list\n",
"      featureNumbers:选择的特征值数,类型:int\n",
"      classLabels:所有分类,类型:list\n",
"    ''' \n",
"    \n",
"    #样本数\n",
"    m=len(dataSet)\n",
"    if m==0:\n",
"        return None\n",
"    #样本特征值数+1(因为最后有一个标签)\n",
"    totalColumnNumber=len(dataSet[0])\n",
"    #随机选择的特征索引\n",
"    randomSelectedFeatures=[]\n",
"    \n",
"    \n",
"    \n",
"    #选择数目必须在特征数目范围内\n",
"    if totalColumnNumber-1>=featureNumbers:        \n",
"        #借助这个变量防止选择重复的特征进入\n",
"        indexList=list(range(totalColumnNumber-1))            \n",
"        for j in range(featureNumbers):\n",
"            #索引序列长度\n",
"            leftSize=len(indexList)\n",
"            #随机数\n",
"            randIndex=random.randrange(leftSize)\n",
"            #索引学列随机数处数据弹出,放入选择特征列表\n",
"            origIndex=indexList.pop(randIndex)\n",
"            #存入的是原始数据特征索引\n",
"            randomSelectedFeatures.append(origIndex)\n",
"    else:\n",
"        randomSelectedFeatures=range(totalColumnNumber-1)#特征全部被选择\n",
"    \n",
"    \n",
"   # print(\"current select features\")\n",
"   # print(randomSelectedFeatures)\n",
"\n",
"    #当前数据集的标签序列\n",
"    class_values=list(set(item[-1] for item in dataSet))\n",
"    \n",
"    #对于每个特征以及每个特征值进行简单划分\n",
"    #保留最小的基尼系数\n",
"    minGini=9999\n",
"    #存入最好的信息\n",
"    bestInfor={}\n",
"    #外层循环,对于每个特征\n",
"    for index in randomSelectedFeatures:\n",
"        #内层循环对于每个特征值\n",
"        tempFeatureValueList=list(set(item[index] for item in dataSet))\n",
"        #print(len(tempFeatureValueList))\n",
"        for tempValue in tempFeatureValueList:\n",
"            #简单分类\n",
"            groups=simpleSplit(dataSet,index,tempValue)            \n",
"            #print(\"currentIndex:%d,CurrentTempValue:%f\"%(index,tempValue))\n",
"            #计算基尼系数\n",
"            gini=calGini(groups,class_values)\n",
"            #print(\"computed gini:\",gini)            \n",
"            if gini
"                minGini=gini\n",
"                #保存目前最后的信息\n",
"                bestInfor[\"index\"]=index#存入原来索引                \n",
"                bestInfor[\"indexValue\"]=tempValue\n",
"                bestInfor[\"groups\"]=groups\n",
"                bestInfor[\"gini\"]=gini\n",
"                \n",
"    return bestInfor"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"52 0.017\n"
]
}
],
"source": [
"#测试最好分类函数\n",
"bestInfor=getBestSplit(dataSet,3)\n",
"print(bestInfor[\"index\"],bestInfor[\"indexValue\"])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def terminalLabel(subSet):\n",
"    '''\n",
"    树叶点对应的标签\n",
"    args:\n",
"      subSet:当前数据集,最后列是标签列,类型:list\n",
"    returns:\n",
"      当前列中最多的标签,类型:原标签类型\n",
"    '''\n",
"    #得到最后一列\n",
"    labelList=[item[-1] for item in subSet]\n",
"    #max函数,key后是函数,代表对前面的进行那种运算,这里是技术\n",
"    #max返回值是第一个参数,这里set是把labelList转换成集合,即去掉重复项\n",
"    #key:相当于循环调用labelList.count(set(labelList))中的每个元素,然后max取得最大值\n",
"    #返回set(labelList)中对应最大的那个标签\n",
"    return max(set(labelList), key=labelList.count)   # 输出 subSet 中出现次数较多的标签 \n",
"\n",
"    #下面的写法也是成立的,利用lambda表达式,表达式中x从全面取,这种写法可能更好理解些\n",
"    #return max(set(labelList), key=lambda x:labelList.count(x)) "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R\n"
]
}
],
"source": [
"#测试\n",
"label=terminalLabel(l)\n",
"print(label)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"#对得到的最好分类信息进行分割\n",
"def split(node, max_depth, min_size, n_features, depth):  # 创建子分割器 递归分类 直到分类结束\n",
"    '''\n",
"    :param node:        节点,类型:字典\n",
"                    bestInfor[\"index\"]=index#存入原来索引                \n",
"                    bestInfor[\"indexValue\"]=tempValue\n",
"                    bestInfor[\"groups\"]=groups\n",
"                    bestInfor[\"gini\"]=gini\n",
"    :param max_depth:   最大深度,int\n",
"    :param min_size:    最小,int\n",
"    :param n_features:  特征选取个数,int\n",
"    :param depth:       深度,int\n",
"    :return:\n",
"    '''\n",
"    left, right = node['groups']\n",
"    del (node['groups'])\n",
"\n",
"    if not left or not right:  # 如果只有一个子集\n",
"        node['left'] = node['right'] = terminalLabel(left + right)  # 投票出类型\n",
"        return\n",
"\n",
"    if depth >= max_depth:  # 如果即将超过\n",
"        node['left'], node['right'] = terminalLabel(left), terminalLabel(right)  # 投票出类型\n",
"        return\n",
"\n",
"    if len(left) <= min_size:  # 处理左子集\n",
"        node['left'] = terminalLabel(left)\n",
"    else:\n",
"        node['left'] = getBestSplit(left, n_features)  # node['left']是一个字典,形式为{'index':b_index, 'value':b_value, 'groups':b_groups},所以node是一个多层字典\n",
"        split(node['left'], max_depth, min_size, n_features, depth + 1)  # 递归,depth+1计算递归层数\n",
"\n",
"    if len(right) <= min_size:  # 处理右子集\n",
"        node['right'] = terminalLabel(right)\n",
"    else:\n",
"        node['right'] = getBestSplit(right, n_features)\n",
"        split(node['right'], max_depth, min_size, n_features, depth + 1)\n",
"        "
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#构建一个决策树\n",
"def buildTree(train, max_depth, min_size, n_features):\n",
"    '''\n",
"    创建一个决策树\n",
"    :param train:       训练数据集\n",
"    :param max_depth:   决策树深度不能太深 不然容易导致过拟合\n",
"    :param min_size:    叶子节点的大小\n",
"    :param n_features:  选择的特征的个数\n",
"    :return\n",
"        root    返回决策树\n",
"    '''\n",
"    root = getBestSplit(train, n_features)  # 获取样本数据集\n",
"    split(root, max_depth, min_size, n_features, 1)  # 进行样本分割,构架决策树\n",
"    return root  # 返回决策树\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'index': 55, 'indexValue': 0.0114, 'gini': 0.0, 'left': {'index': 35, 'indexValue': 0.2288, 'gini': 0.0, 'left': 'R', 'right': {'index': 33, 'indexValue': 0.2907, 'gini': 0.0, 'left': 'R', 'right': {'index': 58, 'indexValue': 0.0057, 'gini': 0.0, 'left': {'index': 12, 'indexValue': 0.0493, 'gini': 0.0, 'left': 'R', 'right': 'R'}, 'right': 'R'}}}, 'right': {'index': 54, 'indexValue': 0.0063, 'gini': 0.0, 'left': {'index': 21, 'indexValue': 0.8384, 'gini': 0.0, 'left': 'M', 'right': 'M'}, 'right': {'index': 32, 'indexValue': 0.558, 'gini': 0.0, 'left': 'M', 'right': {'index': 58, 'indexValue': 0.0332, 'gini': 0.0, 'left': 'M', 'right': 'M'}}}}\n"
]
}
],
"source": [
"#测试决策树\n",
"#选择一个子集\n",
"s=putBackSample(dataSet,10)\n",
"tempTree=buildTree(s,10,1,3)\n",
"print(tempTree)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"#根据决策树进行预测\n",
"def predict(node, row):   # 预测模型分类结果\n",
"    '''\n",
"    在当前节点进行预测,row是待预测样本\n",
"    args:\n",
"       node:树节点\n",
"       row:待分类样本\n",
"    return:\n",
"       分类标签\n",
"    '''\n",
"    if row[node['index']] < node['indexValue']:\n",
"        if isinstance(node['left'], dict):       # isinstance 是 Python 中的一个内建函数。是用来判断一个对象是否是一个已知的类型。\n",
"            return predict(node['left'], row)\n",
"        else:\n",
"            return node['left']\n",
"    else:\n",
"        if isinstance(node['right'], dict):\n",
"            return predict(node['right'], row)\n",
"        else:\n",
"            return node['right']"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"R R\n"
]
}
],
"source": [
"#测试下\n",
"label=predict(tempTree,s[0])\n",
"print(label,s[0][-1])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"#多个树的决策,多数服从少数\n",
"def baggingPredict(trees, row):\n",
"    \"\"\"\n",
"    多个树的决策,多数服从少数\n",
"    Args:\n",
"        trees           决策树的集合\n",
"        row             测试数据集的每一行数据\n",
"    Returns:\n",
"        返回随机森林中,决策树结果出现次数做大的\n",
"    \"\"\"\n",
"\n",
"    # 使用多个决策树trees对测试集test的第row行进行预测,再使用简单投票法判断出该行所属分类\n",
"    predictions = [predict(tree, row) for tree in trees]\n",
"    return max(set(predictions), key=predictions.count)\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def subSample(dataSet, ratio):  \n",
"    '''\n",
"    按比例随机抽取数据,有重复抽样\n",
"    args:\n",
"      dataSet:数据集,类型:list\n",
"      ratio:0-1之间的数\n",
"    '''\n",
"    if ratio<0.0:\n",
"        return None\n",
"    if ratio>=1:\n",
"        return dataSet\n",
"    sampleNumber=int(len(dataSet)*ratio)\n",
"    subSet=putBackSample(dataSet,sampleNumber)\n",
"    return subSet"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"41\n"
]
}
],
"source": [
"#测试\n",
"subSet=subSample(dataSet,0.2)\n",
"print(len(subSet))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"#随机森林主函数\n",
"def buildRandomForest(train, max_depth=10, min_size=1, sample_size=0.2, n_trees=10, n_features=3):\n",
"    \"\"\"\n",
"    random_forest(评估算法性能,返回模型得分)\n",
"    Args:\n",
"        train           训练数据集,类型:list        \n",
"        max_depth       决策树深度不能太深,不然容易导致过拟合\n",
"        min_size        叶子节点的大小\n",
"        sample_size     训练数据集的样本比例,0,1之间的数\n",
"        n_trees         决策树的个数\n",
"        n_features      选取的特征的个数\n",
"    Returns:\n",
"        trees:树序列\n",
"    \"\"\"\n",
"\n",
"    trees = list()\n",
"    # n_trees 表示决策树的数量\n",
"    for i in range(n_trees):\n",
"        # 随机抽样的训练样本, 随机采样保证了每棵决策树训练集的差异性\n",
"        sample = subSample(train, sample_size)\n",
"        # 创建一个决策树\n",
"        tree = buildTree(sample, max_depth, min_size, n_features)\n",
"        trees.append(tree)\n",
"    return trees\n",
"  \n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def predictByForest(trees,test):\n",
"    '''\n",
"    predictions     每一行的预测结果,bagging 预测最后的分类结果\n",
"    '''\n",
"    # 每一行的预测结果,bagging 预测最后的分类结果\n",
"    predictions = [baggingPredict(trees, row) for row in test]\n",
"    return predictions"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"def calQuota(predictions,labelClass,OrigClassLabels):\n",
"    '''\n",
"    计算分类指标\n",
"    args:\n",
"      predictions:预测值,类型:list\n",
"      labelClass:真实标签,类型:list\n",
"      OrigClassLabels:数据可能的标签库,一个正例一个负例标签\n",
"    '''\n",
"    \n",
"    Pos=OrigClassLabels[0]\n",
"    Nev=OrigClassLabels[1]    \n",
"    #真正例   \n",
"    #TP=len([item for item in labelClass if item==Pos and predictions[labelClass.index(item)]==Pos])\n",
"    TP=0\n",
"    TN=0\n",
"    FP=0\n",
"    FN=0\n",
"    for j in range(len(predictions)):        \n",
"        if predictions[j]==Pos and  labelClass[j]==Pos:\n",
"            TP+=1\n",
"        if predictions[j]==Nev and  labelClass[j]==Nev:\n",
"            TN+=1\n",
"        if predictions[j]==Pos and  labelClass[j]==Nev:\n",
"            FP+=1\n",
"        if predictions[j]==Nev and  labelClass[j]==Pos:\n",
"            FN+=1\n",
"#     #真负例,下面的做法不行,原因是index可能得到不同的索引\n",
"#     TN=len([item for item in labelClass if item==Nev and predictions[labelClass.index(item)]==Nev])\n",
"#     #伪正例\n",
"#     FP=len([item for item in labelClass if item==Nev and predictions[labelClass.index(item)]==Pos])\n",
"#     #伪负例\n",
"#     FN=len([item for item in labelClass if item==Pos and predictions[labelClass.index(item)]==Nev])\n",
"\n",
"    #Recall,TruePosProp=TP/(TP+FN)#识别的正例占整个正例的比率\n",
"    #FalsPosProp=FP/(FP+TN)#识别的正例占整个负例的比率\n",
"    #Precition=TP/(TP+FP)#识别的正确正例占识别出所有正例的比率\n",
"    \n",
"    return TP,TN,FP,FN"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"#测试下:\n",
"trees=buildRandomForest(dataSet)\n",
"testSet=nonPutBackSample(dataSet,100)\n",
"prediction=predictByForest(trees,testSet)\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(44, 39, 12, 5)\n"
]
}
],
"source": [
"labelClass=[item[-1] for item in testSet]\n",
"\n",
"tp=calQuota(prediction,labelClass,list(classLabels))\n",
"print(tp)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def accuracy( predicted,actual):  \n",
"    correct = 0\n",
"    for i in range(len(actual)):\n",
"        if actual[i] == predicted[i]:\n",
"            correct += 1\n",
"    return correct / float(len(actual)) * 100.0\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"83.0\n"
]
}
],
"source": [
"a=accuracy(prediction,labelClass)\n",
"print(a)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def createCrossValideSets(trainSet,n_folds,bPutBack=True):\n",
"    '''\n",
"    产生交叉验证数据集\n",
"    Args:\n",
"        dataset     原始数据集       \n",
"        n_folds     数据的份数,数据集交叉验证的份数,采用无放回抽取\n",
"        bPutBack    是否放回\n",
"    '''\n",
"    subSetsList=[]\n",
"    subLen=int(len(trainSet)/n_folds)\n",
"    if bPutBack:\n",
"        for j in range(n_folds):\n",
"            subSet=putBackSample(trainSet,subLen)\n",
"            subSetsList.append(subSet)\n",
"    else:\n",
"        for j in range(n_folds):\n",
"            subSet=nonPutBackSample(trainSet,subLen)\n",
"            subSetsList.append(subSet)\n",
"    return subSetsList"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"def randomForest(trainSet,testSet,max_depth=10, min_size=1, sample_size=0.2, n_trees=10, n_features=3):\n",
"    '''\n",
"    构造随机森林并测试\n",
"     Args:\n",
"        train           训练数据集,类型:list        \n",
"        testSet         测试集,类型:list\n",
"        max_depth       决策树深度不能太深,不然容易导致过拟合\n",
"        min_size        叶子节点的大小\n",
"        sample_size     训练数据集的样本比例,0,1之间的数\n",
"        n_trees         决策树的个数\n",
"        n_features      选取的特征的个数\n",
"    Returns:\n",
"        predition       测试集预测值,类型:list\n",
"    '''\n",
"    trees=buildRandomForest(trainSet,max_depth, min_size, sample_size, n_trees, n_features)\n",
"    predition=predictByForest(trees,testSet)\n",
"    return predition"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"def evaluteAlgorithm(trainSet,algorithm,n_folds,*args):\n",
"    '''\n",
"    评价算法函数\n",
"     Args:\n",
"        dataset     原始数据集\n",
"        algorithm   使用的算法\n",
"        n_folds     数据的份数,数据集交叉验证的份数,采用无放回抽取\n",
"        *args       其他的参数\n",
"    Returns:\n",
"        scores      模型得分\n",
"    '''\n",
"    folds = createCrossValideSets(trainSet, n_folds)\n",
"    scores = list()\n",
"    # 每次循环从 folds 从取出一个 fold 作为测试集,其余作为训练集,遍历整个 folds ,实现交叉验证\n",
"    for fold in folds:\n",
"        train_set = list(folds)\n",
"        train_set.remove(fold)\n",
"        # 将多个 fold 列表组合成一个 train_set 列表, 类似 union all\n",
"        \"\"\"\n",
"        In [20]: l1=[[1, 2, 'a'], [11, 22, 'b']]\n",
"        In [21]: l2=[[3, 4, 'c'], [33, 44, 'd']]\n",
"        In [22]: l=[]\n",
"        In [23]: l.append(l1)\n",
"        In [24]: l.append(l2)\n",
"        In [25]: l\n",
"        Out[25]: [[[1, 2, 'a'], [11, 22, 'b']], [[3, 4, 'c'], [33, 44, 'd']]]\n",
"        In [26]: sum(l, [])\n",
"        Out[26]: [[1, 2, 'a'], [11, 22, 'b'], [3, 4, 'c'], [33, 44, 'd']]\n",
"        \"\"\"\n",
"        train_set = sum(train_set, [])\n",
"        test_set = list()\n",
"        # fold 表示从原始数据集 dataset 提取出来的测试集\n",
"#         for row in fold:\n",
"#             row_copy = list(row)\n",
"#             row_copy[-1] = None\n",
"#             test_set.append(row_copy)\n",
"        predicted = algorithm(train_set, fold, *args)\n",
"    \n",
"        actual = [row[-1] for row in fold]\n",
"\n",
"        # 计算随机森林的预测结果的正确率\n",
"        accuracyValue = accuracy(predicted,actual)\n",
"        scores.append(accuracyValue)\n",
"    return scores"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"随机因子= 0.13436424411240122\n",
"决策树个数: 1\n",
"模型得分: [87.8048780487805, 90.2439024390244, 92.6829268292683, 85.36585365853658, 95.1219512195122]\n",
"平均准确度: 90.244%\n",
"随机因子= 0.13436424411240122\n",
"决策树个数: 10\n",
"模型得分: [92.6829268292683, 92.6829268292683, 87.8048780487805, 78.04878048780488, 100.0]\n",
"平均准确度: 90.244%\n"
]
}
],
"source": [
"    \n",
"    #综合测试函数\n",
"    n_folds = 5        # 分成5份数据,进行交叉验证\n",
"    max_depth = 20     # 调参(自己修改) #决策树深度不能太深,不然容易导致过拟合\n",
"    min_size = 1       # 决策树的叶子节点最少的元素数量\n",
"    sample_size = 1.0  # 做决策树时候的样本的比例\n",
"    # n_features = int((len(dataset[0])-1))\n",
"    n_features = 15     # 调参(自己修改) #准确性与多样性之间的权衡\n",
"    for n_trees in [1, 10]:  # 理论上树是越多越好\n",
"        scores = evaluteAlgorithm(dataSet, randomForest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)\n",
"        # 每一次执行本文件时都能产生同一个随机数\n",
"        random.seed(1)\n",
"        print('随机因子=', random.random())  # 每一次执行本文件时都能产生同一个随机数\n",
"        print('决策树个数: %d' % n_trees)  # 输出决策树个数\n",
"        print('模型得分: %s' % scores)  # 输出五份随机样本的模型得分\n",
"        print('平均准确度: %.3f%%' % (sum(scores)/float(len(scores))))  # 输出五份随机样本的平均准确度\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"随机因子= 0.13436424411240122\n",
"决策树个数: 1\n",
"模型得分: [80.48780487804879, 75.60975609756098, 73.17073170731707, 75.60975609756098, 78.04878048780488]\n",
"平均准确度: 76.585%\n",
"随机因子= 0.13436424411240122\n",
"决策树个数: 10\n",
"模型得分: [87.8048780487805, 85.36585365853658, 90.2439024390244, 78.04878048780488, 92.6829268292683]\n",
"平均准确度: 86.829%\n"
]
}
],
"source": [
"    sample_size =0.5  # 做决策树时候的样本的比例\n",
"    \n",
"    for n_trees in [1, 10]:  # 理论上树是越多越好\n",
"        scores = evaluteAlgorithm(dataSet, randomForest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)\n",
"        # 每一次执行本文件时都能产生同一个随机数\n",
"        random.seed(1)\n",
"        print('随机因子=', random.random())  # 每一次执行本文件时都能产生同一个随机数\n",
"        print('决策树个数: %d' % n_trees)  # 输出决策树个数\n",
"        print('模型得分: %s' % scores)  # 输出五份随机样本的模型得分\n",
"        print('平均准确度: %.3f%%' % (sum(scores)/float(len(scores))))  # 输出五份随机样本的平均准确度"
]
}
],
余下见附件