Author:LieDra

前言

下面将对数据利用组合分类器进行处理分析。

介绍

我们使用多专家组合的全局方法,构造并行的架构,对于给定的一个测试集输入,所有的基学习器都产生进行训练,并给出测试样本的输出,我们将各个输出都保存起来 以进行下一步判断。
我们选用了最常见的方法,即投票法,对于每个输出进行“求和”,即如果有三个及以上的基学习器得到的结果是同一类,那么最终的结果就是这一类。

代码示例

MyAPI.py

'''
version:
author:LieDra
Method:接口--输入非标准化数据或pca后数据
'''
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier   
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model.logistic import LogisticRegression
from sklearn.neural_network import MLPClassifier
    
# 只需要改get_data参数即可改变读取的文件
def get_data():
    df=pd.read_excel('D:/study/5/code/python/python Data analysis and mining/class/dataset/german.xls') #原始数据
    # df=pd.read_excel('D:/study/5/code/python/python Data analysis and mining/class/dataset/german-pca.xls') #pca降维后的数据
    x=df.ix[:,:-1]
    y=df.ix[:,-1]
    return x,y

def list_add(a,b):
    # 这个函数实现列表a与列表b相加,同时相加后的值存到列表a中
    assert len(a)==len(b)
    for i in range(len(a)):
        a[i] += b[i]

def list_div(a,num):
    # 这个函数将列表a的各值除以num,同时将处理后的值存到列表a中
    for i in range(len(a)):
        a[i] /= num
    return a

def get_result_of_5classifiers(l1,l2,l3,l4,l5):
    # tmp = list_add(l1,list_add(l2,list_add( l3,list_add(l4,l5) ) ) )
    tmp = copy.deepcopy(l1)
    list_add(tmp,l2)
    list_add(tmp,l3)
    list_add(tmp,l4)
    list_add(tmp,l5)
    # print(l1[:30])
    # print(l2[:30])
    # print(l3[:30])
    # print(l4[:30])
    # print(l5[:30])
    # print(tmp[:30])
    for i in range(len(tmp)):
        if tmp[i]>2:
            tmp[i]=1
        else:
            tmp[i]=0
    return tmp

def get_acc_of_5classifiers(result,real):
    tmp = [1 for i in range(len(result)) if result[i]==real[i]]
    # print(tmp)
    # print(len(tmp))
    return sum(tmp)/len(real)

def get_score(result,real):
    tmp = 0
    for i in range(len(result)):
        if result[i] == real[i]:
            continue
        elif result[i]==0:
            tmp += 1
        elif result[i]==1:
            tmp += 5
    return tmp

def AUC_performance(AUC):
    '''
    Logistic regression auc
    '''
    if AUC >=0.7:
        print("good classifier")
    if 0.7>AUC>0.6:
        print("not very good classifier")
    if 0.6>=AUC>0.5:
        print("useless classifier")
    if 0.5>=AUC:
        print("bad classifier,with sorting problems")



def get_train_check(seed):
    # x,y = get_data()
    # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=seed)
    x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=seed)
    return x_train,x_check,y_train,y_check

def get_decision_tree_classifier():
    '''
    不需要标准化数据
    '''
    # x,y = get_data()
    # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=47)
    # x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=48)
    best_tree = DecisionTreeClassifier(max_depth=3,random_state=88)
    best_tree = best_tree.fit(x_train,y_train)
    # accuracy_training=best_tree.score(x_train,y_train)
    accuracy_test=best_tree.score(x_test,y_test)
    print("decision tree:") 
    # print("accuracy on the training subset:{:.3f}".format(best_tree.score(x_train,y_train)))
    print("accuracy on the check subset:{:.3f}".format(best_tree.score(x_check,y_check)))
    return best_tree

def get_random_forest_classifier():
    '''
    不需要标准化数据
    '''
    trees = 10
    # x,y = get_data()
    # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=594)
    # x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=594)
    forest=RandomForestClassifier(n_estimators=trees,random_state=38)
    forest.fit(x_train,y_train)
    # result_acc = 0
    # result_acc += forest.score(x_check,y_check)
    # for j in range(10):
    #     x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=j+1)
    #     print(forest.score(x_check,y_check),end=' ')
    #     # print(forest.predict(x_check))
    #     result_acc += forest.score(x_check,y_check)
    # print()
    # print("avg accuracy on the 11 check subsets:{:.3f}".format(result_acc/11))
    return forest

def get_svm_classifier():
    '''
    需要标准化数据
    '''
    # x,y = get_data()
    # x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=38)
    # x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=38)
    
    x_train_scaled = preprocessing.scale(x_train)
    x_test_scaled = preprocessing.scale(x_check)
    
    svm = SVC(random_state=66)
    svm.fit(x_train_scaled,y_train)
    print("accuracy on the scaled training subset:{:.3f}".format(svm.score(x_train_scaled,y_train)))
    print("accuracy on the scaled test subset:{:.3f}".format(svm.score(x_test_scaled,y_check)))
    return svm

def get_logistic_regression_classifier():
    '''
    不需要标准化数据
    '''   
    # x,y = get_data()
    # x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=754)
    # x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=754)
    
    log_classifier = LogisticRegression(random_state=99)
    log_classifier.fit(x_train,y_train)
    print("accuracy on the test subset:{:.3f}".format(log_classifier.score(x_check,y_check)))
    return log_classifier

def get_mlp_classifier():
    '''
    需要标准化数据
    '''
    # x,y = get_data()
    # x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,train_size=0.6,random_state=38)
    # x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=38)
    
    scaler=StandardScaler()
    x_train_scaled=scaler.fit(x_train).transform(x_train)
    x_check_scaled=scaler.fit(x_check).transform(x_check)
    mlp_scaled=MLPClassifier(max_iter=100,random_state=99)
    mlp_scaled.fit(x_train_scaled,y_train)
    print("accuracy on the check subset:{:.3f}".format(mlp_scaled.score(x_check_scaled,y_check)))
    return mlp_scaled

# x,y = get_data()
x,y = get_data()
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=38)
x_test2,x_check,y_test2,y_check=train_test_split(x_test,y_test,train_size=0.25,random_state=38)
    
# x,y = get_data()
# x,x1,y,y1 = get_train_check(1)
# print(y)
# print(type(y))
# print(list(y))
# print(type(y))
# print(type(list(y)))

main.py

'''
version:
author:LieDra
Method:组合分类器
'''

# import numpy as np
# import matplotlib.pyplot as plt
from MyAPI import *

def main():
    # 获得五个分类器
    decision_tree_classifier       = get_decision_tree_classifier()
    random_forest_classifier       = get_random_forest_classifier()
    svm_classifier                 = get_svm_classifier()
    logistic_regression_classifier = get_logistic_regression_classifier()
    mlp_classifier                 = get_mlp_classifier()

    # 测试组合效果
    # x1,y1 = get_data('dataset\german.xls')
    acc_list = [];score_list=[]
    acc1_list = [];score1_list=[]
    acc2_list = [];score2_list=[]
    acc3_list = [];score3_list=[]
    acc4_list = [];score4_list=[]
    acc5_list = [];score5_list=[]

    # x2,y2 = get_data('dataset\german-全标准化.xls')
    for i in range(200):
        print('第',i+1,'次',end=' ')
        x_train,x_check,y_train,y_check=get_train_check(i+1)
        # print('***'*5,111,'***'*5)
        l1 = decision_tree_classifier.predict(x_check)
        # print('***'*5,222,'***'*5)
        l2 = random_forest_classifier.predict(x_check)
        # print('***'*5,333,'***'*5)
        x_check_scaled1 = preprocessing.scale(x_check)
        l3 = svm_classifier.predict(x_check_scaled1)
        # print('***'*5,444,'***'*5)
        l4 = logistic_regression_classifier.predict(x_check)
        # print('***'*5,555,'***'*5)
        scaler=StandardScaler()
        x_check_scaled2=scaler.fit(x_check).transform(x_check)
        l5 = mlp_classifier.predict(x_check_scaled2)
        result = get_result_of_5classifiers(l1,l2,l3,l4,l5)

        acc = get_acc_of_5classifiers(result,list(y_check))
        acc1 = get_acc_of_5classifiers(l1,list(y_check))
        acc2 = get_acc_of_5classifiers(l2,list(y_check))
        acc3 = get_acc_of_5classifiers(l3,list(y_check))
        acc4 = get_acc_of_5classifiers(l4,list(y_check))
        acc5 = get_acc_of_5classifiers(l5,list(y_check))

        # print('acc:%.5f' % acc,end=' ')
        print('acc ',acc)
        print('acc:%.5f 决策树:%.5f 随机森林:%.5f SVM:%.5f 逻辑回归:%.5f MLP:%.5f'%(acc,acc1,acc2,acc3,acc4,acc5))
        # print('score:%.5f 决策树:%.5f 随机森林:%.5f SVM:%.5f 逻辑回归:%.5f MLP:%.5f')
        acc_list.append(acc)
        acc1_list.append(acc1)
        acc2_list.append(acc2)
        acc3_list.append(acc3)
        acc4_list.append(acc4)
        acc5_list.append(acc5)

        score_list.append(get_score(result,list(y_check)))
        score1_list.append(get_score(l1,list(y_check)))
        score2_list.append(get_score(l2,list(y_check)))
        score3_list.append(get_score(l3,list(y_check)))
        score4_list.append(get_score(l4,list(y_check)))
        score5_list.append(get_score(l5,list(y_check)))


    print('max acc:',max(acc_list))
    print('avg acc:',sum(acc_list)/len(acc_list))
    print('max acc:%.7f决策树:%.7f 随机森林:%.7f SVM:%.7f 逻辑回归:%.7f MLP:%.7f'%(\
        max(acc_list)\
        ,max(acc1_list)\
        ,max(acc2_list)\
        ,max(acc3_list)\
        ,max(acc4_list)\
        ,max(acc5_list)))
    print('avg acc:%.7f决策树:%.7f 随机森林:%.7f SVM:%.7f 逻辑回归:%.7f MLP:%.7f'%(\
        sum(acc_list)/len(acc_list),\
        sum(acc1_list)/len(acc1_list),\
        sum(acc2_list)/len(acc2_list),\
        sum(acc3_list)/len(acc3_list),\
        sum(acc4_list)/len(acc4_list),\
        sum(acc5_list)/len(acc5_list),\
            ))
    print('avg score:%.7f决策树:%.7f 随机森林:%.7f SVM:%.7f 逻辑回归:%.7f MLP:%.7f'%(\
        sum(score_list)/len(score_list),\
        sum(score1_list)/len(score1_list),\
        sum(score2_list)/len(score2_list),\
        sum(score3_list)/len(score3_list),\
        sum(score4_list)/len(score4_list),\
        sum(score5_list)/len(score5_list),\
            ))


if __name__ == "__main__":
    main()

结果

PCA前

···
第 100 次 acc  0.7533333333333333
acc:0.75333 决策树:0.68000 随机森林:0.70667 SVM:0.78000 逻辑回归:0.74000 MLP:0.79333
max acc: 0.7533333333333333
avg acc: 0.7117333333333333
max acc:0.7533333决策树:0.6933333 随机森林:0.7400000 SVM:0.7800000 逻辑回归:0.7466667 MLP:0.8333333
avg acc:0.7117333决策树:0.6534667 随机森林:0.7004000 SVM:0.7309333 逻辑回归:0.7030000 MLP:0.7694667
avg score:189.4000000决策树:238.8200000 随机森林:167.8600000 SVM:190.4000000 逻辑回归:192.8300000 MLP:144.5000000
···
第 200 次 acc  0.7
acc:0.70000 决策树:0.65333 随机森林:0.68667 SVM:0.73333 逻辑回归:0.70667 MLP:0.76000
max acc: 0.7533333333333333
avg acc: 0.7118666666666669
max acc:0.7533333决策树:0.7133333 随机森林:0.7400000 SVM:0.7800000 逻辑回归:0.7533333 MLP:0.8333333
avg acc:0.7118667决策树:0.6538667 随机森林:0.7000667 SVM:0.7316333 逻辑回归:0.7037000 MLP:0.7693333
avg score:189.2600000决策树:238.5800000 随机森林:168.5100000 SVM:189.6950000 逻辑回归:192.3450000 MLP:144.6400000

PCA后

第 100 次 acc  0.7533333333333333
acc:0.75333 决策树:0.72000 随机森林:0.68667 SVM:0.73333 逻辑回归:0.74667 MLP:0.74667
max acc: 0.76
avg acc: 0.7196666666666666
max acc:0.7600000决策树:0.7666667 随机森林:0.7200000 SVM:0.7466667 逻辑回归:0.7533333 MLP:0.7733333
avg acc:0.7196667决策树:0.6885333 随机森林:0.6675333 SVM:0.6994667 逻辑回归:0.7138000 MLP:0.7332667
avg score:177.0100000决策树:193.8400000 随机森林:189.4700000 SVM:194.1600000 逻辑回归:169.8900000 MLP:164.3700000
第 200 次 acc  0.7333333333333333
acc:0.73333 决策树:0.70000 随机森林:0.68000 SVM:0.70667 逻辑回归:0.72667 MLP:0.74000
max acc: 0.76
avg acc: 0.7205000000000003
max acc:0.7600000决策树:0.7666667 随机森林:0.7200000 SVM:0.7600000 逻辑回归:0.7666667 MLP:0.7800000
avg acc:0.7205000决策树:0.6878333 随机森林:0.6680333 SVM:0.7013333 逻辑回归:0.7137000 MLP:0.7341333
avg score:176.7650000决策树:194.7050000 随机森林:188.9350000 SVM:193.1800000 逻辑回归:169.9850000 MLP:163.8800000

其余省略

最终的测试结果将在(十)中展示。