1、网格搜索

搜索最佳参数

from sklearn.model_selection import GridSearchCV

2、学习曲线

学习曲线就是通过画出不同训练集大小时训练集和交叉验证的准确率,可以看到模型在新数据上的表现,进而来判断模型是否方差偏高或偏差过高,以及增大训练集是否可以减小过拟合
如何通过学习曲线对一个模型进行评优?

有一篇文章讲的很详细​​这里​​

3、代码实现

注:数据集使用的是2021年华数杯c提附件一删去含有NULL的列

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import warnings

warnings.filterwarnings('ignore')
sns.set(style="darkgrid",palette='deep')
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False

df = pd.read_excel('2.xlsx')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
print(Counter(y))

smo = SMOTE(sampling_strategy=.6,random_state=2021)
X_smo,y_smo = smo.fit_resample(X,y)
print(Counter(y_smo))

X_train,X_test,y_train,y_test = train_test_split(X_smo,y_smo,test_size=0.2,random_state=2021)


#数据标准化
Scaler = MinMaxScaler().fit(X_train)
X_trained = Scaler.transform(X_train)
X_tested = Scaler.transform(X_test)


rf = RandomForestClassifier(n_estimators=100,random_state=2021,max_depth=15,min_samples_split=2,min_samples_leaf=1)
rf.fit(X_trained,y_train)
pred = rf.predict(X_tested)
print('f1:',f1_score(y_test,pred))
print('auc:',roc_auc_score(y_test,pred))



#对n_estimators进行网格搜索
param_test1 = {'n_estimators':np.arange(30,50)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(min_samples_split=100,min_samples_leaf=20,max_depth=8,max_features='sqrt',random_state=2021),param_grid=param_test1,scoring='roc_auc',cv=5)
gsearch1.fit(X_trained,y_train)
print('best_estimator:',gsearch1.best_params_,gsearch1.best_score_)
best_estimator=gsearch1.best_params_['n_estimators']


#对决策树的最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索
param_test2 ={'max_depth':np.arange(1,10),'min_samples_split':np.arange(1,10)}
gsearch2 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=best_estimator,min_samples_leaf=20,max_features='sqrt',oob_score=True,random_state=2021),param_grid=param_test2,scoring='roc_auc',cv=5)
gsearch2.fit(X_trained,y_train)
print('best_max_depth:',gsearch2.best_params_,gsearch2.best_score_)
best_max_depth,best_min_samples_split =gsearch2.best_params_['max_depth'],gsearch2.best_params_['min_samples_split']


#寻找随机森林里每个决策树的最小叶子节点
param_test3 ={'min_samples_leaf':np.arange(1,10)}
gsearch3 =GridSearchCV(estimator=RandomForestClassifier(n_estimators=best_estimator,max_depth=best_max_depth,min_samples_leaf=best_min_samples_split,max_features='sqrt',oob_score=True,random_state=2021),param_grid=param_test3,scoring='roc_auc',cv=5)
gsearch3.fit(X_trained,y_train)
print('best_min_samples_leaf:',gsearch3.best_params_,gsearch3.best_score_)
best_min_samples_leaf=gsearch3.best_params_['min_samples_leaf']


rf = RandomForestClassifier(n_estimators=best_estimator,max_depth=best_max_depth,min_samples_leaf=best_min_samples_leaf,min_samples_split=best_min_samples_split,random_state=2021)
rf.fit(X_trained,y_train)


from sklearn.svm import LinearSVC
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title,X,y,ylim=None,train_sizes=np.linspace(.1,1.0,5)):
plt.figure()
train_sizes,train_scores,test_scores=learning_curve(estimator,X,y,cv=5,n_jobs=-1,train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores,axis=1)
train_scores_std = np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
test_scores_std=np.std(test_scores,axis=1)
plt.fill_between(train_sizes,train_scores_mean-train_scores_std,
train_scores_mean+train_scores_std,alpha=0.1,color='r')
plt.fill_between(train_sizes,test_scores_mean-test_scores_std,
test_scores_mean+test_scores_std,alpha=0.1,color='g')
plt.plot(train_sizes,train_scores_mean,'o-',color='r',label='Training score')
plt.plot(train_sizes,test_scores_mean,'o-',color='g',label='Cross-validation score')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.grid('on')
if ylim:
plt.ylim(ylim)
plt.title(title)
plt.show()


if __name__=='__main__':
plot_learning_curve(rf,'',X,y,ylim=(0.01,1.05),train_sizes=np.linspace(.1,1,15))