文章目录

  • 1. 导包
  • 2.数据加载
  • 3.特征工程
  • 4.划分数据集
  • 4.模型选择
  • 6.模型调参(优)
  • 7.模型评估
  • 8.模型融合


1. 导包

import numpy as np
import pandas as pd
import seaborn as sns (数据可视化)
import matplotlib.pyplot as plt
%matplotlib inline

#消除Waring
import warnings
warnings.filterwarnings("ignore")

2.数据加载

可以简单分为两种情况:

  1. 当数据在根目录下:
train = pd.read_scv('train.csv',encoding='gbk')
test = pd.read_scv('test.csv')
  1. 数据不在根目录下:
root_path = '/opt/data/datasets/getting-started/titanic/input'
train = pd.read_csv('%s/%s' % (root_path, 'train.csv'))
test = pd.read_csv('%s/%s' % (root_path, 'test.csv'))

当使用日期作为索引读取数据

weather_2012 = pd.read_csv('weather_2012.csv', parse_dates=True, index_col='Date/Time')
#去重
data.drop_duplicates(inplace=True)
#观测正负样本是否均衡
y = data.status
X = data.drop('status', axis=1)
y.value_counts()

3.特征工程

3.1 查看数据特征情况

#查看前5行特征:
train.head(5)
#观察数据的大小,几行几列:
print(data.shape)
#查看特征的类型:
data.dtypes
#查看数据都有哪些特征:
data.columns
#查看特征(如特征source)具体数据信息:
data["source"]   
#返回每列列名,该列非nan值个数,以及该类型:
train.info()
#如果遇到特征值较多无法显示所有特征值的情况,可以使用下面代码:
train.info(null_counts = True, verbose = True)
#返回数值型变量的统计量:
train.describe()
#k查看各个特征中空值的总和:
df.isnull().sum()

pandas的str类型提供了一系列方便的函数,比如这里的contains

#获得包含下雪天的时间
weather_description = weather_2012['Weather']
is_snowing = weather_description.str.contains('Snow')

每个月的温度值中位数,有一个很有用的函数可以调用哈,叫 resample()

#M表示按月分,nedian表示中位数,(mean表示平均数)bar表示柱状图
weather_2012['Temp (C)'].resample('M').median().plot(kind='bar')

把两列数据进行左右拼接

stats = pd.concat([temperature, snowiness], axis=1)

表示将特征分别在两个图中展示

stats.plot(kind='bar', subplots=True, figsize=(15, 10))

对特征进行计数并排序

data['City'].value_count()

3.2.特征分析(统计学与绘图)

缺失值处理

#查看缺失值在所有样本中的占比
for feature in data.columns:
    summ = data[feature].isnull().sum()
    if summ:
        print('%.4f'%(summ*100/4754), '%',  '<==>', feature)

#用均值填充缺失值:
for feature in data.columns:
    summ = data[feature].isnull().sum()
    if summ:
        data[feature].fillna(data[feature].mean(), inplace = True)

#用众数填充缺失值:
for col in X_cate.columns:
    summ = X_cate[col].isnull().sum()
    if summ:
        X_cate[col].fillna(X_cate[col].mode()[0], inplace = True)
        
# 统计各列缺失值的比例
col_missing = {}
for col in X_num.columns:
    summ = X_num[col].isnull().sum()
    if summ:
        col_missing[col] = float('%.4f'%(summ*100/len(data)))
col_missing = sorted(col_missing.items(), key = lambda d:-d[1])
for col, rate in col_missing[:10]:
    print(rate, '%', col)
    
#用中位数填充缺失值
for col in X_num.columns:
    summ = X_num[col].isnull().sum()
    if summ:
        X_num[col].fillna(X_num[col].median(), inplace = True)

#填充固定值
X_cate['student_feature'].fillna(-1, inplace = True)

#去掉取值变化小的特征
for col in X_num.columns:
    rate = X_num[col].std()
    if rate < 0.1:
        print(col, rate)
        X_num.drop(col, axis = 1, inplace = True)
#查看:
X_cate.describe().T.assign(missing_pct = data.apply(lambda x : (len(x)-x.count())/len(x)))

# 最大最小归一化
X_num = X_num.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
#查看分类正样本类别个数:
train['Survived'].value_counts()

#查看字符型特征的取值及个数:
data['reg_preference_for_trad'].value_counts()

#将字符型特征转换为数值型特征(enumerate 操作)
dic = {}
for i, val in enumerate(list(data['reg_preference_for_trad'].unique())):   
    dic[val] = i
dic
list(data['reg_preference_for_trad'].unique())
data['reg_preference_for_trad'] = data['reg_preference_for_trad'].map(dic)

#数值型数据协方差,corr()函数,来了解数据间的相关性
train_corr = train.drop('PassengerId',axis=1).corr()
train_corr

#相关性协方差表,corr()函数,返回结果接近0说明无相关性,大于0说明是正相关,小于0是负相关。
#画相关性热力图
a = plt.subplots(figsize=(15,9))#调整画布大小
a = sns.heatmap(train_corr, vmin=-1, vmax=1 , annot=True , square=True)#画热力图

#分析各个特征与结果的关系
#按特征Pclass做表
train.groupby(['Pclass'])['Pclass','Survived'].mean()

#按特征Pclass直方图
train[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()

#年龄
g = sns.FacetGrid(train, col='Survived',size=5)
g.map(plt.hist, 'Age', bins=40)
train.groupby(['Age'])['Survived'].mean().plot()

#Onehot  将特征进行独热编码:
train_test = pd.get_dummies(train_test,columns=['Pclass'])

#添加新的特征,将两个特征进行合并,然后添加
train_test['SibSp_Parch'] = train_test['SibSp'] + train_test['Parch']
train_test = pd.get_dummies(train_test,columns = ['SibSp','Parch','SibSp_Parch'])

#删掉姓名这个特征:del train_test['Name']

#找出特征Fare中有缺失值的行:train_test.loc[train_test["Fare"].isnull()]

#票价与pclass和Embarked有关,所以用train分组后的平均数填充:
train.groupby(by=["Pclass","Embarked"]).Fare.mean()

#用pclass=3和Embarked=S的平均数14.644083来填充
train_test["Fare"].fillna(14.435422,inplace=True)

特征拼接与存储

X = pd.concat([X_date, X_cate, X_num], axis=1)
import pickle
with open('feature.pkl', 'wb') as f:
    pickle.dump(X, f)

4.划分数据集

#将数据标准化:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
#用测试集训练并标准化
ss.fit(missing_age_X_train)
missing_age_X_train = ss.transform(missing_age_X_train)
missing_age_X_test = ss.transform(missing_age_X_test)

#使用贝叶斯预测年龄
from sklearn import linear_model
lin = linear_model.BayesianRidge()
lin.fit(missing_age_X_train,missing_age_Y_train)
BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.001, verbose=False)


#划分数据集
features = [x for x in data.columns if x not in ['status']]
from sklearn.model_selection import train_test_split
#预处理 标准化操作
from sklearn.preprocessing import StandardScaler
#特征 与 标签
X = data[features]
y = data.status
#训练集 和 测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2333)
#对特征进行归一化处理
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

载入已处理特征

# 导入数据集后就去重
data.drop_duplicates(inplace=True)
import pickle
# 载入特征
with open('feature.pkl', 'rb') as f:
    X = pickle.load(f)

# 观测正负样本是否均衡
y = data.status
y.value_counts()

4.模型选择

需要注意的是线性模型需要使用标准化后的数据,树模型无需使用标准化后的数据

#逻辑回归
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
#SVM(线性SVM,多项式SVM,高斯核SVM,SigmoidSVm)
from sklearn import svm
svm_linear = svm.SVC(kernel = 'linear', probability=True).fit(X_train, y_train)
svm_poly = svm.SVC(kernel = 'poly', probability=True).fit(X_train, y_train)
svm_rbf = svm.SVC(probability=True).fit(X_train, y_train)
svm_sigmoid = svm.SVC(kernel = 'sigmoid',probability=True).fit(X_train, y_train)
#决策树
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)

6.模型调参(优)

导入包

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
#LR模型
lr = LogisticRegression()
param = {'C': [1e-3,0.01,0.1,1,10,100,1e3], 'penalty':['l1', 'l2']}
gsearch = GridSearchCV(lr, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
#带入最优参数
lr = LogisticRegression(C = 0.1, penalty = 'l1')
lr.fit(X_train, y_train)
model_metrics(lr, X_train, X_test, y_train, y_test)

#SVM模型
# 1) 线性SVM
svm_linear = svm.SVC(kernel = 'linear', probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_linear, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_linear.fit(X_train, y_train)
model_metrics(svm_linear, X_train, X_test, y_train, y_test)

# 2) 多项式SVM
svm_poly = svm.SVC(kernel = 'poly', probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_poly, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_poly =  svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_poly.fit(X_train, y_train)
model_metrics(svm_poly, X_train, X_test, y_train, y_test)

# 3) 高斯SVM
svm_rbf = svm.SVC(probability=True)
param = {'gamma':[0.01,0.1,1,10], 
         'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_poly, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_rbf =  svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_rbf.fit(X_train, y_train)
model_metrics(svm_rbf, X_train, X_test, y_train, y_test)

# 4) sigmoid - SVM
svm_sigmoid = svm.SVC(kernel = 'sigmoid',probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_sigmoid, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_sigmoid =  svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
svm_sigmoid.fit(X_train, y_train)
model_metrics(svm_sigmoid, X_train, X_test, y_train, y_test)

7.模型评估

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score,roc_curve, auc
import matplotlib.pyplot as plt
%matplotlib inline

def model_metrics(clf, X_train, X_test, y_train, y_test):
#预测训练集和测试集

	y_train_pred = clf.predict(X_train)
	y_test_pred = clf.predict(X_test)
	y_train_proba = clf.predict_proba(X_train)[:,1]
	y_test_proba = clf.predict_proba(X_test)[:,1]

	#准确率Accuracy
	print('[准确率]', end = ' ')
	print('训练集:', '%.4f'%accuracy_score(y_train, y_train_pred), end = ' ')
	print('测试集:', '%.4f'%accuracy_score(y_test, y_test_pred))

	#精准率Precision
	print('[精准率]', end = ' ')
	print('训练集:', '%.4f'%precision_score(y_train, y_train_pred), end = ' ')
	print('测试集:', '%.4f'%precision_score(y_test, y_test_pred))

	#召回率Recall
	print('[召回率]', end = ' ')
	print('训练集:', '%.4f'%recall_score(y_train, y_train_pred), end = ' ')
	print('测试集:', '%.4f'%recall_score(y_test, y_test_pred))

	#f1-score
	print('[f1-score]', end = ' ')
	print('训练集:', '%.4f'%f1_score(y_train, y_train_pred), end = ' ')
	print('测试集:', '%.4f'%f1_score(y_test, y_test_pred))

	#AUC取值
	print('[auc值]', end = ' ')
	print('训练集:', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
	print('测试集:', '%.4f'%roc_auc_score(y_test, y_test_proba))

	#ROC曲线
	fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_proba, pos_label = 1)
	fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_proba, pos_label = 1)

	label = ["Train - AUC:{:.4f}".format(auc(fpr_train, tpr_train)),
	"Test - AUC:{:.4f}".format(auc(fpr_test, tpr_test))]
	plt.plot(fpr_train,tpr_train)
	plt.plot(fpr_test,tpr_test)
	plt.plot([0, 1], [0, 1], 'd--')
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.legend(label, loc = 4)
	plt.title("ROC curve")

计算

#逻辑回归
model_metrics(lr, X_train, X_test, y_train, y_test)
# 线性SVM
model_metrics(svm_linear, X_train, X_test, y_train, y_test)
# 多项式SVM
model_metrics(svm_poly, X_train, X_test, y_train, y_test)
# 高斯核SVM
model_metrics(svm_rbf, X_train, X_test, y_train, y_test)
# sigmoid-SVM
model_metrics(svm_sigmoid, X_train, X_test, y_train, y_test)
# 决策树
model_metrics(dt, X_train, X_test, y_train, y_test)

8.模型融合

(1) stacking融合

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier

from mlxtend.classifier import StackingClassifier

# 模型调优后得到的参数
lr = LogisticRegression(C = 0.1, penalty = 'l1')
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_poly =  svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_rbf =  svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_sigmoid =  svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
dt = DecisionTreeClassifier(max_depth=11,min_samples_split=550,min_samples_leaf=80,max_features=19)
xgb = XGBClassifier(learning_rate =0.01, n_estimators=180, max_depth=3, min_child_weight=5, 
                    gamma=0.4, subsample=0.5, colsample_bytree=0.9, reg_alpha=1, 
                    objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgb = LGBMClassifier(learning_rate =0.1, n_estimators=60, max_depth=3, min_child_weight=11, 
                    gamma=0.1, subsample=0.5, colsample_bytree=0.8, reg_alpha=1e-5, 
                    nthread=4,scale_pos_weight=1, seed=27)
import warnings
warnings.filterwarnings("ignore")

sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, 
                                       svm_sigmoid, dt, xgb, lgb], meta_classifier=lr)
sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)

2)将初级分类器产生的输出类概率作为新特征
对输出概率use_probas=True,有两种不同的处理方式。

假设有2个初级分类器和3个类别输出概率:p1=[0.2,0.5,0.3],p2=[0.3,0.4,0.4]。
如果average_probas=True,则对分类器的结果求平均,得到:p=[0.25,0.45,0.35]
[推荐]如果average_probas=False,则分类器的所有结果都保留作为新的特征:p=[0.2,0.5,0.3,0.3,0.4,0.4]

sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, svm_sigmoid, dt, xgb, lgb],  meta_classifier=lr, use_probas=True,average_probas=False)
sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)

(2)Voting融合

from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.1,max_iter=100)

import xgboost as xgb
xgb_model = xgb.XGBClassifier(max_depth=6,min_samples_leaf=2,n_estimators=100,num_round = 5)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200,min_samples_leaf=2,max_depth=6,oob_score=True)

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)

vot = VotingClassifier(estimators=[('lr', lr), ('rf', rf),('gbdt',gbdt),('xgb',xgb_model)], voting='hard')
vot.fit(train_data_X_sd,train_data_Y)

test["Survived"] = vot.predict(test_data_X_sd)
test[['PassengerId','Survived']].set_index('PassengerId').to_csv('vot5.csv')

可将比较好的模型存起来

# 如果分数足够好,可以将该模型保存起来,下次直接调出来使用0.81339 'rf10.pkl'
from sklearn.externals import joblib
joblib.dump(rf, 'rf10.pkl')