文章目录
- 1. 导包
- 2.数据加载
- 3.特征工程
- 4.划分数据集
- 4.模型选择
- 6.模型调参(优)
- 7.模型评估
- 8.模型融合
1. 导包
import numpy as np
import pandas as pd
import seaborn as sns (数据可视化)
import matplotlib.pyplot as plt
%matplotlib inline
#消除Waring
import warnings
warnings.filterwarnings("ignore")
2.数据加载
可以简单分为两种情况:
- 当数据在根目录下:
train = pd.read_scv('train.csv',encoding='gbk')
test = pd.read_scv('test.csv')
- 数据不在根目录下:
root_path = '/opt/data/datasets/getting-started/titanic/input'
train = pd.read_csv('%s/%s' % (root_path, 'train.csv'))
test = pd.read_csv('%s/%s' % (root_path, 'test.csv'))
当使用日期作为索引读取数据
weather_2012 = pd.read_csv('weather_2012.csv', parse_dates=True, index_col='Date/Time')
#去重
data.drop_duplicates(inplace=True)
#观测正负样本是否均衡
y = data.status
X = data.drop('status', axis=1)
y.value_counts()
3.特征工程
3.1 查看数据特征情况
#查看前5行特征:
train.head(5)
#观察数据的大小,几行几列:
print(data.shape)
#查看特征的类型:
data.dtypes
#查看数据都有哪些特征:
data.columns
#查看特征(如特征source)具体数据信息:
data["source"]
#返回每列列名,该列非nan值个数,以及该类型:
train.info()
#如果遇到特征值较多无法显示所有特征值的情况,可以使用下面代码:
train.info(null_counts = True, verbose = True)
#返回数值型变量的统计量:
train.describe()
#k查看各个特征中空值的总和:
df.isnull().sum()
pandas的str类型提供了一系列方便的函数,比如这里的contains
#获得包含下雪天的时间
weather_description = weather_2012['Weather']
is_snowing = weather_description.str.contains('Snow')
每个月的温度值中位数,有一个很有用的函数可以调用哈,叫 resample()
#M表示按月分,nedian表示中位数,(mean表示平均数)bar表示柱状图
weather_2012['Temp (C)'].resample('M').median().plot(kind='bar')
把两列数据进行左右拼接
stats = pd.concat([temperature, snowiness], axis=1)
表示将特征分别在两个图中展示
stats.plot(kind='bar', subplots=True, figsize=(15, 10))
对特征进行计数并排序
data['City'].value_count()
3.2.特征分析(统计学与绘图)
缺失值处理
#查看缺失值在所有样本中的占比
for feature in data.columns:
summ = data[feature].isnull().sum()
if summ:
print('%.4f'%(summ*100/4754), '%', '<==>', feature)
#用均值填充缺失值:
for feature in data.columns:
summ = data[feature].isnull().sum()
if summ:
data[feature].fillna(data[feature].mean(), inplace = True)
#用众数填充缺失值:
for col in X_cate.columns:
summ = X_cate[col].isnull().sum()
if summ:
X_cate[col].fillna(X_cate[col].mode()[0], inplace = True)
# 统计各列缺失值的比例
col_missing = {}
for col in X_num.columns:
summ = X_num[col].isnull().sum()
if summ:
col_missing[col] = float('%.4f'%(summ*100/len(data)))
col_missing = sorted(col_missing.items(), key = lambda d:-d[1])
for col, rate in col_missing[:10]:
print(rate, '%', col)
#用中位数填充缺失值
for col in X_num.columns:
summ = X_num[col].isnull().sum()
if summ:
X_num[col].fillna(X_num[col].median(), inplace = True)
#填充固定值
X_cate['student_feature'].fillna(-1, inplace = True)
#去掉取值变化小的特征
for col in X_num.columns:
rate = X_num[col].std()
if rate < 0.1:
print(col, rate)
X_num.drop(col, axis = 1, inplace = True)
#查看:
X_cate.describe().T.assign(missing_pct = data.apply(lambda x : (len(x)-x.count())/len(x)))
# 最大最小归一化
X_num = X_num.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
#查看分类正样本类别个数:
train['Survived'].value_counts()
#查看字符型特征的取值及个数:
data['reg_preference_for_trad'].value_counts()
#将字符型特征转换为数值型特征(enumerate 操作)
dic = {}
for i, val in enumerate(list(data['reg_preference_for_trad'].unique())):
dic[val] = i
dic
list(data['reg_preference_for_trad'].unique())
data['reg_preference_for_trad'] = data['reg_preference_for_trad'].map(dic)
#数值型数据协方差,corr()函数,来了解数据间的相关性
train_corr = train.drop('PassengerId',axis=1).corr()
train_corr
#相关性协方差表,corr()函数,返回结果接近0说明无相关性,大于0说明是正相关,小于0是负相关。
#画相关性热力图
a = plt.subplots(figsize=(15,9))#调整画布大小
a = sns.heatmap(train_corr, vmin=-1, vmax=1 , annot=True , square=True)#画热力图
#分析各个特征与结果的关系
#按特征Pclass做表
train.groupby(['Pclass'])['Pclass','Survived'].mean()
#按特征Pclass直方图
train[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
#年龄
g = sns.FacetGrid(train, col='Survived',size=5)
g.map(plt.hist, 'Age', bins=40)
train.groupby(['Age'])['Survived'].mean().plot()
#Onehot 将特征进行独热编码:
train_test = pd.get_dummies(train_test,columns=['Pclass'])
#添加新的特征,将两个特征进行合并,然后添加
train_test['SibSp_Parch'] = train_test['SibSp'] + train_test['Parch']
train_test = pd.get_dummies(train_test,columns = ['SibSp','Parch','SibSp_Parch'])
#删掉姓名这个特征:del train_test['Name']
#找出特征Fare中有缺失值的行:train_test.loc[train_test["Fare"].isnull()]
#票价与pclass和Embarked有关,所以用train分组后的平均数填充:
train.groupby(by=["Pclass","Embarked"]).Fare.mean()
#用pclass=3和Embarked=S的平均数14.644083来填充
train_test["Fare"].fillna(14.435422,inplace=True)
特征拼接与存储
X = pd.concat([X_date, X_cate, X_num], axis=1)
import pickle
with open('feature.pkl', 'wb') as f:
pickle.dump(X, f)
4.划分数据集
#将数据标准化:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
#用测试集训练并标准化
ss.fit(missing_age_X_train)
missing_age_X_train = ss.transform(missing_age_X_train)
missing_age_X_test = ss.transform(missing_age_X_test)
#使用贝叶斯预测年龄
from sklearn import linear_model
lin = linear_model.BayesianRidge()
lin.fit(missing_age_X_train,missing_age_Y_train)
BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.001, verbose=False)
#划分数据集
features = [x for x in data.columns if x not in ['status']]
from sklearn.model_selection import train_test_split
#预处理 标准化操作
from sklearn.preprocessing import StandardScaler
#特征 与 标签
X = data[features]
y = data.status
#训练集 和 测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2333)
#对特征进行归一化处理
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)
载入已处理特征
# 导入数据集后就去重
data.drop_duplicates(inplace=True)
import pickle
# 载入特征
with open('feature.pkl', 'rb') as f:
X = pickle.load(f)
# 观测正负样本是否均衡
y = data.status
y.value_counts()
4.模型选择
需要注意的是线性模型需要使用标准化后的数据,树模型无需使用标准化后的数据
#逻辑回归
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
#SVM(线性SVM,多项式SVM,高斯核SVM,SigmoidSVm)
from sklearn import svm
svm_linear = svm.SVC(kernel = 'linear', probability=True).fit(X_train, y_train)
svm_poly = svm.SVC(kernel = 'poly', probability=True).fit(X_train, y_train)
svm_rbf = svm.SVC(probability=True).fit(X_train, y_train)
svm_sigmoid = svm.SVC(kernel = 'sigmoid',probability=True).fit(X_train, y_train)
#决策树
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)
6.模型调参(优)
导入包
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
#LR模型
lr = LogisticRegression()
param = {'C': [1e-3,0.01,0.1,1,10,100,1e3], 'penalty':['l1', 'l2']}
gsearch = GridSearchCV(lr, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
#带入最优参数
lr = LogisticRegression(C = 0.1, penalty = 'l1')
lr.fit(X_train, y_train)
model_metrics(lr, X_train, X_test, y_train, y_test)
#SVM模型
# 1) 线性SVM
svm_linear = svm.SVC(kernel = 'linear', probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_linear, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_linear.fit(X_train, y_train)
model_metrics(svm_linear, X_train, X_test, y_train, y_test)
# 2) 多项式SVM
svm_poly = svm.SVC(kernel = 'poly', probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_poly, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_poly = svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_poly.fit(X_train, y_train)
model_metrics(svm_poly, X_train, X_test, y_train, y_test)
# 3) 高斯SVM
svm_rbf = svm.SVC(probability=True)
param = {'gamma':[0.01,0.1,1,10],
'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_poly, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_rbf = svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_rbf.fit(X_train, y_train)
model_metrics(svm_rbf, X_train, X_test, y_train, y_test)
# 4) sigmoid - SVM
svm_sigmoid = svm.SVC(kernel = 'sigmoid',probability=True)
param = {'C':[0.01,0.1,1]}
gsearch = GridSearchCV(svm_sigmoid, param_grid = param,scoring ='roc_auc', cv=5)
gsearch.fit(X_train, y_train)
print('最佳参数:',gsearch.best_params_)
print('训练集的最佳分数:', gsearch.best_score_)
print('测试集的最佳分数:', gsearch.score(X_test, y_test))
#带入最优参数
svm_sigmoid = svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
svm_sigmoid.fit(X_train, y_train)
model_metrics(svm_sigmoid, X_train, X_test, y_train, y_test)
7.模型评估
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score,roc_curve, auc
import matplotlib.pyplot as plt
%matplotlib inline
def model_metrics(clf, X_train, X_test, y_train, y_test):
#预测训练集和测试集
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
y_train_proba = clf.predict_proba(X_train)[:,1]
y_test_proba = clf.predict_proba(X_test)[:,1]
#准确率Accuracy
print('[准确率]', end = ' ')
print('训练集:', '%.4f'%accuracy_score(y_train, y_train_pred), end = ' ')
print('测试集:', '%.4f'%accuracy_score(y_test, y_test_pred))
#精准率Precision
print('[精准率]', end = ' ')
print('训练集:', '%.4f'%precision_score(y_train, y_train_pred), end = ' ')
print('测试集:', '%.4f'%precision_score(y_test, y_test_pred))
#召回率Recall
print('[召回率]', end = ' ')
print('训练集:', '%.4f'%recall_score(y_train, y_train_pred), end = ' ')
print('测试集:', '%.4f'%recall_score(y_test, y_test_pred))
#f1-score
print('[f1-score]', end = ' ')
print('训练集:', '%.4f'%f1_score(y_train, y_train_pred), end = ' ')
print('测试集:', '%.4f'%f1_score(y_test, y_test_pred))
#AUC取值
print('[auc值]', end = ' ')
print('训练集:', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
print('测试集:', '%.4f'%roc_auc_score(y_test, y_test_proba))
#ROC曲线
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_proba, pos_label = 1)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_proba, pos_label = 1)
label = ["Train - AUC:{:.4f}".format(auc(fpr_train, tpr_train)),
"Test - AUC:{:.4f}".format(auc(fpr_test, tpr_test))]
plt.plot(fpr_train,tpr_train)
plt.plot(fpr_test,tpr_test)
plt.plot([0, 1], [0, 1], 'd--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(label, loc = 4)
plt.title("ROC curve")
计算
#逻辑回归
model_metrics(lr, X_train, X_test, y_train, y_test)
# 线性SVM
model_metrics(svm_linear, X_train, X_test, y_train, y_test)
# 多项式SVM
model_metrics(svm_poly, X_train, X_test, y_train, y_test)
# 高斯核SVM
model_metrics(svm_rbf, X_train, X_test, y_train, y_test)
# sigmoid-SVM
model_metrics(svm_sigmoid, X_train, X_test, y_train, y_test)
# 决策树
model_metrics(dt, X_train, X_test, y_train, y_test)
8.模型融合
(1) stacking融合
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from mlxtend.classifier import StackingClassifier
# 模型调优后得到的参数
lr = LogisticRegression(C = 0.1, penalty = 'l1')
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_poly = svm.SVC(C = 0.01, kernel = 'poly', probability=True)
svm_rbf = svm.SVC(gamma = 0.01, C =0.01 , probability=True)
svm_sigmoid = svm.SVC(C = 0.01, kernel = 'sigmoid',probability=True)
dt = DecisionTreeClassifier(max_depth=11,min_samples_split=550,min_samples_leaf=80,max_features=19)
xgb = XGBClassifier(learning_rate =0.01, n_estimators=180, max_depth=3, min_child_weight=5,
gamma=0.4, subsample=0.5, colsample_bytree=0.9, reg_alpha=1,
objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgb = LGBMClassifier(learning_rate =0.1, n_estimators=60, max_depth=3, min_child_weight=11,
gamma=0.1, subsample=0.5, colsample_bytree=0.8, reg_alpha=1e-5,
nthread=4,scale_pos_weight=1, seed=27)
import warnings
warnings.filterwarnings("ignore")
sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf,
svm_sigmoid, dt, xgb, lgb], meta_classifier=lr)
sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)
2)将初级分类器产生的输出类概率作为新特征
对输出概率use_probas=True,有两种不同的处理方式。
假设有2个初级分类器和3个类别输出概率:p1=[0.2,0.5,0.3],p2=[0.3,0.4,0.4]。
如果average_probas=True,则对分类器的结果求平均,得到:p=[0.25,0.45,0.35]
[推荐]如果average_probas=False,则分类器的所有结果都保留作为新的特征:p=[0.2,0.5,0.3,0.3,0.4,0.4]
sclf = StackingClassifier(classifiers=[svm_linear, svm_poly, svm_rbf, svm_sigmoid, dt, xgb, lgb], meta_classifier=lr, use_probas=True,average_probas=False)
sclf.fit(X_train, y_train.values)
model_metrics(sclf, X_train, X_test, y_train, y_test)
(2)Voting融合
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.1,max_iter=100)
import xgboost as xgb
xgb_model = xgb.XGBClassifier(max_depth=6,min_samples_leaf=2,n_estimators=100,num_round = 5)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200,min_samples_leaf=2,max_depth=6,oob_score=True)
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)
vot = VotingClassifier(estimators=[('lr', lr), ('rf', rf),('gbdt',gbdt),('xgb',xgb_model)], voting='hard')
vot.fit(train_data_X_sd,train_data_Y)
test["Survived"] = vot.predict(test_data_X_sd)
test[['PassengerId','Survived']].set_index('PassengerId').to_csv('vot5.csv')
可将比较好的模型存起来
# 如果分数足够好,可以将该模型保存起来,下次直接调出来使用0.81339 'rf10.pkl'
from sklearn.externals import joblib
joblib.dump(rf, 'rf10.pkl')