泰坦尼克救援预测

from IPython.display import Image
Image(filename=r'C:\Users\a\Desktop\暑假\Titantic\QQ截图20190827081938.png',width=800)

坦坦尼克之灾机器学习 泰坦尼克救援_坦坦尼克之灾机器学习

第一步:数据分析

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#读取文件 查看数据描述
data = pd.read_csv('titanic_train.csv')
data.describe()

PassengerId

Survived

Pclass

Age

SibSp

Parch

Fare

count

891.000000

891.000000

891.000000

714.000000

891.000000

891.000000

891.000000

mean

446.000000

0.383838

2.308642

29.699118

0.523008

0.381594

32.204208

std

257.353842

0.486592

0.836071

14.526497

1.102743

0.806057

49.693429

min

1.000000

0.000000

1.000000

0.420000

0.000000

0.000000

0.000000

25%

223.500000

0.000000

2.000000

20.125000

0.000000

0.000000

7.910400

50%

446.000000

0.000000

3.000000

28.000000

0.000000

0.000000

14.454200

75%

668.500000

1.000000

3.000000

38.000000

1.000000

0.000000

31.000000

max

891.000000

1.000000

3.000000

80.000000

8.000000

6.000000

512.329200

可以看到数据中的Age列和Cabin列以及Embarked的列有缺失值,Cabin列缺失值数量太多,直接舍去,然后Ticket列对于实际的获救应该也没有什么关系

data.head()

PassengerId

Survived

Pclass

Name

Sex

Age

SibSp

Parch

Ticket

Fare

Cabin

Embarked

0

1

0

3

Braund, Mr. Owen Harris

male

22.0

1

0

A/5 21171

7.2500

NaN

S

1

2

1

1

Cumings, Mrs. John Bradley (Florence Briggs Th...

female

38.0

1

0

PC 17599

71.2833

C85

C

2

3

1

3

Heikkinen, Miss. Laina

female

26.0

0

0

STON/O2. 3101282

7.9250

NaN

S

3

4

1

1

Futrelle, Mrs. Jacques Heath (Lily May Peel)

female

35.0

1

0

113803

53.1000

C123

S

4

5

0

3

Allen, Mr. William Henry

male

35.0

0

0

373450

8.0500

NaN

S

查阅当时的背景资料发现在当时泰坦尼克失事之后的逃生政策是妇女和儿童优先,下面查看一下妇女和儿童的逃生率,可以看到女性的平均获救人数是0.74远大于男性的

#查阅当时的背景资料发现在当时泰坦尼克失事之后的逃生政策是妇女和儿童优先,下面查看一下妇女和儿童的逃生率
data.pivot_table(index='Sex',values='Survived')  #可以看到女性的平均获救人数是0.74远大于男性的0.18

Survived

Sex

female

0.742038

male

0.188908

查看获救人群的平均年龄 然而好像说明不了什么问题

#查看获救人群的平均年龄   然而好像说明不了什么问题
data.pivot_table(index='Survived',values='Age')

Age

Survived

0

30.626179

1

28.343690

查看女性人群的平均年龄 然而好像说明不了什么问题

#查看女性人群的平均年龄  然而好像说明不了什么问题
data[data['Sex']=='female'].pivot_table(index='Survived',values='Age')

Age

Survived

0

25.046875

1

28.847716

查阅资料发现当时儿童的定义为14岁以下,查看儿童的获救率 发现随着年龄的增长获救率会降低这也印证了妇女和儿童优先的逃生政策

#查阅资料发现当时儿童的定义为14岁以下,查看儿童的获救率  发现随着年龄的增长获救率会降低这也印证了妇女和儿童优先的政策  
#所以我们认为年龄是一个重要特征
for i in np.arange(20):
    print(data[data['Age']<= i].pivot_table(index = 'Sex',values='Survived'))
Empty DataFrame
Columns: []
Index: []
        Survived
Sex             
female       1.0
male         0.8
        Survived
Sex             
female  0.600000
male    0.642857
        Survived
Sex             
female  0.583333
male    0.722222
        Survived
Sex             
female  0.705882
male    0.652174
        Survived
Sex             
female  0.761905
male    0.652174
        Survived
Sex             
female  0.739130
male    0.666667
        Survived
Sex             
female  0.750000
male    0.615385
        Survived
Sex             
female  0.730769
male    0.607143
        Survived
Sex             
female  0.633333
male    0.593750
        Survived
Sex             
female  0.612903
male    0.575758
        Survived
Sex             
female  0.593750
male    0.555556
        Survived
Sex             
female  0.593750
male    0.567568
        Survived
Sex             
female  0.617647
male    0.567568
        Survived
Sex             
female  0.631579
male    0.538462
        Survived
Sex             
female  0.651163
male    0.525000
        Survived
Sex             
female  0.673469
male    0.431373
        Survived
Sex             
female  0.690909
male    0.396552
        Survived
Sex             
female  0.676471
male    0.338028
        Survived
Sex             
female  0.706667
male    0.292135

在当时的社会等级制度严格 查看一下三个船舱等级对应的获救率 发现船舱等级不同获救率也会有很大的不同所以船舱等级也是一个重要特征

#在当时的社会等级制度严格  查看一下三个船舱等级对应的获救率   发现船舱等级不同获救率也会有很大的不同所以船舱等级也是一个重要特征
data.pivot_table(index='Pclass',values='Survived')

Survived

Pclass

1

0.629630

2

0.472826

3

0.242363

那么登船地点会不会影响获救率呢 看起来登船地点对应的获救率也有较大区别,可能不同的登船地点上到船上的位置不同,距离逃生地点的远近也不同

#那么登船地点会不会影响获救率呢
data.pivot_table(index='Embarked',values='Survived')
#看起来登船地点对应的获救率也有较大区别,可能不同的登船地点上到船上的位置不同,距离逃生地点的远近也不同

Survived

Embarked

C

0.553571

Q

0.389610

S

0.336957

那么家里的兄弟姐妹的数量会不会影响获救率呢 可以看到越多的兄弟姐妹获救率越低

#那么家里的兄弟姐妹的数量会不会影响获救率呢  可以看到越多的兄弟姐妹获救率越低
data.pivot_table(index='SibSp',values='Survived')

Survived

SibSp

0

0.345395

1

0.535885

2

0.464286

3

0.250000

4

0.166667

5

0.000000

8

0.000000

那么家里老人和小孩的数量会不会影响获救率呢 可以看到总体来说老人和小孩的数量越多获救率也越大

#那么家里老人和小孩的数量会不会影响获救率呢     可以看到总体来说老人和小孩的数量越多获救率也越大
data.pivot_table(index='Parch',values='Survived')

Survived

Parch

0

0.343658

1

0.550847

2

0.500000

3

0.600000

4

0.000000

5

0.200000

6

0.000000

至此,我们认为重要特征为Pclass,Sex,Age,Embarked,SibSp,Parch

构造一个新的数据表

#至此,我们认为重要特征为Pclass,Sex,Age,Embarked,SibSp,Parch
#构造一个新的数据表
columns = ['Pclass','Sex','Age','SibSp','Parch','Embarked','Survived','Fare']
new_data = data[columns]
new_data.head()

Pclass

Sex

Age

SibSp

Parch

Embarked

Survived

Fare

0

3

male

22.0

1

0

S

0

7.2500

1

1

female

38.0

1

0

C

1

71.2833

2

3

female

26.0

0

0

S

1

7.9250

3

1

female

35.0

1

0

S

1

53.1000

4

3

male

35.0

0

0

S

0

8.0500

查看缺失值

#查看缺失值
new_data.isnull().sum()
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
Survived      0
Fare          0
dtype: int64

填充缺失值,年龄填充为中位数,登船地点填充为众数

#填充缺失值,年龄填充为中位数,登船地点填充为众数
new_data['Age'].fillna(new_data['Age'].median(),inplace = True)
print(new_data['Age'].median())
print(new_data['Embarked'].mode())
# #查看数据表描述
new_data.describe()
28.0
0    S
dtype: objec

Pclass

Age

SibSp

Parch

Survived

Fare

count

891.000000

891.000000

891.000000

891.000000

891.000000

891.000000

mean

2.308642

29.361582

0.523008

0.381594

0.383838

32.204208

std

0.836071

13.019697

1.102743

0.806057

0.486592

49.693429

min

1.000000

0.420000

0.000000

0.000000

0.000000

0.000000

25%

2.000000

22.000000

0.000000

0.000000

0.000000

7.910400

50%

3.000000

28.000000

0.000000

0.000000

0.000000

14.454200

75%

3.000000

35.000000

1.000000

0.000000

1.000000

31.000000

max

3.000000

80.000000

8.000000

6.000000

1.000000

512.329200

查看空值

new_data.isnull().sum()
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    2
Survived    0
Fare        0
dtype: int64

将登船地点填充

#将登船地点填充
new_data["Embarked"] = new_data["Embarked"].fillna('S')
new_data.isnull().sum()
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
Survived    0
Fare        0
dtype: int64

将男性和女性的字符值转化为数值男0,女1

#将男性和女性的字符值转化为数值男0,女1
new_data.loc[new_data['Sex']=='male','Sex'] = 0
new_data.loc[new_data['Sex']=='female','Sex'] = 1

将登船地点对应的字符值转化为数值 C:0,Q:1,S:2

#将登船地点对应的字符值转化为数值  C:0,Q:1,S:2
new_data.loc[new_data['Embarked']=='C','Embarked'] = 0
new_data.loc[new_data['Embarked']=='Q','Embarked'] = 1
new_data.loc[new_data['Embarked']=='S','Embarked'] = 2
new_data.head()

Pclass

Sex

Age

SibSp

Parch

Embarked

Survived

Fare

0

3

0

22.0

1

0

2

0

7.2500

1

1

1

38.0

1

0

0

1

71.2833

2

3

1

26.0

0

0

2

1

7.9250

3

1

1

35.0

1

0

2

1

53.1000

4

3

0

35.0

0

0

2

0

8.0500

第二步:初步建模调整参数

使用线性回归

#开始建模,使用线性回归
from sklearn.linear_model import LinearRegression
#使用交叉验证方法
from sklearn.model_selection import KFold,cross_val_score
kf = KFold(5,random_state=0)
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]
train_target = new_data['Survived']
LR = LinearRegression()
accuracys=[]
for train,test in kf.split(new_data):
    LR.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived'])
    pred = LR.predict(new_data.loc[test,predictors])
    pred[pred >= 0.60] = 1
    pred[pred < 0.60] = 0
    accuracy = len(pred[pred == new_data.loc[test,'Survived']])/len(test)
    accuracys.append(accuracy)
print(np.mean(accuracys))
0.8035653756826313

使用逻辑回归

#开始建模,使用逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,cross_val_score
kf = KFold(5,random_state=0)
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]
lr = LogisticRegression(C = 0.1,solver='liblinear',penalty='l2')
lr.fit(new_data[predictors],new_data['Survived'])
print(cross_val_score(lr,new_data[predictors],new_data['Survived'],cv = kf).mean())
accuracys = []
for train,test in kf.split(new_data):
    lr.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived'])
    pred = lr.predict_proba(new_data.loc[test,predictors])
#     print(pred.shape)
    new_pred = pred[:,1]
#     print(new_pred)
    new_pred[new_pred >= 0.50] = 1
    new_pred[new_pred < 0.50] = 0
    accuracy = len(new_pred[new_pred == new_data.loc[test,'Survived']])/len(test)
    accuracys.append(accuracy)
print(np.mean(accuracys))
0.7956939300734418
0.7956939300734418

使用决策树

#开始建模,使用决策树
from sklearn import tree
dt = tree.DecisionTreeClassifier(min_samples_split=4, min_samples_leaf=4)
kf = KFold(5,random_state=0)
accuracys = []
for train,test in kf.split(new_data):
    dt.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived'])
    pred = dt.predict(new_data.loc[test,predictors])
    accuracy = len(pred[pred == new_data.loc[test,'Survived']])/len(test)
    accuracys.append(accuracy)
print(np.mean(accuracys))
print(cross_val_score(dt,new_data[predictors],new_data['Survived'],cv=kf).mean())
0.804758018956751
0.8036344234511331

第三步:使用集成学习算法,随机森林

#开始建模,使用随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
alg = RandomForestClassifier(random_state=1, n_estimators=80, min_samples_split=4, min_samples_leaf=4)
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]
kf = KFold(5,random_state=0)
scores = cross_val_score(alg, new_data[predictors], new_data["Survived"], cv=kf)  #方法一
print(scores.mean())
accuracys = []   #方法二
for train,test in kf.split(new_data):
    alg.fit(new_data.loc[train,predictors],new_data.loc[train,'Survived'])
    pred = alg.predict(new_data.loc[test,predictors])
    accuracy = len(pred[pred == new_data.loc[test,'Survived']])/len(test)
    accuracys.append(accuracy)
print(np.mean(accuracys))
0.820475801895675
0.820475801895675