import pandas as pd
import keras
from keras import layers
import numpy as np
data = pd.read_csv("./data/tt_train.csv")
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
# 查看所有的列索引名,方便去复制要用的列
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
x = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
'Parch', 'Fare', 'Embarked']]
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S |
# 现在要处理Embarked(船舱位置),先看一下它有多少种取值
array(['S', 'C', 'Q', nan], dtype=object)
# 原来的每一类,对所有样本都可以得到True/False
(x.Embarked=='S').head() # 太长了只看前5个
0 True
1 False
2 True
3 True
4 True
Name: Embarked, dtype: bool
# 转换成1/0
0 1
1 0
2 1
3 1
4 1
Name: Embarked, dtype: int32
# 添加列之前copy一下,不然有警告
# 用上面的方式添加one-hot编码,因为是3维的所以要添加三列特征
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | Embarked_S | Embarked_C | Embarked_Q | |
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | 1 | 0 | 0 |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | 0 | 1 | 0 |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | 1 | 0 | 0 |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | 1 | 0 | 0 |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | 1 | 0 | 0 |
# 删除原始的Embarked列
del x['Embarked']
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked_S | Embarked_C | Embarked_Q | |
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | 1 | 0 | 0 |
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | 0 | 1 | 0 |
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | 1 | 0 | 0 |
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | 1 | 0 | 0 |
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | 1 | 0 | 0 |
Survived | Pclass | Age | SibSp | Parch | Fare | Embarked_S | Embarked_C | Embarked_Q | Sex_female | Sex_male | |
0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | 1 | 0 | 0 | 0 | 1 |
1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 1 | 0 | 1 | 0 |
2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | 1 | 0 | 0 | 1 | 0 |
3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 | 0 | 0 | 1 | 0 |
4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | 1 | 0 | 0 | 0 | 1 |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived 891 non-null int64
Pclass 891 non-null int64
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked_S 891 non-null int32
Embarked_C 891 non-null int32
Embarked_Q 891 non-null int32
Sex_female 891 non-null uint8
Sex_male 891 non-null uint8
dtypes: float64(2), int32(3), int64(4), uint8(2)
memory usage: 54.0 KB
# 处理Age的缺失值:用均值填充
x['Age'] = x.Age.fillna(x.Age.mean())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived 891 non-null int64
Pclass 891 non-null int64
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked_S 891 non-null int32
Embarked_C 891 non-null int32
Embarked_Q 891 non-null int32
Sex_female 891 non-null uint8
Sex_male 891 non-null uint8
dtypes: float64(2), int32(3), int64(4), uint8(2)
memory usage: 54.0 KB
Survived | Pclass | Age | SibSp | Parch | Fare | Embarked_S | Embarked_C | Embarked_Q | Sex_female | Sex_male | |
0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | 1 | 0 | 0 | 0 | 1 |
1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 1 | 0 | 1 | 0 |
2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | 1 | 0 | 0 | 1 | 0 |
3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 | 0 | 0 | 1 | 0 |
4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | 1 | 0 | 0 | 0 | 1 |
x.loc[:,'P1'] = (x.Pclass==1).astype('int')
x.loc[:,'P2'] = (x.Pclass==2).astype('int')
x.loc[:,'P3'] = (x.Pclass==3).astype('int')
del x['Pclass']
Survived | Age | SibSp | Parch | Fare | Embarked_S | Embarked_C | Embarked_Q | Sex_female | Sex_male | P1 | P2 | P3 | |
0 | 0 | 22.0 | 1 | 0 | 7.2500 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
2 | 1 | 26.0 | 0 | 0 | 7.9250 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
3 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
4 | 0 | 35.0 | 0 | 0 | 8.0500 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
# 现在预处理完成了,把预测值取出来,并在x中把它删掉
y = data.Survived
del x['Survived']
x.shape, y.shape
((891, 12), (891,))
# 顺序模型
model = keras.Sequential()
# 全连接层,输出1维,输入12维度,使用Sigmoid作为激活函数
model.add(layers.Dense(1, input_dim=12, activation='sigmoid'))
Layer (type) Output Shape Param #
dense_1 (Dense) (None, 1) 13
Total params: 13
Trainable params: 13
Non-trainable params: 0
# 编译模型
loss='binary_crossentropy', # 这里用二元的交叉熵作为二分类的损失函数
metrics=['acc'] # 在训练时输出accuracy(精度,即正确率)
# 训练模型,从返回值可以获得其训练过程中的一些信息
history = model.fit(x, y, epochs=300, verbose=0) # verbose=0不从std输出,不然导出markdown这块就太长了
# 查看保留了哪些数据(该对象的history属性就是一个字典,keys()就是取字典的键)
dict_keys(['loss', 'acc'])
import matplotlib.pyplot as plt
%matplotlib inline
# 这里300==len(history.history.get('loss')==len(history.history.get('acc')==epochs
# 读取测试集,并做相同的预处理
df = pd.read_csv("./data/tt_test.csv")
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
# 注意这里根本没有预测值(Survived列),所以就不用考虑跟着取出来它了
xt = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
xt = xt.copy()
del xt['Embarked']
xt = pd.get_dummies(xt)
xt['Age'] = xt.Age.fillna(xt.Age.mean())
xt.loc[:,'P1'] = (xt.Pclass==1).astype('int')
xt.loc[:,'P2'] = (xt.Pclass==2).astype('int')
xt.loc[:,'P3'] = (xt.Pclass==3).astype('int')
del xt['Pclass']
Age | SibSp | Parch | Fare | Embarked_S | Embarked_C | Embarked_Q | Sex_female | Sex_male | P1 | P2 | P3 | |
0 | 34.5 | 0 | 0 | 7.8292 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
1 | 47.0 | 1 | 0 | 7.0000 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
2 | 62.0 | 0 | 0 | 9.6875 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
3 | 27.0 | 0 | 0 | 8.6625 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 22.0 | 1 | 1 | 12.2875 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
# 计算预测值
predictions = model.predict(xt)
# 生成提交csv
submission = pd.DataFrame({"PassengerId": df["PassengerId"], "Survived": (predictions.flatten()>0.5).astype('int')})
submission.to_csv("./data/tt_upload.csv", index=False)
