#1.第一步,导包
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pylab import mpl
#修改字符集
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
from datetime import datetime
#这是matplotlib的第二个模块,可以在绘图时同时进行numpy计算
import pylab as pl
#这是对matplotlib的高级封装
import seaborn as sn
#这是日历包
import calendar
#2、数据采集/观察与预处理
#2.1 数据读取
bikedata = pd.read_csv("train.csv")
print(bikedata)
datetime season holiday workingday casual registered \
0 2011/1/1 0:00 1 0 0 3 13
1 2011/1/1 1:00 1 0 0 8 32
2 2011/1/1 2:00 1 0 0 5 27
3 2011/1/1 3:00 1 0 0 3 10
4 2011/1/1 4:00 1 0 0 0 1
5 2011/1/1 5:00 1 0 0 0 1
6 2011/1/1 6:00 1 0 0 2 0
7 2011/1/1 7:00 1 0 0 1 2
8 2011/1/1 8:00 1 0 0 1 7
9 2011/1/1 9:00 1 0 0 8 6
10 2011/1/1 10:00 1 0 0 12 24
11 2011/1/1 11:00 1 0 0 26 30
12 2011/1/1 12:00 1 0 0 29 55
13 2011/1/1 13:00 1 0 0 47 47
14 2011/1/1 14:00 1 0 0 35 71
15 2011/1/1 15:00 1 0 0 40 70
16 2011/1/1 16:00 1 0 0 41 52
17 2011/1/1 17:00 1 0 0 15 52
18 2011/1/1 18:00 1 0 0 9 26
19 2011/1/1 19:00 1 0 0 6 31
20 2011/1/1 20:00 1 0 0 11 25
21 2011/1/1 21:00 1 0 0 3 31
22 2011/1/1 22:00 1 0 0 11 17
23 2011/1/1 23:00 1 0 0 15 24
24 2011/1/2 0:00 1 0 0 4 13
25 2011/1/2 1:00 1 0 0 1 16
26 2011/1/2 2:00 1 0 0 1 8
27 2011/1/2 3:00 1 0 0 2 4
28 2011/1/2 4:00 1 0 0 2 1
29 2011/1/2 6:00 1 0 0 0 2
... ... ... ... ... ... ...
10856 2012/12/18 18:00 4 0 1 13 512
10857 2012/12/18 19:00 4 0 1 19 334
10858 2012/12/18 20:00 4 0 1 4 264
10859 2012/12/18 21:00 4 0 1 9 159
10860 2012/12/18 22:00 4 0 1 5 127
10861 2012/12/18 23:00 4 0 1 1 80
10862 2012/12/19 0:00 4 0 1 6 35
10863 2012/12/19 1:00 4 0 1 1 14
10864 2012/12/19 2:00 4 0 1 1 2
10865 2012/12/19 3:00 4 0 1 0 5
10866 2012/12/19 4:00 4 0 1 1 6
10867 2012/12/19 5:00 4 0 1 2 29
10868 2012/12/19 6:00 4 0 1 3 109
10869 2012/12/19 7:00 4 0 1 3 360
10870 2012/12/19 8:00 4 0 1 13 665
10871 2012/12/19 9:00 4 0 1 8 309
10872 2012/12/19 10:00 4 0 1 17 147
10873 2012/12/19 11:00 4 0 1 31 169
10874 2012/12/19 12:00 4 0 1 33 203
10875 2012/12/19 13:00 4 0 1 30 183
10876 2012/12/19 14:00 4 0 1 33 185
10877 2012/12/19 15:00 4 0 1 28 209
10878 2012/12/19 16:00 4 0 1 37 297
10879 2012/12/19 17:00 4 0 1 26 536
10880 2012/12/19 18:00 4 0 1 23 546
10881 2012/12/19 19:00 4 0 1 7 329
10882 2012/12/19 20:00 4 0 1 10 231
10883 2012/12/19 21:00 4 0 1 4 164
10884 2012/12/19 22:00 4 0 1 12 117
10885 2012/12/19 23:00 4 0 1 4 84
count
0 16
1 40
2 32
3 13
4 1
5 1
6 2
7 3
8 8
9 14
10 36
11 56
12 84
13 94
14 106
15 110
16 93
17 67
18 35
19 37
20 36
21 34
22 28
23 39
24 17
25 17
26 9
27 6
28 3
29 2
... ...
10856 525
10857 353
10858 268
10859 168
10860 132
10861 81
10862 41
10863 15
10864 3
10865 5
10866 7
10867 31
10868 112
10869 363
10870 678
10871 317
10872 164
10873 200
10874 236
10875 213
10876 218
10877 237
10878 334
10879 562
10880 569
10881 336
10882 241
10883 168
10884 129
10885 88
[10886 rows x 7 columns]
#2.2数据查看
print("数据描述")
print(bikedata.describe())#数据描述
print("数据的大小")
print(bikedata.shape)#数据大小
print("数据的前五行")
print(bikedata.head())#查看数据的前五行
print("数据的后五行")
print(bikedata.tail())#查看数据的后五行
print("数据的类型")
print(bikedata.dtypes)#查看数据的类型
数据描述
season holiday workingday casual registered \
count 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000
mean 2.506614 0.028569 0.680875 36.021955 155.552177
std 1.116174 0.166599 0.466159 49.960477 151.039033
min 1.000000 0.000000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 0.000000 4.000000 36.000000
50% 3.000000 0.000000 1.000000 17.000000 118.000000
75% 4.000000 0.000000 1.000000 49.000000 222.000000
max 4.000000 1.000000 1.000000 367.000000 886.000000
count
count 10886.000000
mean 191.574132
std 181.144454
min 1.000000
25% 42.000000
50% 145.000000
75% 284.000000
max 977.000000
数据的大小
(10886, 7)
数据的前五行
datetime season holiday workingday casual registered count
0 2011/1/1 0:00 1 0 0 3 13 16
1 2011/1/1 1:00 1 0 0 8 32 40
2 2011/1/1 2:00 1 0 0 5 27 32
3 2011/1/1 3:00 1 0 0 3 10 13
4 2011/1/1 4:00 1 0 0 0 1 1
数据的后五行
datetime season holiday workingday casual registered \
10881 2012/12/19 19:00 4 0 1 7 329
10882 2012/12/19 20:00 4 0 1 10 231
10883 2012/12/19 21:00 4 0 1 4 164
10884 2012/12/19 22:00 4 0 1 12 117
10885 2012/12/19 23:00 4 0 1 4 84
count
10881 336
10882 241
10883 168
10884 129
10885 88
数据的类型
datetime object
season int64
holiday int64
workingday int64
casual int64
registered int64
count int64
dtype: object
#2.3、数据提取
#2.3.1提取年月日,并添加入原DateFrame
bikedata['date'] = bikedata.datetime.apply(lambda x:x.split()[0])
#2.3.2提取小时
bikedata['hour'] = bikedata.datetime.apply(lambda x:x.split()[1].split(':')[0])
#2.3.3提取分钟
bikedata['minute'] = bikedata.datetime.apply(lambda x:x.split()[1].split(':')[1])
print(bikedata.head())
datetime season holiday workingday casual registered count \
0 2011/1/1 0:00 1 0 0 3 13 16
1 2011/1/1 1:00 1 0 0 8 32 40
2 2011/1/1 2:00 1 0 0 5 27 32
3 2011/1/1 3:00 1 0 0 3 10 13
4 2011/1/1 4:00 1 0 0 0 1 1
date hour minute
0 2011/1/1 0 00
1 2011/1/1 1 00
2 2011/1/1 2 00
3 2011/1/1 3 00
4 2011/1/1 4 00
#2.3.4 在年月日基础上提取星期几和月份
#使用calendar包
#其中,date类型是: 年-月-日 有几个方法,首先是.weekday()返回的是整型数,0-6是周一到周五;其次是.month返回的
#是整型数,是月份的数字
#再者,calendar有几个方法。首先是calendar.dat_name[整型数字],返回的是星期几的全程
#然后,calendar.month_name[整型数字],返回的是月份的全称。
bikedata['weekend'] = bikedata.date.apply(lambda datestring:calendar.day_name[datetime.strptime(
datestring,'%Y/%m/%d').weekday()])
bikedata['month'] = bikedata.date.apply(lambda datestring:calendar.month_name[datetime.strptime(
datestring,'%Y/%m/%d').month])
print(bikedata.head())
datetime season holiday workingday casual registered count \
0 2011/1/1 0:00 1 0 0 3 13 16
1 2011/1/1 1:00 1 0 0 8 32 40
2 2011/1/1 2:00 1 0 0 5 27 32
3 2011/1/1 3:00 1 0 0 3 10 13
4 2011/1/1 4:00 1 0 0 0 1 1
date hour minute weekend month
0 2011/1/1 0 00 Saturday January
1 2011/1/1 1 00 Saturday January
2 2011/1/1 2 00 Saturday January
3 2011/1/1 3 00 Saturday January
4 2011/1/1 4 00 Saturday January
#2.4数据转换
#2.4.1 将season转成英文季节名,使用映射方法
#映射方法有两种,一种是利用python的映射方法map,将season提取出来,使用自定义函数来改变
#第二种就是利用series对象的自带的map进行映射修改
'''
def get_season(season):
if season==1:
return 'Spring'
elif season==2:
return 'Summer'
elif season==3:
return 'Fall'
else:
return 'Winter'
#map_result=map(get_season,list(bikedata.season.values))
bikedata.season = bikedata.season.apply(lambda x:get_season(x))
'''
bikedata['season'] = bikedata.season.map({1:'Spring',2:'Summer',3:'Fall',4:'Winter'})
print(bikedata.head())
datetime season holiday workingday casual registered count \
0 2011/1/1 0:00 Spring 0 0 3 13 16
1 2011/1/1 1:00 Spring 0 0 8 32 40
2 2011/1/1 2:00 Spring 0 0 5 27 32
3 2011/1/1 3:00 Spring 0 0 3 10 13
4 2011/1/1 4:00 Spring 0 0 0 1 1
date hour minute weekend month
0 2011/1/1 0 00 Saturday January
1 2011/1/1 1 00 Saturday January
2 2011/1/1 2 00 Saturday January
3 2011/1/1 3 00 Saturday January
4 2011/1/1 4 00 Saturday January
#2.4.1 将某些变量变成分类变量
varlist = ['hour','weekend','month','season','minute','holiday','workingday']
for each in varlist:
bikedata[each] = bikedata[each].astype('category')
print(bikedata.dtypes)
datetime object
season category
holiday category
workingday category
casual int64
registered int64
count int64
date object
hour category
minute category
weekend category
month category
dtype: object
#由于此时的datetime由date和hour还有minute代替,所以删除
bikedata.drop('datetime',axis=1,inplace=True)
#2.5、数据预处理之数据清洗
#2.5.1 首先,数据查看是否有缺失值
print(bikedata.describe())
casual registered count
count 10886.000000 10886.000000 10886.000000
mean 36.021955 155.552177 191.574132
std 49.960477 151.039033 181.144454
min 0.000000 0.000000 1.000000
25% 4.000000 36.000000 42.000000
50% 17.000000 118.000000 145.000000
75% 49.000000 222.000000 284.000000
max 367.000000 886.000000 977.000000
#观察得,count都一直,则无缺失值,进行下一步。
#2.5.2接着,检查是否有异常值,利用绘图
fig,axes = plt.subplots(nrows=2,ncols=2)#创建子图,获取它的画板fig与画布axes
fig.set_size_inches(1,12)#设置画板的大小
sn.boxplot(data=bikedata,y='count',orient='v',ax=axes[0][0])#利用seaborn创建双特征箱型图,
#count作为纵坐标,水平显示,画在子图一中
sn.boxplot(data=bikedata,y='count',x='season',orient='v',ax=axes[0][1])#利用seaborn创建双特征箱型图,
#count作为纵坐标,season作为横坐标,水平显示,画在子图二中
sn.boxplot(data=bikedata,y='count',x='hour',orient='v',ax=axes[1][0])#利用seaborn创建双特征箱型图,
#count作为纵坐标,hour作为横坐标,水平显示,画在子图三中
sn.boxplot(data=bikedata,y='count',x='workingday',orient='v',ax=axes[1][1])#利用seaborn创建双特征箱型图,
#count作为纵坐标,workingday作为横坐标,水平显示,画在子图二中
#设置子图的纵横坐标还有标题名称
axes[0][0].set(ylabel='骑行人数',title='骑行人数')
axes[0][1].set(xlabel='季节',ylabel='骑行人数',title='不同季节骑行人数骑行人数')
axes[0][0].set(xlabel='时间',ylabel='骑行人数',title='一天内不同时间骑行人数')
axes[0][0].set(xlabel='工作日',ylabel='骑行人数',title='工作日骑行人数')
plt.savefig('Abnormal_value_analysis.png')
plt.show()
字体错误(不影响)
#2.5.3将异常值删除
#去除异常值的方法是使用纠正资质的方法,即数值减去平均值的绝对值大于3倍的方差就是异常值
bikedata1 = bikedata[np.abs(bikedata['count']-bikedata['count'].mean())<=(3*bikedata['count'].std())]
print('去除异常值前',bikedata.shape)
print('去除异常值后',bikedata1.shape)
去除异常值前 (10886, 11)
去除异常值后 (10739, 11)
#2.5.4保存数据
bikedata1.to_csv('deal_data.csv')
#3数据进行分析与可视化
#3.1 不同月份的骑行时间
fig,ax = plt.subplots()#取出画板与画布
fig.set_size_inches(12,20)#设置画板大小
#3.1.1设置一月的变量
sortOrder = ['Janury','February','March','April','May','June','July','August','September','October'
,'November','December']
#3.1.2判断每个月有几条数据,从大到小排序
#3.1.2.1首先获取DataFrame对象(每个月的平均count),并重新设置索引
monthAggregated = pd.DataFrame(bikedata1.groupby('month')['count'].mean()).reset_index()
print(monthAggregated)
#3.1.2.2从小到大排序
monthSorted = monthAggregated.sort_values(by='count',ascending=False)
print('排序后')
print(monthSorted)
#3.1.2画柱形图
#利用seaborn来画柱形图,order是用来控制条形图的顺序,x是横坐标利用的数据,y是纵坐标利用的数据
sn.barplot(data=monthSorted,x='month',y='count',order=sortOrder)
#ax.set(xlabel='月份',ylabel='平均骑行人数',title='不同月份的骑行人数')
ax.set(xlabel='month',ylabel='people_count',title='cu=ount_month')
plt.savefig('month_count.png')
plt.show()
month count
0 April 177.013363
1 August 218.130631
2 December 174.349451
3 February 110.003330
4 January 90.366516
5 July 225.133929
6 June 231.093855
7 March 145.399108
8 May 212.294118
9 November 193.677278
10 October 205.184510
11 September 213.777273
排序后
month count
6 June 231.093855
5 July 225.133929
1 August 218.130631
11 September 213.777273
8 May 212.294118
10 October 205.184510
9 November 193.677278
0 April 177.013363
2 December 174.349451
7 March 145.399108
3 February 110.003330
4 January 90.366516
出现字体错误(不重要)
#3.2一周内不同天不同时间的骑行人数
weekOrder = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
fig1,ax1 = plt.subplots()
fig1.set_size_inches(12,20)
#3.2.1分组统计骑行时间的一周内分布并重新索引
hourAggregated = pd.DataFrame(bikedata1.groupby(['hour','weekend'],sort=True)['count'].mean()).reset_index()
print(hourAggregated)
hour weekend count
0 0 Friday 53.234375
1 0 Monday 35.492308
2 0 Saturday 98.212121
3 0 Sunday 96.227273
4 0 Thursday 37.476923
5 0 Tuesday 27.328125
6 0 Wednesday 36.246154
7 1 Friday 24.453125
8 1 Monday 18.076923
9 1 Saturday 70.015152
10 1 Sunday 79.454545
11 1 Thursday 15.415385
12 1 Tuesday 11.904762
13 1 Wednesday 15.615385
14 10 Friday 156.812500
15 10 Monday 140.984615
16 10 Saturday 269.530303
17 10 Sunday 264.636364
18 10 Thursday 128.323077
19 10 Tuesday 129.187500
20 10 Wednesday 132.353846
21 11 Friday 186.828125
22 11 Monday 171.338462
23 11 Saturday 339.484848
24 11 Sunday 321.242424
25 11 Thursday 156.230769
26 11 Tuesday 145.609375
27 11 Wednesday 148.938462
28 12 Friday 236.359375
29 12 Monday 214.584615
.. ... ... ...
138 5 Tuesday 24.015625
139 5 Wednesday 25.046154
140 6 Friday 91.359375
141 6 Monday 89.246154
142 6 Saturday 21.121212
143 6 Sunday 15.136364
144 6 Thursday 108.230769
145 6 Tuesday 105.375000
146 6 Wednesday 105.815385
147 7 Friday 254.109375
148 7 Monday 260.400000
149 7 Saturday 47.242424
150 7 Sunday 34.742424
151 7 Thursday 307.692308
152 7 Tuesday 297.609375
153 7 Wednesday 297.246154
154 8 Friday 450.866667
155 8 Monday 417.555556
156 8 Saturday 117.560606
157 8 Sunday 83.954545
158 8 Thursday 473.483333
159 8 Tuesday 458.741935
160 8 Wednesday 448.431034
161 9 Friday 262.406250
162 9 Monday 226.353846
163 9 Saturday 190.606061
164 9 Sunday 158.666667
165 9 Thursday 241.815385
166 9 Tuesday 236.140625
167 9 Wednesday 238.769231
[168 rows x 3 columns]
字体异常(不重要)
#3.2.1创建散点图
#其中,x,y分别是数据,hue是用来统计的数据,hueOrder是统计的顺序
sn.pointplot(x=hourAggregated['hour'],y=hourAggregated['count'],hue=hourAggregated['weekend'],
hueOrder=weekOrder,data=hourAggregated)
#设置坐标轴名字与标题
ax1.set(xlabel='timt',ylabel='people_count',title='week_time_count')
print(hourAggregated)
plt.savefig('week_time_count_result.png')
plt.show()
hour weekend count
0 0 Friday 53.234375
1 0 Monday 35.492308
2 0 Saturday 98.212121
3 0 Sunday 96.227273
4 0 Thursday 37.476923
5 0 Tuesday 27.328125
6 0 Wednesday 36.246154
7 1 Friday 24.453125
8 1 Monday 18.076923
9 1 Saturday 70.015152
10 1 Sunday 79.454545
11 1 Thursday 15.415385
12 1 Tuesday 11.904762
13 1 Wednesday 15.615385
14 10 Friday 156.812500
15 10 Monday 140.984615
16 10 Saturday 269.530303
17 10 Sunday 264.636364
18 10 Thursday 128.323077
19 10 Tuesday 129.187500
20 10 Wednesday 132.353846
21 11 Friday 186.828125
22 11 Monday 171.338462
23 11 Saturday 339.484848
24 11 Sunday 321.242424
25 11 Thursday 156.230769
26 11 Tuesday 145.609375
27 11 Wednesday 148.938462
28 12 Friday 236.359375
29 12 Monday 214.584615
.. ... ... ...
138 5 Tuesday 24.015625
139 5 Wednesday 25.046154
140 6 Friday 91.359375
141 6 Monday 89.246154
142 6 Saturday 21.121212
143 6 Sunday 15.136364
144 6 Thursday 108.230769
145 6 Tuesday 105.375000
146 6 Wednesday 105.815385
147 7 Friday 254.109375
148 7 Monday 260.400000
149 7 Saturday 47.242424
150 7 Sunday 34.742424
151 7 Thursday 307.692308
152 7 Tuesday 297.609375
153 7 Wednesday 297.246154
154 8 Friday 450.866667
155 8 Monday 417.555556
156 8 Saturday 117.560606
157 8 Sunday 83.954545
158 8 Thursday 473.483333
159 8 Tuesday 458.741935
160 8 Wednesday 448.431034
161 9 Friday 262.406250
162 9 Monday 226.353846
163 9 Saturday 190.606061
164 9 Sunday 158.666667
165 9 Thursday 241.815385
166 9 Tuesday 236.140625
167 9 Wednesday 238.769231
[168 rows x 3 columns]
字体异常(不重要)
总结回顾
- 所需的库
本次案例,所需的库有如下:
首先,是python进行数据分析最常用的三个库:
1. numpy进行数据矩阵化读取与对这些数据进行简单处理。
2. pandas进行数据表格化读取与对这些数据进行清洗转换等。
3. matplotlib进行数据的可视化处理
其次,是其他所需的库或模块:
1. pylab是matplotlib的一个模块,本次案例主要是对数据可视化时进行字体转换
2. datetime模块,主要是对时间类型数据进行处理与转换
3. seaborn 是matplotlib的高级封装包,能够更加简洁高效的绘制可视化图形,并且
在绘制过程中进行额外的操作,比如更改颜色,背景色等。
4.calcendar 是一个日历包,本案例中,主要是能够根据一个date类型 数据获取其所
在的月份和是星期几
- 所需的流程
- 数据观察
数据观察使用的是describe()方法,在整体上感知数据,可以直观的观察数据是否有缺失值
最大值,最小值,1/4值,中位值等数据。
- 数据预处理
数据预处理分为三步,数据清洗,数据提取,数据转换:
1. 数据清洗,在本案例中就是对缺失值进行填充,数值型用平均值填充,字符型用
出现次数最多的填充;对异常值进行删除,使用的是纠正算法即:数值-平均值>
3*方差则为异常值。
2. 数据提取,就是用已存在的列的数据获取所需的列的数据。在本案例中使用的
是datetime的数据获取所需的weekend(本案例拼写错误,尴尬)星期的数据和月
份的数据。
3. 数据转换,就是将字符型数据转成数值型,数值型转成字符型。常用的方法是使
用series的map方法进行匹配映射。
- 数据可视化
本案例中,数据可视化处理是使用的seaborn包进行可视化处理,图形是barplot柱形图与
pointplot散点图。分别可视化的是不同月份的骑行时间图与一周内每天的骑行时间趋势与
每周骑行时间分布。
链接:https://pan.baidu.com/s/1srOwBb56Qo9cGqrdxYpsjA 提取码:n36g