科比投篮问题研究 Python 科比投篮数据分析

转载

云端创新梦想家 2024-08-22 14:13:52

文章标签 科比投篮问题研究 Python 子节点决策树 Back 文章分类 Python 后端开发

一、数据分析

科比投篮问题研究 Python 科比投篮数据分析_Back

可以看到有些数据是字符串形式，有些是数值形式，有数据缺失

action_type（投篮方式）
combined_shot_type（結合投篮方式）
game_event_id（比赛事件ID）
game_id（比赛ID）
lat（投篮经度）
loc_x （投篮x坐标）
loc_y（投篮y坐标）
lon（投篮纬度）
minutes_remaining（离比赛结束还剩多少分钟）
period（第几场）
playoffs（是不是季后赛）
season（赛季）
seconds_remaining（离比赛结束还剩多少秒）
shot_distance（投篮时距篮筐距离）
shot_made_flag （是否进球（目标tag））
shot_type（2分球还是3分球区域）
shot_zone_area（投篮区域的表示方法一）
shot_zone_basic（投篮区域的表示方法二）
shot_zone_range（投篮区域的表示方法三）
team_id（队伍ID）
team_name（队伍名字）
game_date（比赛时间）
matchup（比赛双方队伍）
opponent（自己所在队伍名字）
shot_id（镜头ID）

二、数据预处理

（一）读入并查看数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.model_selection import KFold #交叉验证
filename="data.csv"
raw=pd.read_csv(filename)
print(raw.shape) #读取矩阵长度
raw.head()

输出结果：

科比投篮问题研究 Python 科比投篮数据分析_子节点_02

（二）去掉缺失值

#shot_made_flag有缺失值，因为是标签，所以把缺失的数据行去掉
kobe=raw[pd.notnull(raw['shot_made_flag'])] 
#pd.notnull显示缺失的列，缺失为false，不缺失为true。
#把不是空值的数据复制给kobe，对应pd.isnull()
print(kobe.shape)

科比投篮问题研究 Python 科比投篮数据分析_Back_03

（三）投篮位置特征可视化

#plt.subplot(211) first is raw second Column
alpha = 0.02
plt.figure(figsize=(10,10)) #figsize=(10,10) 代表橫軸和縱軸的每英寸的寬度和高度。

# loc_x and loc_y
plt.subplot(121)    #121：代表一行2列子圖，後面的1代表放在第一個位置。
plt.scatter(kobe.loc_x, kobe.loc_y, c='R', alpha=alpha)  
#scatter：繪製散點圖，c是color顏色，也可以寫red；alpha介於0（透明）和1（不透明）之間。
#print(help(plt.scatter))
plt.title('loc_x and loc_y') #繪製標題

# lat and lon
plt.subplot(122)  
plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha) 
plt.title('lat and lon')

科比投篮问题研究 Python 科比投篮数据分析_Back_04

（四）特征处理

#极坐标,极坐标（角度和距离）可以成为一个标准的特征
#算距离
raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)
#算角度
loc_x_zero = raw['loc_x'] == 0
#print (loc_x_zero)
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2 
#合并成一个新特征，剩余时间=剩余min+剩余s
raw['remaining_time']=raw['minutes_remaining']*60+raw['seconds_remaining']
print(kobe.action_type.unique()) #该列的不重复的值
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())

print(kobe.shot_type.value_counts())#该列各种不重复值出现的次数

科比投篮问题研究 Python 科比投篮数据分析_决策树_05

kobe['season'].unique()

科比投篮问题研究 Python 科比投篮数据分析_子节点_06

#把年份去掉
#split从‘-’分割成两列，把[1]列取出来通过apply函数赋值给raw['season']
raw['season']=raw['season'].apply(lambda x:int(x.split('-')[1]))
raw['season'].unique()

科比投篮问题研究 Python 科比投篮数据分析_科比投篮问题研究 Python_07

print(kobe['team_id'].unique())
print(kobe['team_name'].unique())

科比投篮问题研究 Python 科比投篮数据分析_科比投篮问题研究 Python_08

pd.DataFrame({'matchup':kobe.matchup,'opponet':kobe.opponent})

科比投篮问题研究 Python 科比投篮数据分析_子节点_09

（五）特征相关性

plt.figure(figsize=(5,5))

plt.scatter(raw.dist,raw.shot_distance,color='blue')
plt.title('dist and shot_distance')
#dist是自己加的特征，跟原有的特征性相比是线性相关，意义重复，可去掉一个

科比投篮问题研究 Python 科比投篮数据分析_子节点_10

gs=kobe.groupby('shot_zone_area')
print(kobe['shot_zone_area'].value_counts())
print(len(gs))

科比投篮问题研究 Python 科比投篮数据分析_决策树_11

import matplotlib.cm as cm
plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):
    alpha=0.1
    gs=kobe.groupby(feat)
    #print (list(gs))  
    #list(gs)=[('Back Court(BC)',...),('Right Side(R)',...),('Back Court(BC)',...)] 
    cs=cm.rainbow(np.linspace(0,1,len(gs)))
    #np.linspace(0, 1, len(gs))=[0.  0.2 0.4 0.6 0.8 1. ]
    #cm.rainbow相当于调色盘，可以认为是通过rainbow函数生成了6个RGB颜色
   #print (cs)
   #cs=[[5.00000000e-01 0.00000000e+00 1.00000000e+00 1.00000000e+00]
   #  [1.00000000e-01 5.87785252e-01 9.51056516e-01 1.00000000e+00]
   #  [3.00000000e-01 9.51056516e-01 8.09016994e-01 1.00000000e+00]
   #  [7.00000000e-01 9.51056516e-01 5.87785252e-01 1.00000000e+00]
   #  [1.00000000e+00 5.87785252e-01 3.09016994e-01 1.00000000e+00]
   #  [1.00000000e+00 1.22464680e-16 6.12323400e-17 1.00000000e+00]]
    for g,c in zip(gs,cs):
        #第一次：g=('Back Court(BC)',...)，c=[5.00000000e-01 0.00000000e+00 1.00000000e+00 1.00000000e+00]
        plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)
    
#shot_zone_area
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')

#shot_zone_basic
plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')

#shot_zone_range
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')

科比投篮问题研究 Python 科比投篮数据分析_Back_12

drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', \
'shot_zone_basic',  'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
#把不要的特征依次去掉
for drop in drops:
    raw=raw.drop(drop,1)#1表示一列
raw.head()

科比投篮问题研究 Python 科比投篮数据分析_科比投篮问题研究 Python_13

（六）将特征进行one-hot编码

#特征one-hot编码
print(raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:3]
#get_dummies把字符串变成机器能够识别的数字列表，prefix：前缀；[0,3]打印前3行

科比投篮问题研究 Python 科比投篮数据分析_Back_14

（七）拼接one-hot编码的新特征

categorical_vars=['action_type','combined_shot_type','shot_type',\
                 'opponent','period','season']
#按照上述方法把数字列表拼接到raw中
for var in categorical_vars:
    raw=pd.concat([raw,pd.get_dummies(raw[var],prefix=var)],1)
    #1表示按列连接，0表示按样本连接
    raw=raw.drop(var,1)
    #从raw中去掉var特征列
raw.head()

科比投篮问题研究 Python 科比投篮数据分析_子节点_16

三、使用随机森林算法分类

（一）划分训练集和测试集

train_kobe=raw[pd.notnull(raw['shot_made_flag'])]
train_label=train_kobe['shot_made_flag']
train_kobe=train_kobe.drop('shot_made_flag',1)
test_kobe=raw[pd.isnull(raw['shot_made_flag'])]
test_kobe=test_kobe.drop('shot_made_flag',1)

（二）找到最好的n_estimators

1.np.linspace() 生成(start,stop)区间指定元素个数num的list，均匀分布
np.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None)
endpoint : bool, optional #是否包括右边界点
retstep : bool, optional #返回步长

np.linspace(2.0, 3.0, num=5, retstep=True)
(array([ 2., 2.25, 2.5, 2.75, 3.]),0.25)

np.linspace(2.0, 3.0, num=5)
array([ 2., 2.25, 2.5, 2.75, 3.])

np.linspace(2.0, 3.0, num=5, endpoint=False)
    array([ 2.,  2.2,  2.4,  2.6,  2.8])

2.np.logspace() log分布间距生成list
np.logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None)
start : float #基底base的start次幂作为左边界
stop : float #基底base的stop次幂作为右边界
np.logspace(2.0, 3.0, num=4)
    array([100., 215.443469, 464.15888336, 1000.])

np.logspace(2.0, 3.0, num=4, endpoint=False)
array([ 100.,177.827941, 316.22776602, 562.34132519])

np.logspace(2.0, 3.0, num=4, base=2.0)
array([ 4., 5.0396842, 6.34960421, 8.])

3.np.arange() 生成(start,stop)区间指定步长step的list
arange([start,] stop[, step,], dtype=None)
np.arange(3)
array([0, 1, 2])

np.arange(3.0)
array([ 0., 1., 2.])

np.arange(3,7)
array([3, 4, 5, 6])

np.arange(3,7,2)
array([3, 5])

在scikit-learn中，RF的分类类是RandomForestClassifier，回归类是RandomForestRegressor。
RF框架参数
1) n_estimators: 也就是弱学习器的最大迭代次数，或者说最大的弱学习器的个数。
2) oob_score :即是否采用袋外样本来评估模型的好坏。默认是False。
3) criterion: 即CART树做划分时对特征的评价标准。
RF决策树参数
1) RF划分时考虑的最大特征数max_features
2) 决策树最大深度max_depth: 默认可以不输入，如果不输入的话，决策树在建立子树的时候不会限制子树的深度。
3) 内部节点再划分所需最小样本数min_samples_split: 这个值限制了子树继续划分的条件，如果某节点的样本数少于min_samples_split，则不会继续再尝试选择最优特征来进行划分。
4) 叶子节点最少样本数min_samples_leaf: 这个值限制了叶子节点最少的样本数，如果某叶子节点数目小于样本数，则会和兄弟节点一起被剪枝。
5）叶子节点最小的样本权重和min_weight_fraction_leaf：这个值限制了叶子节点所有样本权重和的最小值，如果小于这个值，则会和兄弟节点一起被剪枝。
6) 最大叶子节点数max_leaf_nodes: 通过限制最大叶子节点数，可以防止过拟合，默认是"None”，即不限制最大的叶子节点数。
7) 节点划分最小不纯度min_impurity_split: 这个值限制了决策树的增长，如果某节点的不纯度(基于基尼系数，均方差)小于这个阈值，则该节点不再生成子节点。

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time
import numpy as np

print('Finding best n_estimators for RandomForestClassifier...')
min_score=100000
best_n=0
scores_n=[]
range_n=np.logspace(0,2,num=3).astype(int)
for n in range_n:
    print('the number of trees:{0}'.format(n))
    t1=time.time()
    rfc_score=0.
    rfc=RandomForestClassifier(n_estimators=n)
    kf=KFold(n_splits=10,shuffle=True)
    for train_k,test_k in kf.split(train_kobe):    
        #print(test_k)
        #train_k,test_k是索引数组
        rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
        pred=rfc.predict(train_kobe.iloc[test_k])
        rfc_score+=log_loss(train_label.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    if rfc_score<min_score:
        min_score=rfc_score
        best_n=n
    t2=time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1))
print(best_n,min_score)

科比投篮问题研究 Python 科比投篮数据分析_Back_17

（三）找到最好的max_depth

#find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score=100000
best_m=0
scores_m=[]
range_m=np.logspace(0,2,num=3).astype(int)
for m in range_m:
    print('the max depth : {0}'.format(m))
    t1=time.time()
    
    rfc_score=0.
    rfc=RandomForestClassifier(max_depth=m,n_estimators=best_n)
    kf=KFold(10,shuffle=True)
    for train_k,test_k in kf.split(train_kobe):
        rfc.fit(train_kobe.iloc[train_k],train_label.iloc[train_k])
        pred=rfc.predict(train_kobe.iloc[test_k])
        rfc_score+=log_loss(train_label.iloc[test_k],pred)/10
    scores_m.append(rfc_score)
    if rfc_score<min_score:
        min_score=rfc_score
        best_m=m
    t2=time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m,t2-t1))
print(best_m,min_score)

科比投篮问题研究 Python 科比投篮数据分析_子节点_18

（四）可视化参数效果

plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n,scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m,scores_m)
plt.ylabel('score')
plt.xlabel('max depth')

科比投篮问题研究 Python 科比投篮数据分析_决策树_19

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：简述dem数据源及其tedian dem数据来源

下一篇：Unity 编辑器黄色箭头 unity的editor在哪

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯