# 第5章 机器学习包
# 5.2.1 缺失值处理
import numpy as np
import pandas as pd
import os
os.chdir("C:\\Users\\Administrator\\Desktop")  #更改工作路径,注意双\\ 任何操作前可以先将常用包和路径先设置好
data = pd.read_excel("missing.xlsx") #将文件放到工作路径下,用该命令读取数据
print(data)
a    b    c    d
0  2.0   kj  4.0  7.0
1  2.0   kl  6.0  9.0
2  NaN   kl  5.0  NaN
3  5.0  NaN  NaN  9.0
4  6.0   kk  6.0  8.0
c = np.array([[1,2,3,4],[4,5,6,np.nan],[5,6,7,8],[9,4,np.nan,8]])# 含有缺失值的数组
C = pd.DataFrame(c)  #把数组c转化成数据框C
# 需要填充的数据结构要求为数组或数据框,类型为数值型
from sklearn.preprocessing import Imputer  #这种做法在新版本软件中会报错
from sklearn.impute import SimpleImputer  # 新版本的操作方法
# 均值填充
fC = C  
imp = SimpleImputer(np.nan,"mean")   # 新版本的作法,用均值填充空值
fC = imp.fit_transform(fC)
print(fC)
[[1.         2.         3.         4.        ]
 [4.         5.         6.         6.66666667]
 [5.         6.         7.         8.        ]
 [9.         4.         5.33333333 8.        ]]


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=mean as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
  warnings.warn(f"Pass {args_msg} as keyword args. From version "
# 中位数填充
fc = c  
imp = SimpleImputer( np.nan,"median")   # 新版本的作法
fc = imp.fit_transform(fc)
print(fc)
[[1. 2. 3. 4.]
 [4. 5. 6. 8.]
 [5. 6. 7. 8.]
 [9. 4. 6. 8.]]


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=median as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
  warnings.warn(f"Pass {args_msg} as keyword args. From version "
# 最频繁值填充
fD = data[["a","c"]]
imp =  SimpleImputer( np.nan,"most_frequent")   # 新版本的作法
fD = imp.fit_transform(fD)
print(fD)
[[2. 4.]
 [2. 6.]
 [2. 5.]
 [5. 6.]
 [6. 6.]]


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=most_frequent as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
  warnings.warn(f"Pass {args_msg} as keyword args. From version "
# 5.2.2数据规范化
data1 = np.load("data.npy")  #load表示读取数据
print(data1)
[[1.00000000e+00 1.70000000e+01 6.61764706e+01 3.20000000e+01
  1.61496618e+03 1.31562500e+01]
 [2.00000000e+00 8.00000000e+00 6.86875000e+01 3.60000000e+01
  1.43564581e+02 3.80555556e+00]
 [3.00000000e+00 1.60000000e+01 6.58437500e+01 4.30000000e+01
  1.34413138e+03 1.26976744e+01]
 ...
 [8.33000000e+02 1.00000000e+01 6.79500000e+01 2.40000000e+01
  1.15874171e+02 2.79166667e+00]
 [8.34000000e+02 2.10000000e+01 6.65000000e+01 4.10000000e+01
  5.38712893e+02 2.03170732e+01]
 [8.35000000e+02 1.10000000e+01 7.82727273e+01 9.00000000e+00
  6.29832333e+01 9.44444444e+00]]
data1 = data1[:,1:]  #切片,取除第0列外的其它数据
print(data1)
[[  17.           66.17647059   32.         1614.96618125   13.15625   ]
 [   8.           68.6875       36.          143.56458056    3.80555556]
 [  16.           65.84375      43.         1344.13137674   12.69767442]
 ...
 [  10.           67.95         24.          115.87417083    2.79166667]
 [  21.           66.5          41.          538.71289268   20.31707317]
 [  11.           78.27272727    9.           62.98323333    9.44444444]]
imp1 = SimpleImputer( np.nan,"mean")   #用上述方法中的均值填充缺失值
data1 = imp1.fit_transform(data1)
print(data1)
[[  17.           66.17647059   32.         1614.96618125   13.15625   ]
 [   8.           68.6875       36.          143.56458056    3.80555556]
 [  16.           65.84375      43.         1344.13137674   12.69767442]
 ...
 [  10.           67.95         24.          115.87417083    2.79166667]
 [  21.           66.5          41.          538.71289268   20.31707317]
 [  11.           78.27272727    9.           62.98323333    9.44444444]]


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass missing_values=nan, strategy=mean as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
  warnings.warn(f"Pass {args_msg} as keyword args. From version "
# 标准化处理 (最常见的减均值除以标准差后的结果)
from sklearn.preprocessing import StandardScaler
x = data1
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)
print(x)   # 标准化后的结果满足零均值,标准差为1
[[ 0.20025842 -0.82760637  0.05554634  2.84353829  0.76954149]
 [-0.68918721 -0.09224269  0.20662516 -0.18554119 -0.65156099]
 [ 0.10143112 -0.92504475  0.47101308  2.28598816  0.69984796]
 ...
 [-0.49153262 -0.30822213 -0.24661129 -0.24254565 -0.80565008]
 [ 0.59556758 -0.73285966  0.39547367  0.62792513  1.85783108]
 [-0.39270533  2.71482439 -0.81315684 -0.35142881  0.20542766]]
# 归一化处理(把原数据转化为取值为0~1的数据)
from sklearn.preprocessing import MinMaxScaler
x1 = data1
mms = MinMaxScaler()  #归一化过程函数
mms.fit(x1)
x1 = mms.transform(x1)
print(x1)
[[0.38095238 0.04406273 0.2519685  0.33941778 0.1381392 ]
 [0.16666667 0.17158327 0.28346457 0.03015525 0.03188131]
 [0.35714286 0.0271658  0.33858268 0.28249311 0.13292812]
 ...
 [0.21428571 0.13412995 0.18897638 0.02433521 0.02035985]
 [0.47619048 0.06049291 0.32283465 0.11320842 0.2195122 ]
 [0.23809524 0.65836106 0.07086614 0.01321848 0.0959596 ]]
# 5.2.3 主成分分析
Data = pd.read_excel('农村居民人均可支配收入来源2016.xlsx')
X = Data.iloc[:,1:]  #取除地区名的变量
R = X.corr()    #求相关系数
print(R)
工资性收入     经营净收入     财产净收入     转移净收入
工资性收入  1.000000 -0.388997  0.826683  0.401917
经营净收入 -0.388997  1.000000 -0.205737 -0.314542
财产净收入  0.826683 -0.205737  1.000000  0.297458
转移净收入  0.401917 -0.314542  0.297458  1.000000
# 标准化处理
scaler = StandardScaler()
scaler.fit(X) 
X = scaler.transform(X)
#主成分分析
from sklearn.decomposition import PCA # 导入主成分分析包
pca = PCA(n_components = 0.95) 
pca.fit(X)
Y= pca.transform(X)  #返回提取的主成分
tzxl=pca.components_    #返回特征向量          
tz=pca.explained_variance_    #返回特征值       
gxl=pca.explained_variance_ratio_   #方差贡献率的百分比
Y00=sum(X[0,:]*tzxl[0,:])  #主成分的分量值
Y01=sum(X[1,:]*tzxl[0,:])
Y02=sum(X[2,:]*tzxl[0,:])
Y03=sum(X[3,:]*tzxl[0,:])
print(Y)   #打印出需要的值
print(gxl)
[[ 4.33588394  1.2871025   1.68836853]
 [ 1.53736203  2.12655221  0.56027639]
 [-0.46694468  0.18931331  1.38413422]
 [-0.10431271 -1.123358    1.17241467]
 [-0.55472192  0.67855307 -1.03614749]
 [-0.7860694   0.60996214 -0.01608431]
 [-1.74256785  1.33152775 -1.02791134]
 [-0.43551643  1.06930104 -0.9947622 ]
 [ 5.26192283 -1.30032837 -0.65586678]
 [ 1.26240024  0.53263985 -1.0411674 ]
 [ 1.66736332  1.59956567 -0.2701487 ]
 [-0.25754205 -0.62850484 -0.50489853]
 [-0.44568285  0.65255314 -0.33371388]
 [-0.39402814 -0.24401899 -0.0721038 ]
 [-0.79480747  1.26287224 -0.06924108]
 [-0.31006603 -0.65108872 -0.54600265]
 [-0.37598829 -0.48118327 -1.40014355]
 [-0.08226864 -0.94042225 -0.36095876]
 [ 0.97778119 -0.73376053 -0.52563007]
 [-0.61802252 -0.67589739 -0.56974344]
 [-1.17447251  0.42174493  0.46729999]
 [ 0.37012714 -1.03216645 -0.91151552]
 [-0.1364243  -0.56892611 -0.50361372]
 [-0.87126924 -0.98624593  1.31181932]
 [-1.62901512  0.41953719  0.98034596]
 [-1.58629868  0.32183819  0.58906561]
 [-0.18741213 -1.21196511  0.65763622]
 [-0.78267223 -1.0472276   0.9187884 ]
 [ 0.17356232 -1.22932478  0.06572846]
 [-0.53868768 -0.12444272  0.97917061]
 [-1.31161213  0.47579784  0.06460487]]
[0.5676807  0.22505502 0.1701918 ]
#综合排名
F=gxl[0]*Y[:,0]+gxl[1]*Y[:,1]+gxl[2]*Y[:,2] # 综合得分=各个主成分*贡献率之和
dq=list(Data['地区'].values)  #提取地区
Rs=pd.Series(F,index=dq)           #以地区作为index,综合得分为值,构建序列
Rs=Rs.sort_values(ascending=False) #按综合得分降序进行排序
print(Rs)
北京     3.038413
上海     2.582823
天津     1.446676
浙江     1.260543
江苏     0.659315
广东     0.300473
河北     0.013099
山西    -0.112498
福建    -0.162941
青海    -0.166951
宁夏    -0.167162
黑龙江   -0.175883
重庆    -0.177313
山东    -0.178765
陕西    -0.267225
江西    -0.290871
四川    -0.291196
辽宁    -0.311699
湖南    -0.319781
内蒙古   -0.338537
安徽    -0.373579
河南    -0.415474
海南    -0.492279
贵州    -0.493301
甘肃    -0.523621
湖北    -0.560027
广西    -0.599919
新疆    -0.626501
云南    -0.663495
西藏    -0.727826
吉林    -0.864497
dtype: float64