import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.nn as nn
from torch.optim import SGD,Adam
from torchviz import make_dot
import torch.utils.data as Data
import hiddenlayer as hl
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler
下载数据集到本地,spambase.data
https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/
#使用pandans读入数据,header=None时,即指明原始文件数据没有列索引,这样read_csv为自动加上列索引,除非你给定列索引的名字。
spam=pd.read_csv("D:\design_cov\spambase.data",header=None)
#查看数据
spam.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | |
0 | 0.00 | 0.64 | 0.64 | 0.0 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.000 | 0.0 | 0.778 | 0.000 | 0.000 | 3.756 | 61 | 278 | 1 |
1 | 0.21 | 0.28 | 0.50 | 0.0 | 0.14 | 0.28 | 0.21 | 0.07 | 0.00 | 0.94 | ... | 0.00 | 0.132 | 0.0 | 0.372 | 0.180 | 0.048 | 5.114 | 101 | 1028 | 1 |
2 | 0.06 | 0.00 | 0.71 | 0.0 | 1.23 | 0.19 | 0.19 | 0.12 | 0.64 | 0.25 | ... | 0.01 | 0.143 | 0.0 | 0.276 | 0.184 | 0.010 | 9.821 | 485 | 2259 | 1 |
3 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.137 | 0.0 | 0.137 | 0.000 | 0.000 | 3.537 | 40 | 191 | 1 |
4 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.135 | 0.0 | 0.135 | 0.000 | 0.000 | 3.537 | 40 | 191 | 1 |
5 rows × 58 columns
#iloc[ : , : ] 前面的冒号就是取行数,后面的冒号是取列数
x=spam.iloc[:,0:57].values#取出所有行的1-56列,即特征列
y=spam.iloc[:,57].values#取出目标列,即target
print(x)
print(len(x),len(x[0]))
print(y)
print(len(y))
[[0.000e+00 6.400e-01 6.400e-01 ... 3.756e+00 6.100e+01 2.780e+02]
[2.100e-01 2.800e-01 5.000e-01 ... 5.114e+00 1.010e+02 1.028e+03]
[6.000e-02 0.000e+00 7.100e-01 ... 9.821e+00 4.850e+02 2.259e+03]
...
[3.000e-01 0.000e+00 3.000e-01 ... 1.404e+00 6.000e+00 1.180e+02]
[9.600e-01 0.000e+00 0.000e+00 ... 1.147e+00 5.000e+00 7.800e+01]
[0.000e+00 0.000e+00 6.500e-01 ... 1.250e+00 5.000e+00 4.000e+01]]
4601 57
[1 1 1 ... 0 0 0]
4601
## train_test_split()函数是用来随机划分样本数据为训练集和测试集的,当然也可以人为的切片划分。
# 优点:随机客观的划分数据,减少人为因素
# train_X,test_X,train_y,test_y = train_test_split(train_data,train_target,test_size=0.3,random_state=123)
#
# 参数解释:
# train_data:待划分样本数据
# train_target:待划分样本数据的结果(标签)
# test_size:测试数据占样本数据的比例,若整数则样本数量
# random_state:设置随机数种子,保证每次都是同一个随机数。若为0或不填,则每次得到数据都不一样
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.25,random_state=123)
print(len(train_x))#行
print(len(train_x[0]))#列
3450
57
机器学习算法实践中,我们往往有着将不同规格的数据转换到同一规格,或将不同分布的数据转换到某个特定分布的需求,这种需求统称为将数据“无量纲化”。
在距离类模型,譬如K近邻,KMeans聚类中,无量纲化可以帮我们提升模型精度,避免某一个取值范围特别大的特征对距离计算造成影响。
一个特例是决策树和树的集成算法们,对决策树我们不需要无量纲化,决策树可以把任意数据都处理得很好。
数据的无量纲化可以是线性的,也可以是非线性的。线性的无量纲化包括中心化(Zero-centered或者Mean-subtraction)处理和缩放处理(Scale)。中心化的本质是让所有记录减去一个固定值,即让数据样本数据平移到某个位置。缩放的本质是通过除以一个固定值,将数据固定在某个范围之中,取对数也算是一种缩放处理。
preprocessing.MinMaxScaler
当数据(x)按照最小值中心化后,再按极差(最大值 - 最小值)缩放,数据移动了最小值个单位,并且会被收敛到[0,1]之间,而这个过程,就叫做数据归一化(Normalization,又称Min-Max Scaling)。
scales的集中预处理方法
fit(): Method calculates the parameters μ and σ and saves them as internal objects.
解释:简单来说,就是求得训练集X的均值,方差,最大值,最小值,这些训练集X固有的属性。
transform(): Method using these calculated parameters apply the transformation to a particular dataset.
解释:在fit的基础上,进行标准化,降维,归一化等操作(看具体用的是哪个工具,如PCA,StandardScaler等)。
fit_transform(): joins the fit() and transform() method for transformation of dataset.
解释:fit_transform是fit和transform的组合,既包括了训练又包含了转换。
transform()和fit_transform()二者的功能都是对数据进行某种统一处理(比如标准化~N(0,1),将数据缩放(映射)到某个固定区间,归一化,正则化等)
#对数据进行归一化,数据归一化MinMaxScaler
scales=MinMaxScaler(feature_range=(0,1))
train_x=scales.fit_transform(train_x)
test_x=scales.transform(test_x)
train_x
array([[0. , 0. , 0. , ..., 0.00054471, 0.00030036,
0.00044192],
[0. , 0. , 0.0745098 , ..., 0.00177304, 0.00330396,
0.00795455],
[0. , 0. , 0.04901961, ..., 0.00312846, 0.00280336,
0.01313131],
...,
[0.18527919, 0. , 0.07058824, ..., 0.00245574, 0.00530637,
0.03011364],
[0. , 0. , 0. , ..., 0.00131094, 0.00750901,
0.01243687],
[0. , 0. , 0. , ..., 0. , 0. ,
0.00025253]])
test_x
array([[0. , 0. , 0. , ..., 0.00059464, 0.00070084,
0.00296717],
[0. , 0. , 0. , ..., 0.00236042, 0.0015018 ,
0.00448232],
[0. , 0. , 0. , ..., 0.00077167, 0.00140168,
0.00227273],
...,
[0. , 0. , 0. , ..., 0.00085066, 0.00070084,
0.00189394],
[0. , 0.06582633, 0.18431373, ..., 0.02315025, 0.02442932,
0.02001263],
[0. , 0.03221289, 0.09019608, ..., 0.00104948, 0.00120144,
0.00688131]])
# 将数据转换为张量
x_train_nots=torch.from_numpy(train_x.astype(np.float32))
y_train=torch.from_numpy(train_y.astype(np.int64))
x_test_nots=torch.from_numpy(test_x.astype(np.float32))
y_test=torch.from_numpy(test_y.astype(np.int64))
#通过TensorDataset将数据组装起来
train_data=Data.TensorDataset(x_train_nots,y_train)
print(x_train_nots)
print(x_train_nots.size())
print(y_train.size())
print(x_test_nots.size())
tensor([[0.0000, 0.0000, 0.0000, ..., 0.0005, 0.0003, 0.0004],
[0.0000, 0.0000, 0.0745, ..., 0.0018, 0.0033, 0.0080],
[0.0000, 0.0000, 0.0490, ..., 0.0031, 0.0028, 0.0131],
...,
[0.1853, 0.0000, 0.0706, ..., 0.0025, 0.0053, 0.0301],
[0.0000, 0.0000, 0.0000, ..., 0.0013, 0.0075, 0.0124],
[0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0003]])
torch.Size([3450, 57])
torch.Size([3450])
torch.Size([1151, 57])
train_note_loader=Data.DataLoader(
dataset=train_data,#数据源
batch_size=64,#数据源切分成若干组,每组的数量
shuffle=True,#是否打乱然后分组
num_workers=0# num_workers是加载数据(batch)的线程数目
)
class MLPclassifica(nn.Module):
def __init__(self):
super(MLPclassifica, self).__init__()
self.hidden1=nn.Sequential(
nn.Linear(57,30,True),
nn.ReLU()
)
self.hidden2=nn.Sequential(
nn.Linear(30,10),
nn.ReLU()
)
self.classifica=nn.Sequential(
nn.Linear(10,2),#最后2个以便使用CrossEntropyLoss
nn.Sigmoid()
)
def forward(self,x):
fc1=self.hidden1(x)
fc2=self.hidden2(fc1)
output=self.classifica(fc2)
return output
mlpc=MLPclassifica()
optimizer=Adam(mlpc.parameters(),lr=0.01)
loss_func=nn.CrossEntropyLoss()#二分类损失函数
history1=hl.History()
canvas1=hl.Canvas()
print_step=25
mlpc
MLPclassifica(
(hidden1): Sequential(
(0): Linear(in_features=57, out_features=30, bias=True)
(1): ReLU()
)
(hidden2): Sequential(
(0): Linear(in_features=30, out_features=10, bias=True)
(1): ReLU()
)
(classifica): Sequential(
(0): Linear(in_features=10, out_features=2, bias=True)
(1): Sigmoid()
)
)
x=torch.randn(1,57).requires_grad_(True)
y=mlpc(x)
x.size()
a=torch.Tensor(y)
for step,(b_x,b_y) in enumerate(train_note_loader):
output=mlpc(b_x)
# print(step,b_x.size(),b_y.size(),output.size())
train_loss=loss_func(output,b_y)
# print(step,train_loss)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
niter=0*len(train_note_loader)+step+1
# print('niter:-',niter)
# print(step,train_loss,niter,len(train_note_loader))
if niter%print_step==0:
output=mlpc(x_test_nots)
_, pre_lab = torch.max(output,1)#去每行最大值
# print(output.size(),pre_lab,len(pre_lab))
test_accuracy=accuracy_score(y_test,pre_lab)
print(test_accuracy)
history1.log(niter,train_loss=train_loss,test_accuracy=test_accuracy)
with canvas1:
canvas1.draw_plot(history1['train_loss'])
canvas1.draw_plot(history1['test_accuracy'])
for epoch in range(15):
for step,(b_x,b_y) in enumerate(train_note_loader):
output=mlpc(b_x)
train_loss=loss_func(output,b_y)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
niter=epoch*len(train_note_loader)+step+1
if niter%print_step==0:
output=mlpc(x_test_nots)
_, pre_lab = torch.max(output,1)#去每行最大值
test_accuracy=accuracy_score(y_test,pre_lab)
history1.log(niter,train_loss=train_loss,test_accuracy=test_accuracy)
print('niter:-',niter,test_accuracy)
with canvas1:
canvas1.draw_plot(history1['train_loss'])
canvas1.draw_plot(history1['test_accuracy'])
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.datasets import fetch_california_housing
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD,Adam
import torch.utils.data as Data
from matplotlib import pyplot as plt
import seaborn as sns
#使用sklearn的数据集
housedata=fetch_california_housing()
X_train,X_test,Y_train,Y_test=train_test_split(housedata.data,housedata.target,test_size=0.3,random_state=42)
scale=StandardScaler()
train_x=scale.fit_transform(X_train)
test_x=scale.transform(X_test)
# 查看数据:二维表
# 方式一
a=pd.concat([pd.DataFrame(train_x),pd.DataFrame(Y_train)],axis=1)
print(a)
0 1 2 3 4 5 6 \
0 0.133506 0.509357 0.181060 -0.273850 -0.184117 -0.010825 -0.805682
1 -0.532218 -0.679873 -0.422630 -0.047868 -0.376191 -0.089316 -1.339473
2 0.170990 -0.362745 0.073128 -0.242600 -0.611240 -0.044800 -0.496645
3 -0.402916 -1.155565 0.175848 -0.008560 -0.987495 -0.075230 1.690024
4 -0.299285 1.857152 -0.259598 -0.070993 0.086015 -0.066357 0.992350
... ... ... ... ... ... ... ...
14443 1.308827 0.509357 0.281603 -0.383849 -0.675265 -0.007030 -0.875918
14444 -0.434100 0.350793 0.583037 0.383154 0.285105 0.063443 -0.763541
14445 -0.494787 0.588640 -0.591570 -0.040978 0.287736 0.017201 -0.758858
14446 0.967171 -1.076283 0.390149 -0.067164 0.306154 0.004821 0.903385
14447 -0.683202 1.857152 -0.829656 -0.087729 1.044630 -0.081672 0.992350
7 0
0 0.780934 1.93800
1 1.245270 1.69700
2 -0.277552 2.59800
3 -0.706938 1.36100
4 -1.430902 5.00001
... ... ...
14443 0.810891 2.29200
14444 1.075513 0.97800
14445 0.601191 2.22100
14446 -1.186252 2.83500
14447 -1.415923 3.25000
[14448 rows x 9 columns]
# 方式二
# housedatadf=pd.DataFrame(data=train_x)
housedatadf=pd.DataFrame(data=train_x,columns=housedata.feature_names)
housedatadf['target']=Y_train
print(housedatadf)
MedInc HouseAge AveRooms AveBedrms Population AveOccup \
0 0.133506 0.509357 0.181060 -0.273850 -0.184117 -0.010825
1 -0.532218 -0.679873 -0.422630 -0.047868 -0.376191 -0.089316
2 0.170990 -0.362745 0.073128 -0.242600 -0.611240 -0.044800
3 -0.402916 -1.155565 0.175848 -0.008560 -0.987495 -0.075230
4 -0.299285 1.857152 -0.259598 -0.070993 0.086015 -0.066357
... ... ... ... ... ... ...
14443 1.308827 0.509357 0.281603 -0.383849 -0.675265 -0.007030
14444 -0.434100 0.350793 0.583037 0.383154 0.285105 0.063443
14445 -0.494787 0.588640 -0.591570 -0.040978 0.287736 0.017201
14446 0.967171 -1.076283 0.390149 -0.067164 0.306154 0.004821
14447 -0.683202 1.857152 -0.829656 -0.087729 1.044630 -0.081672
Latitude Longitude target
0 -0.805682 0.780934 1.93800
1 -1.339473 1.245270 1.69700
2 -0.496645 -0.277552 2.59800
3 1.690024 -0.706938 1.36100
4 0.992350 -1.430902 5.00001
... ... ... ...
14443 -0.875918 0.810891 2.29200
14444 -0.763541 1.075513 0.97800
14445 -0.758858 0.601191 2.22100
14446 0.903385 -1.186252 2.83500
14447 0.992350 -1.415923 3.25000
[14448 rows x 9 columns]
# 查看特征相关性热力图
datacor=np.corrcoef(housedatadf.values,rowvar=0)
datacor=pd.DataFrame(data=datacor,columns=housedatadf.columns,index=housedatadf.columns)
plt.figure(figsize=(8,6))
ax=sns.heatmap(datacor,square=True,annot=True,fmt=".3f",linewidths=.5,cmap="YlGnBu")
plt.show()
train_xt=torch.from_numpy(train_x.astype(np.float32))
train_yt=torch.from_numpy(Y_train.astype(np.float32))
test_xt=torch.from_numpy(test_x.astype(np.float32))
test_yt=torch.from_numpy(Y_test.astype(np.float32))
train_data=Data.TensorDataset(train_xt,train_yt)
test_data=Data.TensorDataset(test_xt,test_yt)
train_loader=Data.DataLoader(
dataset=train_data,
batch_size=64,
shuffle=True,
num_workers=0
)
class MLP(nn.Module):
def __init__(self):
super(MLP, self).__init__()
self.hidden1=nn.Linear(8,100,bias=True)
self.hidden2=nn.Linear(100,100)
self.hidden3=nn.Linear(100,50)
self.predict=nn.Linear(50,1)
def forward(self,x):
x=F.relu(self.hidden1(x))
x=F.relu(self.hidden2(x))
x=F.relu(self.hidden3(x))
output=self.predict(x)
return output
mlpreg=MLP()
print(mlpreg)
MLP(
(hidden1): Linear(in_features=8, out_features=100, bias=True)
(hidden2): Linear(in_features=100, out_features=100, bias=True)
(hidden3): Linear(in_features=100, out_features=50, bias=True)
(predict): Linear(in_features=50, out_features=1, bias=True)
)
optimizer=SGD(mlpreg.parameters(),lr=0.01)
loss_func=nn.MSELoss()
train_loss_all=[]
for epoch in range(30):
train_loss=0
train_num=0
for step,(b_x,b_y) in enumerate(train_loader):
output=mlpreg(b_x)
loss=loss_func(output,b_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss+=loss.item()*b_x.size(0)
train_num+=b_x.size(0)
# 收集所有的loss/num
print(epoch,loss.item(),b_x.size(0))
train_loss_all.append(train_loss/train_num)
0 1.2354446649551392 64
0 1.1876583099365234 64
0 1.2846877574920654 64 省略
plt.figure(figsize=(10,6))
plt.plot(train_loss_all,'ro-',label="Train loss")
plt.legend()
plt.grid()
plt.xlabel("epoch")
plt.ylabel("loss")
plt.show()
pre_y=mlpreg(test_xt)
pre_y=pre_y.data.numpy()
mae=mean_absolute_error(Y_test,pre_y)
print("mae:",mae)
mae: 0.8968341607985509
index=np.argsort(Y_test)
plt.figure(figsize=(12,5))
plt.plot(np.arange(len(Y_test)),Y_test[index],'r',label="Original Y")
plt.scatter(np.arange(len(pre_y)),pre_y[index],s=3,c="b",label="Prediction")
plt.legend("upper left")
plt.grid()
plt.xlabel("Index")
plt.ylabel("Y")
plt.show()