基础知识
import torch
torch.empty(5,3) # 返回未初始化张量(矩阵)
torch.rand(5,3) # 随机生成服从均匀分布的数据,返回值为张量。
torch.zeros(5,3,dtype=torch.long) # 返回零矩阵
x = torch.tensor([5.5,3]) # 转化成张量
x = x.new_ones(5,3,dtype=torch.double) # 返回5行3列的1矩阵
x = torch.randn_like(x,dtype=torch.float) # 返回形状和x一样的随机矩阵
相加
x+y
torch.add(x,y) # 两者相等
索引
x[:,1] # 索引操作
随机数据
x = torch.randn(4,4) # 随机生成服从正态分布的数据,返回值为张量。
y = x.view(16) # 改变矩阵维度
z = x.view(-1,8) # -1代表自动计算
print(x.size(),y.size(),z.size()) # 获取张量的形状
torch.Torch 转 numpy
a = torch.ones(5)
b = a.numpy()
numpy 转 torch.Torch
a = np.ones(5)
b = torch.from_numpy(a)
自动求导
# 自动求导
# 方法一
x = torch.randn(3,4,requires_grad=True)
# 方法二
x = torch.randn(3,4)
x.requires_grad=True
b = torch.randn(3,4,requires_grad=True)
t = x+b
y = t.sum() # tensor(-7.2647, grad_fn=<SumBackward0>)
y.backward(retain_graph=True) # 梯度默认累加
b.grad
x.requires_grad,b.requires_grad,t.requires_grad # (True, True, True)
t.is_leaf,x.is_leaf # 是不是叶子 (False, True)
处理数据分包
目的:将数据打乱分成多份。
def get_data(train_ds: TensorDataset, valid_ds: TensorDataset, bs: int):
return (
DataLoader(train_ds, batch_size=bs, shuffle=True),
DataLoader(valid_ds, batch_size=bs * 2)
)
# 把训练集和测试集的特征和标签打包
train_ds = TensorDataset(
torch.from_numpy(X_train).float(),
torch.from_numpy(Y_train).float()
)
valid_ds = TensorDataset(
torch.from_numpy(X_test).float(),
torch.from_numpy(Y_test).float()
)
# 把训练集和测试集分包
bs = 16
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
线性回归模型
import warnings
import sklearn
from sklearn import preprocessing
import torch.nn.functional as F
import torch
from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
# 其实线性回归就是一个不加激活函数的全链接层
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
warnings.filterwarnings("ignore")
class LinearRegressionModel(nn.Module):
def __init__(self, input_dim, output_dim):
super(LinearRegressionModel, self).__init__()
self.linear = nn.Linear(input_dim, output_dim)
self.loss_func = nn.MSELoss() # MSE损失函数 数据记得标准化,数值要求在[-1,1]之间
self.opt = torch.optim.SGD(self.parameters(), lr=0.03) # SGD优化器
self.val_loss = None
def forward(self, x):
x = self.linear(x)
return x
def loss_batch(self, xb, yb, opt=None):
loss = self.loss_func(self(xb), yb) # 计算损失
if opt is not None:
loss.backward() # 反向传播
opt.step() # 更新权重参数
opt.zero_grad() # 每一次迭代 梯度要清零
return loss.item(), len(xb)
def fit(self, steps, train_dl: DataLoader, valid_dl: DataLoader):
with tqdm(total=steps) as pbar:
for step in range(steps):
self.train() # 开始训练的过程
for xb, yb in train_dl:
self.loss_batch(xb, yb, self.opt)
self.eval() # 开始测试的过程
with torch.no_grad():
losses, nums = zip(
*[self.loss_batch(xb, yb) for xb, yb in valid_dl]
)
self.val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
pbar.set_description("model")
pbar.set_postfix(step=step, val_loss=self.val_loss)
pbar.update(1)
pbar.write('当前数据损失:{}'.format(self.val_loss))
训练线性回归模型
def get_data(train_ds: TensorDataset, valid_ds: TensorDataset, bs: int):
return (
DataLoader(train_ds, batch_size=bs, shuffle=True),
DataLoader(valid_ds, batch_size=bs * 2)
)
if __name__ == '__main__':
# 生成一个线性回归问题
# X, Y = make_regression(n_features=1, n_samples=506, n_informative=2, n_targets=1, noise=1.2)
# n_features(default=100) 表示每一个样本有多少特征值
# n_samples (default=100)表示样本的个数
# n_informative(default=10)有信息的特征数量,也就是用来构造线性模型,生成输出的特征数量
# n_targets(default=1)回归目标的数量,也就是对应于一个样本输出向量y的维度。默认输出是标量
# noise(default=0.0)施加于输出的高斯噪声的标准差(越大越离散)
# 获取波士顿房价数据
loaded_data = datasets.load_boston()
X = loaded_data.data
Y = loaded_data.target
# 处理数据
X = sklearn.preprocessing.StandardScaler().fit_transform(X) # 数据标准化
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y.reshape(-1, 1), test_size=0.01
)
train_ds = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(Y_train).float())
valid_ds = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(Y_test).float())
bs = 16
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
# 训练模型
model = LinearRegressionModel(13, 1)
model.fit(1000, train_dl, valid_dl)
pass
单分类模型
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F
import warnings
from sklearn.datasets import make_blobs
from torch import optim
from tqdm import tqdm
warnings.filterwarnings("ignore")
# torch.nn.functional(一般用于没有可学习的参数) 和 nn.Module (一般用于有可学习的参数)
# 创建一个model来更简化代码
# 必须继承mm.Module 且在其构造函数中需要调用nn.Module的构造函数
# 无需写反向传播函数,nn.Module 能够利用 autograd 自动实现反向传播
# Module 中的可学习参数可以通过named_parameters()或者parameters()返回迭代器
class Mnist_NN(nn.Module):
def __init__(self, input_dim, output_dim):
super().__init__()
self.hidden1 = nn.Linear(input_dim, 128)
self.hidden2 = nn.Linear(128, 256)
self.out = nn.Linear(256, output_dim)
self.loss_func = F.cross_entropy # 损失函数
self.opt = optim.SGD(self.parameters(), lr=0.001) # 随机梯度下降
self.val_loss = None
def forward(self, x):
x = F.relu(self.hidden1(x))
x = F.relu(self.hidden2(x))
x = self.out(x)
return x
def loss_batch(self, xb, yb, opt=None):
loss = self.loss_func(self(xb), yb)
if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()
return loss.item(), len(xb)
def fit(self, steps, train_dl: DataLoader, valid_dl: DataLoader):
with tqdm(total=steps) as pbar:
for step in range(steps):
self.train() # 开始训练的过程
for xb, yb in train_dl:
self.loss_batch(xb, yb, self.opt)
self.eval() # 开始测试的过程
with torch.no_grad():
losses, nums = zip(
*[self.loss_batch(xb, yb) for xb, yb in valid_dl]
)
self.val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
pbar.set_description("model")
pbar.set_postfix(step=step, val_loss=self.val_loss)
pbar.update(1)
pbar.write('当前数据损失:{}'.format(self.val_loss))
def predict(self, X):
return self(X)
训练单分类模型
def get_data(train_ds: TensorDataset, valid_ds: TensorDataset, bs: int):
return (
DataLoader(train_ds, batch_size=bs, shuffle=True),
DataLoader(valid_ds, batch_size=bs * 2)
)
if __name__ == '__main__':
# 生成一个分类问题
x, y = make_blobs(n_features=10, n_samples=100, centers=3, random_state=3)
# n_features 表示每一个样本有多少特征值
# n_samples 表示样本的个数
# centers 是聚类中心点的个数,可以理解为label的种类数
# random_state 是随机种子,可以固定生成的数据
# cluster_std 设置每个类别的方差,若要指定需要传入长度为centers的列表
# 处理数据
x_train, x_test, y_train, y_test = train_test_split(x, y) # 分类 - (训练集 , 测试集)
train_ds = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train).long())
valid_ds = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test).long())
bs = 16
torch.set_default_tensor_type(torch.DoubleTensor)
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
# 训练模型
model = Mnist_NN(10, 3)
model.fit(100, train_dl, valid_dl)
pass
Dataset
from torch.utils.data import Dataset
class OneData(Dataset):
def __init__(self,):
pass
def __getitem__(self,):
pass
def __len__(self,):
pass
模型的保存与读取
torch.save(model.state_dict(),'data/model.pkl')
model.load_state_dict(torch.load('data/model.pkl'))