RNN:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ycy6d8iB-1584285348969)(https://cdn.kesci.com/upload/image/q5jjvcykud.png?imageView2/0/w/320/h/320)]
H t = ϕ ( X t W x h + H t − 1 W h h + b h ) H_{t} = ϕ(X_{t}W_{xh} + H_{t-1}W_{hh} + b_{h}) Ht=ϕ(XtWxh+Ht−1Whh+bh)
GRU:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-r6S2i6jZ-1584285348971)(https://cdn.kesci.com/upload/image/q5jk0q9suq.png?imageView2/0/w/640/h/640)]
R
t
=
σ
(
X
t
W
x
r
+
H
t
−
1
W
h
r
+
b
r
)
R_{t} = σ(X_tW_{xr} + H_{t−1}W_{hr} + b_r)
Rt=σ(XtWxr+Ht−1Whr+br)
Z
t
=
σ
(
X
t
W
x
z
+
H
t
−
1
W
h
z
+
b
z
)
Z_{t} = σ(X_tW_{xz} + H_{t−1}W_{hz} + b_z)
Zt=σ(XtWxz+Ht−1Whz+bz)
H
~
t
=
t
a
n
h
(
X
t
W
x
h
+
(
R
t
⊙
H
t
−
1
)
W
h
h
+
b
h
)
\widetilde{H}_t = tanh(X_tW_{xh} + (R_t ⊙H_{t−1})W_{hh} + b_h)
H
t=tanh(XtWxh+(Rt⊙Ht−1)Whh+bh)
H
t
=
Z
t
⊙
H
t
−
1
+
(
1
−
Z
t
)
⊙
H
~
t
H_t = Z_t⊙H_{t−1} + (1−Z_t)⊙\widetilde{H}_t
Ht=Zt⊙Ht−1+(1−Zt)⊙H
t
• 重置⻔有助于捕捉时间序列⾥短期的依赖关系;
• 更新⻔有助于捕捉时间序列⾥⻓期的依赖关系。
载入数据集
import os os.listdir('/home/input')#数据集文件夹 import numpy as np import torch from torch import nn, optim import torch.nn.functional as F import sys sys.path.append("../input/") import d2l_jay4504 as d2l device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') (corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
初始化参数
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size print('will use', device) def get_params(): def _one(shape): ts = torch.tensor(np.random.normal(0, 0.01, size=shape), device=device, dtype=torch.float32) #正态分布 return torch.nn.Parameter(ts, requires_grad=True) def _three(): return (_one((num_inputs, num_hiddens)), _one((num_hiddens, num_hiddens)), torch.nn.Parameter(torch.zeros(num_hiddens, device=device, dtype=torch.float32), requires_grad=True)) W_xz, W_hz, b_z = _three() # 更新门参数 W_xr, W_hr, b_r = _three() # 重置门参数 W_xh, W_hh, b_h = _three() # 候选隐藏状态参数 # 输出层参数 W_hq = _one((num_hiddens, num_outputs)) b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device, dtype=torch.float32), requires_grad=True) return nn.ParameterList([W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q]) def init_gru_state(batch_size, num_hiddens, device): #隐藏状态初始化 return (torch.zeros((batch_size, num_hiddens), device=device), )
GRU模型
def gru(inputs, state, params): W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: Z = torch.sigmoid(torch.matmul(X, W_xz) + torch.matmul(H, W_hz) + b_z) R = torch.sigmoid(torch.matmul(X, W_xr) + torch.matmul(H, W_hr) + b_r) H_tilda = torch.tanh(torch.matmul(X, W_xh) + R * torch.matmul(H, W_hh) + b_h) H = Z * H + (1 - Z) * H_tilda Y = torch.matmul(H, W_hq) + b_q outputs.append(Y) return outputs, (H,)
训练模型
num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] d2l.train_and_predict_rnn(gru, get_params, init_gru_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, False, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
输出:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YiCbWe1Y-1584285348972)(https://imgkr.cn-bj.ufileos.com/2796f004-0af8-41ec-951d-38d45ca897e0.png)]
简洁实现
num_hiddens=256 num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] lr = 1e-2 # 注意调整学习率 gru_layer = nn.GRU(input_size=vocab_size, hidden_size=num_hiddens) model = d2l.RNNModel(gru_layer, vocab_size).to(device) d2l.train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
输出:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-s7GZt0zB-1584285348975)(https://imgkr.cn-bj.ufileos.com/31d8288d-7d11-43af-a65b-3c46692cd4b4.png)]
LSTM
长短期记忆long short-term memory:
遗忘门:控制上一时间步的记忆细胞
输入门:控制当前时间步的输入
输出门:控制从记忆细胞到隐藏状态
记忆细胞:⼀种特殊的隐藏状态的信息的流动
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-V5S3OxqQ-1584285348976)(https://cdn.kesci.com/upload/image/q5jk2bnnej.png?imageView2/0/w/640/h/640)]
I
t
=
σ
(
X
t
W
x
i
+
H
t
−
1
W
h
i
+
b
i
)
I_t = σ(X_tW_{xi} + H_{t−1}W_{hi} + b_i)
It=σ(XtWxi+Ht−1Whi+bi)
F
t
=
σ
(
X
t
W
x
f
+
H
t
−
1
W
h
f
+
b
f
)
F_t = σ(X_tW_{xf} + H_{t−1}W_{hf} + b_f)
Ft=σ(XtWxf+Ht−1Whf+bf)
O
t
=
σ
(
X
t
W
x
o
+
H
t
−
1
W
h
o
+
b
o
)
O_t = σ(X_tW_{xo} + H_{t−1}W_{ho} + b_o)
Ot=σ(XtWxo+Ht−1Who+bo)
C
~
t
=
t
a
n
h
(
X
t
W
x
c
+
H
t
−
1
W
h
c
+
b
c
)
\widetilde{C}_t = tanh(X_tW_{xc} + H_{t−1}W_{hc} + b_c)
C
t=tanh(XtWxc+Ht−1Whc+bc)
C
t
=
F
t
⊙
C
t
−
1
+
I
t
⊙
C
~
t
C_t = F_t ⊙C_{t−1} + I_t ⊙\widetilde{C}_t
Ct=Ft⊙Ct−1+It⊙C
t
H
t
=
O
t
⊙
t
a
n
h
(
C
t
)
H_t = O_t⊙tanh(C_t)
Ht=Ot⊙tanh(Ct)
初始化参数
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size print('will use', device) def get_params(): def _one(shape): ts = torch.tensor(np.random.normal(0, 0.01, size=shape), device=device, dtype=torch.float32) return torch.nn.Parameter(ts, requires_grad=True) def _three(): return (_one((num_inputs, num_hiddens)), _one((num_hiddens, num_hiddens)), torch.nn.Parameter(torch.zeros(num_hiddens, device=device, dtype=torch.float32), requires_grad=True)) W_xi, W_hi, b_i = _three() # 输入门参数 W_xf, W_hf, b_f = _three() # 遗忘门参数 W_xo, W_ho, b_o = _three() # 输出门参数 W_xc, W_hc, b_c = _three() # 候选记忆细胞参数 # 输出层参数 W_hq = _one((num_hiddens, num_outputs)) b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device, dtype=torch.float32), requires_grad=True) return nn.ParameterList([W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q]) def init_lstm_state(batch_size, num_hiddens, device): return (torch.zeros((batch_size, num_hiddens), device=device), torch.zeros((batch_size, num_hiddens), device=device))
LSTM模型
def lstm(inputs, state, params): [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q] = params (H, C) = state outputs = [] for X in inputs: I = torch.sigmoid(torch.matmul(X, W_xi) + torch.matmul(H, W_hi) + b_i) F = torch.sigmoid(torch.matmul(X, W_xf) + torch.matmul(H, W_hf) + b_f) O = torch.sigmoid(torch.matmul(X, W_xo) + torch.matmul(H, W_ho) + b_o) C_tilda = torch.tanh(torch.matmul(X, W_xc) + torch.matmul(H, W_hc) + b_c) C = F * C + I * C_tilda H = O * C.tanh() Y = torch.matmul(H, W_hq) + b_q outputs.append(Y) return outputs, (H, C)
训练模型
num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] d2l.train_and_predict_rnn(lstm, get_params, init_lstm_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, False, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
输出:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u6yjmgTB-1584285348977)(https://imgkr.cn-bj.ufileos.com/2b6c7757-175f-4de4-96db-c34984cd6795.png)]
简洁实现
num_hiddens=256 num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] lr = 1e-2 # 注意调整学习率 lstm_layer = nn.LSTM(input_size=vocab_size, hidden_size=num_hiddens) model = d2l.RNNModel(lstm_layer, vocab_size) d2l.train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
深度循环神经网络
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lAOtlHvs-1584285348979)(https://cdn.kesci.com/upload/image/q5jk3z1hvz.png?imageView2/0/w/320/h/320)]
H
t
(
1
)
=
ϕ
(
X
t
W
x
h
(
1
)
+
H
t
−
1
(
1
)
W
h
h
(
1
)
+
b
h
(
1
)
)
\boldsymbol{H}_t^{(1)} = \phi(\boldsymbol{X}_t \boldsymbol{W}_{xh}^{(1)} + \boldsymbol{H}_{t-1}^{(1)} \boldsymbol{W}_{hh}^{(1)} + \boldsymbol{b}_h^{(1)})
Ht(1)=ϕ(XtWxh(1)+Ht−1(1)Whh(1)+bh(1))
H
t
(
ℓ
)
=
ϕ
(
H
t
(
ℓ
−
1
)
W
x
h
(
ℓ
)
+
H
t
−
1
(
ℓ
)
W
h
h
(
ℓ
)
+
b
h
(
ℓ
)
)
\boldsymbol{H}_t^{(\ell)} = \phi(\boldsymbol{H}_t^{(\ell-1)} \boldsymbol{W}_{xh}^{(\ell)} + \boldsymbol{H}_{t-1}^{(\ell)} \boldsymbol{W}_{hh}^{(\ell)} + \boldsymbol{b}_h^{(\ell)})
Ht(ℓ)=ϕ(Ht(ℓ−1)Wxh(ℓ)+Ht−1(ℓ)Whh(ℓ)+bh(ℓ))
O
t
=
H
t
(
L
)
W
h
q
+
b
q
\boldsymbol{O}_t = \boldsymbol{H}_t^{(L)} \boldsymbol{W}_{hq} + \boldsymbol{b}_q
Ot=Ht(L)Whq+bq
num_hiddens=256 num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] lr = 1e-2 # 注意调整学习率 gru_layer = nn.LSTM(input_size=vocab_size, hidden_size=num_hiddens,num_layers=2) model = d2l.RNNModel(gru_layer, vocab_size).to(device) d2l.train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
输出:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-LPDnzNUI-1584285348981)(https://imgkr.cn-bj.ufileos.com/524263d2-c8b0-4ad6-8d3e-69e970187cbb.png)]
gru_layer = nn.LSTM(input_size=vocab_size, hidden_size=num_hiddens,num_layers=6) model = d2l.RNNModel(gru_layer, vocab_size).to(device) d2l.train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
输出:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cdLIhYUy-1584285348982)(https://imgkr.cn-bj.ufileos.com/43fd13df-1c5b-4dc3-aeba-3d21c7844b8b.png)]
双向循环神经网络
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-o1iBelTn-1584285348984)(https://cdn.kesci.com/upload/image/q5j8hmgyrz.png?imageView2/0/w/320/h/320)]
H
→
t
=
ϕ
(
X
t
W
x
h
(
f
)
+
H
→
t
−
1
W
h
h
(
f
)
+
b
h
(
f
)
)
H
←
t
=
ϕ
(
X
t
W
x
h
(
b
)
+
H
←
t
+
1
W
h
h
(
b
)
+
b
h
(
b
)
)
\begin{aligned} \overrightarrow{\boldsymbol{H}}_t &= \phi(\boldsymbol{X}_t \boldsymbol{W}_{xh}^{(f)} + \overrightarrow{\boldsymbol{H}}_{t-1} \boldsymbol{W}_{hh}^{(f)} + \boldsymbol{b}_h^{(f)})\\ \overleftarrow{\boldsymbol{H}}_t &= \phi(\boldsymbol{X}_t \boldsymbol{W}_{xh}^{(b)} + \overleftarrow{\boldsymbol{H}}_{t+1} \boldsymbol{W}_{hh}^{(b)} + \boldsymbol{b}_h^{(b)}) \end{aligned}
H
tH
t=ϕ(XtWxh(f)+H
t−1Whh(f)+bh(f))=ϕ(XtWxh(b)+H
t+1Whh(b)+bh(b))
H
t
=
(
H
→
t
,
H
←
t
)
\boldsymbol{H}_t=(\overrightarrow{\boldsymbol{H}}_{t}, \overleftarrow{\boldsymbol{H}}_t)
Ht=(H
t,H
t)
O
t
=
H
t
W
h
q
+
b
q
\boldsymbol{O}_t = \boldsymbol{H}_t \boldsymbol{W}_{hq} + \boldsymbol{b}_q
Ot=HtWhq+bq
num_hiddens=128 num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e-2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] lr = 1e-2 # 注意调整学习率 gru_layer = nn.GRU(input_size=vocab_size, hidden_size=num_hiddens,bidirectional=True) model = d2l.RNNModel(gru_layer, vocab_size).to(device) d2l.train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
输出:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GhF608ea-1584285348985)(https://imgkr.cn-bj.ufileos.com/4bcdb31e-b134-485d-bdd5-0eb4ec923061.png)]