提取码:手机号后四位
结构图:
问题一:
怎么处理长短不一的情况:padding到最大长度
问题二:
padding的部分在计算attention的时候要加入mask机制 来处理
数据处理
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import re
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import jieba
from nltk.translate.bleu_score import sentence_bleu
from opencc import OpenCC
# 处理英文句子 加上标记
def preprocess_english_sentence(w):
w = w.lower().strip()
# 在单词与跟在其后的标点符号之间插入一个空格
# "he is a boy." => "he is a boy ."
w = re.sub(r"([?.!,¿])", r" \1 ", w)
w = re.sub(r'[" "]+', " ", w)
# 除了 (a-z, A-Z, ".", "?", "!", ","),将所有字符替换为空格
w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
w = w.rstrip().strip()
# 给句子加上开始和结束标记
# 以便模型知道何时开始和结束预测
w = '<start> ' + w + ' <end>'
return w
# 处理中文繁体字 加上标记
def preprocess_chinese_sentence(w):
# 将句子里的繁体中文转化为简体中文
w = OpenCC('t2s').convert(w)
# 分词
ws = [i for i in jieba.cut(w)]
w = ' '.join(ws)
# 给句子加上开始和结束标记,以便模型知道何时开始和结束预测
w = '<start> ' + w + ' <end>'
return w
# 数据读取 + 调用函数读取!
def read_data(path):
english=[]
chinese=[]
with open(path,encoding='utf-8') as f:
for line in tqdm(f):
eng_one=line.split('\t')[0]
eng_one=preprocess_english_sentence(eng_one)
chin_one=line.split('\t')[1]
chin_one= preprocess_chinese_sentence(chin_one)
english.append(eng_one)
chinese.append(chin_one)
return english,chinese
target_eng,input_chin=read_data("cmn.txt")
print(target_eng[-1])
print(input_chin[-1])
把汉字句子转化成数字
# 把句子转化成 数字表示的形式
def creat_tokenize_padd(list_data): # 传入一个列表就好了!!!!
# 初始化Tokenizer,之前已经对句子做了预处理,这里直接按照空格将每个token切分出来
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
# 对目标语言数据集进行tokenize
tokenizer.fit_on_texts(list_data)
# 将文本序列转化为编码序列
tensor = tokenizer.texts_to_sequences(list_data)
# 把所有句子补齐
t_tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post',maxlen=20)#
return t_tensor,tokenizer
input_chin_tensor,input_chin_tokenizer=creat_tokenize_padd(input_chin)
target_eng_tensor,target_eng_tokenizer=creat_tokenize_padd(target_eng)
句子padding
MAX_INPUT_LENGTH = max([len(i) for i in input_chin_tensor])
MAX_OUTPUT_LENGTH = max([len(i) for i in target_eng_tensor])
print("输入句子的最大长度:", MAX_INPUT_LENGTH)
print("输出句子的最大长度:", MAX_OUTPUT_LENGTH)
print(input_chin_tensor[-1])
print(target_eng_tensor[-1])
print(type(input_chin_tensor))
print(input_chin_tensor.shape)
print(type(target_eng_tensor))
print(target_eng_tensor.shape)
target_index_word = target_eng_tokenizer.index_word
target_word_index = target_eng_tokenizer.word_index
input_index_word = input_chin_tokenizer.index_word
input_word_index = input_chin_tokenizer.word_index
print(input_word_index["我"])#测试 4
print(input_index_word[4]) # 测试 我
print("源语言词典大小:{}, 目标语言词典大小:{}".format(len(input_index_word), len(target_index_word)))
input_chin_tensor_train, input_chin_tensor_val,target_eng_tensor_train, target_eng_tensor_val = train_test_split(input_chin_tensor, target_eng_tensor, test_size=0.3)
print("训练集规模:{}, 测试集规模:{}".format(len(input_chin_tensor_train), len(input_chin_tensor_val)))
# 扩大训练测试 training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
EncoderRNN
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
# 此处的hidden_size也是embedding_dim
self.embedding = nn.Embedding(input_size, hidden_size) # 这个embedding的模型的定义 后边需要调用这个模型
# 输入的这个句子中单词的个数, 是每个单词的维度定义
self.gru = nn.GRU(hidden_size,hidden_size)
# 输入单词维度 输出单词维度 这里定义的输入输出是一样的
# 如果说加一个参数就是num_layers:lstm隐层的层数,默认为 1
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size)
def forward(self, input, hidden):# hidden:torch.Size([1, 1, 256])
# input维度为(seq_len=1,batch_size=1)
embedded = self.embedding(input).view(1, 1, -1)
# 经过embedded,维度为(seq_len=1,batch_size=1,embedding_dim)
output = embedded
# hidden维度为(num_layers*direction_num=1, batch_size=1, hidden_size)
# output的维度为(seq_length=1, batch_size=1, hidden_size)
output, hidden = self.gru(output, hidden)
return output, hidden
AttnDecoderRNN 版本一 不加mask + 训练代码
讲一讲mask是怎么加的:
所有的翻译句子都需要padding到一定的长度,在计算attention的时候,要计算与encode中的输出的向量进行相似度计算,所以padding单词的相似度就不需要计算了 就需要mask机制来 处理padding
# 不加入mask的情况:
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, max_length):
super(AttnDecoderRNN, self).__init__()
# hidden_size也是embedding_dim 就是刚开始 每个单词的维度
self.hidden_size = hidden_size
self.output_size = output_size # self.output_size:3228 调用的时候传进来的
self.max_length = max_length
self.embedding = nn.Embedding(self.output_size, self.hidden_size) # 跟上边的一样 embedding模型的定义
self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
self.attn_2 = nn.Linear(self.max_length, 1)
self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size) #hidden_size=256
def forward(self,decode_input,encoder_outputs,hidden,mask):
# hidden维度为 hidden = encoder_hidden=torch.Size([1, 1, 256])
# encoder_outputs 前边的数字向量矩阵 固定长度是10个字 torch.Size([20, 256])
# padding_mask 是一维度的 有1 有 0
decode_input = decode_input.view(1, -1)# torch.Size([1, 256])
input_1=decode_input.repeat(len(encoder_outputs),1)
attn_weights =torch.tanh(self.attn(torch.cat([input_1,encoder_outputs], 1)))
attn_weights=self.attn_2(attn_weights).view(1,-1)
# index =list(mask).index(0)# 把padding 的位置是0 的注意力权重复制为0
# attn_weights[0][index:]=int(-10) # 经过F.softmax 让padding的位置 概率非常小
attn_weights=F.softmax(attn_weights,dim=1)
# print("attn_weights",attn_weights)
# print("encoder_outputs",encoder_outputs.size())
# torch.cat: torch.Size([10, 512]) 把两个[10,256]的tensor 拼接
# attn_weights维度为 torch.Size([1, 10]),tensor([[0.0988, 0.0997, 0.0944, 0.1047, 0.1051, 0.0925, 0.1062, 0.1043, 0.0892,0.1052]], device='cuda:0', grad_fn=<ViewBackward>)
attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs.unsqueeze(0))# unsqueeze(0)在第一个维度上 增加一维 1
# print(attn_applied.size()) # torch.Size([1, 1, 256])
output = torch.cat([decode_input , attn_applied[0]], 1)
# output.size():torch.Size([1, 512])
output = self.attn_combine(output).unsqueeze(0)
# output.size():torch.Size([1, 1, 256])
output = F.relu(output)
#output.size():torch.Size([1, 1, 256])
output, hidden = self.gru(output, hidden) # 输入端hidden:torch.Size([1, 1, 256])
# output.size():torch.Size([1, 1, 256]),输出端hidden.size():torch.Size([1, 1, 256])
# output_last=self.out(output[0]) # 我i自己改的
output_last = F.log_softmax(self.out(output[0]), dim=1) # 原来是这个的 log_softmax
#output.size():torch.Size([1, 6756])
return output_last,output, hidden, attn_weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_size = 256
vocab_input_size = len(input_word_index) + 1
vocab_target_size = len(target_word_index) + 1
max_length=38
learning_rate=0.001
epoch=1
encoder = EncoderRNN(vocab_input_size,hidden_size).to(device) # 模型定义实例化
attentionRNN_decoder = AttnDecoderRNN(hidden_size,vocab_target_size,max_length).to(device) # 模型定义实例化
# encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
# decoder_optimizer = optim.SGD(attentionRNN_decoder.parameters(), lr=learning_rate)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(attentionRNN_decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss().to(device) # 交叉熵计算公式
for epo in tqdm(range(epoch)):
for setence_idx,input_setence in enumerate(input_chin_tensor_train):
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
loss = 0
mask=torch.zeros(max_length).to(device)# 制作msak矩阵
input_setence=torch.LongTensor(input_setence).to(device)
encoder_outputs = torch.zeros(max_length, hidden_size).to(device)
for word_inx in range(len(input_setence)):
hidden = encoder.initHidden().to(device)
encoder_output, hidden = encoder((input_setence[word_inx]), hidden) # encoder就是 EncoderRNN 模型
encoder_outputs[word_inx]=encoder_output[0][0]
if input_setence[word_inx]!=0:
mask[word_inx]=1 # 让真实的单词位置mask是1
# print(mask)
#-------encoder部分完成了-----------------------
# print("encoder部分完成了")
# print("encoder_outputs",encoder_outputs.size()) # torch.Size([20, 256])
target_setence=torch.LongTensor(target_eng_tensor_train[setence_idx]).to(device)
decoder_input=attentionRNN_decoder.initHidden().to(device)#第一输入是随机初始化的 [1,256]
for word_idx in range(len(target_setence)):
decoder_output,decoder_input,hidden,attn_weights = attentionRNN_decoder(decoder_input,encoder_outputs,hidden,mask)
loss+=criterion(decoder_output.to(device), torch.LongTensor([target_setence[word_idx]]).to(device))
loss=loss+loss
# if setence_idx%100==0:# 每100次 打印一下loss
# print(setence_idx,(loss/max_length).item())
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
# print("第",epo,"个epoch的平均loss:",loss_epoch/len(input_chin_tensor_train))
torch.save(encoder.state_dict(),str(epo)+'nomaskencoder.pth')
torch.save(attentionRNN_decoder.state_dict(),str(epo)+'nomaskattentionRNN_decoder.pth')
AttnDecoderRNN 版本二 加mask +训练代码
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, max_length):
super(AttnDecoderRNN, self).__init__()
# hidden_size也是embedding_dim 就是刚开始 每个单词的维度
self.hidden_size = hidden_size
self.output_size = output_size # self.output_size:3228 调用的时候传进来的
self.max_length = max_length
self.embedding = nn.Embedding(self.output_size, self.hidden_size) # 跟上边的一样 embedding模型的定义
self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
self.attn_2 = nn.Linear(self.max_length, 1)
self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size) #hidden_size=256
def forward(self,decode_input,encoder_outputs,hidden,mask):
# hidden维度为 hidden = encoder_hidden=torch.Size([1, 1, 256])
# encoder_outputs 前边的数字向量矩阵 固定长度是10个字 torch.Size([20, 256])
# padding_mask 是一维度的 有1 有 0
decode_input = decode_input.view(1, -1)# torch.Size([1, 256])
input_1=decode_input.repeat(len(encoder_outputs),1)
# print(input_1.size())
# print(encoder_outputs.size())
attn_weights =torch.tanh(self.attn(torch.cat([input_1,encoder_outputs], 1)))
attn_weights=self.attn_2(attn_weights).view(1,-1)
index =list(mask).index(0)# 把padding 的位置是0 的注意力权重复制为0
attn_weights[0][index:]=int(-10) # 经过F.softmax 让padding的位置 概率非常小
attn_weights=F.softmax(attn_weights,dim=1)
# print("attn_weights",attn_weights)
# print("encoder_outputs",encoder_outputs.size())
# torch.cat: torch.Size([10, 512]) 把两个[10,256]的tensor 拼接
# attn_weights维度为 torch.Size([1, 10]),tensor([[0.0988, 0.0997, 0.0944, 0.1047, 0.1051, 0.0925, 0.1062, 0.1043, 0.0892,0.1052]], device='cuda:0', grad_fn=<ViewBackward>)
attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs.unsqueeze(0))# unsqueeze(0)在第一个维度上 增加一维 1
# print(attn_applied.size()) # torch.Size([1, 1, 256])
output = torch.cat([decode_input , attn_applied[0]], 1)
# output.size():torch.Size([1, 512])
output = self.attn_combine(output).unsqueeze(0)
# output.size():torch.Size([1, 1, 256])
output = F.relu(output)
#output.size():torch.Size([1, 1, 256])
output, hidden = self.gru(output, hidden) # 输入端hidden:torch.Size([1, 1, 256])
# output.size():torch.Size([1, 1, 256]),输出端hidden.size():torch.Size([1, 1, 256])
# output_last=self.out(output[0]) # 我i自己改的
output_last = F.log_softmax(self.out(output[0]), dim=1) # 原来是这个的 log_softmax
#output.size():torch.Size([1, 6756])
return output_last,output, hidden, attn_weights
# 加入mask的情况:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_size = 256
vocab_input_size = len(input_word_index) + 1
vocab_target_size = len(target_word_index) + 1
max_length=38
learning_rate=0.001
epoch=1
encoder = EncoderRNN(vocab_input_size,hidden_size).to(device) # 模型定义实例化
attentionRNN_decoder = AttnDecoderRNN(hidden_size,vocab_target_size,max_length).to(device) # 模型定义实例化
# encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
# decoder_optimizer = optim.SGD(attentionRNN_decoder.parameters(), lr=learning_rate)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(attentionRNN_decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss().to(device) # 交叉熵计算公式
for epo in range(epoch):
for setence_idx,input_setence in tqdm(enumerate(input_chin_tensor_train)):
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
loss = 0
mask=torch.zeros(max_length).to(device)# 制作msak矩阵
input_setence=torch.LongTensor(input_setence).to(device)
encoder_outputs = torch.zeros(max_length, hidden_size).to(device)
for word_inx in range(len(input_setence)):
hidden = encoder.initHidden().to(device)
encoder_output, hidden = encoder((input_setence[word_inx]), hidden) # encoder就是 EncoderRNN 模型
encoder_outputs[word_inx]=encoder_output[0][0]
if input_setence[word_inx]!=0:
mask[word_inx]=1 # 让真实的单词位置mask是1
# print(mask)
#-------encoder部分完成了-----------------------
# print("encoder部分完成了")
# print("encoder_outputs",encoder_outputs.size()) # torch.Size([20, 256])
target_setence=torch.LongTensor(target_eng_tensor_train[setence_idx]).to(device)
decoder_input=attentionRNN_decoder.initHidden().to(device)#第一输入是随机初始化的 [1,256]
for word_idx in range(len(target_setence)):
decoder_output,decoder_input,hidden,attn_weights = attentionRNN_decoder(decoder_input,encoder_outputs,hidden,mask)
loss+=criterion(decoder_output.to(device), torch.LongTensor([target_setence[word_idx]]).to(device))
loss=loss+loss
# if setence_idx%100==0:# 每100次 打印一下loss
print(setence_idx,(loss/max_length).item())
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
# print("第",epo,"个epoch的平均loss:",loss_epoch/len(input_chin_tensor_train))
torch.save(encoder.state_dict(),str(epo)+'encoder.pth')
torch.save(attentionRNN_decoder.state_dict(),str(epo)+'attentionRNN_decoder.pth')
测试:
encoder=EncoderRNN(vocab_input_size,hidden_size).to(device) # 模型定义实例化
encoder.load_state_dict(torch.load('1encoder.pth'))
attentionRNN_decoder = AttnDecoderRNN(hidden_size,vocab_target_size,max_length).to(device)
attentionRNN_decoder.load_state_dict(torch.load('1attentionRNN_decoder.pth')) #
def data_val(input_chin_tensor_val,target_eng_tensor_val):
with torch.no_grad():
for setence_idx,input_setence in tqdm(enumerate(input_chin_tensor_val)):
loss = 0
input_setence=torch.LongTensor(input_setence).to(device)
encoder_outputs = torch.zeros(max_length, hidden_size).to(device)
for word_inx in range(len(input_setence)):
hidden = encoder.initHidden().to(device)
encoder_output, hidden = encoder((input_setence[word_inx]), hidden) # encoder就是 EncoderRNN 模型
encoder_outputs[word_inx]=encoder_output[0][0]
target_setence=torch.LongTensor(target_eng_tensor_val[setence_idx]).to(device)
decoder_input=encoder_outputs[-1].to(device)
shuchu_juzi=[]
for word_idx in range(len(target_setence)):
decoder_output,decoder_input,hidden,attn_weights = attentionRNN_decoder(decoder_input,encoder_outputs,hidden,mask) # encoder就是 EncoderRNN 模型
output_word=np.argmax(decoder_output.data.cpu().numpy())
shuchu_juzi.append(output_word)
print(shuchu_juzi)
# loss+=criterion(decoder_output.to(device),torch.LongTensor([target_setence[word_idx]]).to(device))
# print("loss",loss/20)
data_val(input_chin_tensor_val,target_eng_tensor_val)
结果测试:
rnn+attention 单独测试:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import numpy as np
import re
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import jieba
from nltk.translate.bleu_score import sentence_bleu
from opencc import OpenCC
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, max_length):
super(AttnDecoderRNN, self).__init__()
# hidden_size也是embedding_dim 就是刚开始 每个单词的维度
self.hidden_size = hidden_size
self.output_size = output_size
self.max_length = max_length
self.embedding = nn.Embedding(self.output_size, self.hidden_size) # 跟上边的一样 embedding模型的定义
self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
self.attn_2 = nn.Linear(self.max_length, 1)
self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size) #hidden_size=256
def forward(self,decode_input,encoder_outputs,hidden):
decode_input = decode_input.view(1, -1) # torch.Size([1, 256])
input_1=decode_input.repeat(len(encoder_outputs),1)
attn_weights =torch.tanh(self.attn(torch.cat([input_1,encoder_outputs], 1)))
attn_weights=F.softmax(self.attn_2(attn_weights),dim=0).view(1,-1)
# torch.cat: torch.Size([10, 512]) 把两个[10,256]的tensor 拼接
# attn_weights维度为 torch.Size([1, 10]),tensor([[0.0988, 0.0997, 0.0944, 0.1047, 0.1051, 0.0925, 0.1062, 0.1043, 0.0892,0.1052]], device='cuda:0', grad_fn=<ViewBackward>)
attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_outputs.unsqueeze(0)) # unsqueeze(0)在第一个维度上 增加一维 1
# torch.bmm 三维矩阵相乘
# attn_applied=torch.Size([1, 1, 256])=torch.bmm([1,1,10],[1,10,256])
# print(attn_applied.size())
output = torch.cat([decode_input , attn_applied[0]], 1)
# output.size():torch.Size([1, 512])
output = self.attn_combine(output).unsqueeze(0)
# output.size():torch.Size([1, 1, 256])
output = F.relu(output)
#output.size():torch.Size([1, 1, 256])
output, hidden = self.gru(output, hidden) # 输入端hidden:torch.Size([1, 1, 256])
# output.size():torch.Size([1, 1, 256]),输出端hidden.size():torch.Size([1, 1, 256])
output_last = F.log_softmax(self.out(output[0]), dim=1)
#output.size():torch.Size([1, 6756])
return output_last,output, hidden, attn_weights
attentionRNN_decoder = AttnDecoderRNN(256,6756,10)
decode_input=torch.Tensor(1, 256)
encoder_outputs=torch.Tensor(10, 256)
hidden=torch.Tensor(1, 1, 256)
output_last,output, hidden, attn_weights=attentionRNN_decoder(decode_input,encoder_outputs,hidden)
print(output_last.size())
print(output.size())
print(hidden.size())
print(attn_weights.size())