本实践采用的是IWLST TED演讲en_zh数据集,基于tensorflow 实战google深度学习框架(第二版)一书所进行的实验,大部分代码直接是用的书上的,为了更好地巩固知识,所以整理成博客。

首先从相关网站上下载数据集,解压,代码如下所示:

wget https://wit3.fbk.eu/archive/2015-01//texts/en/zh/en-zh.tgz
tar xzvf en-zh.tgz
cd en-zh/

我们只关注train.tags.en-zh.en和train.tags.en-zh.zh两个文本数据文件,但是都带有html标记,需要进行预处理一下:

IWSLT15.TED.dev2010.en-zh.en.xml  IWSLT15.TED.tst2011.en-zh.en.xml  IWSLT15.TED.tst2013.en-zh.en.xml  train.tags.en-zh.zh
IWSLT15.TED.dev2010.en-zh.zh.xml IWSLT15.TED.tst2011.en-zh.zh.xml IWSLT15.TED.tst2013.en-zh.zh.xml train.zh
IWSLT15.TED.tst2010.en-zh.en.xml IWSLT15.TED.tst2012.en-zh.en.xml README
IWSLT15.TED.tst2010.en-zh.zh.xml IWSLT15.TED.tst2012.en-zh.zh.xml train.tags.en-zh.en

在对原始数据进行处理时,需要进行分词、建库和数字化等操作,在将数据处理成可以输入的数据时还得进行padding。在这里,本人选取的中文和英文的分词工具都是stanfordcorenlp,相关知识请参考这篇博客。下面直接上代码:

#coding:utf-8
import collections
from operator import itemgetter
from stanfordcorenlp import StanfordCoreNLP
import tqdm

#第一步,把英文跟中文弄成一行一句的格式
def deletehtml(filename1,filename2):
f1 = open(filename1,'r')
f2 = open(filename2,'r')

data1 = f1.readlines()
data2 = f2.readlines()
assert len(data1)==len(data2)#用codecs会导致报错不知道为什么
fw1 = open(filename1+".deletehtml",'w')
fw2 = open(filename2+".deletehtml",'w')

print("deletehtml...")

for line1,line2 in tqdm.tqdm(zip(data1,data2)):
line1 = line1.strip()
line2 = line2.strip()
if line1 and line2:
if '<' not in line1 and '>' not in line1 and '<' not in line2 and '>' not in line2:
fw1.write(line1+"\n")
fw2.write(line2+"\n")
fw1.close()
f1.close()
fw2.close()
f2.close()

return filename1+".deletehtml",filename2+".deletehtml"

#第二步,分词并建立词库
def segement_sentence(filename,vocab_size,lang='en'):
nlp = StanfordCoreNLP("../stanford-corenlp-full-2018-10-05",lang=lang)
with open(filename,'r') as f:
data = f.readlines()
counter = collections.Counter()
f1 = open(filename+".segment",'w')
print("segmenting...")
for line in tqdm.tqdm(data):
line = line.strip()
word_list = nlp.word_tokenize(line.strip())
sentence = ' '.join(word_list)
f1.write(sentence+"\n")
for word in word_list:
counter[word] += 1
f1.close()
nlp.close()

sorted_word_to_cnt = sorted(counter.items(),key=itemgetter(1),reverse=True)
sorted_words = ["<unk>","<sos>","<eos>"] + [x[0] for x in sorted_word_to_cnt]

if len(sorted_words)>vocab_size:
sorted_words = sorted_words[:vocab_size]
assert len(sorted_words)<=vocab_size
with open(filename+".vocab",'w') as fw:
for word in sorted_words:
fw.write(word+"\n")
return filename+".segment"

#第三步,将文本转换成数字编号
def convert_to_id(filename,vocab_file):
with open(vocab_file,"r") as f:
data = f.readlines()
vocab = [w.strip() for w in data]
word_to_id = {k:v for (k,v) in zip(vocab,range(len(vocab)))}

with open(filename,"r") as f:
data = f.readlines()
f1 = open(filename+".id",'w')
print("converting...")
for line in tqdm.tqdm(data):
words = line.strip().split()+["<eos>"]
ids = ' '.join([str(word_to_id[word])
if word in word_to_id else str(word_to_id["<unk>"])
for word in words])
f1.write(ids+"\n")
f1.close()
return filename+".id"

def main():
src = "train.tags.en-zh.en"#有html标记
trg = "train.tags.en-zh.zh"#同
src_vocab_size = 10000
trg_vocab_size = 4000

src1,trg1 = deletehtml(src,trg)


src2 = segement_sentence(src1,src_vocab_size,lang='en')
trg2 = segement_sentence(trg1,trg_vocab_size,lang='zh')

src3 = convert_to_id(src+".deletehtml.segment",src+".deletehtml.vocab")
trg3 = convert_to_id(trg+".deletehtml.segment",trg+".deletehtml.vocab")

if __name__ == '__main__':
main()

使用上述代码对数据文件进行处理后,我们就得到了每一行一句话,一句话都转成编号的形式,句子的某尾加<eos>标志。①但是由于每句话的长短不一,需要将一个batch中的较短句子的长度padding至最长句子的长度,这样才能使encoder正常工作;②在decoder阶段,由于输入需要以<sos>开头,那么还得将目标语言再进行处理一下,x y z <eos>变为<sos> x y z。直接上完整的训练代码:

#coding:utf-8
import tensorflow as tf

MAX_LEN = 50
SOS_ID = 1

SRC_TRAIN_DATA = "../train.tags.en-zh.en.deletehtml.segment.id"
TRG_TRAIN_DATA = "../train.tags.en-zh.zh.deletehtml.segment.id"
CHECKPOINT_PATH = "./seq2seq_ckpt"

HIDDEN_SIZE = 1024
NUM_LAYERS = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
NUM_EPOCH = 5
KEEP_PROB = 0.8
MAX_GRAD_NORM = 5
SHARE_EMB_AND_SOFTMAX = True

class NMTModel(object):
def __init__(self):
self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
for _ in range(NUM_LAYERS)])
self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
for _ in range(NUM_LAYERS)])

self.src_embedding = tf.get_variable(
"src_emb",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
self.trg_embedding = tf.get_variable(
"trg_emb",[TRG_VOCAB_SIZE,HIDDEN_SIZE])

if SHARE_EMB_AND_SOFTMAX:
self.softmax_weight = tf.transpose(self.trg_embedding)
else:
self.softmax_weight = tf.get_variable("weight",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
self.softmax_bias = tf.get_variable("softmax_bias",[TRG_VOCAB_SIZE])

def forward(self,src_input,src_size,trg_input,trg_label,trg_size):
batch_size = tf.shape(src_input)[0]
src_emb = tf.nn.embedding_lookup(self.src_embedding,src_input)
trg_emb = tf.nn.embedding_lookup(self.trg_embedding,trg_input)

src_emb = tf.nn.dropout(src_emb,KEEP_PROB)
trg_emb = tf.nn.dropout(trg_emb,KEEP_PROB)

with tf.variable_scope("encoder"):
enc_outputs,enc_state = tf.nn.dynamic_rnn(
self.enc_cell,src_emb,src_size,dtype=tf.float32)

with tf.variable_scope("decoder"):
dec_outputs, _ = tf.nn.dynamic_rnn(
self.dec_cell,trg_emb,trg_size,initial_state=enc_state)

output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
logits = tf.matmul(output,self.softmax_weight) + self.softmax_bias
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(trg_label,[-1]),logits=logits)

label_weights = tf.sequence_mask(trg_size,maxlen=tf.shape(trg_label)[1],dtype=tf.float32)
label_weights = tf.reshape(label_weights,[-1])

cost = tf.reduce_sum(loss*label_weights)
cost_per_token = cost / tf.reduce_sum(label_weights)

trainable_variables = tf.trainable_variables()

grads = tf.gradients(cost / tf.to_float(batch_size), trainable_variables)
grads,_ = tf.clip_by_global_norm(grads,MAX_GRAD_NORM)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
train_op = optimizer.apply_gradients(zip(grads,trainable_variables))

return cost_per_token,train_op

def run_epoch(session,cost_op,train_op,saver,step):
while True:
try:
cost,_ = session.run([cost_op,train_op])
if step%10 == 0:
print("steps %d, per token cost is %.3f"%(step,cost))
if step%200 == 0:
saver.save(session,CHECKPOINT_PATH,global_step=step)
step += 1
except tf.errors.OutOfRangeError:
break
return step


def MakeDataset(file_path):
dataset = tf.data.TextLineDataset(file_path)
dataset = dataset.map(lambda string: tf.string_split([string]).values)
dataset = dataset.map(lambda string: tf.string_to_number(string,tf.int32))
dataset = dataset.map(lambda x: (x,tf.size(x)))
return dataset

def MakeSrcTrgDataset(src_path,trg_path,batch_size):
src_data = MakeDataset(src_path)
trg_data = MakeDataset(trg_path)

dataset = tf.data.Dataset.zip((src_data,trg_data))

def FilterLength(src_tuple,trg_tuple):
((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
src_len_ok = tf.logical_and(tf.greater(src_len,1),tf.less_equal(src_len,MAX_LEN))
trg_len_ok = tf.logical_and(tf.greater(trg_len,1),tf.less_equal(trg_len,MAX_LEN))
return tf.logical_and(src_len_ok,trg_len_ok)
dataset = dataset.filter(FilterLength)

def MakeTrgInput(src_tuple,trg_tuple):
((src_input,src_len),(trg_label,trg_len)) = (src_tuple,trg_tuple)
trg_input = tf.concat([[SOS_ID],trg_label[:-1]],axis=0)
return ((src_input,src_len),(trg_input,trg_label,trg_len))
dataset = dataset.map(MakeTrgInput)
dataset = dataset.shuffle(10000)

padded_shapes = (
(tf.TensorShape([None]),
tf.TensorShape([])),
(tf.TensorShape([None]),
tf.TensorShape([None]),
tf.TensorShape([])))
batched_dataset = dataset.padded_batch(batch_size,padded_shapes)
return batched_dataset


def main():
initializer = tf.random_uniform_initializer(-0.05,0.05)
with tf.variable_scope("nmt_model",reuse=None,initializer=initializer):
train_model = NMTModel()

data = MakeSrcTrgDataset(SRC_TRAIN_DATA,TRG_TRAIN_DATA,BATCH_SIZE)
iterator = data.make_initializable_iterator()
(src,src_size),(trg_input,trg_label,trg_size) = iterator.get_next()

cost_op,train_op = train_model.forward(src,src_size,trg_input,trg_label,trg_size)
saver = tf.train.Saver()
step = 0

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

with session as sess:
tf.global_variables_initializer().run()
for i in range(NUM_EPOCH):
print("In iteration: %d"%(i+1))
sess.run(iterator.initializer)
step = run_epoch(sess,cost_op,train_op,saver,step)

if __name__ == '__main__':
main()

在对新句子进行inference时,需要确定解码时的条件,因此要用到tf.while_loop函数,这里直接给出预测代码:

#coding:utf-8
import tensorflow as tf


CHECKPOINT_PATH = "./seq2seq_ckpt-9000"

HIDDEN_SIZE = 1024
NUM_LAYERS = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
SHARE_EMB_AND_SOFTMAX = True
SOS_ID = 1
EOS_ID = 2

class NMTModel(object):
def __init__(self):
self.enc_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
for _ in range(NUM_LAYERS)])
self.dec_cell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE)\
for _ in range(NUM_LAYERS)])

self.src_embedding = tf.get_variable(
"src_emb",[SRC_VOCAB_SIZE,HIDDEN_SIZE])
self.trg_embedding = tf.get_variable(
"trg_emb",[TRG_VOCAB_SIZE,HIDDEN_SIZE])

if SHARE_EMB_AND_SOFTMAX:
self.softmax_weight = tf.transpose(self.trg_embedding)
else:
self.softmax_weight = tf.get_variable("weight",[HIDDEN_SIZE,TRG_VOCAB_SIZE])
self.softmax_bias = tf.get_variable("softmax_bias",[TRG_VOCAB_SIZE])

def inference(self,src_input):
src_size = tf.convert_to_tensor([len(src_input)],dtype=tf.int32)
src_input = tf.convert_to_tensor([src_input],dtype=tf.int32)
src_emb = tf.nn.embedding_lookup(self.src_embedding,src_input)

with tf.variable_scope("encoder"):
enc_outputs,enc_state = tf.nn.dynamic_rnn(
self.enc_cell,src_emb,src_size,dtype=tf.float32)
MAX_DEC_LEN = 100

with tf.variable_scope("decoder/rnn/multi_rnn_cell"):
init_array = tf.TensorArray(dtype=tf.int32,size=0,dynamic_size=True,clear_after_read=False)
init_array = init_array.write(0,SOS_ID)

init_loop_var = (enc_state,init_array,0)

def continue_loop_condition(state,trg_ids,step):
return tf.reduce_all(tf.logical_and(tf.not_equal(trg_ids.read(step),EOS_ID),tf.less(step,MAX_DEC_LEN-1)))

def loop_body(state,trg_ids,step):
trg_input = [trg_ids.read(step)]
trg_emb = tf.nn.embedding_lookup(self.trg_embedding,trg_input)

dec_outputs,next_state = self.dec_cell.call(state=state,inputs=trg_emb)
output = tf.reshape(dec_outputs,[-1,HIDDEN_SIZE])
logits = (tf.matmul(output,self.softmax_weight) + self.softmax_bias)
next_id = tf.argmax(logits,axis=1,output_type=tf.int32)

trg_ids = trg_ids.write(step+1,next_id[0])
return next_state,trg_ids,step+1

state,trg_ids,step = tf.while_loop(
continue_loop_condition,loop_body,init_loop_var)
return trg_ids.stack()

def main():
from stanfordcorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP("../../stanford-corenlp-full-2018-10-05",lang='en')
with tf.variable_scope("nmt_model",reuse=None):
model = NMTModel()
vocab_file = "../train.tags.en-zh.en.deletehtml.vocab"
sentence = "It is very beautiful!"
with open(vocab_file,'r') as f:
data = f.readlines()
words = [w.strip() for w in data]
word_to_id = {k:v for (k,v) in zip(words,range(len(words)))}
wordlist = nlp.word_tokenize(sentence.strip()) + ["<eos>"]
# print(wordlist)
idlist = [str(word_to_id[w]) if w in word_to_id else str(word_to_id["<unk>"]) for w in wordlist]
idlist = [int(i) for i in idlist]
# print(idlist)

output_op = model.inference(idlist)
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7,allow_growth=True)
session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
saver = tf.train.Saver()
saver.restore(session,CHECKPOINT_PATH)

output = session.run(output_op)

vocab_file2 = "../train.tags.en-zh.zh.deletehtml.vocab"
with open(vocab_file2,'r') as f2:
data2 = f2.readlines()
words = [w.strip() for w in data2]
id_to_word = {k:v for (k,v) in zip(range(len(words)),words)}
print([id_to_word[i] for i in output])
session.close()

nlp.close()

if __name__ == '__main__':
main()

下面给出预测结果:

['<sos>', '这', '是', '非常', '美丽', '的', '!', '<eos>']

由于不想让篇幅过长,所以直接上干货,具体原理请参考开头所说的书,有疑问请在下面留言,我的软件配置是python3.6.5 + tensorflow-gpu==1.12 + cuda9.0。