使用的数据集是全唐诗,首先提供一下数据集的下载链接:https://pan.baidu.com/s/13pNWfffr5HSN79WNb3Y0_w 提取码:koss
RNN不像传统的神经网络-它们的输出输出是固定的,而RNN允许我们输入输出向量序列。RNN是为了对序列数据进行建模而产生的。本帖代码移植自char-rnn,它是基于Torch的洋文模型,稍加修改即可应用于中文。char-rnn使用文本文件做为输入、训练RNN模型,然后使用它生成和训练数据类似的文本。
下边代码有修改,以适应TensorFlow1.4和GPU平台
1 #coding=utf-8
2 import collections
3 import numpy as np
4 import tensorflow as tf
5 import io
6 import sys
7 import os
8 reload(sys)
9 sys.setdefaultencoding('utf-8')
10 #-------------------------------数据预处理---------------------------#
11
12 poetry_file ='poetry.txt'
13
14 # 诗集
15 poetrys = []
16 with io.open(poetry_file, "r", encoding='utf-8',) as f:
17 for line in f:
18 # print line
19 try:
20 title, content = line.strip().split(':')
21 content = content.replace(' ','')
22 if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content:
23 continue
24 if len(content) < 5 or len(content) > 79:
25 continue
26 content = '[' + content + ']'
27 poetrys.append(content)
28 except Exception as e:
29 pass
30
31 #按诗的字数排序
32 poetrys = sorted(poetrys,key=lambda line: len(line))
33 print(u"唐诗总数: ")
34 print(len(poetrys))
35 print(u"测试")
36
37 # 统计每个字出现次数
38 all_words = []
39 for poetry in poetrys:
40 all_words += [word for word in poetry]
41 counter = collections.Counter(all_words)
42 count_pairs = sorted(counter.items(), key=lambda x: -x[1])
43 words, _ = zip(*count_pairs)
44
45 # 取前多少个常用字
46 words = words[:len(words)] + (' ',)
47 # 每个字映射为一个数字ID
48 word_num_map = dict(zip(words, range(len(words))))
49 # 把诗转换为向量形式,参考TensorFlow练习1
50 to_num = lambda word: word_num_map.get(word, len(words))
51 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys]
52 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1],
53 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1]
54 #....]
55
56 # 每次取64首诗进行训练
57 batch_size = 64
58 n_chunk = len(poetrys_vector) // batch_size
59 x_batches = []
60 y_batches = []
61 for i in range(n_chunk):
62 start_index = i * batch_size
63 end_index = start_index + batch_size
64
65 batches = poetrys_vector[start_index:end_index]
66 length = max(map(len,batches))
67 xdata = np.full((batch_size,length), word_num_map[' '], np.int32)
68 for row in range(batch_size):
69 xdata[row,:len(batches[row])] = batches[row]
70 ydata = np.copy(xdata)
71 ydata[:,:-1] = xdata[:,1:]
72 """
73 xdata ydata
74 [6,2,4,6,9] [2,4,6,9,9]
75 [1,4,2,8,5] [4,2,8,5,5]
76 """
77 x_batches.append(xdata)
78 y_batches.append(ydata)
79
80 #---------------------------------------RNN--------------------------------------#
81
82 input_data = tf.placeholder(tf.int32, [batch_size, None])
83 output_targets = tf.placeholder(tf.int32, [batch_size, None])
84 # 定义RNN
85 def neural_network(model='lstm', rnn_size=128, num_layers=2):
86 if model == 'rnn':
87 cell_fun = tf.nn.rnn_cell.BasicRNNCell
88 elif model == 'gru':
89 cell_fun = tf.nn.rnn_cell.GRUCell
90 elif model == 'lstm':
91 cell_fun = tf.nn.rnn_cell.BasicLSTMCell
92
93 cell = cell_fun(rnn_size, state_is_tuple=True)
94 cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
95
96 initial_state = cell.zero_state(batch_size, tf.float32)
97
98 with tf.variable_scope('rnnlm'):
99 softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1])
100 softmax_b = tf.get_variable("softmax_b", [len(words)+1])
101 with tf.device("/gpu:0"):
102 embedding = tf.get_variable("embedding", [len(words)+1, rnn_size])
103 inputs = tf.nn.embedding_lookup(embedding, input_data)
104
105 outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm')
106 output = tf.reshape(outputs,[-1, rnn_size])
107
108 logits = tf.matmul(output, softmax_w) + softmax_b
109 probs = tf.nn.softmax(logits)
110 return logits, last_state, probs, cell, initial_state
111
112 ckpt_dir="./ckpt_dir"
113 if not os.path.exists(ckpt_dir):
114 os.makedirs(ckpt_dir)
115
116 #训练
117 def train_neural_network():
118 logits, last_state, _, _, _ = neural_network()
119 targets = tf.reshape(output_targets, [-1])
120 loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len(words))
121 cost = tf.reduce_mean(loss)
122 learning_rate = tf.Variable(0.0, trainable=False)
123 tvars = tf.trainable_variables()
124 grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5)
125 optimizer = tf.train.AdamOptimizer(learning_rate)
126 train_op = optimizer.apply_gradients(zip(grads, tvars))
127
128 with tf.Session() as sess:
129 sess.run(tf.initialize_all_variables())
130
131 saver = tf.train.Saver(tf.all_variables())
132
133 for epoch in range(295):
134 sess.run(tf.assign(learning_rate, 0.002 * (0.97 ** epoch)))
135 n = 0
136 for batche in range(n_chunk):
137 train_loss, _ , _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_batches[n], output_targets: y_batches[n]})
138 n += 1
139 print(epoch, batche, train_loss)
140 if epoch % 7 == 0:
141 saver.save(sess, ckpt_dir+'/poetry.module', global_step=epoch)
142
143 train_neural_network()
这里我只说自己对bug调试和调优的一些想法,具体代码理解,请联系作者本人。
首先是#coding=utf-8的问题,这里是告诉python环境,当前python脚本的文字编码是utf-8,这里如果不调整的话,默认的ansii环境极有可能报告编码错误。
之后是数据集的utf-8编码问题,这里在encoding的时候,用了utf-8 的选项,但是却没有告诉python环境,字符集编码是utf-8,会导致每次解析到的content和title都会报错,最终处理完的数据集大小为0,设置sys的默认编码可以解决。
同时,默认的open函数没有encoding选项,这个是在io.open中的选项,这个地方需要修改。
还有一点是一些接口使用问题,比如saver.save现在需要一个parent directory
之后是预测的代码
1 #coding=utf-8
2 import collections
3 import numpy as np
4 import tensorflow as tf
5 import io
6 import sys
7 import os
8 import pdb
9 import time
10 reload(sys)
11 sys.setdefaultencoding('utf-8')
12 #-------------------------------数据预处理---------------------------#
13
14 poetry_file ='poetry.txt'
15
16 # 诗集
17 poetrys = []
18 with io.open(poetry_file, "r", encoding='utf-8',) as f:
19 for line in f:
20 try:
21 title, content = line.strip().split(':')
22 content = content.replace(' ','')
23 if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content:
24 continue
25 if len(content) < 5 or len(content) > 79:
26 continue
27 content = '[' + content + ']'
28 poetrys.append(content)
29 except Exception as e:
30 pass
31
32 # 按诗的字数排序
33 poetrys = sorted(poetrys,key=lambda line: len(line))
34 print(u'唐诗总数: ', len(poetrys))
35
36 # 统计每个字出现次数
37 all_words = []
38 for poetry in poetrys:
39 all_words += [word for word in poetry]
40 counter = collections.Counter(all_words)
41 count_pairs = sorted(counter.items(), key=lambda x: -x[1])
42 words, _ = zip(*count_pairs)
43
44 # 取前多少个常用字
45 words = words[:len(words)] + (' ',)
46 # 每个字映射为一个数字ID
47 word_num_map = dict(zip(words, range(len(words))))
48 # 把诗转换为向量形式
49 to_num = lambda word: word_num_map.get(word, len(words))
50 poetrys_vector = [ list(map(to_num, poetry)) for poetry in poetrys]
51 #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1],
52 #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1]
53 #....]
54
55 batch_size = 1
56 n_chunk = len(poetrys_vector) // batch_size
57 x_batches = []
58 y_batches = []
59 for i in range(n_chunk):
60 start_index = i * batch_size
61 end_index = start_index + batch_size
62
63 batches = poetrys_vector[start_index:end_index]
64 length = max(map(len,batches))
65 xdata = np.full((batch_size,length), word_num_map[' '], np.int32)
66 for row in range(batch_size):
67 xdata[row,:len(batches[row])] = batches[row]
68 ydata = np.copy(xdata)
69 ydata[:,:-1] = xdata[:,1:]
70 """
71 xdata ydata
72 [6,2,4,6,9] [2,4,6,9,9]
73 [1,4,2,8,5] [4,2,8,5,5]
74 """
75 x_batches.append(xdata)
76 y_batches.append(ydata)
77
78
79 #---------------------------------------RNN--------------------------------------#
80
81 input_data = tf.placeholder(tf.int32, [batch_size, None])
82 output_targets = tf.placeholder(tf.int32, [batch_size, None])
83 # 定义RNN
84 def neural_network(model='lstm', rnn_size=128, num_layers=2):
85 if model == 'rnn':
86 cell_fun = tf.nn.rnn_cell.BasicRNNCell
87 elif model == 'gru':
88 cell_fun = tf.nn.rnn_cell.GRUCell
89 elif model == 'lstm':
90 cell_fun = tf.nn.rnn_cell.BasicLSTMCell
91
92 cell = cell_fun(rnn_size, state_is_tuple=True)
93 cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
94
95 initial_state = cell.zero_state(batch_size, tf.float32)
96
97 with tf.variable_scope('rnnlm'):
98 softmax_w = tf.get_variable("softmax_w", [rnn_size, len(words)+1])
99 softmax_b = tf.get_variable("softmax_b", [len(words)+1])
100 with tf.device("/gpu:0"):
101 embedding = tf.get_variable("embedding", [len(words)+1, rnn_size])
102 inputs = tf.nn.embedding_lookup(embedding, input_data)
103
104 outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state, scope='rnnlm')
105 output = tf.reshape(outputs,[-1, rnn_size])
106
107 logits = tf.matmul(output, softmax_w) + softmax_b
108 probs = tf.nn.softmax(logits)
109 return logits, last_state, probs, cell, initial_state
110
111 #-------------------------------生成古诗---------------------------------#
112 # 使用训练完成的模型
113
114 def gen_poetry():
115 def to_word(weights):
116 t = np.cumsum(weights)
117 s = np.sum(weights)
118 sample = int(np.searchsorted(t, np.random.rand(1)*s))
119 return words[sample]
120
121 _, last_state, probs, cell, initial_state = neural_network()
122
123 with tf.Session() as sess:
124 sess.run(tf.initialize_all_variables())
125
126 saver = tf.train.Saver(tf.all_variables())
127 saver.restore(sess, './ckpt_dir/poetry.module-294')
128
129 state_ = sess.run(cell.zero_state(1, tf.float32))
130
131 x = np.array([list(map(word_num_map.get, '['))])
132 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
133 word = to_word(probs_)
134 #word = words[np.argmax(probs_)]
135 poem = ''
136 while word != ']':
137 poem += word
138 x = np.zeros((1,1))
139 x[0,0] = word_num_map[word]
140 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
141 word = to_word(probs_)
142 #word = words[np.argmax(probs_)]
143 return poem
144
145
146
147 def gen_poetry_with_head(head):
148 def to_word(weights):
149 t = np.cumsum(weights)
150 s = np.sum(weights)
151 sample = int(np.searchsorted(t, np.random.rand(1)*s))
152 return words[sample]
153
154 _, last_state, probs, cell, initial_state = neural_network()
155
156 with tf.Session() as sess:
157 sess.run(tf.initialize_all_variables())
158
159 saver = tf.train.Saver(tf.all_variables())
160 saver.restore(sess, './ckpt_dir/poetry.module-294')
161
162 state_ = sess.run(cell.zero_state(1, tf.float32))
163 poem = ''
164 i = 0
165 # print head
166 # pdb.set_trace()
167 for word in head:
168 while word != ',' and word != '。':
169 poem += word
170 # print poem
171 # print head
172 # print word
173 x = np.array([list(map(word_num_map.get, word))])
174 [probs_, state_] = sess.run([probs, last_state], feed_dict={input_data: x, initial_state: state_})
175 word = to_word(probs_)
176 time.sleep(1)
177 if i % 2 == 0:
178 poem += ','
179 else:
180 poem += '。'
181 i += 1
182 return poem
183
184 print(gen_poetry())
185 # print(gen_poetry_with_head(u'一二三四'))
这个藏头诗的代码用法有问题,不建议使用,我调了很久才调好,这次还是先列原作者的代码,下次单独说这块的调整和调优问题。
结果:
有那么点意思,但仔细看问题还是很大,胡言乱语,模型的调优远远不行。