本篇主要记录Keras实现BiLSTM+Attention模型,其中Attention是自定义层。然后用该模型完成新闻标题文本分类任务。
详细代码和数据:https://github.com/huanghao128/zh-nlp-demo
数据预处理
这里使用的数据集只是用来演示文本分类任务,所以没有使用长篇的文章,而是使用的标题。原始数据集是在头条爬取的,在这里可以下载:https://github.com/fate233/toutiao-text-classfication-dataset
我在这个数据集上做了一些处理,只保留了类别和标题,去除了关键词,处理后的数据如下图“类别_!_标题”格式:
读取上面的数据,然后HanLP分词预处理,构建词典并保存词典的结果。
from pyhanlp import HanLP
import numpy as np
from tqdm import tqdm
# 读取原始数据集分词预处理 并保存词典
def read_toutiao_dataset(data_path, save_vocab_path):
with open(data_path, "r", encoding="utf8") as fo:
all_lines = fo.readlines()
datas, labels = [], []
word_vocabs = {}
for line in tqdm(all_lines):
content_words = []
category, content = line.strip().split("_!_")
for term in HanLP.segment(content):
if term.word not in word_vocabs:
word_vocabs[term.word] = len(word_vocabs)+1
content_words.append(term.word)
datas.append(content_words)
labels.append(category)
with open(save_vocab_path, "w", encoding="utf8") as fw:
for word, index in word_vocabs.items():
fw.write(word+"\n")
return datas, labels
读取词典文件,生成词-索引对应关系,其中special_words是填充<PAD>和未知词<UNK>等。
def read_word_vocabs(save_vocab_path, special_words):
with open(save_vocab_path, "r", encoding="utf8") as fo:
word_vocabs = [word.strip() for word in fo]
word_vocabs = special_words + word_vocabs
idx2vocab = {idx: char for idx, char in enumerate(word_vocabs)} # 索引-词对应
vocab2idx = {char: idx for idx, char in idx2vocab.items()} # 词-索引对应
return idx2vocab, vocab2idx
把前面分词预处理过的数据索引化,即根据词典索引对应关系,把文本序列变成词编号序列。
def process_dataset(datas, labels, category2idx, vocab2idx):
new_datas, new_labels = [], []
for data, label in zip(datas, labels):
index_data = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in data]
index_label = category2idx[label]
new_datas.append(index_data)
new_labels.append(index_label)
return new_datas, new_labels
前面都是定义的处理方法,下面正式开始处理数据。生成词典的索引化,类别的索引化。并把数据处理成模型可读取的数据。
# 原始数据集路径
data_path = "/you/path/toutiao_news_dataset.txt"
# 保存词典文件路径
save_vocab_path = "/you/path/word_vocabs.txt"
# 特殊词
special_words = ['<PAD>', '<UNK>']
# 对应的类别
category_lists = ["民生故事","文化","娱乐","体育","财经","房产","汽车","教育","科技","军事",
"旅游","国际","证券","农业","电竞游戏"]
# 对应类别的索引化
category2idx = {cate: idx for idx, cate in enumerate(category_lists)}
idx2category = {idx: cate for idx, cate in enumerate(category_lists)}
# 读取数据,分词预处理,并保存词典
datas, labels = read_toutiao_dataset(data_path, save_vocab_path)
# 读取保存的词典,并建立词-编号对应,也即词典索引化
idx2vocab, vocab2idx = read_word_vocabs(save_vocab_path, special_words)
# 把处理过的标题文本和类别全部索引化,标题文本变成词编号序列,类别变成类别对应编号
all_datas, all_labels = process_dataset(datas, labels, category2idx, vocab2idx)
模型的构建
- 自定义Attention层,这里使用的注意力打分函数是加性模型。
import numpy
import keras
from keras import backend as K
from keras import activations
from keras.engine.topology import Layer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, Bidirectional
K.clear_session()
class AttentionLayer(Layer):
def __init__(self, attention_size=None, **kwargs):
self.attention_size = attention_size
super(AttentionLayer, self).__init__(**kwargs)
def get_config(self):
config = super().get_config()
config['attention_size'] = self.attention_size
return config
def build(self, input_shape):
assert len(input_shape) == 3
self.time_steps = input_shape[1]
hidden_size = input_shape[2]
if self.attention_size is None:
self.attention_size = hidden_size
self.W = self.add_weight(name='att_weight', shape=(hidden_size, self.attention_size),
initializer='uniform', trainable=True)
self.b = self.add_weight(name='att_bias', shape=(self.attention_size,),
initializer='uniform', trainable=True)
self.V = self.add_weight(name='att_var', shape=(self.attention_size,),
initializer='uniform', trainable=True)
super(AttentionLayer, self).build(input_shape)
def call(self, inputs):
self.V = K.reshape(self.V, (-1, 1))
H = K.tanh(K.dot(inputs, self.W) + self.b)
score = K.softmax(K.dot(H, self.V), axis=1)
outputs = K.sum(score * inputs, axis=1)
return outputs
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[2]
- 整个BiLSTM+Attention模型的搭建。
def create_classify_model(max_len, vocab_size, embedding_size, hidden_size, attention_size, class_nums):
# 输入层
inputs = Input(shape=(max_len,), dtype='int32')
# Embedding层
x = Embedding(vocab_size, embedding_size)(inputs)
# BiLSTM层
x = Bidirectional(LSTM(hidden_size, dropout=0.2, return_sequences=True))(x)
# Attention层
x = AttentionLayer(attention_size=attention_size)(x)
# 输出层
outputs = Dense(class_nums, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.summary() # 输出模型结构和参数数量
return model
模型的训练和保存
前面我们已经处理好训练数据,以及模型的搭建,下面开始真正的模型训练、测试和保存。
# 参数的初始化
MAX_LEN = 30
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 64
ATT_SIZE = 50
BATCH_SIZE = 64
EPOCHS = 20
VOCAB_SIZE = len(vocab2idx)
CLASS_NUMS = len(category2idx)
count = len(all_labels) # 数据总量
# 数据集划分比例
rate1, rate2 = 0.8, 0.9 # train-0.8, test-0.1, dev-0.1
# 数据的填充,以及类别one-hot化
new_datas = sequence.pad_sequences(all_datas, maxlen=MAX_LEN)
new_labels = keras.utils.to_categorical(all_labels, CLASS_NUMS)
# 根据比例划分训练集、测试集、验证集
x_train, y_train = new_datas[:int(count*rate1)], new_labels[:int(count*rate1)]
x_test, y_test = new_datas[int(count*rate1):int(count*rate2)], new_labels[int(count*rate1):int(count*rate2)]
x_val, y_val = new_datas[int(count*rate2):], new_labels[int(count*rate2):]
# 根据参数创建模型
model = create_classify_model(MAX_LEN, VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, ATT_SIZE, CLASS_NUMS)
# 选择损失函数和优化函数
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# 训练模型
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_test, y_test))
# 验证模型
score, acc = model.evaluate(x_val, y_val, batch_size=BATCH_SIZE)
print('score:', score, 'accuracy:', acc)
# 保存训练好的模型
model.save("/you/path/news_classify_model.h5")
加载模型预测新的数据
下面的加载模型和预测结果,与前面的项目是分开的,是在前面训练完后保存模型,这里重新加载训练好的模型,同时也要加载词典文件。对于新的数据预测结果,首先还是要对数据分词、索引化等预处理,这里演示单条数据的处理和预测,批量预测过程差不多。
from keras.models import load_model
import numpy as np
from pyhanlp import HanLP
np.set_printoptions(suppress=True)
# 模型的路径
model_path = "/you/path/news_classify_model.h5"
# 词典的路径
save_vocab_path = "/you/path/word_vocabs.txt"
# 特殊词
special_words = ['<PAD>', '<UNK>']
# 类别序列
category_lists = ["民生故事","文化","娱乐","体育","财经","房产","汽车","教育","科技","军事",
"旅游","国际","证券","农业","电竞游戏"]
maxlen = 30 # 序列最大长度参数
ATT_SIZE = 50 # attention中的参数
# 类别索引化
category2idx = {cate: idx for idx, cate in enumerate(category_lists)}
idx2category = {idx: cate for idx, cate in enumerate(category_lists)}
# 词典索引化
idx2vocab, vocab2idx = read_word_vocabs(save_vocab_path, special_words)
# 加载模型
model = load_model(model_path, custom_objects={'AttentionLayer': AttentionLayer(ATT_SIZE)}, compile=False)
# 需要预测的数据
content = "科创板技术系统准备就绪,13日进行预通关测试"
content_words = [term.word for term in HanLP.segment(content)]
sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in content_words]
sent2id_new = np.array([sent2id[:maxlen] + [0] * (maxlen-len(sent2id))])
# 预测结果
y_pred = model.predict(sent2id_new)
print(y_pred)
# 预测结果和类别对应
result = {}
for idx, pred in enumerate(y_pred[0]):
result[idx2category[idx]] = pred
# 对预测类别按概率大小排序
result_sorted = sorted(result.items(), key=lambda item: item[1], reverse=True)
print(result_sorted)