本篇主要记录Keras实现BiLSTM+Attention模型,其中Attention是自定义层。然后用该模型完成新闻标题文本分类任务。

详细代码和数据:https://github.com/huanghao128/zh-nlp-demo

数据预处理

这里使用的数据集只是用来演示文本分类任务,所以没有使用长篇的文章,而是使用的标题。原始数据集是在头条爬取的,在这里可以下载:https://github.com/fate233/toutiao-text-classfication-dataset

我在这个数据集上做了一些处理,只保留了类别和标题,去除了关键词,处理后的数据如下图“类别_!_标题”格式:

基于textrnn的文本分类 bilstm attention文本分类_加载


读取上面的数据,然后HanLP分词预处理,构建词典并保存词典的结果。

from pyhanlp import HanLP
import numpy as np
from tqdm import tqdm

# 读取原始数据集分词预处理 并保存词典
def read_toutiao_dataset(data_path, save_vocab_path):
    with open(data_path, "r", encoding="utf8") as fo:
        all_lines = fo.readlines()
    datas, labels = [], []
    word_vocabs = {}
    for line in tqdm(all_lines):
        content_words = []
        category, content = line.strip().split("_!_")
        for term in HanLP.segment(content):
            if term.word not in word_vocabs:
                word_vocabs[term.word] = len(word_vocabs)+1
            content_words.append(term.word)
        datas.append(content_words)
        labels.append(category)
    with open(save_vocab_path, "w", encoding="utf8") as fw:
        for word, index in word_vocabs.items():
            fw.write(word+"\n")
    return datas, labels

读取词典文件,生成词-索引对应关系,其中special_words是填充<PAD>和未知词<UNK>等。

def read_word_vocabs(save_vocab_path, special_words):
    with open(save_vocab_path, "r", encoding="utf8") as fo:
        word_vocabs = [word.strip() for word in fo]
    word_vocabs = special_words + word_vocabs
    idx2vocab = {idx: char for idx, char in enumerate(word_vocabs)} # 索引-词对应
    vocab2idx = {char: idx for idx, char in idx2vocab.items()} # 词-索引对应
    return idx2vocab, vocab2idx

把前面分词预处理过的数据索引化,即根据词典索引对应关系,把文本序列变成词编号序列。

def process_dataset(datas, labels, category2idx, vocab2idx):
    new_datas, new_labels = [], []
    for data, label in zip(datas, labels):
        index_data = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in data]
        index_label = category2idx[label]
        new_datas.append(index_data)
        new_labels.append(index_label)
    return new_datas, new_labels

前面都是定义的处理方法,下面正式开始处理数据。生成词典的索引化,类别的索引化。并把数据处理成模型可读取的数据。

# 原始数据集路径
data_path = "/you/path/toutiao_news_dataset.txt"
# 保存词典文件路径
save_vocab_path = "/you/path/word_vocabs.txt"
# 特殊词
special_words = ['<PAD>', '<UNK>']
# 对应的类别
category_lists = ["民生故事","文化","娱乐","体育","财经","房产","汽车","教育","科技","军事",
                "旅游","国际","证券","农业","电竞游戏"]
# 对应类别的索引化
category2idx = {cate: idx for idx, cate in enumerate(category_lists)}
idx2category = {idx: cate for idx, cate in enumerate(category_lists)}

# 读取数据,分词预处理,并保存词典
datas, labels = read_toutiao_dataset(data_path, save_vocab_path)
# 读取保存的词典,并建立词-编号对应,也即词典索引化
idx2vocab, vocab2idx = read_word_vocabs(save_vocab_path, special_words)
# 把处理过的标题文本和类别全部索引化,标题文本变成词编号序列,类别变成类别对应编号
all_datas, all_labels = process_dataset(datas, labels, category2idx, vocab2idx)

模型的构建

  1. 自定义Attention层,这里使用的注意力打分函数是加性模型。
import numpy
import keras
from keras import backend as K
from keras import activations
from keras.engine.topology import Layer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, Bidirectional
K.clear_session()

class AttentionLayer(Layer):
    def __init__(self, attention_size=None, **kwargs):
        self.attention_size = attention_size
        super(AttentionLayer, self).__init__(**kwargs)
        
    def get_config(self):
        config = super().get_config()
        config['attention_size'] = self.attention_size
        return config
        
    def build(self, input_shape):
        assert len(input_shape) == 3
        
        self.time_steps = input_shape[1]
        hidden_size = input_shape[2]
        if self.attention_size is None:
            self.attention_size = hidden_size
            
        self.W = self.add_weight(name='att_weight', shape=(hidden_size, self.attention_size),
                                initializer='uniform', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(self.attention_size,),
                                initializer='uniform', trainable=True)
        self.V = self.add_weight(name='att_var', shape=(self.attention_size,),
                                initializer='uniform', trainable=True)
        super(AttentionLayer, self).build(input_shape)
    
    def call(self, inputs):
        self.V = K.reshape(self.V, (-1, 1))
        H = K.tanh(K.dot(inputs, self.W) + self.b)
        score = K.softmax(K.dot(H, self.V), axis=1)
        outputs = K.sum(score * inputs, axis=1)
        return outputs
    
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[2]
  1. 整个BiLSTM+Attention模型的搭建。
def create_classify_model(max_len, vocab_size, embedding_size, hidden_size, attention_size, class_nums):
	# 输入层
    inputs = Input(shape=(max_len,), dtype='int32')
    # Embedding层
    x = Embedding(vocab_size, embedding_size)(inputs)
    # BiLSTM层
    x = Bidirectional(LSTM(hidden_size, dropout=0.2, return_sequences=True))(x)
    # Attention层
    x = AttentionLayer(attention_size=attention_size)(x)
    # 输出层
    outputs = Dense(class_nums, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.summary() # 输出模型结构和参数数量
    return model

模型的训练和保存

前面我们已经处理好训练数据,以及模型的搭建,下面开始真正的模型训练、测试和保存。

# 参数的初始化
MAX_LEN = 30
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 64
ATT_SIZE = 50
BATCH_SIZE = 64
EPOCHS = 20
VOCAB_SIZE = len(vocab2idx)
CLASS_NUMS = len(category2idx)
count = len(all_labels) # 数据总量
# 数据集划分比例
rate1, rate2 = 0.8, 0.9 # train-0.8, test-0.1, dev-0.1

# 数据的填充,以及类别one-hot化
new_datas = sequence.pad_sequences(all_datas, maxlen=MAX_LEN)
new_labels = keras.utils.to_categorical(all_labels, CLASS_NUMS)

# 根据比例划分训练集、测试集、验证集
x_train, y_train = new_datas[:int(count*rate1)], new_labels[:int(count*rate1)]
x_test, y_test = new_datas[int(count*rate1):int(count*rate2)], new_labels[int(count*rate1):int(count*rate2)]
x_val, y_val = new_datas[int(count*rate2):], new_labels[int(count*rate2):]

# 根据参数创建模型
model = create_classify_model(MAX_LEN, VOCAB_SIZE, EMBEDDING_SIZE, HIDDEN_SIZE, ATT_SIZE, CLASS_NUMS)
# 选择损失函数和优化函数
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 训练模型
model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_test, y_test))
# 验证模型 
score, acc = model.evaluate(x_val, y_val, batch_size=BATCH_SIZE)
print('score:', score, 'accuracy:', acc)

# 保存训练好的模型
model.save("/you/path/news_classify_model.h5")

加载模型预测新的数据

下面的加载模型和预测结果,与前面的项目是分开的,是在前面训练完后保存模型,这里重新加载训练好的模型,同时也要加载词典文件。对于新的数据预测结果,首先还是要对数据分词、索引化等预处理,这里演示单条数据的处理和预测,批量预测过程差不多。

from keras.models import load_model
import numpy as np
from pyhanlp import HanLP
np.set_printoptions(suppress=True)

# 模型的路径
model_path = "/you/path/news_classify_model.h5"
# 词典的路径
save_vocab_path = "/you/path/word_vocabs.txt"
# 特殊词
special_words = ['<PAD>', '<UNK>']
# 类别序列
category_lists = ["民生故事","文化","娱乐","体育","财经","房产","汽车","教育","科技","军事",
                "旅游","国际","证券","农业","电竞游戏"]
maxlen = 30 # 序列最大长度参数
ATT_SIZE = 50 # attention中的参数

# 类别索引化
category2idx = {cate: idx for idx, cate in enumerate(category_lists)}
idx2category = {idx: cate for idx, cate in enumerate(category_lists)}
# 词典索引化
idx2vocab, vocab2idx = read_word_vocabs(save_vocab_path, special_words)

# 加载模型
model = load_model(model_path, custom_objects={'AttentionLayer': AttentionLayer(ATT_SIZE)}, compile=False)

# 需要预测的数据
content = "科创板技术系统准备就绪,13日进行预通关测试"
content_words = [term.word for term in HanLP.segment(content)]
sent2id = [vocab2idx[word] if word in vocab2idx else vocab2idx['<UNK>'] for word in content_words]
sent2id_new = np.array([sent2id[:maxlen] + [0] * (maxlen-len(sent2id))])

# 预测结果
y_pred = model.predict(sent2id_new)
print(y_pred)
# 预测结果和类别对应
result = {}
for idx, pred in enumerate(y_pred[0]):
    result[idx2category[idx]] = pred
# 对预测类别按概率大小排序
result_sorted = sorted(result.items(), key=lambda item: item[1], reverse=True)
print(result_sorted)