Keras实现文本预处理

原创

wx6464351503832 2023-05-17 15:19:58 ©著作权

©著作权归作者所有：来自51CTO博客作者wx6464351503832的原创作品，请联系作者获取转载授权，否则将追究法律责任

from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

text1 = "今天 北京 下 暴雨 了"
text2 = "我 今天 打车 回家"
texts = [text1, text2]

print(text_to_word_sequence(text1))  # 按空格分割语料
# ['今天', '北京', '下', '暴雨', '了']

tokenizer = Tokenizer(num_words=10)
tokenizer.fit_on_texts(texts)
print(tokenizer.document_count) # 处理文档的数量
# 2
print(tokenizer.word_counts) # 词频字典，按词频从大到小排序
# OrderedDict([('今天', 2), ('北京', 1), ('下', 1), ('暴雨', 1), ('了', 1), ('我', 1), ('打车', 1), ('回家', 1)])
print(tokenizer.word_docs) # 保存每个word出现的文档的数量
# {'了': 1, '暴雨': 1, '北京': 1, '下': 1, '今天': 2, '打车': 1, '回家': 1, '我': 1}
print(tokenizer.word_index) # 给每个词唯一id
# {'今天': 1, '北京': 2, '下': 3, '暴雨': 4, '了': 5, '我': 6, '打车': 7, '回家': 8}
print(tokenizer.index_docs) # 保存word的id出现的文档的数量
# {5: 1, 4: 1, 2: 1, 3: 1, 1: 2, 7: 1, 8: 1, 6: 1}

# 将序列填充到maxlen长度
print(pad_sequences([[1,2,3],[4,5,6]],maxlen=10,padding='pre')) # 在序列前填充
# [[0 0 0 0 0 0 0 1 2 3]
# [0 0 0 0 0 0 0 4 5 6]]
print(pad_sequences([[1,2,3],[4,5,6]],maxlen=10,padding='post')) # 在序列后填充
# [[1 2 3 0 0 0 0 0 0 0]
# [4 5 6 0 0 0 0 0 0 0]]