需求是在48万条中文新闻标题里,给测试集中50条标题的每一条找出最相近的20条新闻。拿到这个需求第一反应当然是计算词向量,生成每句话的语义向量然后计算相似度啦,也想过TFIDF来提取每条新闻的关键字,但是原始数据集是没有分类标签的,所以这一步比较难做。在下一步打算计算每条新闻的语义向量以后对新闻进行聚类,然后再应用TFDF提取关键字,直觉上这样可以提升准确度同时降低运算量。完整代码可以从这个github链接上找到
先说说目前已经做到的程度吧。首先所给的训练集是一条一条的新闻,需要对新闻进行分词来构建语料库,这里用了常用的结巴库来进行分词,同时删除stopwords,也就是去掉文本中没有意义的一些词来提升对语义的理解。
先定义一些小工具函数来帮助我们构建语料库。其中的clear_list用来清除列表中不在语料字典里的词这个后边会说到。
import jieba
from logs.config import cfg
def stop_words():
#获取stopwords,返回列表
stop_words_file = open(cfg.stop_words_path, 'r')
stopwords_list = []
for line in stop_words_file.readlines():
stopwords_list.append(line[:-1])
#print(type(stopwords_list[-1]))
return stopwords_list
def jieba_fenci(raw, stopwords_list):
#返回结巴切分过的单词列表
word_list =list(jieba.cut(raw, cut_all=False))
#print(type(word_list))
for word in word_list:
if word in stopwords_list:
word_list.remove(word)
#word_list.remove('\n')
#word_list = ' '.join(word_list)
#print(type(word_list))
return word_list
def clear_list(l,vocab):
#删除不在词典内的单词,返回列表
#print(l)
l1 = []
for word in l:
if word in vocab:
l1.append(word)
else:
l.remove(word)
#print(word)
return l1
有了工具们就可以轻松构建语料库啦
from utils import handle_words as hw
import pandas as pd
stopwords_list = hw.stop_words()#加载stop_words
def get_words(words_path,df_path):
#用df_path文件中的句子来构建语料库
df = pd.read_csv(df_path)
with open(words_path,'wb') as y:
for i in range(df.shape[0]):
w_l = hw.jieba_fenci(df['title'][i],stopwords_list)#进行分词
#print(w_l)
for n in range(len(w_l)):
y.write(w_l[n].encode('utf-8'))#删除stopwords
y.write('\n'.encode('utf-8'))
if (i+1)%10000==0:
#break
print(i)
print('words prepared')
return words_path
构建完语料库以后就可以在语料中进行学习,这里使用了gensim库,用来构建语义模型,这个库用来处理自然语言十分强大,这里只用到了word2vec模型。关于word2vec模型也十分厉害厉害厉害,细节的可以看看
这篇博客,总之呢就是给每个分词构建一个特征向量来表示词义,将词义表示成了向量之后我们就大有可为了,通过向量的加减、距离等方式我们就可以得到词语意义上的转移、区别,我觉得这可以说是NLP的敲门砖喽。关于word2vec的参数前边的博客里也都有,就不多写了,下面是训练过程。
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from utils.get_words import get_words
from logs.config import cfg
def train_model():
words_path = cfg.words_path
train_df_path = cfg.train_df_path
model_output_path = cfg.model_output_path
print('preparing words')
get_words(words_path,train_df_path)#准备语料
print('train model using {}'.format(train_df_path))
model_output_path = cfg.model_output_path
model = Word2Vec(LineSentence(words_path),
size=cfg.train_size, window=cfg.train_window,
min_count=cfg.train_min_count, workers=cfg.train_workers)#训练
model.save(model_output_path)
print('training done,saving model as {}'.format(model_output_path))
训练之后就可以在我们的数据上应用模型计算语义向量了。因为需求时间比较紧张(其实就是不会),这里用了十分暴力的办法,遍历所有的新闻,计算训练集和测试集新闻两两之间的相似度,复杂度爆棚,运行速度更是令人发指,不过贵在简单..值得一提的是这里删除了测试集里标题分词后不在语料字典里的词,因为不在语料字典里的词对寻找寻找训练集中的相似标题肯定一点作用都没有,还顺带解决了模型报错的问题,最后将相似度排序,取了每个测试标题最相似的前20个训练集标题的索引保存下来。
import numpy as np
import pandas as pd
import heapq
from gensim.models import Word2Vec
from logs.config import cfg
from utils import handle_words as hw
def calc_test_data():
model_path = cfg.model_output_path
print('loading model from {}'.format(model_path))
model = Word2Vec.load(model_path)#加载模型
vocab = list(model.wv.vocab.keys())#获得语料字典
test_data_path = cfg.test_df_path
print('loading test data from {}'.format(test_data_path))
test = pd.read_csv(test_data_path,encoding='gbk')
test['top20'] = np.zeros(test.shape[0])
train_data_path = cfg.train_df_path
print('loading train data from {}'.format(train_data_path))
train = pd.read_csv(train_data_path)
stopwords_list = hw.stop_words()
test_words = []
for i in range(test.shape[0]):
raw = test['title'][i]
l1 = hw.jieba_fenci(raw,stopwords_list)
l1 = hw.clear_list(l1,vocab)#清洗列表,确保列表里的词都在字典里
test_words.append(l1)
anss = np.zeros((50,485686))
print(anss.shape)
for i in range(train.shape[0]):
raw = train['title'][i]
l1 = hw.jieba_fenci(raw,stopwords_list)
l1 = hw.clear_list(l1,vocab)
#print('test',l1)
for j in range(test.shape[0]):
l2 = test_words[j]
try:
anss[j,i] = model.n_similarity(l1,l2)
except:anss[j,i] = 0
if i%1000==0:
print('{} words done'.format(i))
print('all done')
top20_path = cfg.top20_path
print('saving result as {}'.format(top20_path))
# for i in range(50):
# ans = anss[i,:]
# top = heapq.nlargest(20,range(len(ans)),ans.__getitem__)
# test['top20'][i] = str(top)
for i in range(50):
ans = anss[i,:]
top = heapq.nlargest(20,range(len(ans)),ans.__getitem__)#获得相似度排名前20的标题的索引
an['top20'][i] = top
test.to_csv(cfg.top20_path,mode='a')#保存结果
最后按条目保存了运行的结果顺便打印了一下结果看看,效果好像还不错。
import pandas as pd
from logs.config import cfg
def write_result():
result_path = cfg.top20_path
train_data_path = cfg.train_df_path
print('loading result from {}'.format(result_path))
result = pd.read_csv(result_path,encoding='gbk')
print('loading train data from {}'.format(train_data_path))
train_data = pd.read_csv(train_data_path)
for i in range(50):
print('=========================================')
print(result['title'][i])
print('_________________________________________')
to = result['top20'][i]
l = to.split(',')
with open('output/{}.txt'.format(i+1),'w',encoding='utf-8') as f:
for j in range(0,20):
if j==0:
print('st')
f.write(train_data['title'][int(l[j][1:])])
f.write('\n')
print(train_data['title'][int(l[j][1:])],'\n')
elif j==19:
f.write(train_data['title'][int(l[j][:-1])])
f.write('\n')
print(train_data['title'][int(l[j][:-1])],'\n')
print('en')
else:
f.write(train_data['title'][int(l[j])])
f.write('\n')
print(train_data['title'][int(l[j])],'\n')
下面是一部分运行结果,单横线上是测试集,下边是找到的相似标题。