gensim文档:

​https://radimrehurek.com/gensim/tutorial.html​​​

安装

pip install  gensim

代码示例

# -*- coding: utf-8 -*-

import logging

import jieba
from gensim import corpora, models, similarities

logging.basicConfig(level=logging.DEBUG)
jieba.setLogLevel(logging.INFO)


class DocumentSimilar(object):
def __init__(self, documents):
self.documents = documents
self.dictionary = None
self.tfidf = None
self.similar_matrix = None
self.calculate_similar_matrix()

@staticmethod
def split_word(document):
"""
分词,去除停用词
"""
stop_words = {":", "的", ",", "”"}

text = []
for word in jieba.cut(document):
if word not in stop_words:
text.append(word)

logging.debug(text)

return text

def calculate_similar_matrix(self):
"""
计算相似度矩阵及一些必要数据
"""
words = [self.split_word(document) for document in self.documents]

self.dictionary = corpora.Dictionary(words)
corpus = [self.dictionary.doc2bow(word) for word in words]
self.tfidf = models.TfidfModel(corpus)
corpus_tfidf = self.tfidf[corpus]
self.similar_matrix = similarities.MatrixSimilarity(corpus_tfidf)

def get_similar(self, document):
"""
计算要比较的文档与语料库中每篇文档的相似度
"""
words = self.split_word(document)
corpus = self.dictionary.doc2bow(words)
corpus_tfidf = self.tfidf[corpus]
return self.similar_matrix[corpus_tfidf]


if __name__ == '__main__':

documents = [
"货运物流供应商Flexport完成10亿美元融资",
"一笔300亿并购落地,一个新游戏帝国崛起",
"讯轻科技”累计完成近千万元融资",
"窝趣公寓完成近2亿元B轮融资主打品质和轻松社交的居住环境",
"IBM的区块链副总裁JesseLund:比特币将达到100万美元",
]

doc_similar = DocumentSimilar(documents)

# 要比较的文档
new_doc = "窝趣公寓完成近2亿元B轮融资"

for value, document in zip(doc_similar.get_similar(new_doc), documents):
print("{:.2f}".format(value), document)

输出结果

0.03 货运物流供应商Flexport完成10亿美元融资
0.00 一笔300亿并购落地,一个新游戏帝国崛起
0.10 讯轻科技”累计完成近千万元融资
0.66 窝趣公寓完成近2亿元B轮融资主打品质和轻松社交的居住环境
0.00 IBM的区块链副总裁JesseLund:比特币将达到100万美元


参考:

  1. ​python使用gensim进行文本相似度计算​