基于简单的计算sentence间的相似度,并进行page ranking实现抽取文章摘要。使用jieba库实现抽取关键词。可以有很多优化的点,后面慢慢更新吧。
#/usr/bin/python
# encoding: UTF-8
import re
import math
import jieba
import jieba.analyse
import numpy as np
import networkx as nx
import random
import sys
class DocumentHandler:
def __init__(self, file_path):
self.full_text = ''
self.read_file(file_path)
# read data from file
def read_file(self, file_path):
fi = open(file_path, 'r+', encoding='UTF-8')
self.full_text = fi.read()
fi.close()
# split text as sentences
def split_sentence(self, full_text):
sents = re.split(u'[\n。]', full_text)
sents = [sent for sent in sents if len(sent) > 0]
return sents
# calculate similarity
def cal_sim(self, word_list_1, word_list_2):
occur_sum = 0
word_set_1 = list(set(word_list_1))
word_set_2 = list(set(word_list_2))
for word in word_set_1:
if word in word_set_2:
occur_sum += 1.0
if occur_sum < 1e-6:
return 0.0
denominator = math.log(len(word_set_1)) + math.log(len(word_set_2))
if abs(denominator) < 1e-6:
return 0.0
return occur_sum / denominator
# ranking sentces
def text_rank(self, sentences, top_num = 5, pagerank_config={'alpha': 0.85}):
sents_num = len(sentences)
sorted_sents = []
sent_word_list = []
# split sentece to word list
for sent in sentences:
words = []
cut_res = jieba.cut(sent)
for cut in cut_res:
words.append(cut)
sent_word_list.append(words)
# calculate simiarity
sim_graph = np.zeros((sents_num, sents_num))
for x in range(sents_num):
for y in range(x, sents_num):
similarity = self.cal_sim(sent_word_list[x], sent_word_list[y])
sim_graph[x, y] = similarity
sim_graph[y, x] = similarity
# do page ranking
nx_graph = nx.from_numpy_matrix(sim_graph)
scores = nx.pagerank(nx_graph, **pagerank_config)
sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
# get top sentences based on page ranking
# Attention: you can do break in for loop to get top n
for index, score in sorted_scores:
item = {"sent": sentences[index], "score": score, "index": index}
sorted_sents.append(item)
return sorted_sents[:top_num]
# try to extract abstract from text
def extract_abstracts(self, full_text, top_num = 5):
sents = self.split_sentence(full_text)
rank_res = self.text_rank(sents, top_num=top_num)
sorted_res = sorted(rank_res, key=lambda x: x['index'], reverse=False)
return sorted_res
# get abstract of article
def getAbstract(self, top_num = 5):
res = self.extract_abstracts(self.full_text, top_num=top_num)
abstract = ""
for content in res:
abstract = abstract + content["sent"] + "。"
return abstract.strip()
# get keywords of article, using jieba for Chinese article processing
def getKeywords(self, top_num = 5):
tfidf = jieba.analyse.extract_tags
keywords = tfidf(self.full_text)
tmpKeywords = []
# Attention: you can break for loop to get top n
for keyword in keywords:
if len(keyword) < 5:
tmpKeywords.append(keyword)
return tmpKeywords[:top_num]
# main processor
def main(file_path):
docHandler = DocumentHandler(file_path)
print(docHandler.getAbstract())
print(docHandler.getKeywords())
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage: python digist_keyword.py <file path>')
sys.exit()
file_path = sys.argv[1] # the folder to store your plain text files
main(file_path)
使用百度百科的“百度”词条进行了测试,测试结果如下:
从创立之初,百度便将“让人们最平等便捷地获取信息,找到所求”作为自己的使命,成立以来,公司秉承“用户至上”的理念,不断坚持技术创新,致力于为用户提供“简单可依赖”的互联网搜索产品及服务,其中包括:以网络搜索为主的功能性搜索;以贴吧为主的社区搜索,针对各区域、行业所需的垂直搜索;以及门户频道、IM等,全面覆盖了中文网络世界所有的搜索需求。作为一家以技术为信仰的高科技公司,百度将技术创新作为立身之本,着力于互联网核心技术突破与人才培养,在搜索、人工智能、云计算、大数据等技术领域处于全球领先水平。百度是用户获取信息的最主要入口,随着移动互联网的发展,百度网页搜索完成了由PC向移动的转型,由连接人与信息扩展到连接人与服务,用户可以在PC、Pad、手机上访问百度主页,通过文字、语音、图像多种交互方式瞬间找到所需要的信息和服务。作为百度旗下核心产品,hao123及时收录包括音乐、视频、小说、游戏等热门分类的网站,与搜索完美结合,为中国互联网用户提供最简单便捷的网上导航服务,重新定义了上网导航的概念。百度商业服务是原有的百度推广(以搜索推广为主)的基础上,将数据产品、交易产品、媒体产品、信用产品和咨询服务进行了深度的整合, 并已将咨询服务、百度内容联盟加入到整体的商业服务框架中来。
['百度', '搜索', '服务', '用户', '互联网']