一、需求背景
运营同学提出:需要将文章当中出现频率高的词语统计出来....
简简单单一句话,平平淡淡写代码....
文章id是已知的,且文章内容保存在MySQL中,统计的高频词结果保存在excel文件中即可
二、分步解析
1 连接MySQL查询数据
需要导入pymysql第三方库(没有安装的同学请pip install pymysql)
数据库连接配置信息和查询SQL为虚拟,请自行修改
from pprint import pprint
import pymysql
def execute_sql(dbinfo, sql):
conn = pymysql.connect(**dbinfo) # 连接数据库
cursor = conn.cursor() # 数据库游标
cursor.execute(sql) # 执行sql语句
result = cursor.fetchall() # 查询结果
conn.close() # 关闭数据库连接
return result
if __name__ == "__main__":
# 数据库信息
db_info = {
'host': "192.168.1.1",
'port': 3306,
'user': "test",
'passwd': "test",
'db': "test_db",
'charset': "utf8"
}
# 查询SQL语句
s = 'select content from article where id in (1,2,3);'
# 执行SQL,查询基础数据
datas = execute_sql(db_info, s)
pprint(datas)
运行结果:
2 分词并统计高频词
需要安装导入jieba第三方库,使用jieba.lcut()分词
使用collections.Counter().most_common()统计出高频词
注意:前面连接MySQL查询到的数据为二维元祖
from collections import Counter
from pprint import pprint
import jieba
def high_frequency_words(text, num):
"""
统计文本内容中词语的出现频率
:param text: 文本内容
:param num: 需要统计的高频词的个数
:return: 列表:[[词语,频次]]
"""
# jieba分词,筛选长度大于1的词语,排除不要的词语
words = [word for word in jieba.lcut(text) if len(word) > 1]
# 词语计数
c = Counter(words)
# 统计高频词
hfw = [comm for comm in c.most_common(num)]
return hfw
if __name__ == "__main__":
# 执行SQL,查询基础数据
datas = execute_sql(db_info, s)
# 统计前10个高频词
results = [high_frequency_words(v, 10) for d in datas for v in d]
pprint(results)
运行结果:
问题--高频词中包含“\r\n”等符号:
MySQL查询结果文本内容中包含“\r\n”等特殊符号,可是我们并不想要
解决:
jieba.lcut()分词时将其排除掉
from collections import Counter
from pprint import pprint
import jieba
def high_frequency_words(text, num, exclusionary):
"""
统计文本内容中词语的出现频率
:param exclusionary: 需要排除的词词语
:param text: 文本内容
:param num: 需要统计的高频词的个数
:return: 列表:[[词语,频次]]
"""
# jieba分词,筛选长度大于1的词语,排除不要的词语
words = [word for word in jieba.lcut(text) if len(word) > 1 and word not in exclusionary]
# 词语计数
c = Counter(words)
# 统计高频词
hfw = [comm for comm in c.most_common(num)]
return hfw
if __name__ == "__main__":
# 执行SQL,查询基础数据
datas = execute_sql(db_info, s)
# 统计前10个高频词
results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
pprint(results)
运行结果:
3 统计结果导出excel
前面已经统计出我们想要的高频词,为方便运营同学查看,这里使用xlwt模块将结果保存到excel中
需要先安装导入xlwt第三方库
根据不同的文章,新建不同的worksheet,统计结果保存到同一个excel中
import datetime
import xlwt
def to_excel(result, file_name=None):
ids = [1, 2, 3] # 文章id列表
heads = ['词语', '频次'] # 表头
sj = datetime.datetime.now().strftime("%Y-%m-%d %H%M%S") # 当前时间
wb = xlwt.Workbook(encoding='utf-8') # 创建工作簿
# 遍历结果
for index, value in enumerate(result):
ws = wb.add_sheet(str(ids[index]), cell_overwrite_ok=True)
# 写入表头
col = 0
for head in heads:
ws.write(0, col, head)
col += 1
print(f"写入表头成功!{heads}")
# 写入表格内容
for idx, val in enumerate(value):
col = 0
for d in val:
ws.write(idx + 1, col, d)
col += 1
print(f"写入表格内容成功!共{len(value)}行")
# 保存excel
if file_name is None:
wb.save(f"{sj}.xlsx")
print(f"excel保存成功!【{sj}.xlsx】")
print("=" * 100)
else:
wb.save(f"{file_name + sj}.xlsx")
print(f"excel保存成功!【{file_name + sj}.xlsx】")
print("=" * 100)
if __name__ == "__main__":
# 执行SQL,查询基础数据
datas = execute_sql(db_info, s)
# 统计前10个高频词
results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
# 导出结果为excel
to_excel(results)
运行结果:
三、完整代码
import datetime
from collections import Counter
import jieba
import pymysql
import xlwt
def execute_sql(dbinfo, sql):
conn = pymysql.connect(**dbinfo) # 连接数据库
cursor = conn.cursor() # 数据库游标
cursor.execute(sql) # 执行sql语句
result = cursor.fetchall() # 查询结果
conn.close() # 关闭数据库连接
return result
def high_frequency_words(text, num, exclusionary):
"""
统计文本内容中词语的出现频率
:param exclusionary: 需要排除的词词语
:param text: 文本内容
:param num: 需要统计的高频词的个数
:return: 列表:[[词语,频次]]
"""
# jieba分词,筛选长度大于1的词语,排除不要的词语
words = [word for word in jieba.lcut(text) if len(word) > 1 and word not in exclusionary]
# 词语计数
c = Counter(words)
# 统计高频词
hfw = [comm for comm in c.most_common(num)]
return hfw
def to_excel(result, file_name=None):
ids = [1, 2, 3] # 文章id列表
heads = ['词语', '频次'] # 表头
sj = datetime.datetime.now().strftime("%Y-%m-%d %H%M%S") # 当前时间
wb = xlwt.Workbook(encoding='utf-8') # 创建工作簿
# 遍历结果
for index, value in enumerate(result):
ws = wb.add_sheet(str(ids[index]), cell_overwrite_ok=True)
# 写入表头
col = 0
for head in heads:
ws.write(0, col, head)
col += 1
print(f"写入表头成功!{heads}")
# 写入表格内容
for idx, val in enumerate(value):
col = 0
for d in val:
ws.write(idx + 1, col, d)
col += 1
print(f"写入表格内容成功!共{len(value)}行")
# 保存excel
if file_name is None:
wb.save(f"{sj}.xlsx")
print(f"excel保存成功!【{sj}.xlsx】")
print("=" * 100)
else:
wb.save(f"{file_name + sj}.xlsx")
print(f"excel保存成功!【{file_name + sj}.xlsx】")
print("=" * 100)
if __name__ == "__main__":
# 数据库信息
db_info = {
'host': "192.168.1.1",
'port': 3306,
'user': "test",
'passwd': "test",
'db': "test_db",
'charset': "utf8"
}
# 查询SQL语句
s = 'select content from article where id in (1,2,3);'
# 执行SQL,查询基础数据
datas = execute_sql(db_info, s)
# 统计前10个高频词
results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
# 导出结果为excel
to_excel(results)