一、需求背景

运营同学提出:需要将文章当中出现频率高的词语统计出来....

简简单单一句话,平平淡淡写代码....

文章id是已知的,且文章内容保存在MySQL中,统计的高频词结果保存在excel文件中即可

二、分步解析

1 连接MySQL查询数据

需要导入pymysql第三方库(没有安装的同学请pip install pymysql)

数据库连接配置信息和查询SQL为虚拟,请自行修改

from pprint import pprint
import pymysql


def execute_sql(dbinfo, sql):
    conn = pymysql.connect(**dbinfo)  # 连接数据库
    cursor = conn.cursor()  # 数据库游标
    cursor.execute(sql)  # 执行sql语句
    result = cursor.fetchall()  # 查询结果
    conn.close()  # 关闭数据库连接
    return result


if __name__ == "__main__":
    # 数据库信息
    db_info = {
        'host': "192.168.1.1",
        'port': 3306,
        'user': "test",
        'passwd': "test",
        'db': "test_db",
        'charset': "utf8"
    }
    # 查询SQL语句
    s = 'select content from article where id in (1,2,3);'
    # 执行SQL,查询基础数据
    datas = execute_sql(db_info, s)
    pprint(datas)

运行结果:

Python连接数据库统计高频词保存为excel_高频词

2 分词并统计高频词

需要安装导入jieba第三方库,使用jieba.lcut()分词

使用collections.Counter().most_common()统计出高频词

注意:前面连接MySQL查询到的数据为二维元祖

from collections import Counter
from pprint import pprint
import jieba


def high_frequency_words(text, num):
    """
    统计文本内容中词语的出现频率
    :param text: 文本内容
    :param num: 需要统计的高频词的个数
    :return: 列表:[[词语,频次]]
    """
    # jieba分词,筛选长度大于1的词语,排除不要的词语
    words = [word for word in jieba.lcut(text) if len(word) > 1]
    # 词语计数
    c = Counter(words)
    # 统计高频词
    hfw = [comm for comm in c.most_common(num)]
    return hfw


if __name__ == "__main__":
    # 执行SQL,查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10) for d in datas for v in d]
    pprint(results)

运行结果:

Python连接数据库统计高频词保存为excel_Python_02

问题--高频词中包含“\r\n”等符号:

MySQL查询结果文本内容中包含“\r\n”等特殊符号,可是我们并不想要

解决:

jieba.lcut()分词时将其排除掉

from collections import Counter
from pprint import pprint
import jieba


def high_frequency_words(text, num, exclusionary):
    """
    统计文本内容中词语的出现频率
    :param exclusionary: 需要排除的词词语
    :param text: 文本内容
    :param num: 需要统计的高频词的个数
    :return: 列表:[[词语,频次]]
    """
    # jieba分词,筛选长度大于1的词语,排除不要的词语
    words = [word for word in jieba.lcut(text) if len(word) > 1 and word not in exclusionary]
    # 词语计数
    c = Counter(words)
    # 统计高频词
    hfw = [comm for comm in c.most_common(num)]
    return hfw


if __name__ == "__main__":
    # 执行SQL,查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
    pprint(results)

运行结果:

Python连接数据库统计高频词保存为excel_数据_03

3 统计结果导出excel

前面已经统计出我们想要的高频词,为方便运营同学查看,这里使用xlwt模块将结果保存到excel中

需要先安装导入xlwt第三方库

根据不同的文章,新建不同的worksheet,统计结果保存到同一个excel中

import datetime
import xlwt


def to_excel(result, file_name=None):
    ids = [1, 2, 3]  # 文章id列表
    heads = ['词语', '频次']  # 表头
    sj = datetime.datetime.now().strftime("%Y-%m-%d %H%M%S")  # 当前时间
    wb = xlwt.Workbook(encoding='utf-8')  # 创建工作簿
    # 遍历结果
    for index, value in enumerate(result):
        ws = wb.add_sheet(str(ids[index]), cell_overwrite_ok=True)
        # 写入表头
        col = 0
        for head in heads:
            ws.write(0, col, head)
            col += 1
        print(f"写入表头成功!{heads}")
        # 写入表格内容
        for idx, val in enumerate(value):
            col = 0
            for d in val:
                ws.write(idx + 1, col, d)
                col += 1
        print(f"写入表格内容成功!共{len(value)}行")
    # 保存excel
    if file_name is None:
        wb.save(f"{sj}.xlsx")
        print(f"excel保存成功!【{sj}.xlsx】")
        print("=" * 100)
    else:
        wb.save(f"{file_name + sj}.xlsx")
        print(f"excel保存成功!【{file_name + sj}.xlsx】")
        print("=" * 100)

        
if __name__ == "__main__":
    # 执行SQL,查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
    # 导出结果为excel
    to_excel(results)

运行结果:

Python连接数据库统计高频词保存为excel_高频词_04

Python连接数据库统计高频词保存为excel_数据_05

三、完整代码

import datetime
from collections import Counter
import jieba
import pymysql
import xlwt


def execute_sql(dbinfo, sql):
    conn = pymysql.connect(**dbinfo)  # 连接数据库
    cursor = conn.cursor()  # 数据库游标
    cursor.execute(sql)  # 执行sql语句
    result = cursor.fetchall()  # 查询结果
    conn.close()  # 关闭数据库连接
    return result


def high_frequency_words(text, num, exclusionary):
    """
    统计文本内容中词语的出现频率
    :param exclusionary: 需要排除的词词语
    :param text: 文本内容
    :param num: 需要统计的高频词的个数
    :return: 列表:[[词语,频次]]
    """
    # jieba分词,筛选长度大于1的词语,排除不要的词语
    words = [word for word in jieba.lcut(text) if len(word) > 1 and word not in exclusionary]
    # 词语计数
    c = Counter(words)
    # 统计高频词
    hfw = [comm for comm in c.most_common(num)]
    return hfw


def to_excel(result, file_name=None):
    ids = [1, 2, 3]  # 文章id列表
    heads = ['词语', '频次']  # 表头
    sj = datetime.datetime.now().strftime("%Y-%m-%d %H%M%S")  # 当前时间
    wb = xlwt.Workbook(encoding='utf-8')  # 创建工作簿
    # 遍历结果
    for index, value in enumerate(result):
        ws = wb.add_sheet(str(ids[index]), cell_overwrite_ok=True)
        # 写入表头
        col = 0
        for head in heads:
            ws.write(0, col, head)
            col += 1
        print(f"写入表头成功!{heads}")
        # 写入表格内容
        for idx, val in enumerate(value):
            col = 0
            for d in val:
                ws.write(idx + 1, col, d)
                col += 1
        print(f"写入表格内容成功!共{len(value)}行")
    # 保存excel
    if file_name is None:
        wb.save(f"{sj}.xlsx")
        print(f"excel保存成功!【{sj}.xlsx】")
        print("=" * 100)
    else:
        wb.save(f"{file_name + sj}.xlsx")
        print(f"excel保存成功!【{file_name + sj}.xlsx】")
        print("=" * 100)


if __name__ == "__main__":
    # 数据库信息
    db_info = {
        'host': "192.168.1.1",
        'port': 3306,
        'user': "test",
        'passwd': "test",
        'db': "test_db",
        'charset': "utf8"
    }
    # 查询SQL语句
    s = 'select content from article where id in (1,2,3);'
    # 执行SQL,查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
    # 导出结果为excel
    to_excel(results)