Python连接数据库统计高频词保存为excel

精选原创

DriverWon 2023-12-11 17:32:15 博主文章分类：Python整活了 ©著作权

文章标签 SQL 数据 Python excel 高频词 文章分类 软件测试

©著作权归作者所有：来自51CTO博客作者DriverWon的原创作品，请联系作者获取转载授权，否则将追究法律责任

一、需求背景

运营同学提出：需要将文章当中出现频率高的词语统计出来....

简简单单一句话，平平淡淡写代码....

文章id是已知的，且文章内容保存在MySQL中，统计的高频词结果保存在excel文件中即可

二、分步解析

1 连接MySQL查询数据

需要导入pymysql第三方库（没有安装的同学请pip install pymysql）

数据库连接配置信息和查询SQL为虚拟，请自行修改

from pprint import pprint
import pymysql


def execute_sql(dbinfo, sql):
    conn = pymysql.connect(**dbinfo)  # 连接数据库
    cursor = conn.cursor()  # 数据库游标
    cursor.execute(sql)  # 执行sql语句
    result = cursor.fetchall()  # 查询结果
    conn.close()  # 关闭数据库连接
    return result


if __name__ == "__main__":
    # 数据库信息
    db_info = {
        'host': "192.168.1.1",
        'port': 3306,
        'user': "test",
        'passwd': "test",
        'db': "test_db",
        'charset': "utf8"
    }
    # 查询SQL语句
    s = 'select content from article where id in (1,2,3);'
    # 执行SQL，查询基础数据
    datas = execute_sql(db_info, s)
    pprint(datas)

运行结果：

Python连接数据库统计高频词保存为excel_高频词

2 分词并统计高频词

需要安装导入jieba第三方库，使用jieba.lcut()分词

使用collections.Counter().most_common()统计出高频词

注意：前面连接MySQL查询到的数据为二维元祖

from collections import Counter
from pprint import pprint
import jieba


def high_frequency_words(text, num):
    """
    统计文本内容中词语的出现频率
    :param text: 文本内容
    :param num: 需要统计的高频词的个数
    :return: 列表：[[词语,频次]]
    """
    # jieba分词，筛选长度大于1的词语，排除不要的词语
    words = [word for word in jieba.lcut(text) if len(word) > 1]
    # 词语计数
    c = Counter(words)
    # 统计高频词
    hfw = [comm for comm in c.most_common(num)]
    return hfw


if __name__ == "__main__":
    # 执行SQL，查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10) for d in datas for v in d]
    pprint(results)

运行结果：

Python连接数据库统计高频词保存为excel_Python_02

问题--高频词中包含“\r\n”等符号：

MySQL查询结果文本内容中包含“\r\n”等特殊符号，可是我们并不想要

解决：

jieba.lcut()分词时将其排除掉

from collections import Counter
from pprint import pprint
import jieba


def high_frequency_words(text, num, exclusionary):
    """
    统计文本内容中词语的出现频率
    :param exclusionary: 需要排除的词词语
    :param text: 文本内容
    :param num: 需要统计的高频词的个数
    :return: 列表：[[词语,频次]]
    """
    # jieba分词，筛选长度大于1的词语，排除不要的词语
    words = [word for word in jieba.lcut(text) if len(word) > 1 and word not in exclusionary]
    # 词语计数
    c = Counter(words)
    # 统计高频词
    hfw = [comm for comm in c.most_common(num)]
    return hfw


if __name__ == "__main__":
    # 执行SQL，查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
    pprint(results)

运行结果：

Python连接数据库统计高频词保存为excel_数据_03

3 统计结果导出excel

前面已经统计出我们想要的高频词，为方便运营同学查看，这里使用xlwt模块将结果保存到excel中

需要先安装导入xlwt第三方库

根据不同的文章，新建不同的worksheet，统计结果保存到同一个excel中

import datetime
import xlwt


def to_excel(result, file_name=None):
    ids = [1, 2, 3]  # 文章id列表
    heads = ['词语', '频次']  # 表头
    sj = datetime.datetime.now().strftime("%Y-%m-%d %H%M%S")  # 当前时间
    wb = xlwt.Workbook(encoding='utf-8')  # 创建工作簿
    # 遍历结果
    for index, value in enumerate(result):
        ws = wb.add_sheet(str(ids[index]), cell_overwrite_ok=True)
        # 写入表头
        col = 0
        for head in heads:
            ws.write(0, col, head)
            col += 1
        print(f"写入表头成功！{heads}")
        # 写入表格内容
        for idx, val in enumerate(value):
            col = 0
            for d in val:
                ws.write(idx + 1, col, d)
                col += 1
        print(f"写入表格内容成功！共{len(value)}行")
    # 保存excel
    if file_name is None:
        wb.save(f"{sj}.xlsx")
        print(f"excel保存成功！【{sj}.xlsx】")
        print("=" * 100)
    else:
        wb.save(f"{file_name + sj}.xlsx")
        print(f"excel保存成功！【{file_name + sj}.xlsx】")
        print("=" * 100)

        
if __name__ == "__main__":
    # 执行SQL，查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
    # 导出结果为excel
    to_excel(results)

运行结果：

Python连接数据库统计高频词保存为excel_高频词_04

Python连接数据库统计高频词保存为excel_数据_05

三、完整代码

import datetime
from collections import Counter
import jieba
import pymysql
import xlwt


def execute_sql(dbinfo, sql):
    conn = pymysql.connect(**dbinfo)  # 连接数据库
    cursor = conn.cursor()  # 数据库游标
    cursor.execute(sql)  # 执行sql语句
    result = cursor.fetchall()  # 查询结果
    conn.close()  # 关闭数据库连接
    return result


def high_frequency_words(text, num, exclusionary):
    """
    统计文本内容中词语的出现频率
    :param exclusionary: 需要排除的词词语
    :param text: 文本内容
    :param num: 需要统计的高频词的个数
    :return: 列表：[[词语,频次]]
    """
    # jieba分词，筛选长度大于1的词语，排除不要的词语
    words = [word for word in jieba.lcut(text) if len(word) > 1 and word not in exclusionary]
    # 词语计数
    c = Counter(words)
    # 统计高频词
    hfw = [comm for comm in c.most_common(num)]
    return hfw


def to_excel(result, file_name=None):
    ids = [1, 2, 3]  # 文章id列表
    heads = ['词语', '频次']  # 表头
    sj = datetime.datetime.now().strftime("%Y-%m-%d %H%M%S")  # 当前时间
    wb = xlwt.Workbook(encoding='utf-8')  # 创建工作簿
    # 遍历结果
    for index, value in enumerate(result):
        ws = wb.add_sheet(str(ids[index]), cell_overwrite_ok=True)
        # 写入表头
        col = 0
        for head in heads:
            ws.write(0, col, head)
            col += 1
        print(f"写入表头成功！{heads}")
        # 写入表格内容
        for idx, val in enumerate(value):
            col = 0
            for d in val:
                ws.write(idx + 1, col, d)
                col += 1
        print(f"写入表格内容成功！共{len(value)}行")
    # 保存excel
    if file_name is None:
        wb.save(f"{sj}.xlsx")
        print(f"excel保存成功！【{sj}.xlsx】")
        print("=" * 100)
    else:
        wb.save(f"{file_name + sj}.xlsx")
        print(f"excel保存成功！【{file_name + sj}.xlsx】")
        print("=" * 100)


if __name__ == "__main__":
    # 数据库信息
    db_info = {
        'host': "192.168.1.1",
        'port': 3306,
        'user': "test",
        'passwd': "test",
        'db': "test_db",
        'charset': "utf8"
    }
    # 查询SQL语句
    s = 'select content from article where id in (1,2,3);'
    # 执行SQL，查询基础数据
    datas = execute_sql(db_info, s)
    # 统计前10个高频词
    results = [high_frequency_words(v, 10, '\r\n') for d in datas for v in d]
    # 导出结果为excel
    to_excel(results)