Python3.6 word批量转换为txt提取

原创

betterbertter 2021-08-29 10:10:05 博主文章分类：python ©著作权

文章标签 python txt文件遍历文件另存为 python库 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者betterbertter的原创作品，请联系作者获取转载授权，否则将追究法律责任

1.流程：批量读取文件夹下文件，批量转换word为txt文件，读取txt文件内容

2.word文件放入： D:\jianli ，文件夹下放入一个word文件

Python3.6 word批量转换为txt提取_txt文件

代码如下：

注意导入库

mport os
import re
import sys
import psutil
import win32com.client as wc
import configparser
import time

# -*- coding:utf-8 -*-

import os
import re
import sys
import psutil
import win32com.client as wc
import configparser
import time

'''自动简历解析 Python脚本'''


# 关闭 wps 进程
def printPids():
    pids = psutil.pids()
    for pid in pids:
        try:
            p = psutil.Process(pid)
            # print('pid=%s,pname=%s' % (pid, p.name()))
            # 关闭excel进程
            if p.name() == 'wps.exe':
                print("关闭wps.exe")
                cmd = 'taskkill /F /IM wps.exe'
                os.system(cmd)
        except Exception as e:
            print(e)


def getWordPath(path1, path2):
    # doc文件另存为docx
    word = wc.Dispatch("Word.Application")
    doc = word.Documents.Open(path1)
    # 上面的地方只能使用完整绝对地址，相对地址找不到文件，且，只能用“\\”，不能用“/”，哪怕加了 r 也不行，涉及到将反斜杠看成转义字符。
    doc.SaveAs(path2, 2, False, "", True, "", False, False, False,
               False)  # 转换后的文件,12代表转换后为docx文件
    # doc.SaveAs(r"F:\\***\\***\\appendDoc\\***.docx", 12)#或直接简写
    # 注意SaveAs会打开保存后的文件，有时可能看不到，但后台一定是打开的
    doc.Close
    word.Quit


# 遍历文件夹
def walkFile(file):
    file_list = []
    for root, dirs, files in os.walk(file):
        # root 表示当前正在访问的文件夹路径
        # dirs 表示该文件夹下的子目录名list
        # files 表示该文件夹下的文件list

        # 遍历文件
        for f in files:
            pathtem = os.path.join(root, f)
            # 只需要后缀为.doc的文件
            if ".doc" in str(pathtem):
                if ".docx" in str(pathtem):
                    continue
                else:
                    print(pathtem)
                    file_list.append([pathtem])
        # 遍历所有的文件夹
        # for d in dirs:
        #     print(os.path.join(root, d))
    return file_list


# 删除指定路径下的所有文件
def removeFilesByPath(file):
    file_list = []
    for root, dirs, files in os.walk(file):
        # 遍历文件
        for f in files:
            pathtem = os.path.join(root, f)
            # 删除文件
            os.remove(pathtem)


# 转换dox 为 doxc 文件，以便Python库解析
def fileDocToDocx(path1, path2, file_list):
    txtFilesList = []
    # print(file_list)
    for l in file_list:
        temppath1 = l[0]
        if ".docx" not in temppath1:
            temppath2 = path2 + temppath1.replace(path1, "") + "x"
        else:
            temppath2 = path2 + temppath1.replace(path1, "")
        temppath2 = temppath2.replace(".docx", ".txt")
        print(temppath1, temppath2)
        txtFilesList.append([temppath2])
        # doc文件另存为docx
        getWordPath(temppath1, temppath2)
    return txtFilesList


# 正则表达式 提取中文
def translate_zh(str):
    line = str  # str.strip().decode('utf-8', 'ignore')  # 处理前进行相关的处理，包括转换成Unicode等
    p2 = re.compile(r'[^\u4e00-\u9fa5]')  # 中文的编码范围是：\u4e00到\u9fa5
    zh = " ".join(p2.split(line)).strip()
    zh = ",".join(zh.split())
    outStr = zh  # 经过相关处理后得到中文的文本
    return outStr


# 读取text文本内容
def readTxt(filePath):
    texts = ""
    fileobj = open(filePath, mode='r')
    all_lines = fileobj.readlines()
    for line in all_lines:
        line = str(line).replace(" ", "").replace("\n", "")
        # print(line, str(len(line)))
        if len(line) == 4:
            line = "####" + str(line)
        texts = texts + line
    fileobj.close()
    return texts.replace("????", "??")


# 解析word 智联数据
if __name__ == '__main__':
    path1 = "D:\\jianli\\"
    path2 = "D:\\jianli\\"
    # 获取所有的doc文件
    file_list = walkFile(path1)
    # 转换dox 为 txt 文件，以便Python库解析
    txtxFilesList = fileDocToDocx(path1, path2, file_list)
    # print("待解析文件:", txtxFilesList)
    print("待解析文件集合大小:", len(txtxFilesList))
    # 解析文件内容
    time.sleep(1)
    printPids()
    time.sleep(2)
    for dx in txtxFilesList:
        try:
            print("文件路径-----：", dx[0])
            filePath = dx[0]
            texts = readTxt(filePath)
            print("文档内容", texts)
        except Exception as e:
            print("解析异常：", filePath, e)
            continue

文档存放路径可以自信修改，运行代码后: 同级目录出现 xxx.txt文件

Python3.6 word批量转换为txt提取_遍历文件_02