读取流程: 二进制对象 - 》 解压 ——》读取xml文件
# -*- encoding: utf-8 -*-
from zipfile import ZipFile
from urllib import urlopen
from io import BytesIO
from bs4 import BeautifulSoup
# url="http://www.pythonscraping.com/pages/AwordDocument.doxc"
# word_file = urlopen(url).read()
# word_file = BytesIO(worl_file)
word_file = open("AWordDocument.docx", "rb")
document = ZipFile(word_file)
xml_content = document.read("word/document.xml")
text = xml_content.decode("utf-8")
print text
方式二
解析出文本内容
pip install python-docx
import docx
doc = docx.Document("AWordDocument.docx")
print doc
# 打印所有段落内容
for p in doc.paragraphs:
print p.text
更多内容参考
官方文档:http://python-docx.readthedocs.io/en/latest/index.html