公共文件 common.py
把文件的地址都放在这个文件了
# 文件地址
def filePath():
return 'D:\\document\\test.docx'
# Word 中的文件(图片) 保存的文件地址
def wordImagesSavePath():
return 'D:\\image\\'
Doc 转 Docx(需Windows操作系统)
from win32com import client
def doc2docx(fn):
# 这里要注意(调用系统运行程序)与 office 版本匹配,wps抢先版是 “kwps.Application”,还有“wps” / "word"
word = client.Dispatch("kwps.Application")
doc = word.Documents.Open(fn)
doc.SaveAs("{}x".format(fn), 12)
doc.Close()
word.Quit()
版本 | 程序 |
Microsoft Office | Word.Application |
Wps | wps.Application |
Wps 抢先版 | kwps.Application |
解析 Docx 样式
from docx import Document
# 获取带样式的文本列表
def getContent(path):
# 初始化 Docx
doc = Document(path)
# 声明列表
wordStyles = []
for para in doc.paragraphs:
# 段落文本
section = {
# 段落对齐方式
'alignment': para.paragraph_format.alignment,
# 左缩进
'leftIndent': para.paragraph_format.left_indent,
# 右缩进
'rightIndent': para.paragraph_format.right_indent,
# 首行缩进
'firstLineIndent': para.paragraph_format.first_line_indent,
# 行间距
'lineSpacing': para.paragraph_format.line_spacing,
# 段前间距
'spaceBefore': para.paragraph_format.space_before,
# 段后间距
'spaceAfter': para.paragraph_format.space_after,
# 样式
'style': []
}
for run in para.runs:
# 声明字典
style = {
# 字体名称
'name': run.font.name,
# 字体大小
'size': run.font.size,
# 是否加粗
'bold': run.font.bold,
# 是否斜体
'italic': run.font.italic,
# 字体颜色
'rgb': run.font.color.rgb,
# 字体高亮
'highlightColor': run.font.highlight_color,
# 下划线
'underline': run.font.underline,
# 删除线
'strike': run.font.strike,
# 双删除线
'doubleStrike': run.font.double_strike,
# 下标
'subscript': run.font.subscript,
# 上标
'superscript': run.font.superscript,
# 文本信息
'text': run.text
}
section['style'].append(style)
wordStyles.append(section)
return wordStyles
解析 Docx 图片(行号)
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.image.image import Image
from docx.parts.image import ImagePart
from docx.oxml.shape import CT_Picture
from PIL import Image
from io import BytesIO
# 这里引用了文章开头的 common.py 获取图片保存地址的
from common.common import wordImagesSavePath
def getPicture(document: Document, paragraph: Paragraph):
img = paragraph._element.xpath('.//pic:pic')
if not img:
return
img: CT_Picture = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part: ImagePart = document.part.related_parts[embed]
image: Image = related_part.image
return image
# P1:文档地址 P2:保存图片的名称
def getPictures(path, fileNamePrefix):
doc = docx.Document(path)
i = 0
imageIndex = []
for para in doc.paragraphs:
i = i + 1
image = getPicture(doc, para)
# 二进制内容
if image is not None:
index = {
'line': i,
'picture': str(ext)
}
# 将文件下标保存
imageIndex.append(index)
blob = image.blob
ext = image.ext
# 保存图片到本地
im = Image.open(BytesIO(blob))
im.save(wordImagesSavePath() + fileNamePrefix + str(i) + '.' + str(ext))
return imageIndex
解析 Docx 表格
from docx import Document
def getTables(path):
# 初始化 Docx
doc = Document(path)
# 获取文档中表格信息
tables = doc.tables # 获取文档中所有表格对象的列表
if tables is not None:
for table in tables:
# 获取一个表格的所有单元格
cells = table._cells
# 获取单元格内所有文字信息
contents = [cell.text for cell in cells]
step = len(table.columns)
# 将表格全量内容依据列数分组
return [contents[i:i + step] for i in range(0, len(contents), step)]
创建 Docx 文档
import io
import sys
from docx import Document
from docx.shared import RGBColor
import json
def hexToRgb(value):
value = value.lstrip('#')
lv = len(value)
return tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
def jsonHandler(jsonStr):
jsonArray = json.loads(jsonStr)
return jsonArray
def createWord(entities, path):
doc = Document()
jsonArray = jsonHandler(entities)
# 获取列表长度
lon = len(jsonArray)
p = 0
for wordEntity in jsonArray:
doc.add_paragraph('')
if 'aligns' in wordEntity.keys():
if wordEntity['aligns'] is not None:
doc.paragraphs[p].paragraph_format.alignment = int(wordEntity['aligns'])
p = p + 1
i = 0
for para in doc.paragraphs:
i = i + 1
items = jsonArray[i - 1]['items']
for item in items:
run = para.add_run()
# 设置文本
if 'content' in item.keys():
run.text = item['content']
# 设置颜色
if 'color' in item.keys():
rgb = hexToRgb(item['color'])
run.font.color.rgb = RGBColor(rgb[0], rgb[1], rgb[2])
# 设置加粗
if 'bold' in item.keys():
if item['bold'] == 1:
run.bold = True
# 字体
if 'name' in item.keys():
if item['name'] is not None:
run.font.name = item['name']
# 字体大小
if 'fontSize' in item.keys():
if item['fontSize'] is not None:
footsie = str(item['fontSize']).replace('p', '').replace('x', '')
run.font.size = int(footsie) * 10000
doc.save(path)
创建 xlsx 文档
import json
import xlwt
# 为样式创建背景
def get_background(col):
# 初始背景图案
bg = xlwt.Pattern()
if 'backgroundStyle' in col:
if col['backgroundStyle'] is None:
bg.pattern = xlwt.Pattern.NO_PATTERN
return bg
# May be: NO_PATTERN, SOLID_PATTERN, or 0x00 through 0x12
if col['backgroundStyle'] is not None:
# 无背景颜色
if col['backgroundStyle'] == '0':
bg.pattern = xlwt.Pattern.NO_PATTERN
# 有背景颜色
if col['backgroundStyle'] == '1':
bg.pattern = xlwt.Pattern.SOLID_PATTERN
# May be: 8 through 63.
# 0 = Black,
# 1 = White,
# 2 = Red,
# 3 = Green,
# 4 = Blue,
# 5 = Yellow,
# 6 = Magenta,
# 7 = Cyan,
# 16 = Maroon,
# 17 = Dark Green,
# 18 = Dark Blue,
# 19 = Dark Yellow , almost brown),
# 20 = Dark Magenta,
# 21 = Teal,
# 22 = Light Gray,
# 23 = Dark Gray
# 设置背景颜色
if 'backgroundColor' in col:
if col['backgroundColor'] is not None:
bg.pattern_fore_colour = convertColor(col['backgroundColor'])
return bg
# 为样式创建字体
def get_font(col):
# 初始化字体相关
font = xlwt.Font()
if 'fontName' in col:
if col['fontName'] is not None:
font.name = col['fontName']
if 'fontBold' in col:
if col['fontBold'] is not None:
if col['fontBold'] == '0':
font.bold = False
if col['fontBold'] == '1':
font.bold = True
if 'fontColor' in col:
if col['fontColor'] is not None:
font.colour_index = convertColor(col['fontColor'])
if 'fontSize' in col:
if col['fontSize'] is not None:
# 字体大小,11为字号,20为衡量单位
font.height = 20 * int(col['fontSize'])
# 下划线
if 'underline' in col:
if col['underline'] is not None:
if col['underline'] == '0':
font.underline = True
if col['underline'] == '1':
font.underline = True
# 斜体字
if 'italic' in col:
if col['italic'] is not None:
if col['italic'] == '0':
font.italic = True
if col['italic'] == '1':
font.italic = True
return font
# 设置单元格对齐方式
def get_alignment(col):
alignment = xlwt.Alignment()
# 0x01(左端对齐)、0x02(水平方向上居中对齐)、0x03(右端对齐)
if 'horz' in col:
if col['horz'] is not None:
if col['horz'] == '0':
alignment.horz = 0x01
if col['horz'] == '1':
alignment.horz = 0x02
if col['horz'] == '2':
alignment.horz = 0x03
# 0x00(上端对齐)、 0x01(垂直方向上居中对齐)、0x02(底端对齐)
if 'vert' in col:
if col['vert'] is not None:
if col['vert'] == '0':
alignment.vert = 0x00
if col['vert'] == '1':
alignment.vert = 0x01
if col['vert'] == '2':
alignment.vert = 0x02
# 设置自动换行
if 'wrap' in col:
if col['wrap'] is not None:
if col['wrap'] == '0':
alignment.wrap = 0
if col['wrap'] == '1':
alignment.wrap = 1
return alignment
# 设置边框
def get_borders(col):
borders = xlwt.Borders()
# 细实线:1,小粗实线:2,细虚线:3,中细虚线:4,大粗实线:5,双线:6,细点虚线:7 大粗虚线:8,细点划线:9,粗点划线:10,细双点划线:11,粗双点划线:12,斜点划线:13
# 大粗虚线:8,细点划线:9,粗点划线:10,细双点划线:11,粗双点划线:12,斜点划线:13
if 'left' in col:
if col['left'] is not None:
borders.left = int(col['left'])
if 'right' in col:
if col['right'] is not None:
borders.right = int(col['right'])
if 'top' in col:
if col['top'] is not None:
borders.top = int(col['top'])
if 'bottom' in col:
if col['bottom'] is not None:
borders.bottom = int(col['bottom'])
if 'leftColor' in col:
if col['leftColor'] is not None:
borders.left_colour = convertColor(col['leftColor'])
if 'rightColor' in col:
if col['rightColor'] is not None:
borders.right_colour = convertColor(col['rightColor'])
if 'topColor' in col:
if col['topColor'] is not None:
borders.top_colour = convertColor(col['topColor'])
if 'bottomColor' in col:
if col['bottomColor'] is not None:
borders.bottom_colour = convertColor(col['bottomColor'])
return borders
# 设置颜色类型转换 # 16 进制 -> 0x 16 进制 -> Final Index
def convertColor(color):
colorRGB = hexToRgb(color)
return colorHandler(colorRGB[0], colorRGB[1], colorRGB[2])
# 颜色转换 # 16 进制 -> RGB
def hexToRgb(value):
value = value.lstrip('#')
lv = len(value)
return tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
# 颜色转换 RGB -> Final Index
def colorHandler(R, G, B):
# 判断该颜色是否是黑色、白色、灰色
if R == G == B:
if R == 0 & G == 0 & B == 0:
# 黑色
return 0
if R == 255 & G == 255 & B == 255:
# 白色
return 1
if ((R >= 0) | (R <= 255)) \
& ((G >= 0) | (G <= 255)) \
& ((B >= 0) | (B <= 255)):
# 灰色
return 23
# 判断该颜色是否是红色
if R >= G == B:
if ((R >= 255 - 80) | (R <= 255)) \
& ((G >= 0) | (G <= 80)) \
& ((B >= 0) | (B <= 80)):
# 红色 Red
return 2
# 判断该颜色是否是橙色
if R >= G >= B:
if ((R >= 255 - 80) | (R <= 255)) \
& ((G >= 125 - 40) | (G <= 125 + 40)) \
& ((B >= 0) | (B <= 80)):
# 橙色 Magenta
return 6
# 判断该颜色是否是黄色
if R == G >= B:
if ((R >= 255 - 80) | (R <= 255)) \
& ((G >= 255 - 80) | (G <= 255)) \
& ((B >= 0) | (B <= 80)):
# 黄色 Yellow
return 5
# 判断该颜色是否是绿色
if R <= G >= B:
if ((R >= 0) | (R <= 80)) \
& ((G >= 255 - 80) | (G <= 255)) \
& ((B >= 0) | (B <= 80)):
# 绿色 Green
return 3
# 判断该颜色是否是青色
if R <= G == B:
if ((R >= 0) | (R <= 80)) \
& ((G >= 255 - 80) | (G <= 255)) \
& ((B >= 255 - 80) | (B <= 255)):
# 青色 Cyan
return 7
# 判断该颜色是否是蓝色
if R <= G <= B:
if ((R >= 0) | (R <= 80)) \
& ((G >= 0) | (G <= 80)) \
& ((B >= 255 - 80) | (B <= 255)):
# 蓝色 Blue
return 4
# 判断该颜色是否是紫色
if R >= G <= B:
if ((R >= 255 - 80) | (R <= 255)) \
& ((G >= 0) | (G <= 80)) \
& ((B >= 255 - 80) | (B <= 255)):
# 紫色 Maroon
return 16
return 0
def createExcel(text, path):
# 创建一个excel
excel = xlwt.Workbook()
sheets = json.loads(text)
for sheet in sheets:
# 添加工作区
if sheet['sheetName'] is not None:
she = excel.add_sheet(sheet['sheetName'])
# 从工作区中取出行
if sheet['pyExcelRowEntityList'] is not None:
for index, row in enumerate(sheet['pyExcelRowEntityList']):
# 从行中取出列
if row['pyExcelColEntities'] is not None:
for i, col in enumerate(row['pyExcelColEntities']):
# 解析样式
col_style = xlwt.XFStyle()
col_style.font = get_font(col)
col_style.pattern = get_background(col)
col_style.alignment = get_alignment(col)
col_style.borders = get_borders(col)
# 写入文档
she.write(index, i, col['text'], col_style)
# 保存excel
excel.save(path)
关于系统参数的处理
import io
import sys
# 设置文本编码格式
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# 获取外部参数
path = sys.argv[1]
jsonStr = sys.argv[2]
# 处理 JSON 参数
jsonStr = jsonStr.replace(',', '","')
jsonStr = jsonStr.replace(':', '":"')
jsonStr = jsonStr.replace('{', '{"')
jsonStr = jsonStr.replace('}', '"}')
jsonStr = jsonStr.replace('"[', '[')
jsonStr = jsonStr.replace(']"', ']')
jsonStr = jsonStr.replace('}","{', '},{')
createExcel(jsonStr, path)