一、Markdown中不同的文本内容会分成不同的文本块,并通过markdown的语法控制进行文本的拼接,组成新的文件。
二、利用Python3实现(.md)文件转换成(.html)文件
在cmd命令行下进入(.py)文件目录下,使用命令进行执行
>python md2html.py <file.md> <file.html>
import sys, re
#生成器模块
def lines(file):
#在文本最后加一空行
for line in file: yield line
yield '\n'
def blocks(file):
#生成单独的文本块
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield ''.join(block).strip()
block = []
#文本块处理程序
class Handler:
"""
处理程序父类
"""
def callback(self, prefix, name, *args):
method = getattr(self, prefix + name, None)
if callable(method): return method(*args)
def start(self, name):
self.callback('start_', name)
def end(self, name):
self.callback('end_', name)
def sub(self, name):
def substitution(match):
result = self.callback('sub_', name, match)
if result is None: result = match.group(0)
return result
return substitution
class HTMLRenderer(Handler):
"""
HTML处理程序,给文本块加相应的HTML标记
"""
def start_document(self):
print('<html><head><title>Python文本解析</title></head><body>')
def end_document(self):
print('</body></html>')
def start_paragraph(self):
print('<p style="color: #444;">')
def end_paragraph(self):
print('</p>')
def start_heading(self):
print('<h2 style="color: #68BE5D;">')
def end_heading(self):
print('</h2>')
def start_list(self):
print('<ul style="color: #363736;">')
def end_list(self):
print('</ul>')
def start_listitem(self):
print('<li>')
def end_listitem(self):
print('</li>')
def start_title(self):
print('<h1 style="color: #1ABC9C;">')
def end_title(self):
print('</h1>')
def sub_emphasis(self, match):
return('<em>%s</em>' % match.group(1))
def sub_url(self, match):
return('<a target="_blank" style="text-decoration: none;color: #BC1A4B;" href="%s">%s</a>' % (match.group(1), match.group(1)))
def sub_mail(self, match):
return('<a style="text-decoration: none;color: #BC1A4B;" href="mailto:%s">%s</a>' % (match.group(1), match.group(1)))
def feed(self, data):
print(data)
#规则,判断每个文本块应该如何处理
class Rule:
"""
规则父类
"""
def action(self, block, handler):
"""
加标记
"""
handler.start(self.type)
handler.feed(block)
handler.end(self.type)
return True
class HeadingRule(Rule):
"""
一号标题规则
"""
type = 'heading'
def condition(self, block):
"""
判断文本块是否符合规则
"""
return not '\n' in block and len(block) <= 70 and not block[-1] == ':'
class TitleRule(HeadingRule):
"""
二号标题规则
"""
type = 'title'
first = True
def condition(self, block):
if not self.first: return False
self.first = False
return HeadingRule.condition(self, block)
class ListItemRule(Rule):
"""
列表项规则
"""
type = 'listitem'
def condition(self, block):
return block[0] == '-'
def action(self, block, handler):
handler.start(self.type)
handler.feed(block[1:].strip())
handler.end(self.type)
return True
class ListRule(ListItemRule):
"""
列表规则
"""
type = 'list'
inside = False
def condition(self, block):
return True
def action(self, block, handler):
if not self.inside and ListItemRule.condition(self, block):
handler.start(self.type)
self.inside = True
elif self.inside and not ListItemRule.condition(self, block):
handler.end(self.type)
self.inside = False
return False
class ParagraphRule(Rule):
"""
段落规则
"""
type = 'paragraph'
def condition(self, block):
return True
class Code(Rule):
'''
代码框规则
高亮显示规则
。。。
'''
pass
# 对整个文本进行解析
class Parser:
"""
解析器父类
"""
def __init__(self, handler):
self.handler = handler
self.rules = []
self.filters = []
def addRule(self, rule):
"""
添加规则
"""
self.rules.append(rule)
def addFilter(self, pattern, name):
"""
添加过滤器
"""
def filter(block, handler):
return re.sub(pattern, handler.sub(name), block)
self.filters.append(filter)
def parse(self, file):
"""
解析
"""
self.handler.start('document')
for block in blocks(file):
for filter in self.filters:
block = filter(block, self.handler)
for rule in self.rules:
if rule.condition(block):
last = rule.action(block, self.handler)
if last: break
self.handler.end('document')
class BasicTextParser(Parser):
"""
纯文本解析器
"""
def __init__(self, handler):
Parser.__init__(self, handler)
self.addRule(ListRule())
self.addRule(ListItemRule())
self.addRule(TitleRule())
self.addRule(HeadingRule())
self.addRule(ParagraphRule())
self.addFilter(r'\*(.+?)\*', 'emphasis')
self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')
"""
运行测试程序
"""
handler = HTMLRenderer()
parser = BasicTextParser(handler)
parser.parse(sys.stdin)
三、利用Python3将文本转化成pdf文件
命令>python md2pdf.py 源文件 目标文件 [options]
Options:
-h --help show help document.
-v --version show version information.
-o --output translate sourcefile into html file.
-p --print translate sourcefile into pdf file and html file respectively.
-P --Print translate sourcefile into pdf file only.
import os,re
import sys,getopt
from enum import Enum
from subprocess import call
from functools import reduce
from docopt import docopt
__version__ = '1.0'
# 定义三个枚举类
# 定义表状态
class TABLE(Enum):
Init = 1
Format = 2
Table = 3
# 有序序列状态
class ORDERLIST(Enum):
Init = 1
List = 2
# 块状态
class BLOCK(Enum):
Init = 1
Block = 2
CodeBlock = 3
# 定义全局状态,并初始化状态
table_state = TABLE.Init
orderList_state = ORDERLIST.Init
block_state = BLOCK.Init
is_code = False
is_normal = True
temp_table_first_line = []
temp_table_first_line_str = ""
need_mathjax = False
def test_state(input):
global table_state, orderList_state, block_state, is_code, temp_table_first_line, temp_table_first_line_str
Code_List = ["python\n", "c++\n", "c\n"]
result = input
# 构建正则表达式规则
# 匹配块标识
pattern = re.compile(r'```(\s)*\n')
a = pattern.match(input)
# 普通块
if a and block_state == BLOCK.Init:
result = "<blockquote>"
block_state = BLOCK.Block
is_normal = False
# 特殊代码块
elif len(input) > 4 and input[0:3] == '```' and (input[3:9] == "python" or input[3:6] == "c++" or input[3:4]== "c") and block_state == BLOCK.Init:
block_state = BLOCK.Block
result = "<code></br>"
is_code = True
is_normal = False
# 块结束
elif block_state == BLOCK.Block and input == '```\n':
if is_code:
result = "</code>"
else:
result = "</blockquote>"
block_state = BLOCK.Init
is_code = False
is_normal = False
elif block_state == BLOCK.Block:
pattern = re.compile(r'[\n\r\v\f\ ]')
result = pattern.sub(" ", result)
pattern = re.compile(r'\t')
result = pattern.sub(" " * 4, result)
result = "<span>" + result + "</span></br>"
is_normal = False
# 解析有序序列
if len(input) > 2 and input[0].isdigit() and input[1] == '.' and orderList_state == ORDERLIST.Init:
orderList_state = ORDERLIST.List
result = "<ol><li>" + input[2:] + "</li>"
is_normal = False
elif len(input) > 2 and input[0].isdigit() and input[1] == '.' and orderList_state == ORDERLIST.List:
result = "<li>" + input[2:] + "</li>"
is_normal = False
elif orderList_state == ORDERLIST.List and (len(input) <= 2 or input[0].isdigit() == False or input[1] != '.'):
result = "</ol>" + input
orderList_state = ORDERLIST.Init
# 解析表格
pattern = re.compile(r'^((.+)\|)+((.+))$')
match = pattern.match(input)
if match:
l = input.split('|')
l[-1] = l[-1][:-1]
# 将空字符弹出列表
if l[0] == '':
l.pop(0)
if l[-1] == '':
l.pop(-1)
if table_state == TABLE.Init:
table_state = TABLE.Format
temp_table_first_line = l
temp_table_first_line_str = input
result = ""
elif table_state == TABLE.Format:
# 如果是表头与表格主题的分割线
if reduce(lambda a, b: a and b, [all_same(i,'-') for i in l], True):
table_state = TABLE.Table
result = "<table><thread><tr>"
is_normal = False
# 添加表头
for i in temp_table_first_line:
result += "<th>" + i + "</th>"
result += "</tr>"
result += "</thread><tbody>"
is_normal = False
else:
result = temp_table_first_line_str + "</br>" + input
table_state = TABLE.Init
elif table_state == TABLE.Table:
result = "<tr>"
for i in l:
result += "<td>" + i + "</td>"
result += "</tr>"
elif table_state == TABLE.Table:
table_state = TABLE.Init
result = "</tbody></table>" + result
elif table_state == TABLE.Format:
pass
return result
# 判断 lst 是否全由字符 sym 构成
def all_same(lst, sym):
return not lst or sym * len(lst) == lst
# 处理标题
def handleTitle(s, n):
temp = "<h" + repr(n) + ">" + s[n:] + "</h" + repr(n) + ">"
return temp
# 处理无序列表
def handleUnorderd(s):
s = "<ul><li>" + s[1:]
s += "</li></ul>"
return s
def tokenTemplate(s, match):
pattern = ""
if match == '*':
pattern = "\*([^\*]*)\*"
if match == '~~':
pattern = "\~\~([^\~\~]*)\~\~"
if match == '**':
pattern = "\*\*([^\*\*]*)\*\*"
return pattern
# 处理特殊标识,比如 **, *, ~~
def tokenHandler(s):
l = ['b', 'i', 'S']
j = 0
for i in ['**', '*', '~~']:
pattern = re.compile(tokenTemplate(s,i))
match = pattern.finditer(s)
k = 0
for a in match:
if a:
content = a.group(1)
x,y = a.span()
c = 3
if i == '*':
c = 5
s = s[:x+c*k] + "<" + l[j] + ">" + content + "</" + l[j] + ">" + s[y+c*k:]
k += 1
pattern = re.compile(r'\$([^\$]*)\$')
a = pattern.search(s)
if a:
global need_mathjax
need_mathjax = True
j += 1
return s
# 处理链接
def link_image(s):
# 超链接
pattern = re.compile(r'\\\[(.*)\]\((.*)\)')
match = pattern.finditer(s)
for a in match:
if a:
text, url = a.group(1,2)
x, y = a.span()
s = s[:x] + "<a href=" + url + " target=\"_blank\">" + text + "</a>" + s[y:]
# 图像链接
pattern = re.compile(r'!\[(.*)\]\((.*)\)')
match = pattern.finditer(s)
for a in match:
if a:
text, url = a.group(1,2)
x, y = a.span()
s = s[:x] + "<img src=" + url + " target=\"_blank\">" + "</a>" + s[y:]
# 角标
pattern = re.compile(r'(.)\^\[([^\]]*)\]')
match = pattern.finditer(s)
k = 0
for a in match:
if a:
sym,index = a.group(1,2)
x, y = a.span()
s = s[:x+8*k] + sym + "<sup>" + index + "</sup>" + s[y+8*k:]
k += 1
return s
def parse(input):
global block_state, is_normal
is_normal = True
result = input
# 检测当前 input 解析状态
result = test_state(input)
if block_state == BLOCK.Block:
return result
# 分析标题标记 #
title_rank = 0
for i in range(6, 0, -1):
if input[:i] == '#'*i:
title_rank = i
break
if title_rank != 0:
# 处理标题,转化为相应的 HTML 文本
result = handleTitle(input, title_rank)
return result
# 分析分割线标记 --
if len(input) > 2 and all_same(input[:-1], '-') and input[-1] == '\n':
result = "<hr>"
return result
# 解析无序列表
unorderd = ['+', '-']
if result != "" and result[0] in unorderd :
result = handleUnorderd(result)
is_normal = False
f = input[0]
count = 0
sys_q = False
while f == '>':
count += 1
f = input[count]
sys_q = True
if sys_q:
result = "<blockquote style=\"color:#8fbc8f\"> "*count + "<b>" + input[count:] + "</b>" + "</blockquote>"*count
is_normal = False
# 处理特殊标记,比如 ***, ~~~
result = tokenHandler(result)
# 解析图像链接
result = link_image(result)
pa = re.compile(r'^(\s)*$')
a = pa.match(input)
if input[-1] == "\n" and is_normal == True and not a :
result+="</br>"
return result
def run(source_file, dest_file, dest_pdf_file, only_pdf):
# 获取文件名
file_name = source_file
# 转换后的 HTML 文件名
dest_name = dest_file
# 转换后的 PDF 文件名
dest_pdf_name = dest_pdf_file
# 获取文件后缀
_, suffix = os.path.splitext(file_name)
if suffix not in [".md",".markdown",".mdown","mkd"]:
print('Error: the file should be in markdown format')
sys.exit(1)
if only_pdf:
dest_name = ".~temp~.html"
f = open(file_name, "r")
f_r = open(dest_name, "w")
# 往文件中填写 HTML 的一些属性
f_r.write("""<style type="text/css">div {display: block;font-family: "Times New Roman",Georgia,Serif}\
#wrapper { width: 100%;height:100%; margin: 0; padding: 0;}#left { float:left; \
width: 10%; height: 100%; }#second { float:left; width: 80%;height: 100%; \
}#right {float:left; width: 10%; height: 100%; \
}</style><div id="wrapper"> <div id="left"></div><div id="second">""")
f_r.write("""<meta charset="utf-8"/>""")
# 逐行解析 markdwon 文件
for eachline in f:
result = parse(eachline)
if result != "":
f_r.write(result)
f_r.write("""</br></br></div><div id="right"></div></div>""")
# 公式支持
global need_mathjax
if need_mathjax:
f_r.write("""<script type="text/x-mathjax-config">\
MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});\
</script><script type="text/javascript" \
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>""")
# 文件操作完成之后记得关闭!!!
f_r.close()
f.close()
# 调用扩展 wkhtmltopdf 将 HTML 文件转换成 PDF
if dest_pdf_name != "" or only_pdf:
call(["wkhtmltopdf", dest_name, dest_pdf_name])
# 如果有必要,删除中间过程生成的 HTML 文件
if only_pdf:
call(["rm", dest_name])
# 主函数
def main():
dest_file = "translation_result.html"
dest_pdf_file = "translation_result.pdf"
only_pdf = False
args = docopt(__doc__, version=__version__)
dest_file = args['<outputfile>'] if args['--output'] else dest_file
dest_pdf_file = args['<outputfile>'] if args['--print'] or args['--Print'] else ""
run(args['<sourcefile>'], dest_file, dest_pdf_file, args['--Print'])
if __name__=="__main__":
main()