1.pdf文件获取文本
import pdfplumber
with pdfplumber.open("4.pdf") as pdf:
first_page = pdf.pages[0]
print(first_page.chars[0]) # 获取pdf信息
print(first_page.extract_text()) # 获取文本
print(first_page.extract_tables()) # 获取表格
2.pdf单页纵向切割
from PyPDF4 import PdfFileReader, PdfFileWriter
import math
input_file_path = '11.pdf'
output_file_path_l = 'l.pdf'
output_file_path_r = 'r.pdf'
pdf_input = PdfFileReader(open(input_file_path, 'rb'))
pdf_output = PdfFileWriter()
page0 = pdf_input.getPage(0)
width = float(page0.mediaBox.getWidth())
height = float(page0.mediaBox.getHeight())
page_height = width
new_page_count = math.ceil(height / page_height)
for i in range(new_page_count):
pdf_input = PdfFileReader(open(input_file_path, 'rb'))
new_page = pdf_input.getPage(0)
y = page_height * i
new_page.mediaBox.lowerLeft = (0, height - page_height * (i + 1))
new_page.mediaBox.lowerRight = (width/2, height - page_height * (i + 1))
new_page.mediaBox.upperLeft = (0, height - y)
new_page.mediaBox.upperRight = (width/2, height - y)
pdf_output.addPage(new_page)
# pdf_output.write(open(output_file_path_l, 'wb'))
for i in range(new_page_count):
pdf_input = PdfFileReader(open(input_file_path, 'rb'))
new_page = pdf_input.getPage(0)
y = page_height * i
new_page.mediaBox.lowerLeft = (width/2, height - page_height * (i + 1))
new_page.mediaBox.lowerRight = (width, height - page_height * (i + 1))
new_page.mediaBox.upperLeft = (width/2, height - y)
new_page.mediaBox.upperRight = (width, height - y)
pdf_output.addPage(new_page)
pdf_output.write(open(output_file_path_r, 'wb'))