〇、相关资料
1、python批量下载文件(传递url)
Python实现批量下载文件_mob64ca12d94299的技术博客_51CTO博客
2、python批量下载pdf(爬虫)
python批量下载pdf文件(python练手) - 知乎 (zhihu.com)
一、常量定义
1.1 UrlConstant.py
作用:定义调用的url、请求头、请求体等信息
all_files_query_post_url = "http://processon.xx.com/home/org/workbench/files"
# curl "http://processon.xx.com/home/org/workbench/files" ^
# -H "Accept: */*" ^
# -H "Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" ^
# -H "Connection: keep-alive" ^
# -H "Content-Type: application/x-www-form-urlencoded; charset=UTF-8" ^
# -H ^"Cookie: processon_userKey=62fc4b2b08ec0415f36fa53b; _sid=4ade628f9742e5c9948ef5be14b2ed2d; usid=665494af529ebe35b425d5de; JSESSIONID=3348E2EB97320B0A5E0BE293D52456C6; processon_referrer=http^%^3A//processon.xx.com/^" ^
# -H "Origin: http://processon.xx.com" ^
# -H "Referer: http://processon.xx.com/home/diagrams" ^
# -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0" ^
# -H "X-Requested-With: XMLHttpRequest" ^
# --data-raw "resource=history&folderId=&searchTitle=&sort=&view=&orgId=619b880aed5803396608606c&page=1&curPage=1&role=&urlFrom=diagrams&teamId=" ^
# --insecure
all_files_query_post_header = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "processon_userKey=62fc4b2b08ec0415f36fa53b; _sid=4ade628f9742e5c9948ef5be14b2ed2d; usid=665494af529ebe35b425d5de; JSESSIONID=3348E2EB97320B0A5E0BE293D52456C6; processon_referrer=http://processon.xx.com/",
"Origin": "http://processon.xx.com",
"Referer": "http://processon.xx.com/home/diagrams",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
"X-Requested-With": "XMLHttpRequest"
}
all_files_query_post_data = {
"resource": "history",
"folderId": "",
"searchTitle": "",
"sort": "",
"view": "",
"orgId": "619b880aed5803396608606c",
"page": "1",
"curPage": "1",
"role": "",
"urlFrom": "diagrams",
"teamId": ""
}
per_file_download_get_url = "http://processon.xx.com/chart_image/diagram_export"
# http://processon.xx.com/chart_image/diagram_export?type=pdf&chartId=664c15f3529ebe35b425d29b&mind=
# blob:http://processon.xx.com/e1028417-3e2a-4b7a-bb3e-81eb86e47535
per_file_download_get_header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "processon_userKey=62fc4b2b08ec0415f36fa53b; _sid=4ade628f9742e5c9948ef5be14b2ed2d; usid=665494af529ebe35b425d5de; JSESSIONID=3348E2EB97320B0A5E0BE293D52456C6; processon_referrer=http://processon.xx.com/",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
}
二、util工具类
2.1 FileUtils.py
作用:创建目录、清除目录、查看所有文件
import os
from typing import List
class FileUtils:
@staticmethod
def create_dir(directory: str) -> None:
if not os.path.exists(directory):
os.makedirs(directory)
@staticmethod
def clear_directory(directory: str) -> None:
if os.path.exists(directory):
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
# shutil.rmtree(file_path)
# print("{file_path}is dir".format(file_path))
print(f"{file_path}is dir")
except Exception as e:
print(f'无法删除{file_path}。原因: {e}')
@staticmethod
def list_files(directory: str) -> List[str]:
if os.path.exists(directory):
return [os.path.join(directory, f) for f in os.listdir(directory)]
return []
2.2 HttpUtils.py
作用:调用http请求(post/get)、文件下载、编码转换
import requests
from typing import Any, Dict
timeout_def = 30
import os
import urllib
import re
@staticmethod
def get(url_input: str)-> Any:
"""
调用get url
"""
try:
response = requests.get(url=url_input, timeout=timeout_def)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f'请求失败:{e}')
return None
@staticmethod
def post(url_input: str, data_input: Dict[str, Any]) -> Any:
"""
调用 POST URL
"""
try:
response = requests.post(url=url_input, data=data_input, timeout=timeout_def)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f'请求失败:{e}')
return None
@staticmethod
def post(url_input: str, header_input: Dict[str, Any], data_input: Dict[str, Any], verify_input: bool = True) -> Any:
"""
调用 POST URL
"""
try:
response = requests.post(url=url_input, data=data_input, headers=header_input, verify=verify_input)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f'请求失败:{e}')
return None
import urllib.parse
@staticmethod
def decode_filename(encoded_filename)->str:
# 尝试用不同的编码解码文件名
possible_encodings = ['utf-8'] # , 'iso-8859-1', 'gbk', 'big5'
for encoding in possible_encodings:
try:
decoded_bytes = encoded_filename.encode('iso-8859-1')
decoded_filename = decoded_bytes.decode(encoding)
print(f"Using encoding {encoding}: {decoded_filename}")
return decoded_filename
except Exception as e:
print(f"Failed to decode using {encoding}: {e}")
return encoded_filename
@staticmethod
def download_file(url: str, params: Dict[str, Any], dest_dir: str, headers: Dict[str, str], verify: bool = True, timeout: int = 30) -> bool:
try:
response = requests.get(url, params=params, headers=headers, verify=verify, timeout=timeout, stream=True)
response.raise_for_status()
# 从响应头获取文件名
content_disposition = response.headers.get('Content-Disposition')
filename = None
if content_disposition:
# 尝试从 Content-Disposition 中解析文件名
filename_match = re.search(r'filename\*=UTF-8\'\'(.+)', content_disposition)
if filename_match:
filename = urllib.parse.unquote(filename_match.group(1))
else:
filename_match = re.search(r'filename="(.+)"', content_disposition)
if filename_match:
filename = filename_match.group(1)
else:
filename_match = re.search(r'filename=(.+)', content_disposition)
if filename_match:
filename = filename_match.group(1)
# 移除任何多余的引号
if filename:
filename = filename.strip('"')
# 尝试从 URL 解码
if filename:
filename = urllib.parse.unquote(filename)
filename = decode_filename(filename)
# 如果没有提供文件名,使用chartId和文件类型
if not filename:
file_type = params["type"]
chart_id = params["chartId"]
filename = f"{chart_id}.{file_type}"
# 确保目标目录存在
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
# 处理非法文件名字符
filename = re.sub(r'[\/:*?"<>|]', '-', filename)
dest_path = os.path.join(dest_dir, filename)
print(f"文件下载路径为{dest_path}")
# 写入文件
with open(dest_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
f.close()
return True
except requests.exceptions.RequestException as e:
print(f'文件下载失败:{e}')
return False
def main():
# 示例调用
url = 'https://api.example.com/data'
print("执行 GET 请求")
get_data = get(url)
if get_data:
print('GET请求数据:', get_data)
else:
print('未能获取GET请求数据')
if __name__ == "__main__":
main()
2.3 JSONParseUtils.py
作用:json解析映射为指定的list
import json
from typing import List, Dict, Any
class JSONParseUtils:
@staticmethod
def get_chart_ids(data: Dict[str, Any]) -> List[str]:
"""
从给定的 JSON 数据中提取所有的 chartId,并返回一个列表
"""
chart_ids = []
charts = data.get('charts', [])
for chart in charts:
chart_id = chart.get('chartId')
if chart_id:
chart_ids.append(chart_id)
return chart_ids
def main():
# 模拟从某个地方获取 JSON 数据,这里直接用提供的 JSON 字符串
json_string = '''
{
"charts": [...], # 这里应该插入完整的JSON数据内容
"urlFrom": "diagrams",
"uc": {
"timeZone": null,
"locale": null,
"dateFormat": null,
"phone": null,
"tenantKey": "A0000001",
"email": "liujinhui@boulderaitech.com",
"photoUrl": "/images/default/default/profile-full-male.png",
"orgId": "619b880aed5803396608606c",
"userId": "62fc4b2b08ec0415f36fa53b",
"userName": "liujinhui",
"fullName": "刘金辉"
},
"userView": "list"
}
'''
# 将 JSON 字符串解析为 Python 字典
data = json.loads(json_string)
# 使用工具类的方法提取所有 chartId
chart_ids = JSONParseUtils.get_chart_ids(data)
# 打印提取到的 chartId 列表
print("所有的 chartId:", chart_ids)
if __name__ == "__main__":
main()
三、入口
3.1 main.py
from util import HttpUtils, JSONParseUtils, FileUtils
from constant import UrlConstant
import os
from typing import List
def idempotent_load_file(id_list: List[str]):
print("所有的 chartId:", id_list)
FileUtils.FileUtils.create_dir("pos")
FileUtils.FileUtils.clear_directory("pos")
base_url = "http://processon.xx.com/chart_image/diagram_export"
for chart_id in id_list[:51]:
pos_params = {"type": "pos", "chartId": chart_id, "mind": ""}
pdf_params = {"type": "pdf", "chartId": chart_id, "mind": ""}
print(f"正在下载: {chart_id} 的 pos 文件")
HttpUtils.download_file(base_url, pos_params, "pos", UrlConstant.per_file_download_get_header, verify=False)
# 5. 打印pos文件夹和pdf文件夹的文件列表
print("pos文件夹中的文件:", FileUtils.FileUtils.list_files("pos"))
print("pdf文件夹中的文件:", FileUtils.FileUtils.list_files("pdf"))
def main():
# 1、遍历post接口,获取list json数据
response = HttpUtils.post(UrlConstant.all_files_query_post_url, UrlConstant.all_files_query_post_header, UrlConstant.all_files_query_post_data, False)
if response is None:
print("不存在")
data = response.json()
# 2、解析json,map映射为list[str]
chart_id_list = JSONParseUtils.JSONParseUtils.get_chart_ids(data)
# 3、遍历list的每个chartId,调用拉取不同格式(pos&pdf文件)
idempotent_load_file(chart_id_list)
if __name__ == '__main__':
main()
四、验证
4.1 整体框架
4.2 调用
C:/ProgramData/anaconda3/python.exe d:/git/project/main.py