〇、相关资料

1、python批量下载文件(传递url)

Python实现批量下载文件_mob64ca12d94299的技术博客_51CTO博客

2、python批量下载pdf(爬虫)

python批量下载pdf文件(python练手) - 知乎 (zhihu.com)

一、常量定义

1.1 UrlConstant.py

作用:定义调用的url、请求头、请求体等信息

all_files_query_post_url = "http://processon.xx.com/home/org/workbench/files"
# curl "http://processon.xx.com/home/org/workbench/files" ^
#   -H "Accept: */*" ^
#   -H "Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" ^
#   -H "Connection: keep-alive" ^
#   -H "Content-Type: application/x-www-form-urlencoded; charset=UTF-8" ^
#   -H ^"Cookie: processon_userKey=62fc4b2b08ec0415f36fa53b; _sid=4ade628f9742e5c9948ef5be14b2ed2d; usid=665494af529ebe35b425d5de; JSESSIONID=3348E2EB97320B0A5E0BE293D52456C6; processon_referrer=http^%^3A//processon.xx.com/^" ^
#   -H "Origin: http://processon.xx.com" ^
#   -H "Referer: http://processon.xx.com/home/diagrams" ^
#   -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0" ^
#   -H "X-Requested-With: XMLHttpRequest" ^
#   --data-raw "resource=history&folderId=&searchTitle=&sort=&view=&orgId=619b880aed5803396608606c&page=1&curPage=1&role=&urlFrom=diagrams&teamId=" ^
#   --insecure


all_files_query_post_header = {
        "Accept": "*/*",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie": "processon_userKey=62fc4b2b08ec0415f36fa53b; _sid=4ade628f9742e5c9948ef5be14b2ed2d; usid=665494af529ebe35b425d5de; JSESSIONID=3348E2EB97320B0A5E0BE293D52456C6; processon_referrer=http://processon.xx.com/",
        "Origin": "http://processon.xx.com",
        "Referer": "http://processon.xx.com/home/diagrams",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0",
        "X-Requested-With": "XMLHttpRequest"
    }


all_files_query_post_data = {
        "resource": "history",
        "folderId": "",
        "searchTitle": "",
        "sort": "",
        "view": "",
        "orgId": "619b880aed5803396608606c",
        "page": "1",
        "curPage": "1",
        "role": "",
        "urlFrom": "diagrams",
        "teamId": ""
    }

per_file_download_get_url = "http://processon.xx.com/chart_image/diagram_export"

# http://processon.xx.com/chart_image/diagram_export?type=pdf&chartId=664c15f3529ebe35b425d29b&mind=
# blob:http://processon.xx.com/e1028417-3e2a-4b7a-bb3e-81eb86e47535

per_file_download_get_header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Connection": "keep-alive",
        "Cookie": "processon_userKey=62fc4b2b08ec0415f36fa53b; _sid=4ade628f9742e5c9948ef5be14b2ed2d; usid=665494af529ebe35b425d5de; JSESSIONID=3348E2EB97320B0A5E0BE293D52456C6; processon_referrer=http://processon.xx.com/",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"
    }

二、util工具类

2.1 FileUtils.py

作用:创建目录、清除目录、查看所有文件

import os
from typing import List

class FileUtils:
    @staticmethod
    def create_dir(directory: str) -> None:
        if not os.path.exists(directory):
            os.makedirs(directory)

    @staticmethod
    def clear_directory(directory: str) -> None:
        if os.path.exists(directory):
            for filename in os.listdir(directory):
                file_path = os.path.join(directory, filename)
                try:
                    if os.path.isfile(file_path) or os.path.islink(file_path):
                        os.unlink(file_path)
                    elif os.path.isdir(file_path):
                        # shutil.rmtree(file_path)
                        # print("{file_path}is dir".format(file_path))
                        print(f"{file_path}is dir")
                except Exception as e:
                    print(f'无法删除{file_path}。原因: {e}')

    @staticmethod
    def list_files(directory: str) -> List[str]:
        if os.path.exists(directory):
            return [os.path.join(directory, f) for f in os.listdir(directory)]
        return []

2.2 HttpUtils.py

作用:调用http请求(post/get)、文件下载、编码转换

import requests
from typing import Any, Dict
timeout_def = 30
import os
import urllib
import re
@staticmethod
def get(url_input: str)-> Any:
    """
    调用get url
    """
    try:
        response = requests.get(url=url_input, timeout=timeout_def)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f'请求失败:{e}')
        return None


@staticmethod
def post(url_input: str, data_input: Dict[str, Any]) -> Any:
    """
    调用 POST URL
    """
    try:
        response = requests.post(url=url_input, data=data_input, timeout=timeout_def)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f'请求失败:{e}')
        return None

@staticmethod
def post(url_input: str, header_input: Dict[str, Any], data_input: Dict[str, Any], verify_input: bool = True) -> Any:
    """
    调用 POST URL
    """
    try:
        response = requests.post(url=url_input, data=data_input, headers=header_input, verify=verify_input)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f'请求失败:{e}')
        return None

import urllib.parse
@staticmethod
def decode_filename(encoded_filename)->str:
    # 尝试用不同的编码解码文件名
    possible_encodings = ['utf-8'] # , 'iso-8859-1', 'gbk', 'big5'
    for encoding in possible_encodings:
        try:
            decoded_bytes = encoded_filename.encode('iso-8859-1')
            decoded_filename = decoded_bytes.decode(encoding)
            print(f"Using encoding {encoding}: {decoded_filename}")
            return decoded_filename
        except Exception as e:
            print(f"Failed to decode using {encoding}: {e}")
            return encoded_filename

@staticmethod
def download_file(url: str, params: Dict[str, Any], dest_dir: str, headers: Dict[str, str], verify: bool = True, timeout: int = 30) -> bool:
    try:
        response = requests.get(url, params=params, headers=headers, verify=verify, timeout=timeout, stream=True)
        response.raise_for_status()

        # 从响应头获取文件名
        content_disposition = response.headers.get('Content-Disposition')
        filename = None
        if content_disposition:
            # 尝试从 Content-Disposition 中解析文件名
            filename_match = re.search(r'filename\*=UTF-8\'\'(.+)', content_disposition)
            if filename_match:
                filename = urllib.parse.unquote(filename_match.group(1))
            else:
                filename_match = re.search(r'filename="(.+)"', content_disposition)
                if filename_match:
                    filename = filename_match.group(1)
                else:
                    filename_match = re.search(r'filename=(.+)', content_disposition)
                    if filename_match:
                        filename = filename_match.group(1)

            # 移除任何多余的引号
            if filename:
                filename = filename.strip('"')

            # 尝试从 URL 解码
            if filename:
                filename = urllib.parse.unquote(filename)
                filename = decode_filename(filename)

        # 如果没有提供文件名,使用chartId和文件类型
        if not filename:
            file_type = params["type"]
            chart_id = params["chartId"]
            filename = f"{chart_id}.{file_type}"

        # 确保目标目录存在
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        # 处理非法文件名字符
        filename = re.sub(r'[\/:*?"<>|]', '-', filename)

        dest_path = os.path.join(dest_dir, filename)

        print(f"文件下载路径为{dest_path}")

        # 写入文件
        with open(dest_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
            f.close()
        return True
    except requests.exceptions.RequestException as e:
        print(f'文件下载失败:{e}')
        return False


def main():
    # 示例调用
    url = 'https://api.example.com/data'
    
    print("执行 GET 请求")
    get_data = get(url)
    if get_data:
        print('GET请求数据:', get_data)
    else:
        print('未能获取GET请求数据')


if __name__ == "__main__":
    main()

2.3 JSONParseUtils.py

作用:json解析映射为指定的list

import json
from typing import List, Dict, Any

class JSONParseUtils:
    @staticmethod
    def get_chart_ids(data: Dict[str, Any]) -> List[str]:
        """
        从给定的 JSON 数据中提取所有的 chartId,并返回一个列表
        """
        chart_ids = []
        charts = data.get('charts', [])
        for chart in charts:
            chart_id = chart.get('chartId')
            if chart_id:
                chart_ids.append(chart_id)
        return chart_ids

def main():
    # 模拟从某个地方获取 JSON 数据,这里直接用提供的 JSON 字符串
    json_string = '''
    {
        "charts": [...],  # 这里应该插入完整的JSON数据内容
        "urlFrom": "diagrams",
        "uc": {
            "timeZone": null,
            "locale": null,
            "dateFormat": null,
            "phone": null,
            "tenantKey": "A0000001",
            "email": "liujinhui@boulderaitech.com",
            "photoUrl": "/images/default/default/profile-full-male.png",
            "orgId": "619b880aed5803396608606c",
            "userId": "62fc4b2b08ec0415f36fa53b",
            "userName": "liujinhui",
            "fullName": "刘金辉"
        },
        "userView": "list"
    }
    '''
    
    # 将 JSON 字符串解析为 Python 字典
    data = json.loads(json_string)
    
    # 使用工具类的方法提取所有 chartId
    chart_ids = JSONParseUtils.get_chart_ids(data)
    
    # 打印提取到的 chartId 列表
    print("所有的 chartId:", chart_ids)

if __name__ == "__main__":
    main()

三、入口

3.1 main.py

from util import HttpUtils, JSONParseUtils, FileUtils
from constant import UrlConstant
import os
from typing import List

def idempotent_load_file(id_list: List[str]):
    print("所有的 chartId:", id_list)
    FileUtils.FileUtils.create_dir("pos")
    FileUtils.FileUtils.clear_directory("pos")

    base_url = "http://processon.xx.com/chart_image/diagram_export"
    for chart_id in id_list[:51]:
        pos_params = {"type": "pos", "chartId": chart_id, "mind": ""}
        pdf_params = {"type": "pdf", "chartId": chart_id, "mind": ""}
        
        print(f"正在下载: {chart_id} 的 pos 文件")
        HttpUtils.download_file(base_url, pos_params, "pos", UrlConstant.per_file_download_get_header, verify=False)
        

    # 5. 打印pos文件夹和pdf文件夹的文件列表
    print("pos文件夹中的文件:", FileUtils.FileUtils.list_files("pos"))
    print("pdf文件夹中的文件:", FileUtils.FileUtils.list_files("pdf"))

def main():
    # 1、遍历post接口,获取list json数据
    response = HttpUtils.post(UrlConstant.all_files_query_post_url, UrlConstant.all_files_query_post_header, UrlConstant.all_files_query_post_data, False)
    if response is None:
        print("不存在")
    data = response.json()

    # 2、解析json,map映射为list[str]
    chart_id_list = JSONParseUtils.JSONParseUtils.get_chart_ids(data)

    # 3、遍历list的每个chartId,调用拉取不同格式(pos&pdf文件)
    idempotent_load_file(chart_id_list)

if __name__ == '__main__':
    main()

四、验证

4.1 整体框架

基于python实现批量下载processon pos格式的流程图文件_批量下载

4.2 调用

C:/ProgramData/anaconda3/python.exe d:/git/project/main.py

4.3 生成效果

基于python实现批量下载processon pos格式的流程图文件_批量下载_02基于python实现批量下载processon pos格式的流程图文件_processon_03