一,说明

pandas在1.0之前支持分块,之后的,我并没有找到。read_excel整体将excel加载到内存,可能出现性能问题,还未找到替代方案

pandas支持xlsx,xls文件

二,代码

1,返回结果格式

example_excel_json = {
    "sheet_name1": [
        {"SkuId": "12345", "Name": "Product A"},
        {"SkuId": "67890", "Name": "Product B"}
    ],
    "sheet_name2": [
        {"SkuId": "24680", "Name": "Product C"},
        {"SkuId": "78901", "Name": "Product D"}
    ]
}

2,安装包

import pandas as pd


3,主要读取代码

缓存excel数据

memory = Memory(location='./cache', verbose=0)
@memory.cache
def get_excel_data_cache(file_path: str, sheet_name: str = None, header: int | None = 0):
    """
    获取excel数据,返回json格式数据
    :param file_path: excel文件路径
    :param sheet_name: 工作表名称,默认读取所有工作表
    :param header: 表头行数,默认第一行
    """
    try:
        with pd.ExcelFile(file_path) as excel_file:
            current_sheet_data = pd.read_excel(excel_file, sheet_name=sheet_name, header=header, dtype=str)
            return current_sheet_data
    except Exception as e:
        print(e)
        print('文件:' + file_path + '\n')
        raise NoneException("读取文件异常:" + e.args[0])

校验excel文件

def check_excel_file(file_path: str):
    """
    检查excel文件是否存在,是否为excel文件
    """
    is_exists = os.path.exists(file_path)
    if not is_exists:
        raise NoneException("文件不存在")
    if not file_path.lower().endswith(".xlsx") and not file_path.lower().endswith(".xls"):
        raise NoneException("文件格式错误,仅支持excel文件")

读取代码


 获取excel数据,返回json格式数据
 :param file_path: excel文件路径
 :param sheet_name: 工作表名称,默认读取所有工作表
 :param header: 表头行数,默认第一行
 :param filter_dict: 过滤条件,字典类型,key为列名,value为过滤值
 :return: json格式数据
def excel_pandas(file_path: str, sheet_name: str = None, header: int = 0, filter_dict: Dict[str, str] = None):
    excel_json_pandas = {}
    try:
        if file_path.startswith('/'):
            file_path = os.getcwd() + file_path
        check_excel_file(file_path)
        with pd.ExcelFile(file_path) as excel_data:
            if sheet_name is not None and sheet_name not in excel_data.sheet_names:
                raise NoneException("sheet_name[" + sheet_name + "]不存在")
            for current_sheet_name in excel_data.sheet_names:
                if sheet_name and sheet_name != current_sheet_name:
                    continue
                sheet_list_dict = []
                # current_sheet_data = pd.read_excel(file_path, sheet_name=current_sheet_name, header=header, dtype=str)
                current_sheet_data = get_excel_data_cache(file_path, sheet_name=current_sheet_name, header=header)
                conditions = []
                if filter_dict is not None:
                    for dict_key, dict_value in filter_dict.items():
                        if dict_key not in current_sheet_data.columns:
                            raise NoneException("过滤条件中字段:" + dict_key + "不在当前sheet中")
                        conditions.append(current_sheet_data[dict_key] == dict_value)
                if conditions and len(conditions) > 0:
                    final_condition = conditions[0]
                    for condition in conditions[1:]:
                        final_condition = final_condition & condition
                    current_sheet_data = current_sheet_data[final_condition]
                for index, row in current_sheet_data.iterrows():
                    row_dict = {}
                    for column_name, cell_value in row.items():
                        row_dict[column_name] = cell_value
                    sheet_list_dict.append(row_dict)
                excel_json_pandas[current_sheet_name] = sheet_list_dict
    except NoneException as e:
        raise NoneException("读取文件异常:" + e.args[0])
    except Exception as e:
        print(e)
        print('文件:' + file_path + '\n')
        raise NoneException("读取文件异常:" + e.args[0])
    return excel_json_pandas

使用

获取所有:
sheet_name = 'sheet1'
header = 0
json = excel_pandas(file_path, sheet_name=sheet_name, header=header, filter_dict=filter_dict)

单个匹配
sheet_name = 'sheet1'
header = 0
filter_dict = {'SkuId': '1171424'}
json = excel_pandas(file_path, sheet_name=sheet_name, header=header, filter_dict=filter_dict)

name = json[sheet_name][0]['名称']