一,说明
pandas在1.0之前支持分块,之后的,我并没有找到。read_excel整体将excel加载到内存,可能出现性能问题,还未找到替代方案
pandas支持xlsx,xls文件
二,代码
1,返回结果格式
example_excel_json = {
"sheet_name1": [
{"SkuId": "12345", "Name": "Product A"},
{"SkuId": "67890", "Name": "Product B"}
],
"sheet_name2": [
{"SkuId": "24680", "Name": "Product C"},
{"SkuId": "78901", "Name": "Product D"}
]
}
2,安装包
import pandas as pd
3,主要读取代码
缓存excel数据
memory = Memory(location='./cache', verbose=0)
@memory.cache
def get_excel_data_cache(file_path: str, sheet_name: str = None, header: int | None = 0):
"""
获取excel数据,返回json格式数据
:param file_path: excel文件路径
:param sheet_name: 工作表名称,默认读取所有工作表
:param header: 表头行数,默认第一行
"""
try:
with pd.ExcelFile(file_path) as excel_file:
current_sheet_data = pd.read_excel(excel_file, sheet_name=sheet_name, header=header, dtype=str)
return current_sheet_data
except Exception as e:
print(e)
print('文件:' + file_path + '\n')
raise NoneException("读取文件异常:" + e.args[0])
校验excel文件
def check_excel_file(file_path: str):
"""
检查excel文件是否存在,是否为excel文件
"""
is_exists = os.path.exists(file_path)
if not is_exists:
raise NoneException("文件不存在")
if not file_path.lower().endswith(".xlsx") and not file_path.lower().endswith(".xls"):
raise NoneException("文件格式错误,仅支持excel文件")
读取代码
获取excel数据,返回json格式数据
:param file_path: excel文件路径
:param sheet_name: 工作表名称,默认读取所有工作表
:param header: 表头行数,默认第一行
:param filter_dict: 过滤条件,字典类型,key为列名,value为过滤值
:return: json格式数据
def excel_pandas(file_path: str, sheet_name: str = None, header: int = 0, filter_dict: Dict[str, str] = None):
excel_json_pandas = {}
try:
if file_path.startswith('/'):
file_path = os.getcwd() + file_path
check_excel_file(file_path)
with pd.ExcelFile(file_path) as excel_data:
if sheet_name is not None and sheet_name not in excel_data.sheet_names:
raise NoneException("sheet_name[" + sheet_name + "]不存在")
for current_sheet_name in excel_data.sheet_names:
if sheet_name and sheet_name != current_sheet_name:
continue
sheet_list_dict = []
# current_sheet_data = pd.read_excel(file_path, sheet_name=current_sheet_name, header=header, dtype=str)
current_sheet_data = get_excel_data_cache(file_path, sheet_name=current_sheet_name, header=header)
conditions = []
if filter_dict is not None:
for dict_key, dict_value in filter_dict.items():
if dict_key not in current_sheet_data.columns:
raise NoneException("过滤条件中字段:" + dict_key + "不在当前sheet中")
conditions.append(current_sheet_data[dict_key] == dict_value)
if conditions and len(conditions) > 0:
final_condition = conditions[0]
for condition in conditions[1:]:
final_condition = final_condition & condition
current_sheet_data = current_sheet_data[final_condition]
for index, row in current_sheet_data.iterrows():
row_dict = {}
for column_name, cell_value in row.items():
row_dict[column_name] = cell_value
sheet_list_dict.append(row_dict)
excel_json_pandas[current_sheet_name] = sheet_list_dict
except NoneException as e:
raise NoneException("读取文件异常:" + e.args[0])
except Exception as e:
print(e)
print('文件:' + file_path + '\n')
raise NoneException("读取文件异常:" + e.args[0])
return excel_json_pandas
使用
获取所有:
sheet_name = 'sheet1'
header = 0
json = excel_pandas(file_path, sheet_name=sheet_name, header=header, filter_dict=filter_dict)
单个匹配
sheet_name = 'sheet1'
header = 0
filter_dict = {'SkuId': '1171424'}
json = excel_pandas(file_path, sheet_name=sheet_name, header=header, filter_dict=filter_dict)
name = json[sheet_name][0]['名称']