导入相关包
import json
import pandas as pd
import pyarrow.parquet as pq
读json,json,parquet文件
def read_json_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
except FileNotFoundError:
print(f"File {file_path} not found.")
except json.JSONDecodeError:
print(f"File {file_path} is not a valid JSON file.")
except Exception as e:
print(f"An error occurred: {e}")
def read_jsonl_file(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
try:
data.append(json.loads(line))
except:
print(line)
1/0
return data
def read_praquet_file(file_path):
table = pq.read_table(file_path)
df = table.to_pandas()
result=[row for _, row in df.iterrows()]
return result
写json,json,parquet文件
def save_json(file_path,data):
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(data, file, indent=4, ensure_ascii=False)
print(f'Save {file_path} is ok!')
def save_jsonl(file_path,data):
try:
with open(file_path, 'w', encoding='utf-8') as file:
for item in data:
file.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"Data saved to {file_path}")
except Exception as e:
print(f"An error occurred while saving the data: {e}")
def save_parquet(file_path, data):
if isinstance(data, list):
data = pd.DataFrame(data)
if not isinstance(data, pd.DataFrame):
raise ValueError("data must be a pandas DataFrame or a list of lists")
pq.write_table(pa.Table.from_pandas(data), file_path)
print(f'Save {file_path} is ok!')
# 写parquet的示例数据
data = {
'col1': [1, 2, 3],
'col2': ['a', 'b', 'c']
}
df = pd.DataFrame(data)
# 保存数据到 Parquet 文件
save_parquet('output.parquet', df)