前言
在工作的过程中遇到了大批量数据入库的操作,总共三张Excel,格式不复杂,以往以手工方式通过navicat入库,这里通过Python实现调用Excel然后入库,没有采用多线程和多进程,后续会尝试用,文章主要用到了对于dataframe.dropna和字符串拼接方法的运用。
#-*-coding:utf-8 -*-
import pandas as pd
import os
import pymysql
# import multiprocessing as mp
'''
author:shikailiang
function:平台的数据导入到mysql中,共三张表采用三个方法单线程执行
notice:为实现插入速度的加快,采用一次性构建三百条的插入SQL语句
'''
def sale_out_in_database(sale_out,data_path):
# 拼接路径,获取数据
sale_out_path=data_path+"\\"+sale_out
sale_out_df=pd.read_excel(sale_out_path)
# 采用第2,3列不为nan的方式去除错误行
drop_list=sale_out_df.columns.tolist()[2:4]
sale_out_df.dropna(subset=drop_list,inplace=True)
# 获取列名
column_list=sale_out_df.iloc[1,:].tolist()
# 去除不为nan的错误行
sale_out_df1=sale_out_df[(sale_out_df.iloc[:,0] !=sale_out_df.iloc[0,0]) & (sale_out_df.iloc[:,0] !=sale_out_df.iloc[1,0]) ]
sale_out_df1.columns=column_list
# 获取行数
rowcount=sale_out_df1.shape[0]
# 下面的循环为拼接SQL,一次性插入三百行的方式
j=1
sql=""
sql1 = "insert into tb_sale_out_test(日期,订单编号,客户,物料编码,物料名称,单位,实发数量,仓库) values"
for i in sale_out_df1.iterrows():
sql2=(("(" + "'{}'," * 8)[:-1] + ")").format(i[1]["单据日期"], i[1]["单据号"], i[1]["客户"], i[1]["物料编码"], i[1]["物料名称"],
i[1]["主单位"], i[1]["主数量"], i[1]["仓库名称"])
# sql = (("insert into tb_sale_out_test(日期,订单编号,客户,物料编码,物料名称,单位,实发数量,仓库) values"
# "(" + "'{}'," * 8)[:-1] + ")").format(i[1]["单据日期"], i[1]["单据号"], i[1]["客户"], i[1]["物料编码"], i[1]["物料名称"],
# i[1]["主单位"], i[1]["主数量"], i[1]["仓库名称"])
sql = sql+","+sql2
if divmod(j, 300)[1] == 0 or j == rowcount:
# 如果执行错误回滚当前事务
try:
cursor.execute(sql1 + sql[1:])
except:
conn.rollback()
print(purchase_in, sql1 + sql[1:])
continue
sql = ""
j=j+1
# 每一百条打印
if divmod(j,100)[1]==0:
print("已经插入tb_sale_out_test"+str(j)+"条")
print("插入"+str(j-1)+"条")
def purchase_order_in_database(purchase_order,data_path):
# 注释同上
purchase_order_path=data_path+"\\"+purchase_order
purchase_order_df=pd.read_excel(purchase_order_path)
drop_list=purchase_order_df.columns.tolist()[2:4]
purchase_order_df.dropna(subset=drop_list, inplace=True)
column_list = purchase_order_df.iloc[1, :].tolist()
purchase_order_df1 = purchase_order_df[(purchase_order_df.iloc[:, 0] != purchase_order_df.iloc[0, 0]) & (purchase_order_df.iloc[:, 0] != purchase_order_df.iloc[1, 0])]
purchase_order_df1.columns = column_list
rowcount=purchase_order_df1.shape[0]
j = 1
sql = ""
sql1="insert into tb_purchase_order_test(material_code,material_name,create_date,unit_price) values"
for i in purchase_order_df1.iterrows():
# sql = (("insert into tb_purchase_order_test(material_code,material_name,create_date,unit_price) values"
# "(" + "'{}'," * 4)[:-1] + ")").format(i[1]["物料编码"], i[1]["物料名称"], i[1]["计划到货日期"], i[1]["含税净价"])
sql2= (("(" + "'{}'," * 4)[:-1] + ")").format(i[1]["物料编码"], i[1]["物料名称"], i[1]["计划到货日期"], i[1]["含税净价"])
sql = sql + "," + sql2
if divmod(j, 300)[1] == 0 or j == rowcount:
try:
cursor.execute(sql1 + sql[1:])
except:
conn.rollback()
print(purchase_in, sql1 + sql[1:])
continue
sql = ""
conn.commit()
j = j + 1
if divmod(j, 100)[1] == 0:
print("已经插入tb_purchase_order_test" + str(j) + "条")
print("插入"+str(j-1)+"条")
def purchase_in_in_database(purchase_in,data_path):
# 单独的读取Excel文件报错,目前复制出来单独存为txt解决采用open方式读取文件
purchase_in_path = data_path + "\\" + purchase_in
f=open(purchase_in_path)
purchase_order_df = pd.DataFrame(f.readlines())
# 根据\t和\n对series分裂组装成dataframe
purchase_order_df = purchase_order_df[0].str.split('\t|\n', expand=True)
# 获取列名
purchase_order_df_column=purchase_order_df.iloc[0,:]
# 赋值数据行
purchase_order_df = purchase_order_df.iloc[1:,:]
# 赋值列名
purchase_order_df.columns=purchase_order_df_column
# 获取行数
rowcount=purchase_order_df.shape[0]
# 下面的循环为拼接SQL,一次性插入三百行的方式
j=1
sql=""
sql1 = "insert into tb_purchase_in_test(单据编号,入库日期,物料编码,物料名称,单位,实收数量,仓库,unit_price,仓位) values"
for i in purchase_order_df.iterrows():
sql2 = (("(" + "'{}'," * 9)[
:-1] + ")").format(i[1]["单据编号"], i[1]["入库日期"], i[1]["物料编码"], i[1]["物料名称"], i[1]["单位"], i[1]["实收数量"],i[1]["仓库"],i[1]["unit_price"], i[1]["仓位"])
sql = sql+","+sql2
if divmod(j, 300)[1] == 0 or j == rowcount:
try:
# sql=(("insert into tb_purchase_in_test(单据编号,入库日期,物料编码,物料名称,实收数量,仓位) values("+"'{}'," * 6)[:-1]+ ")").format(
# i[1]["单据编号"],i[1]["入库日期"],i[1]["物料编码"],i[1]["物料名称"],i[1]["实收数量"],i[1]["仓位"])
cursor.execute(sql1+sql[1:])
except:
conn.rollback()
print(purchase_in, sql1+sql[1:])
continue
sql = ""
conn.commit()
j = j + 1
if divmod(j, 100)[1] == 0:
print("已经插入tb_purchase_in_test" + str(j) + "条")
print("插入"+str(j-1)+"条")
if __name__ == "__main__":
# 连接mysql
conn = pymysql.connect(host="", user="",password="",database="",charset="utf8")
cursor = conn.cursor()
# 读取data路径并判断采用不同的method处理
last_path = os.path.abspath(os.path.dirname(os.getcwd()))
data_path = last_path + r"\data"
data_list = os.listdir(data_path)
purchase_in = [i for i in data_list if i[:6] == "采购入库单."][0]
purchase_order = [i for i in data_list if i[:9] == "采购订单明细查询."][0]
sale_out = [i for i in data_list if i[:9] == "销售订单执行查询."][0]
# pool = Pool(processes=3)
# p1=mp.Process(target=sale_out_in_database,args=(sale_out,data_path))
# p2 = mp.Process(target=purchase_order_in_database, args=(purchase_order,data_path))
# p3 = mp.Process(target=purchase_in_in_database, args=(purchase_in,data_path))
# p1.start()
# p2.start()
# p3.start()
# pool.apply(sale_out_in_database,(sale_out,data_path))
# pool.apply(purchase_order_in_database, (purchase_order,data_path))
# pool.apply(purchase_in_in_database, (purchase_in,data_path))
# pool.close()
# pool.join()
# 执行方法
sale_out_in_database(sale_out, data_path)
purchase_order_in_database(purchase_order, data_path)
purchase_in_in_database(purchase_in, data_path)
# 关闭数据库
cursor.close()
conn.close()