前言

在工作的过程中遇到了大批量数据入库的操作,总共三张Excel,格式不复杂,以往以手工方式通过navicat入库,这里通过Python实现调用Excel然后入库,没有采用多线程和多进程,后续会尝试用,文章主要用到了对于dataframe.dropna和字符串拼接方法的运用。

pandas.DataFrame.dropna官方文档

Python拼接字符串的7种方法总结

#-*-coding:utf-8 -*-
import pandas as pd
import os
import pymysql
# import multiprocessing as mp
'''
author:shikailiang
function:平台的数据导入到mysql中,共三张表采用三个方法单线程执行
notice:为实现插入速度的加快,采用一次性构建三百条的插入SQL语句
'''

def sale_out_in_database(sale_out,data_path):
    # 拼接路径,获取数据
    sale_out_path=data_path+"\\"+sale_out
    sale_out_df=pd.read_excel(sale_out_path)
    # 采用第2,3列不为nan的方式去除错误行
    drop_list=sale_out_df.columns.tolist()[2:4]
    sale_out_df.dropna(subset=drop_list,inplace=True)
    # 获取列名
    column_list=sale_out_df.iloc[1,:].tolist()
    # 去除不为nan的错误行
    sale_out_df1=sale_out_df[(sale_out_df.iloc[:,0] !=sale_out_df.iloc[0,0]) & (sale_out_df.iloc[:,0] !=sale_out_df.iloc[1,0]) ]
    sale_out_df1.columns=column_list
    # 获取行数
    rowcount=sale_out_df1.shape[0]
    # 下面的循环为拼接SQL,一次性插入三百行的方式
    j=1
    sql=""
    sql1 = "insert into tb_sale_out_test(日期,订单编号,客户,物料编码,物料名称,单位,实发数量,仓库) values"
    for i in sale_out_df1.iterrows():
        sql2=(("(" + "'{}'," * 8)[:-1] + ")").format(i[1]["单据日期"], i[1]["单据号"], i[1]["客户"], i[1]["物料编码"], i[1]["物料名称"],
                                                      i[1]["主单位"], i[1]["主数量"], i[1]["仓库名称"])
        # sql = (("insert into tb_sale_out_test(日期,订单编号,客户,物料编码,物料名称,单位,实发数量,仓库) values"
        #         "(" + "'{}'," * 8)[:-1] + ")").format(i[1]["单据日期"], i[1]["单据号"], i[1]["客户"], i[1]["物料编码"], i[1]["物料名称"],
        #                                               i[1]["主单位"], i[1]["主数量"], i[1]["仓库名称"])
        sql = sql+","+sql2
        if divmod(j, 300)[1] == 0 or j == rowcount:
            # 如果执行错误回滚当前事务
            try:
                cursor.execute(sql1 + sql[1:])
            except:
                conn.rollback()
                print(purchase_in, sql1 + sql[1:])
                continue
            sql = ""
        j=j+1
        # 每一百条打印
        if divmod(j,100)[1]==0:
            print("已经插入tb_sale_out_test"+str(j)+"条")
    print("插入"+str(j-1)+"条")
def purchase_order_in_database(purchase_order,data_path):
    # 注释同上
    purchase_order_path=data_path+"\\"+purchase_order
    purchase_order_df=pd.read_excel(purchase_order_path)
    drop_list=purchase_order_df.columns.tolist()[2:4]
    purchase_order_df.dropna(subset=drop_list, inplace=True)
    column_list = purchase_order_df.iloc[1, :].tolist()
    purchase_order_df1 = purchase_order_df[(purchase_order_df.iloc[:, 0] != purchase_order_df.iloc[0, 0]) & (purchase_order_df.iloc[:, 0] != purchase_order_df.iloc[1, 0])]
    purchase_order_df1.columns = column_list
    rowcount=purchase_order_df1.shape[0]
    j = 1
    sql = ""
    sql1="insert into tb_purchase_order_test(material_code,material_name,create_date,unit_price) values"
    for i in purchase_order_df1.iterrows():
        # sql = (("insert into tb_purchase_order_test(material_code,material_name,create_date,unit_price) values"
        #         "(" + "'{}'," * 4)[:-1] + ")").format(i[1]["物料编码"], i[1]["物料名称"], i[1]["计划到货日期"], i[1]["含税净价"])
        sql2= (("(" + "'{}'," * 4)[:-1] + ")").format(i[1]["物料编码"], i[1]["物料名称"], i[1]["计划到货日期"], i[1]["含税净价"])
        sql = sql + "," + sql2
        if divmod(j, 300)[1] == 0 or j == rowcount:
            try:
                cursor.execute(sql1 + sql[1:])
            except:
                conn.rollback()
                print(purchase_in, sql1 + sql[1:])
                continue
            sql = ""
        conn.commit()
        j = j + 1
        if divmod(j, 100)[1] == 0:
            print("已经插入tb_purchase_order_test" + str(j) + "条")
    print("插入"+str(j-1)+"条")
def purchase_in_in_database(purchase_in,data_path):
    # 单独的读取Excel文件报错,目前复制出来单独存为txt解决采用open方式读取文件
    purchase_in_path = data_path + "\\" + purchase_in
    f=open(purchase_in_path)
    purchase_order_df = pd.DataFrame(f.readlines())
    # 根据\t和\n对series分裂组装成dataframe
    purchase_order_df = purchase_order_df[0].str.split('\t|\n', expand=True)
    # 获取列名
    purchase_order_df_column=purchase_order_df.iloc[0,:]
    # 赋值数据行
    purchase_order_df = purchase_order_df.iloc[1:,:]
    # 赋值列名
    purchase_order_df.columns=purchase_order_df_column
    # 获取行数
    rowcount=purchase_order_df.shape[0]
    # 下面的循环为拼接SQL,一次性插入三百行的方式
    j=1
    sql=""
    sql1 = "insert into tb_purchase_in_test(单据编号,入库日期,物料编码,物料名称,单位,实收数量,仓库,unit_price,仓位) values"
    for i in purchase_order_df.iterrows():
        sql2 = (("(" + "'{}'," * 9)[
               :-1] + ")").format(i[1]["单据编号"], i[1]["入库日期"], i[1]["物料编码"], i[1]["物料名称"], i[1]["单位"], i[1]["实收数量"],i[1]["仓库"],i[1]["unit_price"], i[1]["仓位"])
        sql = sql+","+sql2
        if divmod(j, 300)[1] == 0 or j == rowcount:
            try:
            #     sql=(("insert into tb_purchase_in_test(单据编号,入库日期,物料编码,物料名称,实收数量,仓位) values("+"'{}'," * 6)[:-1]+ ")").format(
            #     i[1]["单据编号"],i[1]["入库日期"],i[1]["物料编码"],i[1]["物料名称"],i[1]["实收数量"],i[1]["仓位"])
                cursor.execute(sql1+sql[1:])
            except:
                conn.rollback()
                print(purchase_in, sql1+sql[1:])
                continue
            sql = ""
            conn.commit()
        j = j + 1
        if divmod(j, 100)[1] == 0:
            print("已经插入tb_purchase_in_test" + str(j) + "条")
    print("插入"+str(j-1)+"条")
if __name__ == "__main__":
    # 连接mysql
    conn = pymysql.connect(host="", user="",password="",database="",charset="utf8")
    cursor = conn.cursor()
    # 读取data路径并判断采用不同的method处理
    last_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = last_path + r"\data"
    data_list = os.listdir(data_path)
    purchase_in = [i for i in data_list if i[:6] == "采购入库单."][0]
    purchase_order = [i for i in data_list if i[:9] == "采购订单明细查询."][0]
    sale_out = [i for i in data_list if i[:9] == "销售订单执行查询."][0]
    # pool = Pool(processes=3)
    # p1=mp.Process(target=sale_out_in_database,args=(sale_out,data_path))
    # p2 = mp.Process(target=purchase_order_in_database, args=(purchase_order,data_path))
    # p3 = mp.Process(target=purchase_in_in_database, args=(purchase_in,data_path))
    # p1.start()
    # p2.start()
    # p3.start()
    # pool.apply(sale_out_in_database,(sale_out,data_path))
    # pool.apply(purchase_order_in_database, (purchase_order,data_path))
    # pool.apply(purchase_in_in_database, (purchase_in,data_path))
    # pool.close()
    # pool.join()
    # 执行方法
    sale_out_in_database(sale_out, data_path)
    purchase_order_in_database(purchase_order, data_path)
    purchase_in_in_database(purchase_in, data_path)
    # 关闭数据库
    cursor.close()
    conn.close()