mysql插入千万级别数据

背景

项目中期,需要进行性能测试,大数据平台要测试千万级别的数据进行处理,测试性能问题,所以需要对MySQL中插入千万级别的数据

第一次插入数据

最直接的想法,就是通过jdbc方式,将数据插入到MySQL中,因为原来造测试数据,进行自测时,也是同样的做法


private static final Logger LOG = LoggerFactory.getLogger(CreateLeagalPersonTest.class);
private static Random rdn = new Random();
private static Connection conn = null;
private static int correctDataCount = 20000000;
private static int errorDataCount = 10000000;


@Before
public void before() throws ClassNotFoundException, SQLException {
//1、连接数据库
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://10.117.5.223:3306/bigdata";
String user = "root";
String password = "123456";
conn = DriverManager.getConnection(url, user, password);
if (conn == null) {
LOG.error("连接失败...");
System.exit(1);
}
}

//主入口
@Test
public void test() throws SQLException {
//2、编写带?的SQL
String sql = "INSERT INTO tb_test_leagal_person (ORG_CODE, ORG_NAME ,ORG_ADDRESS ,ORG_TYPE_CODE, REG_NO ," +
"REG_DATE ,STATUS ,STATUS_NAME ,ECNOMIC_CODE, ECNOMIC_CODE_NAME ,INDUSTRY_CODE ,INDUSTRY_CODE_NAME ," +
"REG_CAPITAL ,UNI_SCID ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)";

PreparedStatement pst = conn.prepareStatement(sql);// 对带?的sql进行预编译
for (int index = 0; index < correctDataCount; index++) {

// 4、把?用具体的值进行代替
pst.setString(1, createORGCODE(index, true)); //组织机构代码
pst.setString(2, createORGNAME(index, true)); //组织机构名称
pst.setString(3, creteORGADDRESS(index, true)); // 组织地址
pst.setString(4, createORGTYPECODE(index, true)); //机构类型
pst.setString(5, createREGNO(index, true)); //注册号
pst.setDate(6, creteDate(index, true)); //成立日期
pst.setString(7, createSTATUS(index, true)); //状态
pst.setString(8, createSTATUSNAME(index, true)); //状态名称
pst.setString(9, createECNOMICCODE(index, true)); //经济类型
pst.setString(10, createECNOMIC_CODE_NAME(index, true)); //经济类型名称
pst.setString(11, createStringType("行业类别", index, true)); //行业类别
pst.setString(12, createStringType("行业类别名称", index, true)); //行业类别名称
pst.setInt(13, createIntType(index, true));//注册资金
pst.setString(14, createStringType("统一信用代码", index, true)); //同一信用代码

// 5、执行sql
int len = pst.executeUpdate();
System.out.println(len > 0 ? "添加成功 " + index : "添加失败");
}
// 6、释放资源
pst.close();
conn.close();
}

效果

很直观的发现,效率比较慢,需要进行优化

第二种方式:通过批处理方式

上面的做法是通过一条一条的记录插入的,效率比较低,那就改为批处理的

 //批处理插入
@Test
public void testInsertBatchData() {
//2、编写带?的SQL
String sql = "INSERT INTO tb_test_leagal_person (ORG_CODE, ORG_NAME ,ORG_ADDRESS ,ORG_TYPE_CODE, REG_NO ," +
"REG_DATE ,STATUS ,STATUS_NAME ,ECNOMIC_CODE, ECNOMIC_CODE_NAME ,INDUSTRY_CODE ,INDUSTRY_CODE_NAME ," +
"REG_CAPITAL ,UNI_SCID ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)";

PreparedStatement pst = null;// 对带?的sql进行预编译
try {
pst = conn.prepareStatement(sql);
for (int index = 4450000; index < correctDataCount; index++) {

// 4、把?用具体的值进行代替
pst.setString(1, createORGCODE(index, true)); //组织机构代码
pst.setString(2, createORGNAME(index, true)); //组织机构名称
pst.setString(3, creteORGADDRESS(index, true)); // 组织地址
pst.setString(4, createORGTYPECODE(index, true)); //机构类型
pst.setString(5, createREGNO(index, true)); //注册号
pst.setDate(6, creteDate(index, true)); //成立日期
pst.setString(7, createSTATUS(index, true)); //状态
pst.setString(8, createSTATUSNAME(index, true)); //状态名称
pst.setString(9, createECNOMICCODE(index, true)); //经济类型
pst.setString(10, createECNOMIC_CODE_NAME(index, true)); //经济类型名称
pst.setString(11, createStringType("行业类别", index, true)); //行业类别
pst.setString(12, createStringType("行业类别名称", index, true)); //行业类别名称
pst.setInt(13, createIntType(index, true));//注册资金
pst.setString(14, createStringType("统一信用代码", index, true)); //同一信用代码

pst.addBatch();
if (index % 5000 == 0) {
pst.executeBatch();
System.out.println(("插入成功..." + index));
pst.clearBatch();
}
// // 5、执行sql
// int len = pst.addBatch();
// System.out.println(len > 0 ? "添加成功 " + index : "添加失败");
}
pst.executeBatch();
} catch (SQLException e) {
e.printStackTrace();
}finally {
// 6、释放资源
try {
pst.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}

第三种方式

同事说使用mysql 命令可以快速加载文件到mysql中

生成csv文件

//写入文件方式
@Test
public void testWrtieToCSV() {
String outputPath = "dir/tb_test_leagal_person_2.csv";
doCreateCVSData(0, correctDataCount, true, outputPath);

doCreateCVSData(correctDataCount, correctDataCount + errorDataCount, false, outputPath);

}

private List<String> doCreateCVSData(int start, int end, boolean flag, String outputPath) {
List<String> result = new ArrayList<>();
for (int index = start; index < end; index++) {
StringBuffer sbuffer = new StringBuffer();
sbuffer.append(index);
sbuffer.append(",");
sbuffer.append(createORGCODE(index, flag)); //组织机构代码
sbuffer.append(",");
sbuffer.append(createORGNAME(index, flag)); //组织机构名称
sbuffer.append(",");
sbuffer.append(creteORGADDRESS(index, flag)); // 组织地址
sbuffer.append(",");
sbuffer.append(createORGTYPECODE(index, flag)); //机构类型
sbuffer.append(",");
sbuffer.append(createREGNO(index, flag)); //注册号
sbuffer.append(",");
sbuffer.append(creteDate(index, flag)); //成立日期
sbuffer.append(",");
sbuffer.append(createSTATUS(index, flag)); //状态
sbuffer.append(",");
sbuffer.append(createSTATUSNAME(index, flag)); //状态名称
sbuffer.append(",");
sbuffer.append(createECNOMICCODE(index, flag)); //经济类型
sbuffer.append(",");
sbuffer.append(createECNOMIC_CODE_NAME(index, flag)); //经济类型名称
sbuffer.append(",");
sbuffer.append(createStringType("行业类别", index, flag)); //行业类别
sbuffer.append(",");
sbuffer.append(createStringType("行业类别名称", index, flag)); //行业类别名称
sbuffer.append(",");
sbuffer.append(createIntType(index, flag));//注册资金
sbuffer.append(",");
sbuffer.append(createStringType("统一信用代码", index, flag)); //同一信用代码
System.out.println(sbuffer.toString());
result.add(sbuffer.toString());

if (result.size() % 1000000 == 0) {
for (String s : result) {
try {
writeLinesToFile1(outputPath, s);
} catch (Exception e) {
e.printStackTrace();
}
}
result.clear();
}
}
return result;
}

public static boolean writeLinesToFile1(String fileName, String line) throws Exception {
FileWriter fw = new FileWriter(fileName, true);
fw.write(line + "\n");
fw.flush();
fw.close();
return true;
}

加载文件到mysql中

#mysql加载csv文件
LOAD DATA LOCAL INFILE 'C:\\Users\\wangyg\\Desktop\\tb_test_leagal_person_2.csv' INTO TABLE bigdata.tb_test_leagal_person_2 FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';

缺陷:

将数据写入csv文件时,同样会有大量的IO,导致所花费的时间很长,所以从某种角度来讲,时间还是一样长,不能解决问题

最终方式

最终。在网上查询资料,使用这种方式,2千万+数据 不到半小时就完成

  @Test
public void testInsert() throws SQLException, ClassNotFoundException {
initConn();
insert(0, 2000000, true);
// insert(correctDataCount, correctDataCount+errorDataCount, false);
}

private void initConn() throws ClassNotFoundException, SQLException {
//1、连接数据库
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://10.117.5.223:3306/bigdata";
String user = "root";
String password = "123456";
conn = DriverManager.getConnection(url, user, password);
if (conn == null) {
LOG.error("连接失败...");
System.exit(1);
}
}

private static int singleCount = 100000;

//批量插入
public static void insert(int startCount, int endCount, boolean flag) {
// 开时时间
Long begin = new Date().getTime();
// sql前缀
String prefix = " INSERT INTO tb_test_leagal_person_3 (ORG_CODE, ORG_NAME ,ORG_ADDRESS ,ORG_TYPE_CODE, REG_NO ," +
"STATUS ,STATUS_NAME ,ECNOMIC_CODE, ECNOMIC_CODE_NAME ,INDUSTRY_CODE ,INDUSTRY_CODE_NAME ," +
"REG_CAPITAL ,UNI_SCID ) VALUES";
try {
// 保存sql后缀
StringBuffer suffix = new StringBuffer();
// 设置事务为非自动提交
conn.setAutoCommit(false);
// Statement st = conn.createStatement();
// 比起st,pst会更好些
PreparedStatement pst = conn.prepareStatement("");
// 外层循环,总提交事务次数
for (int i = 0; i <= endCount / singleCount; i++) {
// 第次提交步长
for (int j = 1; j <= singleCount; j++) {
// 构建sql后缀
// suffix.append("(" + j * i + ", SYSDATE(), " + i * j
// * Math.random() + ",'" + randomStr(6) + "'),");
int index = i * singleCount + j;
//字符串类型添加'引号
suffix.append("(");

suffix.append("'");
suffix.append(createORGCODE(index, flag));
suffix.append("'");

suffix.append(",");
suffix.append("'");
suffix.append(createORGNAME(index, flag));
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(creteORGADDRESS(index, flag));
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(createORGTYPECODE(index, flag)); //机构类型
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(createREGNO(index, flag)); //注册号
suffix.append("'");
// suffix.append(",");
// suffix.append(creteDate(index, flag)); //成立日期
suffix.append(",");
suffix.append("'");
suffix.append(createSTATUS(index, flag)); //状态
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(createSTATUSNAME(index, flag)); //状态名称
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(createECNOMICCODE(index, flag)); //经济类型
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(createECNOMIC_CODE_NAME(index, flag)); //经济类型名称
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(createStringType("行业类别", index, flag)); //行业类别
suffix.append("'");
suffix.append(",");
suffix.append("'");
suffix.append(createStringType("行业类别名称", index, flag)); //行业类别名称
suffix.append("'");
suffix.append(",");
suffix.append(createIntType(index, flag));//注册资金
suffix.append(",");
suffix.append("'");
suffix.append(createStringType("统一信用代码", index, flag)); //同一信用代码
suffix.append("'");
suffix.append("),");

}
// 构建完整sql
String sql = prefix + suffix.substring(0, suffix.length() - 1);
pst.addBatch(sql);

// 添加执行sql
// 执行操作
pst.executeBatch();
// 提交事务
conn.commit();
System.out.println("提交成功..." + i * singleCount);
// 清空上一次添加的数据
suffix = new StringBuffer();
}
// 头等连接
pst.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
// 结束时间
Long end = new Date().getTime();
// 耗时
System.out.println("cast : " + (end - begin) / 1000 + " ms");
}

注意点

  1. 创建表时,指定MYISAM
CREATE TABLE bigdata.tb_test_leagal_person_2 (
id INT AUTO_INCREMENT,
ORG_CODE VARCHAR(9),
ORG_NAME VARCHAR(140),
ORG_ADDRESS VARCHAR(300),
ORG_TYPE_CODE VARCHAR(100),
REG_NO VARCHAR(100),
`STATUS` VARCHAR(100),
STATUS_NAME VARCHAR(100),
ECNOMIC_CODE VARCHAR(100),
ECNOMIC_CODE_NAME VARCHAR(100),
INDUSTRY_CODE VARCHAR(100),
INDUSTRY_CODE_NAME VARCHAR(100),
REG_CAPITAL INT(10),
UNI_SCID VARCHAR(100),
PRIMARY KEY (id)
) ENGINE MYISAM, CHARSET utf8,
AUTO_INCREMENT 0;
  1. SQL拼接多个value值
  2. 设置事务为非自动提交
  3. SET GLOBAL max_allowed_packet=100 *1024*1024*20;