业务需求是将一张2000多万的数据表进行分表插入。
引入依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-batch</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.connector.java.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>${druid.version}</version>
</dependency>
添加批处理配置
#项目启动会执行springbatch自带的sql脚本
spring.sql.init.schema-locations=classpath:org/springframework/batch/core/schema-mysql.sql
#项目启动后是否执行sql脚本 always 和 never,第一次配置always 之后换成never
spring.sql.init.mode=never
#项目启动时执行job
spring.batch.job.enabled=false
添加启动类
//开启批处理
@EnableBatchProcessing
@SpringBootApplication
public class GatherPlusApplication {
public static void main(String[] args) {
SpringApplication.run(GatherPlusApplication.class, args);
}
}
添加job配置
@Configuration
public class MonitorGatherHistoryJobConfig {
@Autowired
private JobBuilderFactory jobBuilderFactory;
@Autowired
private StepBuilderFactory stepBuilderFactory;
@Autowired
private SqlSessionFactory sqlSessionFactory;
@Bean
@StepScope //这个要获取分区器的参数,必须加上这个注解
public MyBatisPagingItemReader<BitMonitorGather1History> itemReader( @Value("#{stepExecutionContext[startIndex]}")Long startIndex,
@Value("#{stepExecutionContext[endIndex]}")Long endIndex) {
System.out.println("----------MyBatisPagingItemReader开始-----startIndex: " + startIndex + " -----endIndex:" + endIndex);
MyBatisPagingItemReader<BitMonitorGather1History> reader = new MyBatisPagingItemReader<>();
reader.setSqlSessionFactory(sqlSessionFactory);
//1个线程每次读取1000条数据
reader.setPageSize(1000);
reader.setQueryId("water.gather.plus.gather.mapper.BitMonitorGather1HistoryMapper.selectTempForList");
Map<String, Object> map = new HashMap<>(2);
map.put("startIndex",startIndex);
map.put("endIndex",endIndex);
reader.setParameterValues(map);
return reader;
}
@Bean
public ItemProcessor<BitMonitorGather1History, BitMonitorGatherHistory> itemProcessor() {
return new ItemProcessor<BitMonitorGather1History, BitMonitorGatherHistory>() {
//我这里只是进行了日期的格式化
@Override
public BitMonitorGatherHistory process(BitMonitorGather1History bitMonitorGather1History) throws Exception {
BitMonitorGatherHistory bitMonitorGatherHistory = new BitMonitorGatherHistory();
BeanUtil.copyProperties(bitMonitorGather1History, bitMonitorGatherHistory, "insertTime");
Long min = bitMonitorGatherHistory.getMin();
DateTime dateTime = DateUtil.parse(min + "00");
bitMonitorGatherHistory.setInsertTime(dateTime);
return bitMonitorGatherHistory;
}
};
}
//执行插入操作 这里遇到一个坑,开始使用的是mybatis的insert方法插入,执行一半就报错了,跟进源码发现有时候执行的sql语句有差别,原因是,我读出来的数据有几个字段有时候有值有时候没值,插入的sql语句发现字段的值为空就会去掉这个字段,所以一批任务会有不同的sql插入语句,springbatch一批处理的语句必须相同。因此我手写了插入语句。
@Bean
public MyBatisBatchItemWriter<BitMonitorGatherHistory> itemWriter() {
MyBatisBatchItemWriter<BitMonitorGatherHistory> itemWriter = new MyBatisBatchItemWriter<>();
itemWriter.setSqlSessionFactory(sqlSessionFactory);
itemWriter.setStatementId("water.gather.plus.gather.mapper.BitMonitorGatherHistoryMapper.save"); //操作sql
return itemWriter;
}
@Bean
public Step workStep() {
return stepBuilderFactory.get("workStep")
.<BitMonitorGather1History, BitMonitorGatherHistory>chunk(1000) // 设置批量处理的大小 每次处理1000条数据
.reader(itemReader(null,null))
.processor(itemProcessor())
.writer(itemWriter())
.build();
}
@Bean
public Step masterStep() {
return stepBuilderFactory.get("masterStep")
.partitioner(workStep().getName(), partitioner())
.partitionHandler(partitionHandler())
.build();
}
@Bean
public MonitorGatherJobListener monitorGatherJobListener() {
return new MonitorGatherJobListener();
}
@Bean
public PartitionHandler partitionHandler() {
TaskExecutorPartitionHandler handler = new TaskExecutorPartitionHandler();
handler.setGridSize(50); //这里我开启50个线程
handler.setTaskExecutor(new SimpleAsyncTaskExecutor());
handler.setStep(workStep());
try {
handler.afterPropertiesSet();
} catch (Exception e) {
e.printStackTrace();
}
return handler;
}
@Bean
public MonitorGatherTimePartitioner partitioner() {
return new MonitorGatherTimePartitioner();
}
@Bean
public Job myJob() {
return jobBuilderFactory.get("myJob")
.start(masterStep())
.incrementer(myParamIncrementer()) //保证可以多次执行
.listener(monitorGatherJobListener())
.build();
}
@Bean
public MyParamIncrementer myParamIncrementer(){
return new MyParamIncrementer();
}
//itemReader执行的查询语句
public interface BitMonitorGather1HistoryMapper extends BaseMapper<BitMonitorGather1History> {
@Select("select * from bit_monitor_gather_1_history where and MIN >= #{startIndex} and MIN < #{endIndex} limit #{_pagesize} OFFSET #{_skiprows}")
List<BitMonitorGather1History> selectTempForList();
}
//基于上面插入问题重写了方法,保证每次执行sql都是一样的。
public interface BitMonitorGatherHistoryMapper extends BaseMapper<BitMonitorGatherHistory> {
@Insert("INSERT INTO bit_monitor_gather_history (\n" +
"\tID,\n" +
"\tMEASURE_ID,\n" +
"\tFACTORY_SN,\n" +
"\tMONITOR_CHILD_ID,\n" +
"\tPLC_NAME,\n" +
"\tFACTORY_NAME,\n" +
"\tDEVICE_STATUS,\n" +
"\t`MIN`,\n" +
"\tMEASURE_POINT_NAME,\n" +
"\tMEASURE_POINT_NAME_SHOW,\n" +
"\tMEASURE_POINT_RAW_DATA,\n" +
"\tMEASURE_POINT_UNIT,\n" +
"\tMEASURE_POINT_DATA_TYPE,\n" +
"\tMEASURE_POINT_TIME_STAMP,\n" +
"\tINSERT_TIME,\n" +
"\tPARENT_SYSTEM_NAME\n" +
")\n" +
"VALUES\n" +
"\t(\n" +
"\t\t#{id}, #{measureId}, #{factorySn}, #{monitorChildId}, #{plcName}, #{factoryName}, #{deviceStatus}, #{min}, #{measurePointName},#{measurePointNameShow}, #{measurePointRawData}, #{measurePointUnit}, #{measurePointDataType}, #{measurePointTimeStamp}, #{insertTime}, #{parentSystemName}\n" +
"\t);")
int save(BitMonitorGatherHistory bitMonitorGatherHistory);
//添加分区器 表中1小时大概4W条数据,因此我以1小时为单位去处理
public class MonitorGatherTimePartitioner implements Partitioner {
@Override
public Map<String, ExecutionContext> partition(int gridSize) {
DateTime startTime = DateUtil.parse("20231024000000");
DateTime lastTime = DateUtil.parse("20231116000000");
Map<String, ExecutionContext> partitions = new HashMap<>();
long between = DateUtil.between(startTime, lastTime,DateUnit.HOUR)+1;
for (int i = 0; i < between; i++) {
if (startTime.compareTo(lastTime) >=0){
break;
}
String start = startTime.toString("yyyyMMddHHmm");
String end = DateUtil.offsetHour(startTime,1).toString("yyyyMMddHHmm");
// 创建分区的参数集合
Map<String, Object> partitionParams = new HashMap<>();
partitionParams.put("startIndex", Long.valueOf(start));
partitionParams.put("endIndex", Long.valueOf(end));
// 创建 ExecutionContext 并添加到分区集合中
ExecutionContext context = new ExecutionContext(partitionParams);
partitions.put("partition" + i, context);
startTime = DateUtil.offsetHour(startTime,1);
}
return partitions;
}
}
//添加job监听器
public class MonitorGatherJobListener implements JobExecutionListener {
@Override
public void beforeJob(JobExecution jobExecution) {
long begin = System.currentTimeMillis();
jobExecution.getExecutionContext().putLong("begin", begin);
System.err.println("-------------------------【开始时间:】---->"+begin+"<-----------------------------");
}
@Override
public void afterJob(JobExecution jobExecution) {
long begin = jobExecution.getExecutionContext().getLong("begin");
long end = System.currentTimeMillis();
System.err.println("-------------------------【结束时间:】---->"+end+"<-----------------------------");
System.err.println("-------------------------【总耗时:】---->"+(end - begin)+"<-----------------------------");
}
//添加自定义自增器,这里说一下,因为我这个任务因为上面插入的问题导致多次失败,因此我本次任务并没有执行完成,所以我赋值了我本次的任务id,再次启动job就会从上次失败的地方继续执行,不会从头开始执行。
//如果你的任务每次都要重新执行,请使用springbatch自带的自增器
public class MyParamIncrementer implements JobParametersIncrementer {
@Override
public JobParameters getNext(JobParameters jobParameters) {
return new JobParametersBuilder(jobParameters)
.addLong("run.id", 44L)
.toJobParameters();
}
}
2000W条数据执行起来得一会时间,耐心等待吧。。。