问题说明:
若使用java直接批量导入大量数据到elasticsearch服务器则会报错:
Unable to parse response body
org.elasticsearch.client.ResponseException: method [POST], host [http://127.0.0.1:9200], URI [/_bulk?timeout=1m], status line [HTTP/1.1 413 Request Entity Too Large]
解决办法:
使用easyexcel
分批次导入
下面是解决办法
1、首先添加pom
依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.5.5</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.kv</groupId>
<artifactId>demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>demo</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
<elasticsearch.version>7.12.0</elasticsearch.version>
<hutool.version>5.7.14</hutool.version>
<poi-ooxml.version>5.0.0</poi-ooxml.version>
<fastjson.version>1.2.78</fastjson.version>
<easyexcel.version>2.2.11</easyexcel.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>${easyexcel.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<!--ES的client-->
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>${elasticsearch.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>
2、配置项目
- 配置ES地址:
@Configuration
@ConfigurationProperties(prefix="elasticsearch")
public class ElasticSearchConfig {
private String host;
private int port;
public String getHost() {
return host;
}
public void setHost(String host) {
this.host = host;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
@Bean
public RestHighLevelClient client(){
return new RestHighLevelClient(RestClient.builder(
new HttpHost(host,port,"http")
));
}
}
2、配置文件application.yml
server:
port: 8080
elasticsearch:
host: localhost
port: 9200
3、创建索引添加映射
索引就是类似与数据库的库
,而映射就类似与数据库表的字段
创建一个news
的索引:http://192.168.8.125:9200/news
注意是put
请求
返回结果:
然后添加映射:http://192.168.8.125:9200/news/_mapping
也是put请求
请求体:
{
"properties": {
"title": {
"type": "text",
"index": true
},
"body": {
"type": "text",
"index": true
}
}
}
返回结果:
4、创建java实体类news
@Data
@AllArgsConstructor
@NoArgsConstructor
public class News {
private String title;
private String body;
}
5、创建业务类用于导入数据
NewsService
public interface NewsService {
/**
* 导入数据
* @param news
*/
public void importData(List<News> news);
/**
* 关闭链接
*/
public void closeClient();
}
NewsServiceImpl
@Service
public class NewsServiceImpl implements NewsService {
@Autowired
private RestHighLevelClient client;
@Override
public void importData(List<News> news) {
BulkRequest request = new BulkRequest();
for (News n : news) {
request.add(new IndexRequest().index("news").source(JSON.toJSONString(n), XContentType.JSON));
}
try {
//客户端发送请求,获取响应对象
BulkResponse responses = client.bulk(request, RequestOptions.DEFAULT);
//打印结果信息
System.out.println("took:"+responses.getTook());
System.out.println("items:"+responses.getItems());
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 关闭链接
*/
@Override
public void closeClient() {
if(client!=null){
try {
client.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
6、创建NewsListener
监听器
我使用的easyexcel来读取excel
easyexcel官网
NewsListener:
@Slf4j
public class NewsListener extends AnalysisEventListener<News> {
/**
* 每隔5条存储数据库,实际使用中可以3000条,然后清理list ,方便内存回收
*/
private static final int BATCH_COUNT = 3000;
/**
* 缓存的数据
*/
private List<News> list = new ArrayList<>(BATCH_COUNT);
private NewsService newsService;
/**
* 如果使用了spring,请使用这个构造方法。每次创建Listener的时候需要把spring管理的类传进来
*
* @param newsService
*/
public NewsListener(NewsService newsService) {
this.newsService =newsService ;
}
/**
* 这个每一条数据解析都会来调用
*
* @param data one row value. Is is same as {@link AnalysisContext#readRowHolder()}
* @param analysisContext
*/
@Override
public void invoke(News data, AnalysisContext analysisContext) {
log.info("解析到一条数据:{}", JSON.toJSONString(data));
list.add(data);
// 达到BATCH_COUNT了,需要去存储一次数据库,防止数据几万条数据在内存,容易OOM
if (list.size() >= BATCH_COUNT) {
saveData();
// 存储完成清理 list
list = new ArrayList<>(BATCH_COUNT);
}
}
/**
* 所有数据解析完成了 都会来调用
*
* @param analysisContext
*/
@Override
public void doAfterAllAnalysed(AnalysisContext analysisContext) {
// 这里也要保存数据,确保最后遗留的数据也存储到数据库
saveData();
//关闭链接
newsService.closeClient();
log.info("所有数据解析完成,关闭连接!");
}
/**
* 加上存储数据库
*/
private void saveData() {
log.info("{}条数据,开始存储数据库!", list.size());
newsService.importData(list);
log.info("存储数据库成功!");
}
}
7、导入测试
@Slf4j
@SpringBootTest
class DemoApplicationTests {
@Autowired
private NewsService newsService;
@Test
public void testSave(){
String fileName = "src/main/resources/excel/news_no_duplicated20210204.xlsx";
// 这里每次会读取3000条数据 然后返回过来 直接调用使用数据就行
// 这里 需要指定读用哪个class去读,然后读取第一个sheet 文件流会自动关闭
EasyExcel.read(fileName, News.class, new NewsListener(newsService)).sheet().doRead();
}
}
说明:news_no_duplicated20210204.xlsx文件就只有两列分别与news实体类对应
,即:title、body
两列