前言
在项目开发过程中遇到这样的业务需求,在网上也找了许多资料,但是都比较复杂,需要花点时间去理解,用了各种方法踩坑之后,也请教了一下大佬ES方面的相关知识,最主要还是因为刚用ES不久,所以对ES的用法,数据结构什么的,都不是很熟悉,导致花了比较长的时间去实现这个业务需求,现在就对这个聚合搜索的具体实现代码列出来,供大家参考。
正文
ES索引的Mapping
{
"mappings": {
"properties": {
"aid": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"content": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"tagCode": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"tagValue": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"createDateTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"fromModule": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"fromWeb": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"html": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publisher": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"releaseDateTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"title": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"updateDateTime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"url": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"webCla": {
"type": "keyword",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
在一开始使用ES的时候,并不知道说ES对精准查询的要求是对应字段的数据类型需要是Keyword才可以,一开始创建的时候都用的Text类型,所以导致进行模糊查询匹配的时候,一直出现不符合的数据。
在需要做时间范围筛选或者排序的字段上,记得用上date类型,并且加上format,可以保证多种格式都能自动转义。
JAVA代码
pom依赖
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
Entity 类
package com.crawler.service.docment;
import com.baomidou.mybatisplus.annotation.TableField;
import com.crawler.common.base.IdEntity;
import com.crawler.common.constant.GlobalConstant;
import lombok.Data;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;
import java.util.List;
@Data
@Document(indexName = "idx_article", type = GlobalConstant.TYPE_OR_FAMILY)
public class Article extends IdEntity {
@Field(type = FieldType.Keyword)
private String aid;//ID
@Field(type = FieldType.Keyword)
private String title;//标题
@Field(type = FieldType.Keyword, analyzer = "ik_max_word", searchAnalyzer = "ik_max_word")
private String content;//内容
@Field(type = FieldType.Text)
private String publisher;//发布者、出版商
@Field(type = FieldType.Date, fielddata=true)
private String releaseDateTime;//发布日期
@Field(type = FieldType.Text)
private String url;
@Field(type = FieldType.Text, analyzer = "ik_max_word", searchAnalyzer = "ik_max_word")
private String html;//HTML
@Field(type = FieldType.Keyword)
private String fromWebId;//来源网站ID
@Field(type = FieldType.Text)
private String fromWeb;//来源网站
@Field(type = FieldType.Keyword)
private String fromModuleId;//来源网站模块ID
@Field(type = FieldType.Text)
private String fromModule;//来源网站模块
@Field(type = FieldType.Date)
private String createDateTime;
@Field(type = FieldType.Date)
private String updateDateTime;
@Field(type = FieldType.Keyword)
private String webCla;//引擎类型,1、2、3、4、5
@Field(type = FieldType.Keyword)
private String tagCode;//code1,code2
@Field(type = FieldType.Keyword)
private String tagValue;//{code1:"",code2:""}
@TableField(exist = false)
private List<String> webClasses;
}
Repository类
就简单的继承了ElasticsearchRepository父类
package com.crawler.service.repository;
import com.crawler.service.docment.Article;
import com.crawler.service.entity.Company;
import org.springframework.data.elasticsearch.repository.ElasticsearchRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface ArticleRepository extends ElasticsearchRepository<Article, String> {
}
Controller
@ApiOperation(value = "获取舆情新闻列表", notes = "获取公司基本信息列表")
@RequestMapping(value = "/listByKeyword", method = RequestMethod.POST)
@ResponseBody
public CommonResult listByKeyword(@RequestBody Article article) {
Page<Article> articles = iArticleService.listByKeyword(article);
return CommonResult.success(articles);
}
Impl实现类
主要的逻辑都在这里实现,我写的比较简单,坑都是在这边踩完了,具体的说明都写在了注释里面。
package com.crawler.service.service.impl;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.crawler.service.docment.Article;
import com.crawler.service.mapper.ArticleMapper;
import com.crawler.service.repository.ArticleRepository;
import com.crawler.service.service.IArticleService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
/**
* @author LinZS
* @description
* @date 2020/12/9 15:02
*/
@Service
@Slf4j
public class IArticleServiceImpl extends ServiceImpl<ArticleMapper, Article> implements IArticleService {
@Autowired
private ArticleRepository articleRepository;
@Override
public Page<Article> listByKeyword(Article article) {
//查询对象
BoolQueryBuilder queryBuilder = new BoolQueryBuilder();
//模糊搜索对象
BoolQueryBuilder keyBuilder = new BoolQueryBuilder();
//分类查询对象
BoolQueryBuilder orBuilder = new BoolQueryBuilder();
//拼接模糊搜索条件
if (StringUtils.isNotBlank(article.getKeyword())){
//这边主要用的是should,也就是相当于mysql的or title like concat('%keyword%') or content like concat('%keyword%')
//wildcardQuery可以用于带分词的模糊搜索,如果要分词的话,那么字段的type应该是text,假如在用wildcardQuery而不想分词的话,可以查.keyword
//例如title.keyword,不过我这边title的type已经定了是keyword类型,所以我就直接做不分词的模糊查询,精确查询的话就用matchQuery
keyBuilder.should(QueryBuilders.wildcardQuery("title", "*"+article.getKeyword()+"*"));
keyBuilder.should(QueryBuilders.wildcardQuery("content", "*"+article.getKeyword()+"*"));
queryBuilder.must(keyBuilder);
}
//拼接分类筛选条件
if (article.getWebClasses() != null && article.getWebClasses().size() > 0){
//这里主要是实现了多条件筛选的需求,前端有复选框的条件筛选,后端以集合方式接收,然后做or的条件拼接 webCla = '1' or webCla = '2'...
article.getWebClasses().forEach(s ->
orBuilder.should(QueryBuilders.matchQuery("webCla",s)));
queryBuilder.must(orBuilder);
}
//时间范围筛选
if (article.getTimeLimit() > 0){
//这边是获取了距离今天多少天以前的日期
Calendar calendar = Calendar.getInstance();
calendar.set(Calendar.DAY_OF_YEAR, calendar.get(Calendar.DAY_OF_YEAR) - article.getTimeLimit());
//范围筛选就用rangeQuery 相当于 >= 'xx' and <= 'xx' 还有gt方法和lt方法就是不带 '='
QueryBuilder queryRange = QueryBuilders.rangeQuery("releaseDateTime")
.gte(new SimpleDateFormat("yyyy-MM-dd")
.format(calendar.getTime()))
.lte(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
.format(new Date()));
queryBuilder.filter(queryRange);
}
//分页查询并按发布时间排序
//这边的分页只能算是form-size的浅层分页,如果数据量大的话,建议改造成scroll深度分页
Pageable pageable = PageRequest.of(article.getStart(), article.getLimit(), Sort.Direction.DESC,"releaseDateTime");
Page<Article> search = articleRepository.search(queryBuilder,pageable);
return search;
}
}
整个嵌套下来,相当于SQL select * from table where (title like concat(’%keyword%’) or content like concat(’%keyword%’) )and ( webCla = ‘1’ or webCla = ‘2’…) and (releaseDateTime >= ‘xx’ and releaseDateTime <= ‘xx’ (或者between and 一个意思)) order by releaseDateTime desc;
聚合查询结果
这个是分页的数据,总共hits是44条,我limit了1,所以展示1条