前言

在项目开发过程中遇到这样的业务需求,在网上也找了许多资料,但是都比较复杂,需要花点时间去理解,用了各种方法踩坑之后,也请教了一下大佬ES方面的相关知识,最主要还是因为刚用ES不久,所以对ES的用法,数据结构什么的,都不是很熟悉,导致花了比较长的时间去实现这个业务需求,现在就对这个聚合搜索的具体实现代码列出来,供大家参考。

正文

ES索引的Mapping

{
  "mappings": {
    "properties": {
      "aid": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "content": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "tagCode": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "tagValue": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "createDateTime": {
        "type": "date",
        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "fromModule": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "fromWeb": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "html": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "publisher": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "releaseDateTime": {
        "type": "date",
        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "title": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "updateDateTime": {
        "type": "date",
        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "url": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "webCla": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
    }
  }
}

在一开始使用ES的时候,并不知道说ES对精准查询的要求是对应字段的数据类型需要是Keyword才可以,一开始创建的时候都用的Text类型,所以导致进行模糊查询匹配的时候,一直出现不符合的数据。

在需要做时间范围筛选或者排序的字段上,记得用上date类型,并且加上format,可以保证多种格式都能自动转义。

JAVA代码

pom依赖
<dependency>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
Entity 类
package com.crawler.service.docment;

import com.baomidou.mybatisplus.annotation.TableField;
import com.crawler.common.base.IdEntity;
import com.crawler.common.constant.GlobalConstant;
import lombok.Data;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;

import java.util.List;

@Data
@Document(indexName = "idx_article", type = GlobalConstant.TYPE_OR_FAMILY)
public class Article extends IdEntity {

    @Field(type = FieldType.Keyword)
    private String aid;//ID
    @Field(type = FieldType.Keyword)
    private String title;//标题
    @Field(type = FieldType.Keyword, analyzer = "ik_max_word", searchAnalyzer = "ik_max_word")
    private String content;//内容
    @Field(type = FieldType.Text)
    private String publisher;//发布者、出版商
    @Field(type = FieldType.Date, fielddata=true)
    private String releaseDateTime;//发布日期
    @Field(type = FieldType.Text)
    private String url;
    @Field(type = FieldType.Text, analyzer = "ik_max_word", searchAnalyzer = "ik_max_word")
    private String html;//HTML
    @Field(type = FieldType.Keyword)
    private String fromWebId;//来源网站ID
    @Field(type = FieldType.Text)
    private String fromWeb;//来源网站
    @Field(type = FieldType.Keyword)
    private String fromModuleId;//来源网站模块ID
    @Field(type = FieldType.Text)
    private String fromModule;//来源网站模块
    @Field(type = FieldType.Date)
    private String createDateTime;
    @Field(type = FieldType.Date)
    private String updateDateTime;
    @Field(type = FieldType.Keyword)
    private String webCla;//引擎类型,1、2、3、4、5
    @Field(type = FieldType.Keyword)
    private String tagCode;//code1,code2
    @Field(type = FieldType.Keyword)
    private String tagValue;//{code1:"",code2:""}
    @TableField(exist = false)
    private List<String> webClasses;
}
Repository类

就简单的继承了ElasticsearchRepository父类

package com.crawler.service.repository;

import com.crawler.service.docment.Article;
import com.crawler.service.entity.Company;
import org.springframework.data.elasticsearch.repository.ElasticsearchRepository;
import org.springframework.stereotype.Repository;

@Repository
public interface ArticleRepository extends ElasticsearchRepository<Article, String> {
}
Controller
@ApiOperation(value = "获取舆情新闻列表", notes = "获取公司基本信息列表")
@RequestMapping(value = "/listByKeyword", method = RequestMethod.POST)
@ResponseBody
public CommonResult listByKeyword(@RequestBody Article article) {
    Page<Article> articles = iArticleService.listByKeyword(article);
    return CommonResult.success(articles);
}
Impl实现类

主要的逻辑都在这里实现,我写的比较简单,坑都是在这边踩完了,具体的说明都写在了注释里面。

package com.crawler.service.service.impl;

import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.crawler.service.docment.Article;
import com.crawler.service.mapper.ArticleMapper;
import com.crawler.service.repository.ArticleRepository;
import com.crawler.service.service.IArticleService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;

/**
 * @author LinZS
 * @description
 * @date 2020/12/9 15:02
 */
@Service
@Slf4j
public class IArticleServiceImpl extends ServiceImpl<ArticleMapper, Article> implements IArticleService {

    @Autowired
    private ArticleRepository articleRepository;

    @Override
    public Page<Article> listByKeyword(Article article) {
        //查询对象
        BoolQueryBuilder queryBuilder = new BoolQueryBuilder();
        //模糊搜索对象
        BoolQueryBuilder keyBuilder = new BoolQueryBuilder();
        //分类查询对象
        BoolQueryBuilder orBuilder = new BoolQueryBuilder();
        //拼接模糊搜索条件
        if (StringUtils.isNotBlank(article.getKeyword())){
            //这边主要用的是should,也就是相当于mysql的or   title like concat('%keyword%') or content like concat('%keyword%') 
            //wildcardQuery可以用于带分词的模糊搜索,如果要分词的话,那么字段的type应该是text,假如在用wildcardQuery而不想分词的话,可以查.keyword
            //例如title.keyword,不过我这边title的type已经定了是keyword类型,所以我就直接做不分词的模糊查询,精确查询的话就用matchQuery
            keyBuilder.should(QueryBuilders.wildcardQuery("title", "*"+article.getKeyword()+"*"));
            keyBuilder.should(QueryBuilders.wildcardQuery("content", "*"+article.getKeyword()+"*"));
            queryBuilder.must(keyBuilder);
        }
        //拼接分类筛选条件
        if (article.getWebClasses() != null && article.getWebClasses().size() > 0){
            //这里主要是实现了多条件筛选的需求,前端有复选框的条件筛选,后端以集合方式接收,然后做or的条件拼接 webCla = '1' or webCla = '2'...
            article.getWebClasses().forEach(s ->
                    orBuilder.should(QueryBuilders.matchQuery("webCla",s)));
            queryBuilder.must(orBuilder);
        }
        //时间范围筛选
        if (article.getTimeLimit() > 0){
            //这边是获取了距离今天多少天以前的日期
            Calendar calendar = Calendar.getInstance();
            calendar.set(Calendar.DAY_OF_YEAR, calendar.get(Calendar.DAY_OF_YEAR) - article.getTimeLimit());
            //范围筛选就用rangeQuery 相当于 >= 'xx' and <= 'xx'   还有gt方法和lt方法就是不带 '='
            QueryBuilder queryRange = QueryBuilders.rangeQuery("releaseDateTime")
                    .gte(new SimpleDateFormat("yyyy-MM-dd")
                            .format(calendar.getTime()))
                    .lte(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
                            .format(new Date()));
            queryBuilder.filter(queryRange);
        }
        //分页查询并按发布时间排序
        //这边的分页只能算是form-size的浅层分页,如果数据量大的话,建议改造成scroll深度分页
        Pageable pageable = PageRequest.of(article.getStart(), article.getLimit(), Sort.Direction.DESC,"releaseDateTime");
        Page<Article> search = articleRepository.search(queryBuilder,pageable);
        return search;
    }
}

整个嵌套下来,相当于SQL select * from table where (title like concat(’%keyword%’) or content like concat(’%keyword%’) )and ( webCla = ‘1’ or webCla = ‘2’…) and (releaseDateTime >= ‘xx’ and releaseDateTime <= ‘xx’ (或者between and 一个意思)) order by releaseDateTime desc;

聚合查询结果

es进行聚合查询 es聚合查询分页_大数据

这个是分页的数据,总共hits是44条,我limit了1,所以展示1条

es进行聚合查询 es聚合查询分页_大数据_02