es进行聚合查询 es聚合查询分页

转载

编程小达人之心 2024-07-03 14:39:27

文章标签 es进行聚合查询 es 大数据 java elasticsearch 文章分类 架构后端开发

前言

在项目开发过程中遇到这样的业务需求，在网上也找了许多资料，但是都比较复杂，需要花点时间去理解，用了各种方法踩坑之后，也请教了一下大佬ES方面的相关知识，最主要还是因为刚用ES不久，所以对ES的用法，数据结构什么的，都不是很熟悉，导致花了比较长的时间去实现这个业务需求，现在就对这个聚合搜索的具体实现代码列出来，供大家参考。

正文

ES索引的Mapping

{
  "mappings": {
    "properties": {
      "aid": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "content": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "tagCode": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "tagValue": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "createDateTime": {
        "type": "date",
        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "fromModule": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "fromWeb": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "html": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "publisher": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "releaseDateTime": {
        "type": "date",
        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "title": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "updateDateTime": {
        "type": "date",
        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "url": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "webCla": {
        "type": "keyword",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
    }
  }
}

在一开始使用ES的时候，并不知道说ES对精准查询的要求是对应字段的数据类型需要是Keyword才可以，一开始创建的时候都用的Text类型，所以导致进行模糊查询匹配的时候，一直出现不符合的数据。

在需要做时间范围筛选或者排序的字段上，记得用上date类型，并且加上format，可以保证多种格式都能自动转义。

JAVA代码

pom依赖

<dependency>
		<groupId>org.springframework.boot</groupId>
		<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>

Entity 类

package com.crawler.service.docment;

import com.baomidou.mybatisplus.annotation.TableField;
import com.crawler.common.base.IdEntity;
import com.crawler.common.constant.GlobalConstant;
import lombok.Data;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;

import java.util.List;

@Data
@Document(indexName = "idx_article", type = GlobalConstant.TYPE_OR_FAMILY)
public class Article extends IdEntity {

    @Field(type = FieldType.Keyword)
    private String aid;//ID
    @Field(type = FieldType.Keyword)
    private String title;//标题
    @Field(type = FieldType.Keyword, analyzer = "ik_max_word", searchAnalyzer = "ik_max_word")
    private String content;//内容
    @Field(type = FieldType.Text)
    private String publisher;//发布者、出版商
    @Field(type = FieldType.Date, fielddata=true)
    private String releaseDateTime;//发布日期
    @Field(type = FieldType.Text)
    private String url;
    @Field(type = FieldType.Text, analyzer = "ik_max_word", searchAnalyzer = "ik_max_word")
    private String html;//HTML
    @Field(type = FieldType.Keyword)
    private String fromWebId;//来源网站ID
    @Field(type = FieldType.Text)
    private String fromWeb;//来源网站
    @Field(type = FieldType.Keyword)
    private String fromModuleId;//来源网站模块ID
    @Field(type = FieldType.Text)
    private String fromModule;//来源网站模块
    @Field(type = FieldType.Date)
    private String createDateTime;
    @Field(type = FieldType.Date)
    private String updateDateTime;
    @Field(type = FieldType.Keyword)
    private String webCla;//引擎类型，1、2、3、4、5
    @Field(type = FieldType.Keyword)
    private String tagCode;//code1,code2
    @Field(type = FieldType.Keyword)
    private String tagValue;//{code1:"",code2:""}
    @TableField(exist = false)
    private List<String> webClasses;
}

Repository类

就简单的继承了ElasticsearchRepository父类

package com.crawler.service.repository;

import com.crawler.service.docment.Article;
import com.crawler.service.entity.Company;
import org.springframework.data.elasticsearch.repository.ElasticsearchRepository;
import org.springframework.stereotype.Repository;

@Repository
public interface ArticleRepository extends ElasticsearchRepository<Article, String> {
}

Controller

@ApiOperation(value = "获取舆情新闻列表", notes = "获取公司基本信息列表")
@RequestMapping(value = "/listByKeyword", method = RequestMethod.POST)
@ResponseBody
public CommonResult listByKeyword(@RequestBody Article article) {
    Page<Article> articles = iArticleService.listByKeyword(article);
    return CommonResult.success(articles);
}

Impl实现类

主要的逻辑都在这里实现，我写的比较简单，坑都是在这边踩完了，具体的说明都写在了注释里面。

package com.crawler.service.service.impl;

import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.crawler.service.docment.Article;
import com.crawler.service.mapper.ArticleMapper;
import com.crawler.service.repository.ArticleRepository;
import com.crawler.service.service.IArticleService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;

/**
 * @author LinZS
 * @description
 * @date 2020/12/9 15:02
 */
@Service
@Slf4j
public class IArticleServiceImpl extends ServiceImpl<ArticleMapper, Article> implements IArticleService {

    @Autowired
    private ArticleRepository articleRepository;

    @Override
    public Page<Article> listByKeyword(Article article) {
        //查询对象
        BoolQueryBuilder queryBuilder = new BoolQueryBuilder();
        //模糊搜索对象
        BoolQueryBuilder keyBuilder = new BoolQueryBuilder();
        //分类查询对象
        BoolQueryBuilder orBuilder = new BoolQueryBuilder();
        //拼接模糊搜索条件
        if (StringUtils.isNotBlank(article.getKeyword())){
            //这边主要用的是should，也就是相当于mysql的or   title like concat('%keyword%') or content like concat('%keyword%') 
            //wildcardQuery可以用于带分词的模糊搜索，如果要分词的话，那么字段的type应该是text，假如在用wildcardQuery而不想分词的话，可以查.keyword
            //例如title.keyword，不过我这边title的type已经定了是keyword类型，所以我就直接做不分词的模糊查询，精确查询的话就用matchQuery
            keyBuilder.should(QueryBuilders.wildcardQuery("title", "*"+article.getKeyword()+"*"));
            keyBuilder.should(QueryBuilders.wildcardQuery("content", "*"+article.getKeyword()+"*"));
            queryBuilder.must(keyBuilder);
        }
        //拼接分类筛选条件
        if (article.getWebClasses() != null && article.getWebClasses().size() > 0){
            //这里主要是实现了多条件筛选的需求，前端有复选框的条件筛选，后端以集合方式接收，然后做or的条件拼接 webCla = '1' or webCla = '2'...
            article.getWebClasses().forEach(s ->
                    orBuilder.should(QueryBuilders.matchQuery("webCla",s)));
            queryBuilder.must(orBuilder);
        }
        //时间范围筛选
        if (article.getTimeLimit() > 0){
            //这边是获取了距离今天多少天以前的日期
            Calendar calendar = Calendar.getInstance();
            calendar.set(Calendar.DAY_OF_YEAR, calendar.get(Calendar.DAY_OF_YEAR) - article.getTimeLimit());
            //范围筛选就用rangeQuery 相当于 >= 'xx' and <= 'xx'   还有gt方法和lt方法就是不带 '='
            QueryBuilder queryRange = QueryBuilders.rangeQuery("releaseDateTime")
                    .gte(new SimpleDateFormat("yyyy-MM-dd")
                            .format(calendar.getTime()))
                    .lte(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
                            .format(new Date()));
            queryBuilder.filter(queryRange);
        }
        //分页查询并按发布时间排序
        //这边的分页只能算是form-size的浅层分页，如果数据量大的话，建议改造成scroll深度分页
        Pageable pageable = PageRequest.of(article.getStart(), article.getLimit(), Sort.Direction.DESC,"releaseDateTime");
        Page<Article> search = articleRepository.search(queryBuilder,pageable);
        return search;
    }
}

整个嵌套下来，相当于SQL select * from table where （title like concat(’%keyword%’) or content like concat(’%keyword%’) ）and ( webCla = ‘1’ or webCla = ‘2’…) and (releaseDateTime >= ‘xx’ and releaseDateTime <= ‘xx’ (或者between and 一个意思)) order by releaseDateTime desc;