文章目录
- Elasticsearch 聚合使用
- 前言
- 聚合使用
- 指标聚合
- 统计字段有值的数据总数
- cardinality 去重计数
- stats 统计
- Extended stats 统计
- percentile_ranks
- 桶 聚合
- having 过滤
- 聚合查询
Elasticsearch 聚合使用
前言
通常在业务开发中我们需要统计报表或对一些数据进行统计,在关系型数据库中我们通常使用sql 如max min sum count group by 等配合使用进行数据统计,也可能在业务代码中进行一些数据计算,那么在es中是如何使用的呢?
聚合使用
聚合分析是es中比较重要的功能特性,完成对一个查询数据集中数据的聚合计算,如:需要找出某个字段的最大值,最小值,平均值,计算综合,平均值等。同样的es作为搜索引擎兼数据库,也提供了强大的聚合分析功能。
对于一个数据集求平均值,求和,最大,最小等指标聚合,在es中被称为指标聚合metric,在关系db中还可以使用group by进行分组计算,同样es中group by被称为桶聚合 Bucketing
聚合语法
指标聚合
如min max avg sum
# 聚合查询
POST /elasticsearch_test/_search
{
"size": 0,
"aggs": {
"max_price": {
"max": {
"field": "price"
}
},
"min_pice":{
"avg": {
"field": "price"
}
},
"sum_price":{
"sum": {
"field": "price"
}
},
"avg_price":{
"avg": {
"field": "price"
}
}
}
}
结果如下
统计行数
# 统计行数 统计price 大于等于6的数据
POST /elasticsearch_test/_search
{
"query": {
"range": {
"price": {
"gte": 6
}
}
}
}
如下图 符合条件的3条数据
统计字段有值的数据总数
# 统计字段有值的数据总数
POST /elasticsearch_test/_search
{
"aggs": {
"price_count": {
"value_count": {
"field": "price"
}
}
}
}
这里有7条数据
实际上price价格相同的数据存在好几条,sql中我们使用 disctinct 和 group by 可以去重,es中我们可以使用cardinality 去重计数
cardinality 去重计数
# cardinality 去重计数
POST /elasticsearch_test/_search
{
"aggs": {
"id_count": {
"cardinality": {
"field": "_id"
}
},
"price_count": {
"cardinality": {
"field": "price"
}
}
}
}
去重后的数据
stats 统计
统计 avg max min sum count 这五个统计
# stats 统计 avg max min sum count 这五个统计
POST /elasticsearch_test/_search
{
"aggs": {
"price_stats": {
"stats": {
"field": "price"
}
}
}
}
可以看到不需要额外的每个max min sum count avg 都写一遍 ,直接全部统计出来
Extended stats 统计
Extended stats 统计 比 stats多 平方和 方差 标准差 平均值加/减两个标准差的区间
# Extended stats 统计 比 stats多 平方和 方差 标准差 平均值加/减两个标准差的区间
POST /elasticsearch_test/_search
{
"aggs": {
"price_stats": {
"extended_stats": {
"field": "price"
}
}
}
}
可以看到这个统计更详细,不过我们最常用上面五个就可以了
percents 指定分位值
# 占比百分比对应的统计值 percents 指定分位值 占比百分位对应值的统计
POST /elasticsearch_test/_search
{
"aggs": {
"price_stats": {
"percentiles": {
"field": "price",
"percents": [
0,
5,
90,
100
]
}
}
}
}
percentile_ranks
统计值小于等于指定值的文档占比
# percentile_ranks 统计值小于等于指定值的文档占比
POST /elasticsearch_test/_search
{
"aggs": {
"price_aggs": {
"percentile_ranks": {
"field": "price",
"values": [
0,
5,
5.6,
10,
15,
900,
990
]
}
}
}
}
桶 聚合
桶 聚合 相当于sql中的group by 统计
# 桶 聚合 相当于sql中的group by 统计
POST /elasticsearch_test/_search
{
"aggs": {
"group_by_price": {
"range": {
"field": "price",
"ranges": [
{
"from": 0,
"to": 10
},
{
"from": 10,
"to": 90
}
,
{
"from": 90,
"to": 100
},
{
"from": 100,
"to": 1000
}
]
},
"aggs": {
"price_aggs": {
"max": {
"field": "price"
}
},
"avg_price": {
"avg": {
"field": "price"
}
},
"count_price":{
"value_count": {
"field": "price"
}
}
}
}
}
}
这里可以看到不同价格区间里的数据统计
"aggregations" : {
"group_by_price" : {
"buckets" : [
{
"key" : "0.0-10.0",
"from" : 0.0,
"to" : 10.0,
"doc_count" : 4,
"count_price" : {
"value" : 4
},
"price_aggs" : {
"value" : 5.599999904632568
},
"avg_price" : {
"value" : 5.599999904632568
}
},
{
"key" : "10.0-90.0",
"from" : 10.0,
"to" : 90.0,
"doc_count" : 0,
"count_price" : {
"value" : 0
},
"price_aggs" : {
"value" : null
},
"avg_price" : {
"value" : null
}
},
{
"key" : "90.0-100.0",
"from" : 90.0,
"to" : 100.0,
"doc_count" : 0,
"count_price" : {
"value" : 0
},
"price_aggs" : {
"value" : null
},
"avg_price" : {
"value" : null
}
},
{
"key" : "100.0-1000.0",
"from" : 100.0,
"to" : 1000.0,
"doc_count" : 3,
"count_price" : {
"value" : 3
},
"price_aggs" : {
"value" : 990.0
},
"avg_price" : {
"value" : 396.6666666666667
}
}
]
}
}
}
having 过滤
# having 过滤
# bucket_selector: 过滤条件字段 script:写过滤逻辑
POST /elasticsearch_test/_search
{
"aggs": {
"group_by_price": {
"range": {
"field": "price",
"ranges": [
{
"from": 0,
"to": 10
},
{
"from": 10,
"to": 90
}
,
{
"from": 90,
"to": 100
},
{
"from": 100,
"to": 1000
}
]
},
"aggs": {
"price_aggs": {
"max": {
"field": "price"
}
},
"avg_price": {
"avg": {
"field": "price"
}
},
"count_price":{
"value_count": {
"field": "price"
}
},
"having":{
"bucket_selector": {
"buckets_path": {
"avg_price": "avg_price"
},
"script": "params.avg_price > 6"
}
}
}
}
}
}
这里我们分组统计后 最后使用having过滤平均价格大于6的数据
聚合查询
################################## 聚合 ########################################
DELETE /employees
#创建索引库
PUT /employees
{
"mappings":{
"properties":{
"age":{
"type":"integer"
},
"gender":{
"type":"keyword"
},
"job":{
"type":"text",
"fields":{
"keyword":{
"type":"keyword",
"ignore_above":50
}
}
},
"name":{
"type":"keyword"
},
"salary":{
"type":"integer"
}
}
}
}
PUT /employees/_bulk
{ "index" : { "_id" : "1" } }
{ "name" : "Emma","age":32,"job":"Product Manager","gender":"female","sa lary":35000 }
{ "index" : { "_id" : "2" } }
{ "name" : "Underwood","age":41,"job":"Dev Manager","gender":"male","sal ary": 50000}
{ "index" : { "_id" : "3" } }
{ "name" : "Tran","age":25,"job":"Web Designer","gender":"male","salary":18000 }
{ "index" : { "_id" : "4" } }
{ "name" : "Rivera","age":26,"job":"Web Designer","gender":"female","sal ary": 22000}
{ "index" : { "_id" : "5" } }
{ "name" : "Rose","age":25,"job":"QA","gender":"female","salary":18000 }
{ "index" : { "_id" : "6" } }
{ "name" : "Lucy","age":31,"job":"QA","gender":"female","salary": 25000}
{ "index" : { "_id" : "7" } }
{ "name" : "Byrd","age":27,"job":"QA","gender":"male","salary":20000 }
{ "index" : { "_id" : "8" } }
{ "name" : "Foster","age":27,"job":"Java Programmer","gender":"male","sa lary": 20000}
{ "index" : { "_id" : "9" } }
{ "name" : "Gregory","age":32,"job":"Java Programmer","gender":"male","s alary":22000 }
{ "index" : { "_id" : "10" } }
{ "name" : "Bryant","age":20,"job":"Java Programmer","gender":"male","sa lary": 9000}
{ "index" : { "_id" : "11" } }
{ "name" : "Jenny","age":36,"job":"Java Programmer","gender":"female","s alary":38000 }
{ "index" : { "_id" : "12" } }
{ "name" : "Mcdonald","age":31,"job":"Java Programmer","gender":"male","salary": 32000}
{ "index" : { "_id" : "13" } }
{ "name" : "Jonthna","age":30,"job":"Java Programmer","gender":"female","salary":30000 }
{ "index" : { "_id" : "14" } }
{ "name" : "Marshall","age":32,"job":"Javascript Programmer","gender":"m ale","salary": 25000}
{ "index" : { "_id" : "15" } }
{ "name" : "King","age":33,"job":"Java Programmer","gender":"male","sala ry":28000 }
{ "index" : { "_id" : "16" } }
{ "name" : "Mccarthy","age":21,"job":"Javascript Programmer","gender":"m ale","salary": 16000}
{ "index" : { "_id" : "17" } }
{ "name" : "Goodwin","age":25,"job":"Javascript Programmer","gender":"ma le","salary": 16000}
{ "index" : { "_id" : "18" } }
{ "name" : "Catherine","age":29,"job":"Javascript Programmer","gender":"female","salary":20000}
{ "index" : { "_id" : "19" } }
{ "name" : "Boone","age":30,"job":"DBA","gender":"male","salary": 30000}
{ "index" : { "_id": "20" } }
{ "name" : "Kathy","age":29,"job":"DBA","gender":"female","salary": 2000}
########### agg #############
## 单值分析 只输出一个分析结果
#单值分析︰只输出一个分析结果 min, max, avg, sum Cardinality(类似distinct Count)
GET /employees/_search
{
"size": 0,
"aggs": {
"max_salary": {
"max": {
"field": "salary"
}
},
"max_age": {
"max": {
"field": "age"
}
},
"avg_age": {
"avg": {
"field": "age"
}
},
"sum_age": {
"sum": {
"field": "age"
}
}
}
}
# 一个聚合,输出多值
GET /employees/_search
{
"size": 0,
"aggs": {
"stats_salary": {
"stats": {
"field": "salary"
}
}
}
}
GET /employees/_search
{
"query": {
"match_all": {}
}
}
# cardinate对搜索结果去重
GET /employees/_search
{
"size": 0,
"aggs": {
"job": {
"cardinality": {
"field": "job.keyword"
}
}
}
}
#Bucket Aggregation 按照一定的规则,将文档分配到不同的桶中,从而达到分类的目的。
#ES提供的一些常见的 Bucket Aggregation。
#Terms,需要字段支持filedata keyword 默认支持fielddata text需要在Mapping 中开启fielddata,会按照分词后的结果 进行分桶
#数字类型Range / Data Range
#Histogram(直方图) / Date Histogram
#支持嵌套: 也就在桶里再做分桶
#聚合可配置属性有: field:指定聚合字段 size:指定聚合结果数量 order:指定聚合结果排序方式
#,Bucket聚合会统计Bucket内的文档数量,记为_count,并且按照_count降序 排序。我们可以指定order属性,自定义聚合的排序方式:
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
},
"salary": {
"terms": {
"field": "salary",
"size": 4,
"order": {
"_key": "asc"
}
}
}
}
}
# 限定聚合范围
GET /employees/_search
{
"size": 0,
"query": {
"range": {
"salary": {
"gte": 10000,
"lte": 20000
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
},
"salary": {
"terms": {
"field": "salary",
"size": 4,
"order": {
"_key": "asc"
}
}
}
}
}
# 注意:对 Text 字段进行 terms 聚合查询,会失败抛出异常 (已经打开了fielddata )
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field": "job"
}
}
}
}
# 对 Text 字段打开 fielddata,支持terms aggregation
PUT /employees/_mapping
{
"properties": {
"job": {
"type": "text",
"fielddata": true
}
}
}
# Range & Histogram聚合
#按照数字的范围,进行分桶 在Range Aggregation中,可以自定义Key
GET /employees/_search
{
"size": 0,
"aggs": {
"salary_range": {
"range": {
"field": "salary",
"ranges": [
{"to":10000},
{"from": 10000,"to": 50000},
{"key":">20000","from": 20000}
]
}
}
}
}
# Histogram示例:按照工资的间隔分桶
# #工资0到10万,以 5000一个区间进行分桶
GET /employees/_search
{
"size": 0,
"aggs": {
"salary_range": {
"histogram": {
"field": "salary",
"interval": 5000,
"extended_bounds": {
"min": 0,
"max": 100000
}
}
}
}
}
# top_hits应用场景: 当获取分桶后,桶内最匹配的顶部文档列表
# 指定job,不同工种job中,年纪最大的3个员工的具体信息
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
},
"aggs": {
"old_employee": {
"top_hits": {
"size": 2,
"sort": [
{"age":{"order": "desc"}}
]
}
}
}
}
}
}
# 嵌套聚合示例
# 嵌套聚合1,按照工作类型分桶,并统计工资信息
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs_salary_stats": {
"terms": {
"field": "job.keyword"
},
"aggs": {
"salary": {
"stats": {
"field": "salary"
}
}
}
}
}
}
# # 多次嵌套。根据工作类型分桶,然后按照性别分桶,计算工资的统计信息
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs_salary_stats": {
"terms": {
"field": "job.keyword"
},
"aggs": {
"gender_aggs": {
"terms": {
"field": "gender"
},
"aggs": {
"salary_agg": {
"stats": {
"field": "salary"
}
}
}
}
}
}
}
}
# Pipeline Aggregation
#支持对聚合分析的结果,再次进行聚合分析。 Pipeline 的分析结果会输出到原结果中,根据位置的不同,分为两类: Sibling - 结果和现有分析结果同级 Max,min,Avg & Sum Bucket Stats,Extended Status Bucket Percentiles Bucket Parent -结果内嵌到现有的聚合分析结果之中 Derivative(求导) Cumultive Sum(累计求和) Moving Function(移动平均值 )
# min_bucket示例 在员工数最多的工种里,找出平均工资最低的工种
# min_salary_by_job结果和jobs的聚合同级
#min_bucket求之前结果的最小值
#通过bucket_path关键字指定路径
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
},
"min_salary_by_job":{
"min_bucket": {
"buckets_path": "jobs>avg_salary"
}
}
}
}
# Stats示例
# # 平均工资的统计分析
# 意思就是这个同理,平均工资,再计算最大最小平均 总数
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
},
"stats_salary_by_job":{
"stats_bucket": {
"buckets_path": "jobs>avg_salary"
}
}
}
}
# percentiles示例
# # 平均工资的百分位数
GET /employees/_search
{
"size": 0,
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
},
"percentiles_salary_by_job":{
"percentiles_bucket": {
"buckets_path": "jobs>avg_salary"
}
}
}
}
# Cumulative_sum示例
# #Cumulative_sum 累计求和
# histogram , interval 表示区间数
GET /employees/_search
{
"size": 0,
"aggs": {
"age": {
"histogram": {
"field": "age",
"min_doc_count": 0,
"interval": 1
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
},
"cumulative_salary_by_job": {
"cumulative_sum": {
"buckets_path": "avg_salary"
}
}
}
}
}
}
# 聚合的作用范围 ES聚合分析的默认作用范围是query的查询结果集,同时ES还支持以下方式改变聚合的作用 范围: Filter Post Filter Global
# query
GET /employees/_search
{
"size": 0,
"query": {
"range": {
"salary": {
"gte": 10,
"lte": 40000
}
}
},
"aggs": {
"age": {
"histogram": {
"field": "age",
"min_doc_count": 0,
"interval": 1
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
},
"cumulative_salary_by_job": {
"cumulative_sum": {
"buckets_path": "avg_salary"
}
}
}
}
}
}
# Filter
GET /employees/_search
{
"size": 0,
"aggs": {
"age_aggs": {
"filter": {
"range": {
"age": {
"lte": 35
}
}
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
}
}
}
#Post field. 一条语句,找出所有的job类型。还能找到聚合后符合条件的结果
GET /employees/_search
{
"size": 0,
"aggs": {
"age_aggs": {
"terms": {
"field": "job.keyword"
}
}
},
"post_filter": {
"term": {
"job.keyword": "QA"
}
}
}
#global
# 分组,统计职位,global 再全部计算所有的平均工资(和上面范围没关系了)
GET /employees/_search
{
"size": 0,
"query": {
"range": {
"age": {
"gte": 30
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword"
}
},
"all":{
"global": {},
"aggs": {
"salary_avg": {
"avg": {
"field": "salary"
}
}
}
}
}
}
#排序 order
GET /employees/_search
{
"size": 0,
"query": {
"range": {
"age": {
"gte": 30
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword",
"order": [{
"avg_salary": "asc"
}]
},
"aggs": {
"avg_salary": {
"avg": {
"field": "salary"
}
}
}
}
}
}
#排序 order
#count and key
GET /employees/_search
{
"size": 0,
"query": {
"range": {
"age": {
"gte": 30
}
}
},
"aggs": {
"jobs": {
"terms": {
"field": "job.keyword",
"order": [
{
"stats_salary.sum": "asc"
},
{
"stats_salary.min": "asc"
}
]
},
"aggs": {
"stats_salary": {
"stats": {
"field": "salary"
}
}
}
}
}
}