1.首先使用默认分词器,对数据进行处理,查看term 与match区别
//默认分词器
POST /_analyze
{
"text": "我爱北京天安门",
"analyzer": "standard"
}
//结果
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "<IDEOGRAPHIC>",
"position" : 0
},
{
"token" : "爱",
"start_offset" : 1,
"end_offset" : 2,
"type" : "<IDEOGRAPHIC>",
"position" : 1
},
...
{
"token" : "门",
"start_offset" : 6,
"end_offset" : 7,
"type" : "<IDEOGRAPHIC>",
"position" : 6
}
]
}
//发现中文是单字分词,那针对默认分词器的中文数据进行查询会是什么结果
PUT /index_1/_doc/1
{
"message":"我爱北京天安门"
}
GET index_1/_search
//结果
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "index_1",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"message" : "我爱北京天安门"
}
}
]
}
}
//使用term 进行查询 ,但不使用 xx.keyword
GET index_1/_search
{
"query": {
"term": {
"message": {
"value": "我爱北京天安门"
}
}
}
}
//发现没有查询到数据,为什么呢,首先term查询不会对搜索词进行分词,
把整个搜索词当做一个完整的词去匹配信息。这时候可能又感觉到疑惑,
我的搜索词与对应信息一致为什么还搜索不到,因为对应的message字段使用了默认分词器,
默认分词器会将中文按单字分词。所有当用“我爱北京天安门”去匹配时,
对应信息根本没有这个词
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
//同理messgae默认分词后没有北京这个词
GET index_1/_search
{
"query": {
"term": {
"message": {
"value": "北京"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
//使用term 进行查询 ,使用 xx.keyword
GET index_1/_search
{
"query": {
"term": {
"message.keyword": {
"value": "我爱北京天安门"
}
}
}
}
//发现查询到了信息。为什么加入.keyword 就可以查询到呢,因为一旦查询是对应字段加入.keyword ,就代表搜索条件与所查询的字段信息要完全匹配 ,这个时候不会受到分词影响
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.2876821,
"hits" : [
{
"_index" : "index_1",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.2876821,
"_source" : {
"message" : "我爱北京天安门"
}
}
]
}
}
GET index_1/_search
{
"query": {
"term": {
"message.keyword": {
"value": "北京"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
//使用match 查询,match 会对搜索词进行分词,在想一下默认分词器对中文分词是单字分词,所以对搜索词分词也是单字,所以想一下查询结果,match 默认是使用的or关系。
//因为都是单字分词所以在使用分词后的词去匹配时,都可以匹配上
GET index_1/_search
{
"query": {
"match": {
"message": {
"query": "我爱天安门"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.4384104,
"hits" : [
{
"_index" : "index_1",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.4384104,
"_source" : {
"message" : "我爱北京天安门"
}
}
]
}
}
GET index_1/_search
{
"query": {
"match": {
"message": {
"query": "我爱天门"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.1507283,
"hits" : [
{
"_index" : "index_1",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.1507283,
"_source" : {
"message" : "我爱北京天安门"
}
}
]
}
}
//or和and 区别,or只要信息中包含对应搜索条件分词后的任意词语就可匹配到,
and需要包含搜索条件分词后所有词语
GET index_1/_search
{
"query": {
"match": {
"message": {
"query": "我爱天安门erer",
"operator": "or"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 1.4384104,
"hits" : [
{
"_index" : "index_1",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.4384104,
"_source" : {
"message" : "我爱北京天安门"
}
}
]
}
}
GET index_1/_search
{
"query": {
"match": {
"message": {
"query": "我爱天安门erer",
"operator": "and"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
2.使用ik分词器,对数据进行处理,查看term 与match区别
//使用ik分词器
POST _analyze/
{
"analyzer": "ik_max_word",
"text": ["我爱北京天安门"]
}
//分词结果,很明显和默认分词器对中文处理上的巨大差别
{
"tokens" : [
{
"token" : "我",
"start_offset" : 0,
"end_offset" : 1,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "爱",
"start_offset" : 1,
"end_offset" : 2,
"type" : "CN_CHAR",
"position" : 1
},
{
"token" : "北京",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "天安门",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "天安",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "门",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 5
}
]
}
//创建使用ik分词器的数据
PUT index_2/
{
"mappings": {
"properties": {
"message": {
"type": "text",
"analyzer": "ik_max_word",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
GET index_2/_mapping
//这时可以看到 "analyzer" : "ik_max_word" 这就代表使用了ik分词器,如果你没配置ik分词器,请先配置
{
"index_2" : {
"mappings" : {
"properties" : {
"message" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
},
"analyzer" : "ik_max_word"
}
}
}
}
}
PUT /index_2/_doc/1
{
"message":"我爱北京天安门"
}
//在使用上面的条件进行查询就可以看到不同的结果
GET index_2/_search
{
"query": {
"term": {
"message": {
"value": "我爱北京天安门"
}
}
}
}
//这个就是代表用“我爱北京天安门”去匹配时,对应信息分词后没有这个词语,
觉得迷惑可以往上看一下“我爱北京天安门” 的分词结果
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
GET index_2/_search
{
"query": {
"term": {
"message": {
"value": "北京"
}
}
}
}
//比对下这俩条查询条件,是不是就很明了了
{
"took" : 718,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.2876821,
"hits" : [
{
"_index" : "index_2",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.2876821,
"_source" : {
"message" : "我爱北京天安门"
}
}
]
}
}
//对应term 的.keyword 和默认分词是没有区别的
//match 则是根据信息的分词结果与搜索词的分词结果有关,看下面俩条件查询,在看对应的分词结果,你能得出什么结论?对应信息分词后没有‘天’,所以就匹配不上。
GET index_2/_search
{
"query": {
"match": {
"message": {
"query": "我"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.2876821,
"hits" : [
{
"_index" : "index_2",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.2876821,
"_source" : {
"message" : "我爱北京天安门"
}
}
]
}
}
GET index_2/_search
{
"query": {
"match": {
"message": {
"query": "天"
}
}
}
}
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}