es match_parser 单字

转载

mob6454cc74c0fc 2024-07-19 06:37:50

文章标签 es match_parser 单字分词器 analyzer 搜索 文章分类 架构后端开发

1.首先使用默认分词器，对数据进行处理，查看term 与match区别

//默认分词器
POST /_analyze
{
  "text": "我爱北京天安门",
  "analyzer": "standard"
}

//结果
{
  "tokens" : [
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "<IDEOGRAPHIC>",
      "position" : 0
    },
    {
      "token" : "爱",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "<IDEOGRAPHIC>",
      "position" : 1
    },
    ...
    {
      "token" : "门",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "<IDEOGRAPHIC>",
      "position" : 6
    }
  ]
}
//发现中文是单字分词，那针对默认分词器的中文数据进行查询会是什么结果

PUT /index_1/_doc/1
{
  "message":"我爱北京天安门"
}

GET index_1/_search
//结果

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "index_1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "message" : "我爱北京天安门"
        }
      }
    ]
  }
}

//使用term 进行查询 ，但不使用 xx.keyword
GET index_1/_search
{
  "query": {
    "term": {
      "message": {
        "value": "我爱北京天安门"
      }
    }
  }
}

//发现没有查询到数据，为什么呢，首先term查询不会对搜索词进行分词，
把整个搜索词当做一个完整的词去匹配信息。这时候可能又感觉到疑惑，
我的搜索词与对应信息一致为什么还搜索不到，因为对应的message字段使用了默认分词器，
默认分词器会将中文按单字分词。所有当用“我爱北京天安门”去匹配时，
对应信息根本没有这个词
{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  }
}
//同理messgae默认分词后没有北京这个词
GET index_1/_search
{
  "query": {
    "term": {
      "message": {
        "value": "北京"
      }
    }
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  }
}

//使用term 进行查询 ，使用 xx.keyword
GET index_1/_search
{
  "query": {
    "term": {
      "message.keyword": {
        "value": "我爱北京天安门"
      }
    }
  }
}

//发现查询到了信息。为什么加入.keyword 就可以查询到呢，因为一旦查询是对应字段加入.keyword ,就代表搜索条件与所查询的字段信息要完全匹配 ,这个时候不会受到分词影响
{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.2876821,
    "hits" : [
      {
        "_index" : "index_1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.2876821,
        "_source" : {
          "message" : "我爱北京天安门"
        }
      }
    ]
  }
}


GET index_1/_search
{
  "query": {
    "term": {
      "message.keyword": {
        "value": "北京"
      }
    }
  }
}


{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  }
}

//使用match 查询，match 会对搜索词进行分词，在想一下默认分词器对中文分词是单字分词，所以对搜索词分词也是单字，所以想一下查询结果,match 默认是使用的or关系。
//因为都是单字分词所以在使用分词后的词去匹配时，都可以匹配上
GET index_1/_search
{
  "query": {
    "match": {
     "message": {
       "query": "我爱天安门"
     }
    }
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.4384104,
    "hits" : [
      {
        "_index" : "index_1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.4384104,
        "_source" : {
          "message" : "我爱北京天安门"
        }
      }
    ]
  }
}

GET index_1/_search
{
  "query": {
    "match": {
     "message": {
       "query": "我爱天门"
     }
    }
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.1507283,
    "hits" : [
      {
        "_index" : "index_1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.1507283,
        "_source" : {
          "message" : "我爱北京天安门"
        }
      }
    ]
  }
}
//or和and 区别，or只要信息中包含对应搜索条件分词后的任意词语就可匹配到，
and需要包含搜索条件分词后所有词语

GET index_1/_search
{
  "query": {
    "match": {
     "message": {
       "query": "我爱天安门erer",
       "operator": "or"
     }
    }
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.4384104,
    "hits" : [
      {
        "_index" : "index_1",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.4384104,
        "_source" : {
          "message" : "我爱北京天安门"
        }
      }
    ]
  }
}

GET index_1/_search
{
  "query": {
    "match": {
     "message": {
       "query": "我爱天安门erer",
       "operator": "and"
     }
    }
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  }
}

2.使用ik分词器，对数据进行处理，查看term 与match区别

//使用ik分词器
POST _analyze/
{
  "analyzer": "ik_max_word",
  "text": ["我爱北京天安门"]
}
//分词结果，很明显和默认分词器对中文处理上的巨大差别
{
  "tokens" : [
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_CHAR",
      "position" : 0
    },
    {
      "token" : "爱",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 1
    },
    {
      "token" : "北京",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "天安门",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "天安",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "门",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_CHAR",
      "position" : 5
    }
  ]
}
//创建使用ik分词器的数据
PUT index_2/
{
  "mappings": {
    "properties": {
      "message": {
        "type": "text",
        "analyzer": "ik_max_word",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
    }
  }
}
GET index_2/_mapping
//这时可以看到  "analyzer" : "ik_max_word" 这就代表使用了ik分词器，如果你没配置ik分词器，请先配置
{
  "index_2" : {
    "mappings" : {
      "properties" : {
        "message" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          },
          "analyzer" : "ik_max_word"
        }
      }
    }
  }
}

PUT /index_2/_doc/1
{
  "message":"我爱北京天安门"
}

//在使用上面的条件进行查询就可以看到不同的结果
GET index_2/_search
{
  "query": {
    "term": {
      "message": {
        "value": "我爱北京天安门"
      }
    }
  }
}
//这个就是代表用“我爱北京天安门”去匹配时，对应信息分词后没有这个词语，
觉得迷惑可以往上看一下“我爱北京天安门” 的分词结果
{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  }
}

GET index_2/_search
{
  "query": {
    "term": {
      "message": {
        "value": "北京"
      }
    }
  }
}
//比对下这俩条查询条件，是不是就很明了了
{
  "took" : 718,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.2876821,
    "hits" : [
      {
        "_index" : "index_2",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.2876821,
        "_source" : {
          "message" : "我爱北京天安门"
        }
      }
    ]
  }
}

//对应term 的.keyword 和默认分词是没有区别的
//match 则是根据信息的分词结果与搜索词的分词结果有关，看下面俩条件查询，在看对应的分词结果，你能得出什么结论？对应信息分词后没有‘天’，所以就匹配不上。

GET index_2/_search
{
  "query": {
    "match": {
     "message": {
       "query": "我"
     }
    }
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.2876821,
    "hits" : [
      {
        "_index" : "index_2",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.2876821,
        "_source" : {
          "message" : "我爱北京天安门"
        }
      }
    ]
  }
}

GET index_2/_search
{
  "query": {
    "match": {
     "message": {
       "query": "天"
     }
    }
  }
}


{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  }
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。