真实开发中我们往往需要对一段内容既进行文字分词,又进行拼音分词,此时我们需要自定义ik+pinyin
分词器。

  1. 创建自定义分词器:
    在创建索引时自定义分词器
PUT /索引名
{
"settings": {
"analysis": {
"analyzer": {
"ik_pinyin":{//自定义分词器名
"tokenizer":"ik_max_word",// 基本分词器
"filter":"pinyin_filter"// 配置分词器过滤
}
},
"filter": {// 分词器过滤时配置另一个分词器,相当于同时使用两个分词器
"pinyin_filter":{
"type":"pinyin",// 另一个分词器
// 拼音分词器的配置
"keep_separate_first_letter":false,// 是否分词每个字的首字母
"keep_full_pinyin":true,// 是否分词全拼
"keep_original":true,// 是否保留原始输入
"remove_duplicated_term":true// 是否删除重复项
}
}
}
}
}
//为索引创建结构
POST /people/_mapping
{
"properties":{
"id":{
"type":"integer",
"store":true,
"index":true
},
"name":{
"type":"text",
"store":true,
"index":true,
"analyzer":"ik_pinyin"
},
"desc":{
"type":"text",
"store":true,
"index":true,
"analyzer":"ik_max_word"
}
}
}
//添加一条记录
POST /people/_doc/1
{
"id":1001,
"name":"湖人俱乐部的科比",
"desc":"科比是NBA最伟大的运动员"
}
  1. 测试自定义分词器
GET /索引/_analyze 
{
"text": "科比是NBA最伟大的运动员",
"analyzer": "ik_pinyin"
}
//测试结果
{
"tokens" : [
{
"token" : "shi",
"start_offset" : 2,
"end_offset" : 3,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "是",
"start_offset" : 2,
"end_offset" : 3,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "s",
"start_offset" : 2,
"end_offset" : 3,
"type" : "CN_CHAR",
"position" : 0
},
{
"token" : "n",
"start_offset" : 3,
"end_offset" : 6,
"type" : "ENGLISH",
"position" : 1
},
{
"token" : "ba",
"start_offset" : 3,
"end_offset" : 6,
"type" : "ENGLISH",
"position" : 2
},
{
"token" : "nba",
"start_offset" : 3,
"end_offset" : 6,
"type" : "ENGLISH",
"position" : 2
},
{
"token" : "zui",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "最",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "z",
"start_offset" : 6,
"end_offset" : 7,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "wei",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "da",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "伟大",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "wd",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "de",
"start_offset" : 9,
"end_offset" : 10,
"type" : "CN_CHAR",
"position" : 6
},
{
"token" : "的",
"start_offset" : 9,
"end_offset" : 10,
"type" : "CN_CHAR",
"position" : 6
},
{
"token" : "d",
"start_offset" : 9,
"end_offset" : 10,
"type" : "CN_CHAR",
"position" : 6
},
{
"token" : "yun",
"start_offset" : 10,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 7
},
{
"token" : "dong",
"start_offset" : 10,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 8
},
{
"token" : "yuan",
"start_offset" : 10,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 9
},
{
"token" : "运动员",
"start_offset" : 10,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 9
},
{
"token" : "ydy",
"start_offset" : 10,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 9
},
{
"token" : "yun",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 10
},
{
"token" : "dong",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 11
},
{
"token" : "运动",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 11
},
{
"token" : "yd",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 11
},
{
"token" : "dong",
"start_offset" : 11,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 12
},
{
"token" : "yuan",
"start_offset" : 11,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 13
},
{
"token" : "动员",
"start_offset" : 11,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 13
},
{
"token" : "dy",
"start_offset" : 11,
"end_offset" : 13,
"type" : "CN_WORD",
"position" : 13
}
]
}

按照中文分词检索:

GET /people/_search
{
"query": {
"term": {
"name": {
"value": "俱乐部"
}
}
}
}
//结果
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.3754495,
"hits" : [
{
"_index" : "people",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.3754495,
"_source" : {
"id" : 1001,
"name" : "湖人俱乐部的科比",
"desc" : "科比是NBA最伟大的运动员"
}
}
]
}
}

按照拼音检索:

GET /people/_search
{
"query": {
"term": {
"name": {
"value": "jlb"
}
}
}
}
//结果
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.3754495,
"hits" : [
{
"_index" : "people",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.3754495,
"_source" : {
"id" : 1001,
"name" : "湖人俱乐部的科比",
"desc" : "科比是NBA最伟大的运动员"
}
}
]
}
}