前言
不同版本Elasticsearch语法不同
本文使用的Elasticsearch是6.8.0
原理
使用Elasticsearch自带分词器ngram,将句子短语逐字按步长完全拆分出来加入倒排索引中,搜索无需分词直接查询。
顺带举例下分词器ngram的分词效果
例如句子:你要购买什么
结果:见本文下方的"查看某个id的文档的字段拆分出的实际结果"
新建索引
PUT http://localhost:9200/fc_test/
{
"settings": {
"index": {
"number_of_shards": "1",
"number_of_replicas": "1"
},
"index.max_ngram_diff": 10,
"analysis": {
"analyzer": {
"twelvet_index_analyzer": {
"type": "custom",
"tokenizer": "code_index_tokenizer",
"filter": [
"lowercase"
]
},
"twelvet_search_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
},
"tokenizer": {
"code_index_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 10,
"token_chars": []
}
}
}
},
"mappings": {
"wjhtype": {
"properties": {
"name": {
"type": "text"
},
"age": {
"type": "long",
"index": true
},
"gender": {
"type": "keyword"
},
"fileText": {
"type": "text",
"analyzer": "twelvet_index_analyzer",
"search_analyzer": "twelvet_search_analyzer"
},
"fileTexttwo": {
"type": "text",
"analyzer": "twelvet_index_analyzer",
"search_analyzer": "twelvet_search_analyzer"
}
}
}
}
}其中fileText这个字段,分词加入索引的时候使用twelvet_index_analyzer这个自定义分词器,这个分词器内部由ES自带分词器ngram实现,可以将逐字按递增的步长拆开加入倒排索引中,
搜索的时候则不需要进行分词,因此采用keyword这个不分词的分词器。
min_gram是拆分出来的token最小字符数
max_gram是拆分出来的token最大字符数
max_ngram_diff的值要比max_gram-min_gram的大,不然会报错
批量插入测试数据
POST
http://localhost:9200/fc_test/wjhtype/_bulk
{"index": {"_id": 1}}
{"name": "tom","age": 5,"fileText":"你要购买什么","fileTexttwo":""}
{"index": {"_id": 2}}
{"name": "tom","age": 5,"fileText":"购物和买卖可以让人兴奋","fileTexttwo":""}
{"index": {"_id": 3}}
{"name": "tom","age": 5,"fileText":"【买卖合同定义】买卖合同是出卖人转移标的物的所有权于买受人,买受人支付价款的合同。第五百九十六条 【买卖合同条款】买卖合同的内容一般包括标的物的名称、数量、质量、价款、履行期限、履行地点和方式、包装方式、检验标准和方法、结算方式、合同使用的文字及其效力等条款。","fileTexttwo":""}
{"index": {"_id": 4}}
{"name": "tom","age": 5,"fileText":"是不是购物可以让人精神放松,据了解,偶尔逛街对身体健康有促进作用。其能够调节人的情绪,舒缓压力。现在在高强度工作的影响下,很多都市白领都有压力过大的情况。如果没能及时释放压力,大脑长期处于紧绷的状态,有可能会导致身心健康受损","fileTexttwo":""}
{"index": {"_id": 5}}
{"name": "tom","age": 5,"fileText":"基督教认为耶稣是上帝的儿子,降生为人,是为了拯救世人。后对于拯救黎民百姓于水火之中的时代英雄,人们往往也称之为救世主。无论疾苦悲伤,他都肩负着拯救人类的重任","fileTexttwo":""}
{"index": {"_id": 6}}
{"name": "tom","age": 5,"fileText":"购文字物","fileTexttwo":""}
{"index": {"_id": 7}}
{"name": "tom","age": 5,"fileText":"我不会购物的","fileTexttwo":""}
{"index": {"_id": 8}}
{"name": "tom","age": 5,"fileText":"111111","fileTexttwo":""}
{"index": {"_id": 9}}
{"name": "tom","age": 5,"fileText":"11111111111","fileTexttwo":""}查看某个id的文档的字段拆分出的实际结果
GET
http://localhost:9200/fc_test/wjhtype/1/_termvectors?fields=fileText
结果
{
"_index": "fc_test",
"_type": "wjhtype",
"_id": "1",
"_version": 4,
"found": true,
"took": 5,
"term_vectors": {
"fileText": {
"field_statistics": {
"sum_doc_freq": 305101746,
"doc_count": 11550009,
"sum_ttf": 328148948
},
"terms": {
"么": {
"term_freq": 1,
"tokens": [
{
"position": 20,
"start_offset": 5,
"end_offset": 6
}
]
},
"买": {
"term_freq": 1,
"tokens": [
{
"position": 15,
"start_offset": 3,
"end_offset": 4
}
]
},
"买什": {
"term_freq": 1,
"tokens": [
{
"position": 16,
"start_offset": 3,
"end_offset": 5
}
]
},
"买什么": {
"term_freq": 1,
"tokens": [
{
"position": 17,
"start_offset": 3,
"end_offset": 6
}
]
},
"什": {
"term_freq": 1,
"tokens": [
{
"position": 18,
"start_offset": 4,
"end_offset": 5
}
]
},
"什么": {
"term_freq": 1,
"tokens": [
{
"position": 19,
"start_offset": 4,
"end_offset": 6
}
]
},
"你": {
"term_freq": 1,
"tokens": [
{
"position": 0,
"start_offset": 0,
"end_offset": 1
}
]
},
"你要": {
"term_freq": 1,
"tokens": [
{
"position": 1,
"start_offset": 0,
"end_offset": 2
}
]
},
"你要购": {
"term_freq": 1,
"tokens": [
{
"position": 2,
"start_offset": 0,
"end_offset": 3
}
]
},
"你要购买": {
"term_freq": 1,
"tokens": [
{
"position": 3,
"start_offset": 0,
"end_offset": 4
}
]
},
"你要购买什": {
"term_freq": 1,
"tokens": [
{
"position": 4,
"start_offset": 0,
"end_offset": 5
}
]
},
"你要购买什么": {
"term_freq": 1,
"tokens": [
{
"position": 5,
"start_offset": 0,
"end_offset": 6
}
]
},
"要": {
"term_freq": 1,
"tokens": [
{
"position": 6,
"start_offset": 1,
"end_offset": 2
}
]
},
"要购": {
"term_freq": 1,
"tokens": [
{
"position": 7,
"start_offset": 1,
"end_offset": 3
}
]
},
"要购买": {
"term_freq": 1,
"tokens": [
{
"position": 8,
"start_offset": 1,
"end_offset": 4
}
]
},
"要购买什": {
"term_freq": 1,
"tokens": [
{
"position": 9,
"start_offset": 1,
"end_offset": 5
}
]
},
"要购买什么": {
"term_freq": 1,
"tokens": [
{
"position": 10,
"start_offset": 1,
"end_offset": 6
}
]
},
"购": {
"term_freq": 1,
"tokens": [
{
"position": 11,
"start_offset": 2,
"end_offset": 3
}
]
},
"购买": {
"term_freq": 1,
"tokens": [
{
"position": 12,
"start_offset": 2,
"end_offset": 4
}
]
},
"购买什": {
"term_freq": 1,
"tokens": [
{
"position": 13,
"start_offset": 2,
"end_offset": 5
}
]
},
"购买什么": {
"term_freq": 1,
"tokens": [
{
"position": 14,
"start_offset": 2,
"end_offset": 6
}
]
}
}
}
}
}查询语法
http://localhost:9200/fc_test/_search
{
"query": {
"term":{
"fileText":"购物"
}
},
"from": 0,
"size": 1000
}查询结果如下
{
"took": 32,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 16.802471,
"hits": [
{
"_index": "fc_test",
"_type": "wjhtype",
"_id": "7",
"_score": 16.802471,
"_source": {
"name": "tom",
"age": 5,
"fileText": "我不会购物的",
"fileTexttwo": ""
}
},
{
"_index": "fc_test",
"_type": "wjhtype",
"_id": "2",
"_score": 9.923969,
"_source": {
"name": "tom",
"age": 5,
"fileText": "购物和买卖可以让人兴奋",
"fileTexttwo": ""
}
},
{
"_index": "fc_test",
"_type": "wjhtype",
"_id": "4",
"_score": 0.95717204,
"_source": {
"name": "tom",
"age": 5,
"fileText": "是不是购物可以让人精神放松,据了解,偶尔逛街对身体健康有促进作用。其能够调节人的情绪,舒缓压力。现在在高强度工作的影响下,很多都市白领都有压力过大的情况。如果没能及时释放压力,大脑长期处于紧绷的状态,有可能会导致身心健康受损",
"fileTexttwo": ""
}
}
]
}
}性能测试
新增约1千万数据,只有id和fileText字段有数据,fileText字段数据和id相同,都是从1递增的数字,例如1,2,3,4,5,6·····一直递增到10000000左右,共约1千万多个文档
查询字段fileText包含“111111”这个字符串的文档,可查出122个记录,第一次查询耗时38毫秒,性能预计是mysql的100倍以上。
















