用Python实现ElasticSearch的智能搜索

原创

wx61924d058416f 2021-11-16 15:17:53 博主文章分类：python ©著作权

文章标签 ElasticSearch python elasticsearch elastic 存储文件 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者wx61924d058416f的原创作品，请联系作者获取转载授权，否则将追究法律责任

一、ElasticSearch的安装

针对安装ElasticSearch以及专门处理中文的IK分词，网上一搜一大把，这里就不说明了。

二、文本预处理

本次的文本是word文档，所以我么先进行TXT转换。转换代码如下：

'''将docx转化为txt'''
def docx_to_text():
wordapp = win32com.client.\
gencache.EnsureDispatch("Word.Application")
try:
for root, dirs, files\
in os.walk(PATH_DATA):
#root代表路径,dirs代表目录，files代表文件名
print(root,dirs,files)
for _dir in dirs:   #若是目录，跳过
pass
for _file in files:  #若是文件，转化为txt
if not fnmatch.\
fnmatch(_file, '*.docx'):
#若是docx结尾，才进行操作
continue
word_file = os.path.join(root, _file)
wordapp.Documents.Open(word_file)
#打开word文件
docastxt = word_file[:-4] + 'txt'
#新建txt的文件名
wordapp.ActiveDocument\
                    .SaveAs(
docastxt,
FileFormat=
win32com.client.constants.wdFormatText)
wordapp.ActiveDocument.Close()
finally:
wordapp.Quit()
print("well done!")

三、创建ElasticSearch库及其相关信息

class ElasticWy:
def __init__(self, index_name,index_type,ip ="localhost"):

self.index_name =index_name
self.index_type = index_type
self.es = Elasticsearch([ip],
http_auth=('elastic',
'password'),
port=9200)

def create_index(self,index_name,index_type):
'''
        创建索引,创建索引名称为ott，类型为ott_type的索引
        :param ex: Elasticsearch对象
        :return:
        '''
#创建映射
_index_mappings = {
"mappings": {
self.index_type: {
"properties": {
"source": {
"type": "text",
"index": True,
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
                        }
                    }
                }

            }
        }
if self.es.indices.exists\
                    (index=self.index_name) is not True:
res = self.es.indices.create\
                (index=self.index_name, body=_index_mappings)
print('1',res)
def Index_Data(self,category,docname,content,cishu):

if isinstance(content,str):   #判断是否为表类型
#print('插入数据')
# for category,name,line in namelist,docname,filelist:

action = {
'category': category,
'name': docname,
"title": content}
#ACTIONS.append(action)
self.es.index(index=self.index_name,
doc_type=self.index_type,
body=action)
print('已经插入',cishu,'条记录')
else:
print("错误：文件尚未成功从目录中写入库")

def Search_data(self,input_text):
# doc = {'query': {'match_all': {}}}
#start_time = time()
doc = {
"query": {
"match":{
"title": {

"query": input_text,
"operator": "and"

                    }
                }
            }
        }
_searched = self.es.search(
index=self.index_name,
doc_type=self.index_type,
body=doc)
i=0
last_category = []
last_docxname = []
last_sentence = []
for hit in _searched['hits']['hits']:
#print (hit['_source'])
#print ( hit['_source']['title'])
# print(hit['_score'])
last_category.append(hit['_source']['category'])
last_docxname.append(hit['_source']['name'])
last_sentence.append(hit['_source']['title'])
i=i+1
if i==1:
break
#print(len(last_sentence))
for temp in last_sentence:
print(temp)
print(last_category)
print(last_docxname)
#cost_time = time()-start_time
#print(cost_time)

代码注释也写得比较清楚，大家自己看吧

四、读取文件并且逐条插入

def readfile():
docx_name=[]   #存储文件名
filename=[]   #存储文件种类名
all_file=[]    #存储所有文件
#temp1=[]
content=[]
print('文件开始读取')
x=-1
try:
for root, dirs, files in os.walk(PATH_DATA):
#print(x)
if x==-1:
#print(dirs)
for name in dirs:#访问第一层文件夹
#print(name)
filename.append(name)
temp_content=[]
else:

#print(files)
#temp=[]
print('已经读取到', filename[x], '文件夹')
cishu=1
temp_content = []
for every_docx_name in files:
# print('hang',hang)
every_file_path = \
os.path.join(root, every_docx_name)
# print(every_file_path)
# print(every_file_path)
# print(filename)
try:#判断编码问题
with codecs.open(every_file_path,
encoding='gbk') as f:
temp = f.read()
#print('++++',temp)
'''输出各个文档的信息'''
# print('name', filename[x],
#       'docname', every_docx_name,
#       'content', len(temp))
'''执行文档插入操作'''
wy.Index_Data \
                            (filename[x],
every_docx_name,
temp,
cishu)
cishu=cishu+1

except UnicodeDecodeError:
#如果不是gbk，执行utf-8操作
with codecs.open(every_file_path,
encoding='utf-8') as f:
temp = f.read()
# print('++++',temp)
# print('name', filename[x],
#       'docname', every_docx_name,
#       'content', len(temp))
# ''''''
'''执行文档插入操作'''
wy.Index_Data \
                                (filename[x],
every_docx_name,
temp,
cishu)
cishu=cishu+1

#temp_content.append(temp)
# print(len(temp_content))
#content.append(temp_content)
# print(every_file_path)

x=x+1



#print(files)


finally:
print("文件读取完成")
#return filename,all_file,docx_name

我文件读取那么循环是因为存在二级目录。

用Python实现ElasticSearch的智能搜索_python

用Python实现ElasticSearch的智能搜索_elasticsearch_02

这段代码中有抛出异常操作，这是因为我的文档中不仅有GBK还有UTF-8，所以我这样解决了。如果大家没有这些问题，就不用加。

用Python实现ElasticSearch的智能搜索_ElasticSearch_03

上图是插入截图

五、搜索

搜索的代码其实在三、创建ElasticSearch库及其相关信息中已经贴上去了，就是def Index_Data()那个函数。这里简单的说一下IK分词。首先是在创建库的时候已经写进去了

用Python实现ElasticSearch的智能搜索_ElasticSearch_04

然后我们在搜索的时候是直接匹配我插入的一个参数 'title'

用Python实现ElasticSearch的智能搜索_elastic_05

第一条线是插入的title，里面的content在readfile（）函数里传参，第二个title是在title里搜索 input_text。

第三个是分词模式，and 模式他是在分词之后匹配最近的文档。比如：‘五位一体’，如果词库中没有这个词，他假如分成“五位”和‘一体’。但是在匹配的时候他会优先匹配这两个词在一起或者距离最近的。

这里可以参考官方文档：多词查询|ElasticSearch

查询截图：

用Python实现ElasticSearch的智能搜索_elasticsearch_06

用Python实现ElasticSearch的智能搜索_elastic_07

这里是只反馈一条，大家可以根据实际情况更改，反馈条数在Search_data（）函数中的 i 参数里

用Python实现ElasticSearch的智能搜索_elasticsearch_08

六、完整代码

#!D:/workplace/python
# -*- coding: utf-8 -*-
# @File  : DangJian_prepare.py
# @Author: WangYe
# @Date  : 2018/8/27
# @Software: PyCharm
# coding: utf-8
import numpy
import jieba
import os
from threading import Thread
import time
from os import walk
#import CSVOP
import codecs
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from time import time
import os
import sys
import fnmatch
import win32com.client
PATH = os.path.abspath(os.path.dirname(sys.argv[0]))
#print(PATH)
PATH_DATA=r'C:\Users\wy\Desktop\data\DangJian_prepare'
# 主要执行函数
'''将docx转化为txt'''
def docx_to_text():
wordapp = win32com.client.\
gencache.EnsureDispatch("Word.Application")
try:
for root, dirs, files\
in os.walk(PATH_DATA):
#root代表路径,dirs代表目录，files代表文件名
print(root,dirs,files)
for _dir in dirs:   #若是目录，跳过
pass
for _file in files:  #若是文件，转化为txt
if not fnmatch.\
fnmatch(_file, '*.docx'):
#若是docx结尾，才进行操作
continue
word_file = os.path.join(root, _file)
wordapp.Documents.Open(word_file)
#打开word文件
docastxt = word_file[:-4] + 'txt'
#新建txt的文件名
wordapp.ActiveDocument\
                    .SaveAs(
docastxt,
FileFormat=
win32com.client.constants.wdFormatText)
wordapp.ActiveDocument.Close()
finally:
wordapp.Quit()
print("well done!")
'''遍历TXT文件并且调用es插入数据'''
def readfile():
docx_name=[]   #存储文件名
filename=[]   #存储文件种类名
all_file=[]    #存储所有文件
#temp1=[]
content=[]
print('文件开始读取')
x=-1
try:
for root, dirs, files in os.walk(PATH_DATA):
#print(x)
if x==-1:
#print(dirs)
for name in dirs:#访问第一层文件夹
#print(name)
filename.append(name)
temp_content=[]
else:

#print(files)
#temp=[]
print('已经读取到', filename[x], '文件夹')
cishu=1
temp_content = []
for every_docx_name in files:
# print('hang',hang)
every_file_path = \
os.path.join(root, every_docx_name)
# print(every_file_path)
# print(every_file_path)
# print(filename)
try:#判断编码问题
with codecs.open(every_file_path,
encoding='gbk') as f:
temp = f.read()
#print('++++',temp)
'''输出各个文档的信息'''
# print('name', filename[x],
#       'docname', every_docx_name,
#       'content', len(temp))
'''执行文档插入操作'''
wy.Index_Data \
                            (filename[x],
every_docx_name,
temp,
cishu)
cishu=cishu+1

except UnicodeDecodeError:
#如果不是gbk，执行utf-8操作
with codecs.open(every_file_path,
encoding='utf-8') as f:
temp = f.read()
# print('++++',temp)
# print('name', filename[x],
#       'docname', every_docx_name,
#       'content', len(temp))
# ''''''
'''执行文档插入操作'''
wy.Index_Data \
                                (filename[x],
every_docx_name,
temp,
cishu)
cishu=cishu+1

#temp_content.append(temp)
# print(len(temp_content))
#content.append(temp_content)
# print(every_file_path)

x=x+1



#print(files)


finally:
print("文件读取完成")
#return filename,all_file,docx_name
'''文件操作结束，开始es'''
'''文件操作结束，开始es'''
'''文件操作结束，开始es'''

class ElasticWy:
def __init__(self, index_name,index_type,ip ="localhost"):

self.index_name =index_name
self.index_type = index_type
self.es = Elasticsearch([ip],
http_auth=('elastic',
'password'),
port=9200)

def create_index(self,index_name,index_type):
'''
        创建索引,创建索引名称为ott，类型为ott_type的索引
        :param ex: Elasticsearch对象
        :return:
        '''
#创建映射
_index_mappings = {
"mappings": {
self.index_type: {
"properties": {
"source": {
"type": "text",
"index": True,
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
                        }
                    }
                }

            }
        }
if self.es.indices.exists\
                    (index=self.index_name) is not True:
res = self.es.indices.create\
                (index=self.index_name, body=_index_mappings)
print('1',res)
def ReadFile(self,filepath):
# filepath='C:\\Users\\wy\\Desktop\\data' \
#          '\\elasticsearch\\data.txt'
if os.path.exists(filepath) and os.path.isfile(filepath):
print("*********文件成功读取完毕*********")
with open(filepath) as f:
temp_list=[]
for line in f:
temp_set={}
#print(line)
temp_set=str(line)
temp_list.append(temp_set)
#print(temp_list)
return temp_list
else:
print("错误：文件目录不存在或文件不存在")
def Index_Data(self,category,docname,content,cishu):

if isinstance(content,str):   #判断是否为表类型
#print('插入数据')
# for category,name,line in namelist,docname,filelist:

action = {
'category': category,
'name': docname,
"title": content}
#ACTIONS.append(action)
self.es.index(index=self.index_name,
doc_type=self.index_type,
body=action)
print('已经插入',cishu,'条记录')
else:
print("错误：文件尚未成功从目录中写入库")

def Search_data(self,input_text):
# doc = {'query': {'match_all': {}}}
#start_time = time()
doc = {
"query": {
"match":{
"title": {

"query": input_text,
"operator": "and"

                    }
                }
            }
        }
_searched = self.es.search(
index=self.index_name,
doc_type=self.index_type,
body=doc)
i=0
last_category = []
last_docxname = []
last_sentence = []
for hit in _searched['hits']['hits']:
#print (hit['_source'])
#print ( hit['_source']['title'])
# print(hit['_score'])
last_category.append(hit['_source']['category'])
last_docxname.append(hit['_source']['name'])
last_sentence.append(hit['_source']['title'])
i=i+1
if i==1:
break
#print(len(last_sentence))
for temp in last_sentence:
print(temp)
print(last_category)
print(last_docxname)
#cost_time = time()-start_time
#print(cost_time)


if __name__ == '__main__':
wy = ElasticWy("3", "wy", ip="localhost")
wy.create_index(index_name="3",index_type="wy")
readfile()
# # #return filename, all_file, docx_name
# # docname=readfile()[2]
# # filelist=readfile()[1]
# # namelist=readfile()[0]
while(1):
print("请输入与要匹配的字符串,输入 0 终止查询")
input_text = input()
if input_text != str(0):
print('查询结果如下')
wy.Search_data(input_text=input_text)
else:
print("查询终止")
break