为了可以方便地实时观测插入过程中有无差错,选择使用自己独立开发的ESClient来实现数据的批量导入。其中关键方法为基于elaticsearch库中的help.bulk方法的add_date_bulk函数,只需要通过json库读取json文件形成列表,再把该列表传入该方法中,就可以不断将数据传入es中,并且可以观察到进度状态和异常信息。
插入脚本
使用方法
- 创建一个es.ini配置文件,路径为conf/es.ini(可自定义,在main函数中更改),格式如下
[Elasticsearch]
host = xxxx
port = 9200
# 没有账号密码可不写
user = elastic
pass = xxxxxx
- 准备好自己要插入的json数据(json文件格式)
- 在下面这段脚本中的main函数根据自己的需求配置好mapping和setting
- 运行脚本,开始插入
import configparser
import time
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import helpers
import tqdm
# 运行时间装饰器 语法糖
def timer(func):
def wrapper(*args, **kwargs):
start = time.time()
res = func(*args, **kwargs)
print('共耗时约 {:.2f} 秒'.format(time.time() - start))
return res
return wrapper
# 索引类
class BaseEsClient(object):
'''
TODO:连接es,增删查改index
'''
def __init__(self, filepath="./conf/es.ini"):
'''
TODO:实例和事务化单个node,若需要多个node,需要重构代码
:param filepath:
'''
# 读取es配置
self.conf = configparser.ConfigParser()
self.conf.read(filepath, encoding='utf-8')
# TODO:传参
self.es_servers = [{
"host": self.conf.get('Elasticsearch', 'Host'),
"port": self.conf.get('Elasticsearch', 'Port')
}]
self.es_auth = (self.conf.get('Elasticsearch', 'User'), self.conf.get('Elasticsearch', 'Pass'))
try:
# 无用户名密码状态
# self.es = Elasticsearch([ip], port=port)
self.es_client = Elasticsearch(hosts=self.es_servers, http_auth=self.es_auth)
# 用户名密码状态
# http_auth是对设置了安全机制的es库需要写入 账号与密码,如果没有设置则不用写这个参数
# self.es = Elasticsearch([ip], http_auth=('elastic', 'password'), port=port)
print(f'连接成功,目标url为: f{self.es_servers}')
except:
print('连接失败')
def create_index(self, index_name):
'''
TODO:进行创建一个数据库,即index, 索引
:param index_name: 索引名
:return:
'''
self.es_client.indices.create(index=index_name)
def create_index_by_body(self, index_name: str, body: dict):
'''
TODO:指定body创建一个index
:param index_name:
:param body: The configuration for the index (`settings` and`mappings`)
:return:
'''
self.es_client.indices.create(index=index_name, body=body)
def delete_index(self, index_name: str):
'''
TODO:进行删除一个数据库,即index
:param index_name:
:return:
'''
self.es_client.indices.delete(index=index_name)
# 数据库不用进入,也不用退出。
class MyEsClient(BaseEsClient):
# TODO:对单个index进行增删改查
default_body = {
"mappings": {},
"settings": {
"index": {
"number_of_shards": "1",
"number_of_replicas": "0",
}
}
}
def __init__(self, filepath="./conf/es.ini", index='index01', doc_type='_doc', body=None):
# TODO:输入单个index的名称
super().__init__(filepath=filepath)
self.index = index
self.doc_type = doc_type
if not self.es_client.indices.exists(index=index):
# 创建Index
if body:
self.body = body
else:
self.body = self.default_body
self.create_index_by_body(index_name=index, body=body)
print(f'提示:创建了index,body结构如下')
print(self.body)
def set_index_mapping(self, set_mappings):
# TODO:设置mapping结构
"""
设置index的mapping,类似于表结构。
注意!!!!现在仅仅对mapping中的properties参数,其他的参数还很多
前提为:已有index,并且已自定义分词器,详情见
输入参数举例说明:
set_mappings = {
"answer": {
"type": "string",
"index": "not_analyzed"
},
"answerAuthor": {
"type": "string"
},
"answerDate": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"//这里出现了复合类型
},
...
{...
}
}
"""
mapping = {
self.doc_type: {
"properties": set_mappings
}
}
self.es_client.indices.put_mapping(index=self.index, doc_type=self.doc_type, body=mapping)
def add_date(self, row_obj):
"""
TODO:单条插入ES
:param row_obj The document 类型:dict
"""
self.es_client.index(index=self.index, doc_type=self.doc_type, body=row_obj)
def add_date_bulk(self, row_obj_list):
"""
TODO:批量插入ES,输入文本格式为单条插入的list格式
:param row_obj_list list 列表
"""
load_data = []
i = 1
bulk_num = 2000 # 2000条为一批
for row_obj in tqdm(row_obj_list):
action = {
"_index": self.index,
"_type": self.doc_type,
"_source": row_obj
}
load_data.append(action)
i += 1
# 批量处理
if len(load_data) == bulk_num:
print('插入', i / bulk_num, '批bulk')
success, failed = helpers.bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
print(success, failed)
# del load_data[0:len(load_data)]
load_data.clear()
# 处理剩下不足2000的
if len(load_data) > 0:
success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
del load_data[0:len(load_data)]
print(success, failed)
def update_by_id(self, row_obj):
"""
TODO:根据给定的_id,更新ES文档
:param row_obj
:return: None
"""
_id = row_obj.get("_id", 1)
row_obj.pop("_id")
self.es_client.update(index=self.index, doc_type=self.doc_type, body={"doc": row_obj}, id=_id)
def delete_by_id(self, _id):
"""
TODO:根据给定的id,删除文档
:param _id
:return:
"""
self.es_client.delete(index=self.index, doc_type=self.doc_type, id=_id)
def search_by_query(self, body):
'''
TODO:根据查询的query语句,来搜索查询内容
:param body
'''
search_result = self.es_client.search(index=self.index, doc_type=self.doc_type, body=body)
return search_result
def clear_doc(self):
'''
TODO:清空该index里的所有数据
:return:
'''
choic = input('即将删除该index下的所有数据,是否继续?(Y/N):').strip()
if choic == 'y' or choic == 'Y':
# 删除所有
delete_by_all = {"query": {"match_all": {}}}
result = self.es_client.delete_by_query(index=self.index, body=delete_by_all, doc_type=self.doc_type)
print(result)
else:
print('取消操作')
@timer
def add_data_by_bulk(self, row_obj_list):
"""
TODO: 使用生成器批量写入数据
:param row_obj_list
:return None
"""
bulk_num = 2000 # 2000条为一批
action = ({
"_index": self.index,
"_type": self.doc_type,
"_source": one_row,
} for one_row in row_obj_list)
helpers.bulk(self.es_client, action)
if __name__ == '__main__':
# 包括mapping和setting 自己根据需求更改配置
my_body = {
"settings": {
# 副本数
"number_of_replicas": 0,
# 分片数
"number_of_shards": 3,
# 分析
"analysis": {
# 自定义的分词过滤器
"filter": {
"pinyin_max_word_filter": {
"type": "pinyin",
"keep_full_pinyin": "true", # 分词全拼如雪花 分词xue,hua
"keep_separate_first_letter": "true", # 分词简写如雪花 分词xh
"keep_joined_full_pinyin": True # 分词会quanpin 连接 比如雪花分词 xuehua
},
"full_pinyin_filter": {
"type": "pinyin",
"keep_first_letter": False,
"keep_separate_first_letter": False,
"keep_full_pinyin": False,
"none_chinese_pinyin_tokenize": False,
"keep_original": False,
"limit_first_letter_length": 50,
"lowercase": False
}
},
# 自定义的分词器
"tokenizer": {
"my_pinyin_01": {
"type": "pinyin",
"keep_separate_first_letter": False,
"keep_full_pinyin": True,
"keep_original": True,
"limit_first_letter_length": 16,
"lowercase": True,
"remove_duplicated_term": True
},
"my_ngram_tokenizer": {
"token_chars": [
"letter",
"digit"
],
"type": "ngram",
"min_gram": "2",
"max_gram": "3"
}
},
# 字符过滤器
"char_filter": {
"my_char_filter_01": {
"type": "mapping",
"mappings": [
"/n => <br>",
"/t => ",
]
}
},
# 自定义的分析器
"analyzer": {
# ik-pinyin github的文档示例
"pinyin_analyzer": {
"tokenizer": "my_pinyin_01"
},
# 分析器用于处理 中文字段无法精确匹配;
"my_ngram_analyzer": {
"tokenizer": "my_ngram_tokenizer"
},
},
}
},
"mappings": {
"properties": {
# 平台
"Platform": {
"type": "nested",
"properties": {
# 平台首页
"platformIndexUrl": {
"index": False,
"type": "keyword"
},
# 平台介绍
"platformIntroduction": {
"index": False,
"type": "keyword"
},
# 平台名字
"platformName": {
"type": "keyword"
},
# 平台Id
"platformId": {
"type": "keyword"
}
}
},
# 学校
"School": {
"type": "nested",
"properties": {
"schoolIntroduction": {
"index": False,
"type": "keyword"
},
"schoolLogoUrl": {
"index": False,
"type": "keyword"
},
"schoolName": {
"type": "text",
"analyzer": "ik_max_word"
}
}
},
# 学期实体
"Semester": {
"type": "nested",
"properties": {
# 老师实体
"Teacher": {
"type": "nested",
"properties": {
"teacherIntroduction": {
"index": False,
"type": "keyword"
},
"teacherName": {
"index": False,
"type": "keyword"
},
"teacherPhotoUrl": {
"index": False,
"type": "keyword"
}
}
},
"semesterEndTime": {
"type": "date",
"format": "yyyy-MM-dd"
},
"semesterReference": {
"index": False,
"type": "keyword"
},
"semesterStartTime": {
"type": "date",
"format": "yyyy-MM-dd"
},
"semesterStatus": {
"type": "byte"
},
"semesterNo": {
"type": "long"
}
}
},
"courseCategory": {
"type": "keyword"
},
"courseApplicant": {
"type": "long"
},
"courseCoverUrl": {
"index": False,
"type": "keyword"
},
"courseDetailUrl": {
"index": False,
"type": "keyword"
},
"courseGraderNum": {
"type": "long"
},
"courseIntroduction": {
"type": "text",
"analyzer": "ik_max_word"
},
"courseIsFree": {
"type": "byte"
},
"courseName": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word",
"fields": {
"pinyin": {
"type": "text",
"store": False,
"term_vector": "with_offsets",
"analyzer": "pinyin_analyzer",
"boost": 10
},
"completion": {
"type": "completion",
"analyzer": "ik_max_word"
},
"keyword": {
"type": "keyword",
}
}
},
"courseScore": {
"type": "float"
},
"courseSyllabus": {
"index": False,
"type": "keyword"
},
"courseId": {
"type": "keyword",
}
}
}
}
# 操作es的客户端对象 记得创建一个es.ini来保存自己的es配置
es01 = MyEsClient(filepath='./conf/es.ini', index='moocgle_02', body=my_body)
# 自己的json格式的数据文件
try:
with open('./all01.json', 'r', encoding='utf-8') as fp:
data_list_01 = json.load(fp=fp)
except Exception as e:
print("解析json文件时出错")
print(e)
exit()
print(f'json解析完毕,有{len(data_list_01)}条正常数据')
# 插入出错的数据列表
error_list_01 = []
print("开始插入数据")
print("插入第1批bulk")
for a_data in tqdm.tqdm(data_list_01):
try:
es01.add_date(row_obj=a_data)
except Exception as e:
print(e)
error_list_01.append(a_data)
print('---' * 50)
# 插数据出错的
print(len(error_list_01))
with open("error_list.txt", "w", encoding='utf-8') as err_fp:
err_fp.write(error_list_01)
print('ok')