为了可以方便地实时观测插入过程中有无差错,选择使用自己独立开发的ESClient来实现数据的批量导入。其中关键方法为基于elaticsearch库中的help.bulk方法的add_date_bulk函数,只需要通过json库读取json文件形成列表,再把该列表传入该方法中,就可以不断将数据传入es中,并且可以观察到进度状态和异常信息。

python 连接 es集群 python编写es脚本_elasticsearch

python 连接 es集群 python编写es脚本_elasticsearch_02

插入脚本

使用方法

  • 创建一个es.ini配置文件,路径为conf/es.ini(可自定义,在main函数中更改),格式如下
[Elasticsearch]
host = xxxx
port = 9200

# 没有账号密码可不写
user = elastic
pass = xxxxxx
  • 准备好自己要插入的json数据(json文件格式)
  • 在下面这段脚本中的main函数根据自己的需求配置好mapping和setting
  • 运行脚本,开始插入
import configparser
import time
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import helpers
import tqdm

# 运行时间装饰器  语法糖
def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        res = func(*args, **kwargs)
        print('共耗时约 {:.2f} 秒'.format(time.time() - start))
        return res

    return wrapper


# 索引类
class BaseEsClient(object):
    '''
    TODO:连接es,增删查改index
    '''

    def __init__(self, filepath="./conf/es.ini"):
        '''
        TODO:实例和事务化单个node,若需要多个node,需要重构代码
        :param filepath:
        '''
        # 读取es配置
        self.conf = configparser.ConfigParser()
        self.conf.read(filepath, encoding='utf-8')
        # TODO:传参

        self.es_servers = [{
            "host": self.conf.get('Elasticsearch', 'Host'),
            "port": self.conf.get('Elasticsearch', 'Port')
        }]

        self.es_auth = (self.conf.get('Elasticsearch', 'User'), self.conf.get('Elasticsearch', 'Pass'))

        try:
            # 无用户名密码状态
            # self.es = Elasticsearch([ip], port=port)
            self.es_client = Elasticsearch(hosts=self.es_servers, http_auth=self.es_auth)

            # 用户名密码状态
            # http_auth是对设置了安全机制的es库需要写入 账号与密码,如果没有设置则不用写这个参数
            # self.es = Elasticsearch([ip], http_auth=('elastic', 'password'), port=port)

            print(f'连接成功,目标url为: f{self.es_servers}')
        except:
            print('连接失败')

    def create_index(self, index_name):
        '''
        TODO:进行创建一个数据库,即index, 索引
        :param index_name:  索引名
        :return: 
        '''
        self.es_client.indices.create(index=index_name)

    def create_index_by_body(self, index_name: str, body: dict):
        '''
        TODO:指定body创建一个index
        :param index_name: 
        :param body:   The configuration for the index (`settings` and`mappings`)
        :return: 
        '''
        self.es_client.indices.create(index=index_name, body=body)

    def delete_index(self, index_name: str):
        '''
        TODO:进行删除一个数据库,即index
        :param index_name: 
        :return: 
        '''
        self.es_client.indices.delete(index=index_name)

    # 数据库不用进入,也不用退出。


class MyEsClient(BaseEsClient):
    # TODO:对单个index进行增删改查

    default_body = {
        "mappings": {},
        "settings": {
            "index": {
                "number_of_shards": "1",
                "number_of_replicas": "0",
            }
        }
    }

    def __init__(self, filepath="./conf/es.ini", index='index01', doc_type='_doc', body=None):
        # TODO:输入单个index的名称
        super().__init__(filepath=filepath)

        self.index = index
        self.doc_type = doc_type

        if not self.es_client.indices.exists(index=index):
            # 创建Index
            if body:
                self.body = body
            else:
                self.body = self.default_body
            self.create_index_by_body(index_name=index, body=body)
            print(f'提示:创建了index,body结构如下')
            print(self.body)

    def set_index_mapping(self, set_mappings):
        # TODO:设置mapping结构
        """
        设置index的mapping,类似于表结构。
        注意!!!!现在仅仅对mapping中的properties参数,其他的参数还很多
        前提为:已有index,并且已自定义分词器,详情见
        输入参数举例说明:
            set_mappings = {
                    "answer": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    "answerAuthor": {
                        "type": "string"
                    },
                    "answerDate": {
                        "type": "date",
                        "format": "strict_date_optional_time||epoch_millis"//这里出现了复合类型
                    },
                    ...
                    {...
                    }
                }
        """
        mapping = {
            self.doc_type: {
                "properties": set_mappings
            }
        }
        self.es_client.indices.put_mapping(index=self.index, doc_type=self.doc_type, body=mapping)

    def add_date(self, row_obj):
        """
        TODO:单条插入ES
        :param row_obj    The document  类型:dict
        """
        self.es_client.index(index=self.index, doc_type=self.doc_type, body=row_obj)

    def add_date_bulk(self, row_obj_list):
        """
        TODO:批量插入ES,输入文本格式为单条插入的list格式
        :param row_obj_list  list 列表
        """
        load_data = []
        i = 1
        bulk_num = 2000  # 2000条为一批
        for row_obj in tqdm(row_obj_list):
            action = {
                "_index": self.index,
                "_type": self.doc_type,
                "_source": row_obj
            }
            load_data.append(action)
            i += 1

            # 批量处理
            if len(load_data) == bulk_num:
                print('插入', i / bulk_num, '批bulk')
                success, failed = helpers.bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
                print(success, failed)
                # del load_data[0:len(load_data)]
                load_data.clear()

        # 处理剩下不足2000的
        if len(load_data) > 0:
            success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
            del load_data[0:len(load_data)]
            print(success, failed)

    def update_by_id(self, row_obj):
        """
        TODO:根据给定的_id,更新ES文档
        :param row_obj
        :return: None
        """

        _id = row_obj.get("_id", 1)
        row_obj.pop("_id")
        self.es_client.update(index=self.index, doc_type=self.doc_type, body={"doc": row_obj}, id=_id)

    def delete_by_id(self, _id):
        """
        TODO:根据给定的id,删除文档
        :param _id
        :return:
        """
        self.es_client.delete(index=self.index, doc_type=self.doc_type, id=_id)

    def search_by_query(self, body):
        '''
        TODO:根据查询的query语句,来搜索查询内容
        :param body
        '''
        search_result = self.es_client.search(index=self.index, doc_type=self.doc_type, body=body)
        return search_result

    def clear_doc(self):
        '''
        TODO:清空该index里的所有数据
        :return:
        '''
        choic = input('即将删除该index下的所有数据,是否继续?(Y/N):').strip()

        if choic == 'y' or choic == 'Y':
            # 删除所有
            delete_by_all = {"query": {"match_all": {}}}
            result = self.es_client.delete_by_query(index=self.index, body=delete_by_all, doc_type=self.doc_type)
            print(result)
        else:
            print('取消操作')

    @timer
    def add_data_by_bulk(self, row_obj_list):
        """
        TODO: 使用生成器批量写入数据
        :param row_obj_list
        :return None
        """
        bulk_num = 2000  # 2000条为一批

        action = ({
            "_index": self.index,
            "_type": self.doc_type,
            "_source": one_row,
        } for one_row in row_obj_list)

        helpers.bulk(self.es_client, action)


if __name__ == '__main__':


    # 包括mapping和setting  自己根据需求更改配置
    my_body = {
        "settings": {
            # 副本数
            "number_of_replicas": 0,

            # 分片数
            "number_of_shards": 3,

            # 分析
            "analysis": {

                # 自定义的分词过滤器
                "filter": {
                    "pinyin_max_word_filter": {
                        "type": "pinyin",
                        "keep_full_pinyin": "true",  # 分词全拼如雪花 分词xue,hua
                        "keep_separate_first_letter": "true",  # 分词简写如雪花 分词xh
                        "keep_joined_full_pinyin": True  # 分词会quanpin 连接 比如雪花分词 xuehua
                    },
                    "full_pinyin_filter": {
                        "type": "pinyin",
                        "keep_first_letter": False,
                        "keep_separate_first_letter": False,
                        "keep_full_pinyin": False,
                        "none_chinese_pinyin_tokenize": False,
                        "keep_original": False,
                        "limit_first_letter_length": 50,
                        "lowercase": False
                    }

                },

                # 自定义的分词器
                "tokenizer": {
                    "my_pinyin_01": {
                        "type": "pinyin",
                        "keep_separate_first_letter": False,
                        "keep_full_pinyin": True,
                        "keep_original": True,
                        "limit_first_letter_length": 16,
                        "lowercase": True,
                        "remove_duplicated_term": True
                    },
                    "my_ngram_tokenizer": {
                        "token_chars": [
                            "letter",
                            "digit"
                        ],
                        "type": "ngram",
                        "min_gram": "2",
                        "max_gram": "3"
                    }
                },

                # 字符过滤器
                "char_filter": {
                    "my_char_filter_01": {
                        "type": "mapping",
                        "mappings": [
                            "/n => <br>",
                            "/t =>   ",
                        ]
                    }
                },

                # 自定义的分析器
                "analyzer": {
                    # ik-pinyin github的文档示例
                    "pinyin_analyzer": {
                        "tokenizer": "my_pinyin_01"
                    },

                    # 分析器用于处理 中文字段无法精确匹配;
                    "my_ngram_analyzer": {
                        "tokenizer": "my_ngram_tokenizer"
                    },

                },

            }
        },
        "mappings": {
            "properties": {
                # 平台
                "Platform": {
                    "type": "nested",
                    "properties": {

                        # 平台首页
                        "platformIndexUrl": {
                            "index": False,
                            "type": "keyword"
                        },

                        # 平台介绍
                        "platformIntroduction": {
                            "index": False,
                            "type": "keyword"
                        },

                        # 平台名字
                        "platformName": {
                            "type": "keyword"
                        },
                        # 平台Id
                        "platformId": {
                            "type": "keyword"
                        }
                    }
                },

                # 学校
                "School": {
                    "type": "nested",
                    "properties": {
                        "schoolIntroduction": {
                            "index": False,
                            "type": "keyword"
                        },
                        "schoolLogoUrl": {
                            "index": False,
                            "type": "keyword"
                        },
                        "schoolName": {
                            "type": "text",
                            "analyzer": "ik_max_word"
                        }
                    }
                },

                # 学期实体
                "Semester": {
                    "type": "nested",
                    "properties": {
                        # 老师实体
                        "Teacher": {
                            "type": "nested",
                            "properties": {
                                "teacherIntroduction": {
                                    "index": False,
                                    "type": "keyword"
                                },
                                "teacherName": {
                                    "index": False,
                                    "type": "keyword"
                                },
                                "teacherPhotoUrl": {
                                    "index": False,
                                    "type": "keyword"
                                }
                            }
                        },
                        "semesterEndTime": {
                            "type": "date",
                            "format": "yyyy-MM-dd"
                        },
                        "semesterReference": {
                            "index": False,
                            "type": "keyword"
                        },
                        "semesterStartTime": {
                            "type": "date",
                            "format": "yyyy-MM-dd"
                        },
                        "semesterStatus": {
                            "type": "byte"
                        },
                        "semesterNo": {
                            "type": "long"
                        }
                    }
                },
                "courseCategory": {
                    "type": "keyword"
                },
                "courseApplicant": {
                    "type": "long"
                },
                "courseCoverUrl": {
                    "index": False,
                    "type": "keyword"
                },
                "courseDetailUrl": {
                    "index": False,
                    "type": "keyword"
                },
                "courseGraderNum": {
                    "type": "long"
                },
                "courseIntroduction": {
                    "type": "text",
                    "analyzer": "ik_max_word"
                },
                "courseIsFree": {
                    "type": "byte"
                },
                "courseName": {
                    "type": "text",
                    "analyzer": "ik_max_word",
                    "search_analyzer": "ik_max_word",
                    "fields": {
                        "pinyin": {
                            "type": "text",
                            "store": False,
                            "term_vector": "with_offsets",
                            "analyzer": "pinyin_analyzer",
                            "boost": 10
                        },
                        "completion": {
                            "type": "completion",
                            "analyzer": "ik_max_word"
                        },
                        "keyword": {
                            "type": "keyword",
                        }
                    }
                },
                "courseScore": {
                    "type": "float"
                },
                "courseSyllabus": {
                    "index": False,
                    "type": "keyword"
                },
                "courseId": {
                    "type": "keyword",
                }
            }
        }
    }

    # 操作es的客户端对象  记得创建一个es.ini来保存自己的es配置
    es01 = MyEsClient(filepath='./conf/es.ini', index='moocgle_02', body=my_body)

    # 自己的json格式的数据文件
    try:
        with open('./all01.json', 'r', encoding='utf-8') as fp:
            data_list_01 = json.load(fp=fp)
    except Exception as e:
        print("解析json文件时出错")
        print(e)
        exit()

    print(f'json解析完毕,有{len(data_list_01)}条正常数据')
    
    # 插入出错的数据列表
    error_list_01 = []
    print("开始插入数据")

    print("插入第1批bulk")
    for a_data in tqdm.tqdm(data_list_01):
        try:
            es01.add_date(row_obj=a_data)
        except Exception as e:
            print(e)
            error_list_01.append(a_data)

    print('---' * 50)

    # 插数据出错的
    print(len(error_list_01))
    with open("error_list.txt", "w", encoding='utf-8') as err_fp:
        err_fp.write(error_list_01)
        
    print('ok')