python实现对es的大数据分页

原创

langy1990 2023-08-18 09:47:22 ©著作权

文章标签 分页 python Code 文章分类 Python 后端开发

©著作权归作者所有：来自51CTO博客作者langy1990的原创作品，请联系作者获取转载授权，否则将追究法律责任

scorll分页

scroll分页可以实现es中的大量数据分页但是不能实现大跨度跳转分页

从第1页直接跳转到第100页这样就会导致超时最好的是一页一页的向下翻页这样就可以无限往下分页

查询es的时候最好指定要查找的索引名称或者索引名称正则表达式这样可以大大提高es的查询效率

def hostLogsdata(self,appname,startTime,endTime,hostname,pageIndex,pageSize,sortColumn,sortType):
        pageIndex = int(pageIndex)
        pageSize = int(pageSize)
        userChoiceTime_start = self.formartStartTime(startTime)
        userChoiceTime_end = self.formartEndTime(endTime)

        if sortColumn=="":
            sort_dict={"@timestamp": {"order": "desc" }}
        else:
            if sortColumn == "logtime":
                sortColumn="@timestamp"
            sort_dict={sortColumn:{"order":sortType}}


        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        "query_string": {
                            "analyze_wildcard": True,
                            "query": "appname:" + appname
                        }
                    },
                    "filter": {
                        "bool": {
                            "must": [
                                {"term": {"hostname": hostname}},
                                {"match": {"success": "false"}},
                                {
                                    "range": {
                                        "@timestamp": {
                                            "gte": tools.strtime_to_timestamp(userChoiceTime_start),
                                            "lte": tools.strtime_to_timestamp(userChoiceTime_end)
                                        }
                                    }
                                }
                            ]
                        }
                    }
                }
            },
            "sort": sort_dict
        }

        # 第一(次)页查询没有scroll_id
        start = datetime.datetime.now()
        if pageIndex == 1:
            res = self.es.search(body=body, scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
        else:

            res = self.es.search(body=body, scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
            while pageIndex - 1 > 0:
                res = self.es.scroll(scroll_id=scroll_id, scroll='1m')
                pageIndex -= 1

        reslist = []
        for re in res["hits"]["hits"]:
            re = re["_source"]
            reslist.append(
                {"logtime": self.formatDate(re["logtime"]), "interface":re.get("interface","无数据"),"appname": re["appname"], "waste_time": str(re.get("waste_time","无数据"))+"ms",
                 "jylsh": re.get("jylsh","无数据")})
        data = {'code': 'SUCCESS', 'message': '', 'data': {"total": scroll_size, "list": reslist}}
        end = datetime.datetime.now()
        print('Running time: %s Seconds' % (end - start))
        return data

分页函数

python实现对es的大数据分页_Code

打印函数执行的耗时时间

start = datetime.datetime.now()
        if pageIndex == 1:
            res = self.es.search(body=body, scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
        else:

            res = self.es.search(body=body, scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
            while pageIndex - 1 > 0:
                res = self.es.scroll(scroll_id=scroll_id, scroll='1m')
                pageIndex -= 1

        reslist = []
        for re in res["hits"]["hits"]:
            re = re["_source"]
            reslist.append(
                {"logtime": self.formatDate(re["logtime"]), "interface":re.get("interface","无数据"),"appname": re["appname"], "waste_time": str(re.get("waste_time","无数据"))+"ms",
                 "jylsh": re.get("jylsh","无数据")})
        data = {'code': 'SUCCESS', 'message': '', 'data': {"total": scroll_size, "list": reslist}}
        end = datetime.datetime.now()
        print('Running time: %s Seconds' % (end - start))

View Code

python正则匹配字符串替换

def geteslogs(self,startTime,endTime,querycondition,pageIndex,pageSize):
        userChoiceTime_start = self.formartStartTime(startTime)
        userChoiceTime_end = self.formartEndTime(endTime)
        res_time = self.calc_time(startTime, endTime)
        interval = tools.set_interval(res_time[0], res_time[1])

        pageIndex = int(pageIndex)
        pageSize = int(pageSize)

        if querycondition=="":
            querycondition="*"

        tbody ={
               "sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],
               "query":{"filtered":
                            {"query":
                                 {"query_string":
                                    {"analyze_wildcard":True,"query":""+querycondition+""}
                                  },
                                  "filter":
                                      {"bool":
                                         {"must":
                                             [{"range":
                                                 {"@timestamp":{"gte":tools.strtime_to_timestamp(userChoiceTime_start),"lte":tools.strtime_to_timestamp(userChoiceTime_end),"format":"epoch_millis"}}
                                              }],
                                           "must_not":[]
                                          }
                                      }
                             }
                        },
                "fields":["*","_source"]
             }
        cbody ={
            "sort": [{"@timestamp": {"order": "desc", "unmapped_type": "boolean"}}],
            "query": {"filtered":
                          {"query":
                               {"query_string":
                                    {"analyze_wildcard": True, "query": ""+querycondition+""}
                                },
                           "filter":
                               {"bool":
                                    {"must":
                                         [{"range":
                                               {"@timestamp": {"gte": tools.strtime_to_timestamp(userChoiceTime_start),
                                                               "lte": tools.strtime_to_timestamp(userChoiceTime_end),
                                                               "format": "epoch_millis"}}
                                           }],
                                     "must_not": []
                                     }
                                }
                           }
                      },
            "aggs": {
                "2": {"date_histogram":
                    {
                        "field": "@timestamp",
                        "interval": interval,
                        "time_zone": "Asia/Shanghai",
                        "min_doc_count": 0,
                        "extended_bounds": {"min": tools.strtime_to_timestamp(userChoiceTime_start),
                                            "max": tools.strtime_to_timestamp(userChoiceTime_end)}
                    }
                }
            },
            "fields": ["*", "_source"]
        }

        # 第一(次)页查询没有scroll_id
        if pageIndex == 1:
            res = self.es.search(index="app-dzswj-business-*",body=tbody, scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
        else:
            res = self.es.search(index="app-dzswj-business-*",body=tbody, scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
            while pageIndex - 1 > 0:
                res = self.es.scroll(scroll_id=scroll_id, scroll='1m')
                pageIndex -= 1


        reslist=res["hits"]["hits"]
        total=res["hits"]["total"]

        for res in reslist:
            res["logtime"] = res["_source"]["logtime"]
            res["_source"]=str(res["_source"])
            if ":" in querycondition:
                highworld=querycondition.split(":")[1]
                res["_source"]=res["_source"].replace(highworld,"<span style='background-color:yellow'>"+highworld+"</span>")
            else:
                hre = re.compile(re.escape(querycondition), re.IGNORECASE)
                words = hre.findall(res["_source"])
                if words:
                    res["_source"]=hre.sub("<span style='background-color:yellow'>"+words[0]+"</span>",res["_source"])

        #查询柱状图
        res2= self.es.search(index="app-dzswj-business-*",body=cbody)
        bars=res2["aggregations"]["2"]["buckets"]
        for bar in bars:
            bar["localtime"]=self.timestamp13_to_date(bar["key"])
        data = {'code': 'SUCCESS', 'message': '', 'data': {"total": total, "list": reslist,"bars":bars}}
        return data

View Code

python一次性查询多个索引

class esLogAPI(object):
    def __init__(self,url):
        self.es = Elasticsearch(url,timeout=120)



res = self.es.search(index=["app-dzswj-business-2019.09.20","app-dzswj-business-2019.09.12"],body=tbody, scroll='1m', size=pageSize)


查看官方api
 def search(self, index=None, doc_type=None, body=None, params=None):
        """
        Execute a search query and get back search hits that match the query.
        `<http://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html>`_

        :arg index: A comma-separated list of index names to search; use `_all`
            or empty string to perform the operation on all indices

View Code

python查询es的时候动态匹配索引名称

def getindexes(self,startTime,endTime):
        # app-dzswj-business-2019.09.05
        date_list = []
        index_list=[]
        begin_date = datetime.datetime.strptime(startTime,"%Y-%m-%dT%H:%M:%S.%fZ")+datetime.timedelta(hours=8)
        end_date = datetime.datetime.strptime(endTime,"%Y-%m-%dT%H:%M:%S.%fZ")+datetime.timedelta(hours=8)
        while begin_date <= end_date:
            date_str = begin_date.strftime("%Y.%m.%d")
            date_list.append(date_str)
            begin_date += datetime.timedelta(days=1)

        for date in date_list:
            index_list.append("*-%s"%(date))
        return index_list


    def geteslogs(self,startTime,endTime,querycondition,pageIndex,pageSize):
        userChoiceTime_start = self.formartStartTime(startTime)
        userChoiceTime_end = self.formartEndTime(endTime)
        res_time = self.calc_time(startTime, endTime)
        interval = tools.set_interval(res_time[0], res_time[1])

        pageIndex = int(pageIndex)
        pageSize = int(pageSize)

        if querycondition=="":
            querycondition="*"

        tbody ={
               "sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],
               "query":{"filtered":
                            {"query":
                                 {"query_string":
                                    {"analyze_wildcard":True,"query":""+querycondition+""}
                                  },
                                  "filter":
                                      {"bool":
                                         {"must":
                                             [{"range":
                                                 {"@timestamp":{"gte":tools.strtime_to_timestamp(userChoiceTime_start),"lte":tools.strtime_to_timestamp(userChoiceTime_end),"format":"epoch_millis"}}
                                              }],
                                           "must_not":[]
                                          }
                                      }
                             }
                        },
                "fields":["*","_source"]
             }
        cbody ={
            "sort": [{"@timestamp": {"order": "desc", "unmapped_type": "boolean"}}],
            "query": {"filtered":
                          {"query":
                               {"query_string":
                                    {"analyze_wildcard": True, "query": ""+querycondition+""}
                                },
                           "filter":
                               {"bool":
                                    {"must":
                                         [{"range":
                                               {"@timestamp": {"gte": tools.strtime_to_timestamp(userChoiceTime_start),
                                                               "lte": tools.strtime_to_timestamp(userChoiceTime_end),
                                                               "format": "epoch_millis"}}
                                           }],
                                     "must_not": []
                                     }
                                }
                           }
                      },
            "aggs": {
                "2": {"date_histogram":
                    {
                        "field": "@timestamp",
                        "interval": interval,
                        "time_zone": "Asia/Shanghai",
                        "min_doc_count": 0,
                        "extended_bounds": {"min": tools.strtime_to_timestamp(userChoiceTime_start),
                                            "max": tools.strtime_to_timestamp(userChoiceTime_end)}
                    }
                }
            },
            "fields": ["*", "_source"]
        }

        index_list_pattern=self.getindexes(startTime,endTime)
        # 第一(次)页查询没有scroll_id
        if pageIndex == 1:
            res = self.es.search(index=index_list_pattern,body=tbody, scroll='1m', size=pageSize)
            # res = self.es.search(index=["app-dzswj-business-2019.09.22","app-dzswj-business-2019.09.23"], body=tbody, scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
        else:
            res = self.es.search(index=index_list_pattern,body=tbody, scroll='1m', size=pageSize)
            # res = self.es.search(index=["app-dzswj-business-2019.09.22", "app-dzswj-business-2019.09.23"], body=tbody,
            #                      scroll='1m', size=pageSize)
            scroll_size = res['hits']['total']
            scroll_id = res['_scroll_id']
            while pageIndex - 1 > 0:
                res = self.es.scroll(scroll_id=scroll_id, scroll='1m')
                pageIndex -= 1


        reslist=res["hits"]["hits"]
        total=res["hits"]["total"]

        for res in reslist:
            res["logtime"] = res["_source"]["logtime"]
            res["_source"]=str(res["_source"])
            if ":" in querycondition:
                highworld=querycondition.split(":")[1]
                res["_source"]=res["_source"].replace(highworld,"<span style='background-color:yellow'>"+highworld+"</span>")
            else:
                hre = re.compile(re.escape(querycondition), re.IGNORECASE)
                words = hre.findall(res["_source"])
                if words:
                    res["_source"]=hre.sub("<span style='background-color:yellow'>"+words[0]+"</span>",res["_source"])

        #查询柱状图
        res2= self.es.search(index=index_list_pattern,body=cbody)
        bars=res2["aggregations"]["2"]["buckets"]
        for bar in bars:
            bar["localtime"]=self.timestamp13_to_date(bar["key"])
        data = {'code': 'SUCCESS', 'message': '', 'data': {"total": total, "list": reslist,"bars":bars}}
        return data

匹配索引后缀

实现分组查询匹配

for i2 in res["aggregations"]["2"]["buckets"]:
        for i3 in i2["3"]["buckets"]:
          for i4 in i3["4"]["buckets"]:
            for i5 in i4["5"]["buckets"]:
              for i6 in i5["6"]["buckets"]:
                for i7 in i6["7"]["buckets"]:
                  timestr = i7["key_as_string"][:-6]

View Code