数据库分页的几种实现

数据库的分页总体上有两种方式:

  • limit/offset 方式
  • 游标方式
  • 滚动方式

SQL

-- 通过 LIMIT/OFFSET
-- MySQL
SELECT * FROM <table_name> LIMIT <offset>,<page_size> ORDER BY <field>;
-- PostgreSQL
SELECT * FROM <table_name> LIMIT <page_size>,OFFSET <offset> ORDER BY <field>;


-- hive 
-- 通过 ROW_NUMBER() >=5.7
-- window function
WITH t AS (
	SELECT *, ROW_NUMBER() OVER(ORDER BY <field> ASC) AS row_id FROM <table_name> 
)
SELECT * FROM t WHERE row_id BETWEEN <offset> AND <offset> + <page_size>;

-- variable
SELECT * FROM (
	SELECT *, @row_id := @row_id + 1 AS row_id
    FROM <table_name>, (SELECT @row_id := 0) AS t
    ORDER BY <field>
) AS tt 
WHERE row_id BETWEEN <offset> AND <offset> + <page_size>;


-- 游标 cursor: 存储过程/函数/触发器/事件
DELIMITER // -- 定义语法结束符号
CREATE PROCEDURE p2() -- 创建 p2 的存储过程
BEGIN
    DECLARE id, total INT; -- 创建 用于接收游标值的变量
    DECLARE name VARCHAR(20) CHARACTER SET utf8; -- 注意: 游标值为中文时需指定字符集
    DECLARE done INT DEFAULT 0;  -- 游标结束的标志

    DECLARE mycur CURSOR FOR SELECT stuId,stuName FROM student WHERE stuAge > 19; -- 声明游标
    DECLARE CONTINUE HANDLE FOR NOT FOUND SET done =1; -- 指定游标循环结束时的返回值 
    OPEN mycur;  -- 打开游标
    SET total = 0; -- 初始化 变量

    myCursorLabel:LOOP  -- loop 循环
        FETCH cur INTO id,name;  -- 根据游标当前指向的一条数据
        IF done = 1 THEN  -- 当游标的返回值为 1 时,退出 loop 循环 
            LEAVE myCursorLabel;
        END IF;
        -- DO YOU THING ...
        SET total = total + 1;
    END LOOP;

    CLOSE mycur;  -- 关闭游标

    SELECT total;  -- 输出结果
END //
DELIMITER ;

MongoDB

# limit/offset
db.myCollection
    .find(<filter_condition>)
    .sort({"<field>":1})
    .skip(<offset>).limit(<page_size>);

# hasNext
var datas = db.myCollection.find(<filter_condition>).sort({"<field>":1});
while (datas.hasNext()) {
  var item = datas.next();
  print(
    'INSERT INTO <mytable>(`field1`, `field2`) VALUES'
    + ' ('
    + "'" + item.property1 + "',"
    + "'" + item.property2 + "'"
    + ');'
  );
}

Elasticsearch

# from/size 方式
POST <index>/_search
{
  "query": {
    "bool": {
      "filter": [
        {
          "term": {
            "age": 28
          }
        }
      ]
    }
  },
  "from": <offset>,
  "size": <page_size>,
  "sort": [
    {
      "timestamp": {
        "order": "desc"
      },
      "_id": {
        "order": "desc"
      }
    }
  ]
}
# 对于 from + size > 10000 的数据量,再使用 from + size 方式,此时会遇到以下错误
{
  "error": {
    "root_cause": [
      {
        "type": "query_phase_execution_exception",
        "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [10005]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting."
      }
    ]
  }
}

## 原因
# ES 需要在各个分片上匹配得到 分片数 * (from + size) 条有效数据后做一次全局排序,然后在结果集中取最后 size 条数据返回
# 而当索引非常大时,是无法使用 from + size 做深分页的,分页越深则越容易 Out Of Memory,即使运气很好没有发生 Out Of Memory,也会非常消耗 CPU 和内存资源
# 为了保护机器,ES 使用 index.max_result_window:10000 这个设定作为保护措施 ,即默认 from + size < 10000(虽然 index.max_result_window 可以动态修改或在配置文件修改,但是不被建议这样做,因为查询方式是低效的)
# 此时应改用 ES 提供的 scroll 方法来取得数据

## scroll 原理
# scroll 类型于关系型数据库里的 cursor,因此 scroll 并不适合用来做实时搜索,而更适用于后台批处理任务
# 使用 scroll 可以增加性能的原因,是因为如果做深分页,每次搜索都必须重新排序,非常浪费,而使用 scroll 就是一次把要用的数据都排完了,分批取出

## scroll 使用方式
# (1)初始化:查询并获取第一次 scroll_id
# 初始化时将所有符合搜索条件的搜索结果缓存起来,可以想象成快照
# (2)遍历:通过上次的 scroll_id 查询
# 在遍历时,从这个快照里取数据(也就是说,在初始化后对索引插入、删除、更新数据都不会影响遍历结果)
POST <index>/_search?scroll=5m
{
  "query": {
    "bool": {
      "filter": [
        {
          "term": {
            "age": 28
          }
        }
      ]
    }
  },
  "size": <page_size>,
  "from": 0,
  "sort": [
    {
      "timestamp": {
        "order": "desc"
      },
      "_id": {
        "order": "desc"
      }
    }
  ]
}

# 得到结果
# scroll_id 用于下一次遍历使用
{
  "_scroll_id" : "DnF1ZXJ5VG.....................................hlbkZlR",
  "took" : 606,
  "timed_out" : false,
  "_shards" : {
    "total" : 20,
    "successful" : 20,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 694465,
    "max_score" : null,
    "hits" : [  # 第一页结果
      {
        "_index" : "...",
        "_type" : "doc",
        "_id" : "D-Gf43IB_wsSZvpG4VgT",
        "_score" : null,
        "_source" : {
          "message" : "...."
        },
        "sort" : [
          1592956801000
        ]
      }
      # ... ...
    ]
  }
}

# 第二页请求用上一次的 scroll_id 请求
POST _search/scroll
{
    "scroll": "5m",
    "scroll_id":"DnF1ZXJ5VG.....................................hlbkZlR"
}

# 得到结果
{
  "_scroll_id" : "DnF1ZXJ5VG.....................................hlbkZlR",
  "took" : 606,
  "timed_out" : false,
  "_shards" : {
    "total" : 20,
    "successful" : 20,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 694465,
    "max_score" : null,
    "hits" : [  # 第二页结果
      {
        "_index" : "....",
        "_type" : "doc",
        "_id" : "D-Gf43IB_wsSZvpG4Vg",
        "_score" : null,
        "_source" : {
          "message" : "......"
        },
        "sort" : [
          1592956801000
        ]
      }
    ]
  }
}

# 后续第三、四...页请求类似,当前请求用上一次的 scroll_id 请求

# 请求完毕后释放资源(使用上一次的 scroll_id)
DELETE _search/scroll
{
    "scroll_id": "DnF1ZXJ5VG.....................................hlbkZlR"
}
# npm i sync-request --save

let request = require('sync-request');

const API_HOST = 'http://<host>:<port>'
let indexes = '<indexs>'
let API_FIRST = API_HOST + '/' + indexes + '/_search?scroll=5m'
let API_SECOND = API_HOST + '/_search/scroll'

let PAGE_SIZE = 1000
let fist_query = {
  "size": PAGE_SIZE,
  "from": 0,
  "sort": [
    {
      "_id": {
        "order": "asc"
      }
    }
  ],
  "_source": ["<field1>", "field2"],
  "query": <query>
}

function requestData(last_scroll_id) {
  if (last_scroll_id === '') {
    var api = API_FIRST
    var body = fist_query
  } else {
    var api = API_SECOND
    var body = {
      scroll: "5m", # 缓存时间
      scroll_id: last_scroll_id
    }
  }

  var data = request('POST', api, {
    json: body,
    headers: {
      "Content-type": "application/json",
      "Authorization": "Basic <Authorization>",
    }
  })

  if (data && data.statusCode == 200 && data.body) {
    return JSON.parse(data.body.toString())
  } else {
    console.log(data, last_scroll_id, data.toString())
  }
}

var last_scroll_id = ''
var count = 0
while (true) {
  var data = requestData(last_scroll_id)
  if (data && data.hits.hits.length > 0) {
    last_scroll_id = data._scroll_id

    # ... DO YOUR THING
    data.hits.hits.forEach(function (v) {
      var field1 = v._source.field1
      var field2 = v._source.field1
      # ... ...
    })

    count += data.hits.hits.length
    console.log(count)
   # 假设超过 1500000 条不再查询
    if (data.hits.hits.length < PAGE_SIZE || count > 1500000) {
      break;
    }
  } else {
    break
  }
}
console.log(last_scroll_id)
console.log(result)
POST twitter/_search
{
    "size": 10,
    "query": {
        "match" : {
            "title" : "es"
        }
    },
    "sort": [
        {"date": "asc"},
        {"_id": "desc"}
    ]
}

{
      "took" : 29,
      "timed_out" : false,
      "_shards" : {
        "total" : 1,
        "successful" : 1,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : {
          "value" : 5,
          "relation" : "eq"
        },
        "max_score" : null,
        "hits" : [
          {
            ...
            },
            "sort" : [
              ...
            ]
          },
          {
            ...
            },
            "sort" : [
              124648691,
              "624812"
            ]
          }
        ]
      }
    }
    
GET twitter/_search
{
    "size": 10,
    "query": {
        "match" : {
            "title" : "es"
        }
    },
    "search_after": [124648691, "624812"],
    "sort": [
        {"date": "asc"},
        {"_id": "desc"}
    ]
}

参考