PostgreSQL数据库TableAM——Table Access Method_postgresql

PostgreSQL数据库TableAM——Table Access Method_big data_02

TableAmRoutine

src/include/access/tableam.h

TableAmRoutine是表访问方法的API结构体,作为服务端声明周期有效的数据结构,需要将其作为静态常量结构体定义,通过FormData_pg_am.amhandler获取。如果需要实现其他TableAM,只需要实现自己的TableAmRoutine结构,实现并定义TableAmRoutine里面所有的API。

PostgreSQL数据库TableAM——Table Access Method_数据库_03

  1. 为新的表访问方式创建TableAmRoutine表访问方法的API结构体
  2. 实现新访问方式TableAmRoutine的API结构体中的方法
  3. 创建GetXXXamTableAMRoutine和XXX_tableam_handler函数用于获取TableAmRoutine表访问方法的API结构体
  4. 向fmgr中注册相应TableAmRoutine结构体,用于通过amhandler的Oid查询相应的TableAmRoutine结构体(PostgreSQL数据库TableAM——GetTableAmRoutine函数)

TableAmRoutine接口使用

以PostgreSQL数据库TableAM——HeapAM为例,如下为其TableAmRoutine的slot_callbacks成员的值。

static const TableAmRoutine heapam_methods = {
.slot_callbacks = heapam_slot_callbacks,
}
TTSOpsHeapTuple = {
.getsomeattrs = tts_heap_getsomeattrs,
.materizalize = tts_heao_materizalize,
.copyslot = tts_heap_copyslot,
.get_heap_tuple = tts_heap_get_heap_tuple,
.copy_minimal_tuple = tts_heap_copy_minimal_tuple
}

顺序扫描相关接口
.scan_begin = heap_beginscan
.scan_end = heap_endscan
.scan_rescan = heap_rescan
.scan_getnextslot = heap_getnextslot
.parallelscan_estimate = table_block_parallelscan_estimate
.parallelscan_initialize = table_block_parallelscan_initialize
.parallelscan_reinitialize = table_block_parallelscan_reinitialize

索引扫描相关接口
.index_fetch_begin = heapam_index_fetch_begin
.index_fetch_reset = heapam_index_fetch_reset
.index_fetch_end = heapam_index_fetch_end
.index_fetch_tuple = heapam_index_fetch_tuple
.scan_bitmap_next_block = heapam_scan_bitmap_next_block
.scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple

DML/DDL相关接口
.tuple_insert = heapam_tuple_insert
.multi_insert = heapam_multi_insert
.tuple_delete = heapam_tuple_delete
.tuple_update = heapam_tuple_update
.relation_vacuum = heap_vacuum_rel

统计相关接口
.scan_analyze_next_block = heapam_scan_analyze_next_block
.scan_analyze_next_tuple = heapam_scan_analyze_next_tuple
.relation_size = table_block_relation_size
.relation_estimate_size = heapam_estimate_rel_size

TTSOpsHeapTuple是最重要的一个接口,TupleTableSlot结构是执行器操作和传输的结构,执行器只关心逻辑上的行定义,即行由几列组成,每列的值存放在slot->values[]数组中,执行器不关心底层的tuple是怎么存储的,所以Access Method需要自己定义函数对底层物理存储进行转换,将其转换执行器想得到的数据。例如,只想你过去想得到第n列的值,它会调用getsomeattrs接口,实际上调用的是Heap的tts_heap_getsomeattrs。

TableAm相关Generic routine

src/backend/access/table/tableamapi.c包含如下函数
GetTableAmRoutine函数调用指定access method handler routine以获取其TableAmRoutine结构体,该结构体会在调用者的内存上下文中分配。主要是通过调用OidFunctionCall0(amhandler)函数获取TableAmRoutine结构体。

const TableAmRoutine *GetTableAmRoutine(Oid amhandler) {
Datum datum;
const TableAmRoutine *routine;
datum = OidFunctionCall0(amhandler);
routine = (TableAmRoutine *) DatumGetPointer(datum);
if (routine == NULL || !IsA(routine, TableAmRoutine))
elog(ERROR, "table access method handler %u did not return a TableAmRoutine struct", amhandler);
Assert(routine->scan_begin != NULL);
Assert(routine->scan_end != NULL);
Assert(routine->scan_rescan != NULL);
Assert(routine->scan_getnextslot != NULL);
...
Assert(routine->scan_sample_next_tuple != NULL);
return routine;
}

将宏展开​​OidFunctionCall0(amhandler)​​​就是​​OidFunctionCall0Coll(amhandler, InvalidOid)​​,OidFunctionCall0Coll函数详情如下:

Datum OidFunctionCall0Coll(Oid functionId, Oid collation){
FmgrInfo flinfo;
fmgr_info(functionId, &flinfo);
return FunctionCall0Coll(&flinfo, collation);
}
Datum FunctionCall0Coll(FmgrInfo *flinfo, Oid collation) {
LOCAL_FCINFO(fcinfo, 0);
Datum result;
InitFunctionCallInfoData(*fcinfo, flinfo, 0, collation, NULL, NULL);
result = FunctionCallInvoke(fcinfo);
/* Check for null result, since caller is clearly not expecting one */
if (fcinfo->isnull)
elog(ERROR, "function %u returned NULL", flinfo->fn_oid);
return result;
}

check_default_table_access_method函数用于校验新的default_table_access_method是否合法。

bool check_default_table_access_method(char **newval, void **extra, GucSource source) {
if (**newval == '\0') {
GUC_check_errdetail("%s cannot be empty.", "default_table_access_method"); return false;
}
if (strlen(*newval) >= NAMEDATALEN) {
GUC_check_errdetail("%s is too long (maximum %d characters).", "default_table_access_method", NAMEDATALEN - 1); return false;
}
/* If we aren't inside a transaction, or not connected to a database, we cannot do the catalog access necessary to verify the method. Must accept the value on faith. */
if (IsTransactionState() && MyDatabaseId != InvalidOid) {
if (!OidIsValid(get_table_am_oid(*newval, true))) {
/* When source == PGC_S_TEST, don't throw a hard error for a nonexistent table access method, only a NOTICE. See comments in guc.h. */
if (source == PGC_S_TEST) {
ereport(NOTICE, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("table access method \"%s\" does not exist", *newval)));
} else {
GUC_check_errdetail("Table access method \"%s\" does not exist.", *newval); return false;
}
}
}
return true;
}

src/include/access/tableam.h包含如下函数,主要逻辑就是通过Relation的rd_tableam(TableAmRoutine结构体)调用相应AM的成员函数。

static inline TableScanDesc table_beginscan(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key)
static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, bool allow_strat, bool allow_sync)
static inline TableScanDesc table_beginscan_bm(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key)
static inline TableScanDesc table_beginscan_sampling(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, bool allow_strat, bool allow_sync, bool allow_pagemode)
static inline TableScanDesc table_beginscan_tid(Relation rel, Snapshot snapshot)
static inline TableScanDesc table_beginscan_analyze(Relation rel)
static inline void table_endscan(TableScanDesc scan)
static inline void table_rescan(TableScanDesc scan, struct ScanKeyData *key)
static inline bool table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
static inline void table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
static inline IndexFetchTableData * table_index_fetch_begin(Relation rel)
static inline void table_index_fetch_reset(struct IndexFetchTableData *scan)
static inline void table_index_fetch_end(struct IndexFetchTableData *scan)
static inline bool table_index_fetch_tuple(struct IndexFetchTableData *scan, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, bool *call_again, bool *all_dead)
static inline bool table_tuple_fetch_row_version(Relation rel, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot)
static inline bool table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
static inline bool table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, Snapshot snapshot)
static inline TransactionId table_compute_xid_horizon_for_tuples(Relation rel, ItemPointerData *items, int nitems)
static inline void table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate)
static inline void table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot, CommandId cid, int options, struct BulkInsertStateData *bistate, uint32 specToken)
static inline void table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot, uint32 specToken, bool succeeded)
static inline void table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, CommandId cid, int options, struct BulkInsertStateData *bistate)
static inline TM_Result table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, Snapshot snapshot, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
static inline TM_Result table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, CommandId cid, Snapshot snapshot, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, bool *update_indexes)
static inline TM_Result table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags, TM_FailureData *tmfd)
static inline void table_finish_bulk_insert(Relation rel, int options)
static inline void table_relation_set_new_filenode(Relation rel, const RelFileNode *newrnode, char persistence, TransactionId *freezeXid, MultiXactId *minmulti)
static inline void table_relation_nontransactional_truncate(Relation rel)
static inline void table_relation_copy_data(Relation rel, const RelFileNode *newrnode)
static inline void table_relation_copy_for_cluster(Relation OldTable, Relation NewTable, Relation OldIndex, bool use_sort, TransactionId OldestXmin, TransactionId *xid_cutoff, MultiXactId *multi_cutoff, double *num_tuples, double *tups_vacuumed, double *tups_recently_dead)
static inline void table_relation_vacuum(Relation rel, struct VacuumParams *params, BufferAccessStrategy bstrategy)
static inline bool table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, BufferAccessStrategy bstrategy)
static inline bool table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, double *liverows, double *deadrows, TupleTableSlot *slot)
static inline double table_index_build_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
static inline void table_index_validate_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, Snapshot snapshot, struct ValidateIndexState *state)
static inline uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
static inline bool table_relation_needs_toast_table(Relation rel)
static inline void table_relation_estimate_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
static inline bool table_scan_bitmap_next_block(TableScanDesc scan, struct TBMIterateResult *tbmres)
static inline bool table_scan_bitmap_next_tuple(TableScanDesc scan, struct TBMIterateResult *tbmres, TupleTableSlot *slot)
static inline bool table_scan_sample_next_block(TableScanDesc scan, struct SampleScanState *scanstate)
static inline bool table_scan_sample_next_tuple(TableScanDesc scan, struct SampleScanState *scanstate, TupleTableSlot *slot)

table相关Generic routine(打开关闭table)

src/backend/access/table/table.c 主要是通过调用relation.c中函数获取RelationData结构体,PostgreSQL数据库RelationAM——relation related routines。
table_open函数通过relation OID打开表relation,函数检查relation不是索引或复合类型,调用者需要检查relation不是一个视图或外部表。table_open函数直接调用relation_open函数。

Relation table_open(Oid relationId, LOCKMODE lockmode) {
Relation r;
r = relation_open(relationId, lockmode);
if (r->rd_rel->relkind == RELKIND_INDEX || r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", RelationGetRelationName(r))));
else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is a composite type", RelationGetRelationName(r))));
return r;
}

table_openrv函数打开由RangeVar节点指定的表relation

Relation table_openrv(const RangeVar *relation, LOCKMODE lockmode) {
Relation r;
r = relation_openrv(relation, lockmode);
if (r->rd_rel->relkind == RELKIND_INDEX || r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", RelationGetRelationName(r))));
else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is a composite type", RelationGetRelationName(r))));
return r;
}

table_openrv_extended函数打开由RangeVar节点指定的表relation,但是可以通过missing_ok标志来指定在relation-not-found时返回NULL而不是直接失败。

Relation table_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, bool missing_ok){
Relation r;
r = relation_openrv_extended(relation, lockmode, missing_ok);
if (r){
if (r->rd_rel->relkind == RELKIND_INDEX || r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is an index", RelationGetRelationName(r))));
else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is a composite type", RelationGetRelationName(r))));
}
return r;
}

table_close关闭table

void table_close(Relation relation, LOCKMODE lockmode) {
relation_close(relation, lockmode);
}

table相关AM routine(操作table)

src/backend/access/table/tableam.c

/* Slot functions. */
const TupleTableSlotOps *table_slot_callbacks(Relation relation)
TupleTableSlot *table_slot_create(Relation relation, List **reglist)
/* Table scan functions. */
TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
void table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
/* Parallel table scan related functions. */
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
/* Index scan related functions. */
bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, bool *all_dead)
/* Functions for non-modifying operations on individual tuples */
void table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
/* Functions to make modifications a bit simpler. */
void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
void simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
void simple_table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, Snapshot snapshot, bool *update_indexes)
/* Helper functions to implement parallel scans for block oriented AMs. */
Size table_block_parallelscan_estimate(Relation rel)
Size table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
void table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
void table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
BlockNumber table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)

Table AM当前的一些限制:

  • AMs期望是基于块组织的,代码中有很多这样的假设,像统计信息,analyzes取样都是基于块的
  • redo类型被PG内核支持,而新的WAL类型例如redo类型不在核心代码中,所以很难在AM中使用新的WAL类型,必须改动核心代码
  • AMs和planner和执行器需要更好的集成和优化,例如对列存的支持,const估算需要更加精确,scan需要知道所选列的信息等
  • 代码中还有很多代码是基于heap的假设,例如通过xmin来进行cache的可见性判断