查询
概述
查询是数据库中最常用也是最复杂的操作之一,关于查询涉及很多方面:SQL解析、成本评估、索引选择…等等,本文只讨论最简单的部分,即PostgreSQL如何执行全表遍历。考虑下面的SQL语句:
-- 建表
create table t1(id int);
-- 插入
insert into t1 values(1),(2),(3);
-- 查询
select * from t1 where id = 1;
对于select语句,由于我们并没有为t1表创建索引,所以只能通过全表遍历的方式来执行查询。全表遍历会遍历表的所有块,逐条获取块中的元组,判断元组是否符合条件,如果符合条件则放入结果集。下面我们详细阐述查询的相关内容。
查询框架
在介绍全表遍历前,我们先来看看PostgreSQL的查询框架。ExecutePlan是查询的主循环,该函数获取一条合法的元组(合法是指元组可见且满足where条件)。然后放入结果集(这部分不太清楚)。具体代码如下:
static void
ExecutePlan(EState *estate,
PlanState *planstate,
bool use_parallel_mode,
CmdType operation,
bool sendTuples,
uint64 numberTuples,
ScanDirection direction,
DestReceiver *dest)
{
TupleTableSlot *slot;
uint64 current_tuple_count;
/*
* initialize local variables
*/
current_tuple_count = 0;
/*
* Set the direction.
*/
estate->es_direction = direction;
/*
* If a tuple count was supplied, we must force the plan to run without
* parallelism, because we might exit early. Also disable parallelism
* when writing into a relation, because no database changes are allowed
* in parallel mode.
*/
if (numberTuples || dest->mydest == DestIntoRel)
use_parallel_mode = false;
estate->es_use_parallel_mode = use_parallel_mode;
if (use_parallel_mode)
EnterParallelMode();
/*
* Loop until we've processed the proper number of tuples from the plan.
*/
for (;;)
{
/* Reset the per-output-tuple exprcontext */
ResetPerTupleExprContext(estate);
/*
* Execute the plan and obtain a tuple
* 获取一条合法元组
*/
slot = ExecProcNode(planstate);
/*
* if the tuple is null, then we assume there is nothing more to
* process so we just end the loop...
* 如果solt为空,说明已经不存在合法元组,则结束循环(也就是结束查询)
*/
if (TupIsNull(slot))
{
/* Allow nodes to release or shut down resources. */
(void) ExecShutdownNode(planstate);
break;
}
/*
* If we have a junk filter, then project a new tuple with the junk
* removed.
*
* Store this new "clean" tuple in the junkfilter's resultSlot.
* (Formerly, we stored it back over the "dirty" tuple, which is WRONG
* because that tuple slot has the wrong descriptor.)
*/
if (estate->es_junkFilter != NULL)
slot = ExecFilterJunk(estate->es_junkFilter, slot);
/*
* If we are supposed to send the tuple somewhere, do so. (In
* practice, this is probably always the case at this point.)
*/
if (sendTuples)
{
/*
* If we are not able to send the tuple, we assume the destination
* has closed and no more tuples can be sent. If that's the case,
* end the loop.
*/
if (!((*dest->receiveSlot) (slot, dest)))
break;
}
/*
* Count tuples processed, if this is a SELECT. (For other operation
* types, the ModifyTable plan node must count the appropriate
* events.)
*/
if (operation == CMD_SELECT)
(estate->es_processed)++;
/*
* check our tuple count.. if we've processed the proper number then
* quit, else loop again and process more tuples. Zero numberTuples
* means no limit.
*/
current_tuple_count++;
if (numberTuples && numberTuples == current_tuple_count)
break;
}
if (use_parallel_mode)
ExitParallelMode();
}
上述代码中关键函数为ExecProcNode,ExecProcNode负责获取一条合法元组。我来看看ExecProcNode的具体实现:
TupleTableSlot *
ExecProcNode(PlanState *node)
{
TupleTableSlot *result;
CHECK_FOR_INTERRUPTS();
if (node->chgParam != NULL) /* something changed */
ExecReScan(node); /* let ReScan handle this */
if (node->instrument)
InstrStartNode(node->instrument);
switch (nodeTag(node))
{
/*
* control nodes
*/
case T_ResultState:
result = ExecResult((ResultState *) node);
break;
case T_ModifyTableState:
result = ExecModifyTable((ModifyTableState *) node);
break;
case T_AppendState:
result = ExecAppend((AppendState *) node);
break;
case T_MergeAppendState:
result = ExecMergeAppend((MergeAppendState *) node);
break;
case T_RecursiveUnionState:
result = ExecRecursiveUnion((RecursiveUnionState *) node);
break;
/* BitmapAndState does not yield tuples */
/* BitmapOrState does not yield tuples */
/*
* scan nodes
*/
case T_SeqScanState:
result = ExecSeqScan((SeqScanState *) node);
break;
case T_SampleScanState:
result = ExecSampleScan((SampleScanState *) node);
break;
case T_IndexScanState:
result = ExecIndexScan((IndexScanState *) node);
break;
case T_IndexOnlyScanState:
result = ExecIndexOnlyScan((IndexOnlyScanState *) node);
break;
/* BitmapIndexScanState does not yield tuples */
case T_BitmapHeapScanState:
result = ExecBitmapHeapScan((BitmapHeapScanState *) node);
break;
case T_TidScanState:
result = ExecTidScan((TidScanState *) node);
break;
case T_SubqueryScanState:
result = ExecSubqueryScan((SubqueryScanState *) node);
break;
case T_FunctionScanState:
result = ExecFunctionScan((FunctionScanState *) node);
break;
case T_ValuesScanState:
result = ExecValuesScan((ValuesScanState *) node);
break;
case T_CteScanState:
result = ExecCteScan((CteScanState *) node);
break;
case T_WorkTableScanState:
result = ExecWorkTableScan((WorkTableScanState *) node);
break;
case T_ForeignScanState:
result = ExecForeignScan((ForeignScanState *) node);
break;
case T_CustomScanState:
result = ExecCustomScan((CustomScanState *) node);
break;
/*
* join nodes
*/
case T_NestLoopState:
result = ExecNestLoop((NestLoopState *) node);
break;
case T_MergeJoinState:
result = ExecMergeJoin((MergeJoinState *) node);
break;
case T_HashJoinState:
result = ExecHashJoin((HashJoinState *) node);
break;
/*
* materialization nodes
*/
case T_MaterialState:
result = ExecMaterial((MaterialState *) node);
break;
case T_SortState:
result = ExecSort((SortState *) node);
break;
case T_GroupState:
result = ExecGroup((GroupState *) node);
break;
case T_AggState:
result = ExecAgg((AggState *) node);
break;
case T_WindowAggState:
result = ExecWindowAgg((WindowAggState *) node);
break;
case T_UniqueState:
result = ExecUnique((UniqueState *) node);
break;
case T_GatherState:
result = ExecGather((GatherState *) node);
break;
case T_HashState:
result = ExecHash((HashState *) node);
break;
case T_SetOpState:
result = ExecSetOp((SetOpState *) node);
break;
case T_LockRowsState:
result = ExecLockRows((LockRowsState *) node);
break;
case T_LimitState:
result = ExecLimit((LimitState *) node);
break;
default:
elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
result = NULL;
break;
}
if (node->instrument)
InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
return result;
}
ExecProcNode接收一个执行计划PlanState(目前并未明确PlanState是什么东西,看起来可能是执行计划),根据执行计划的类型调用对应函数,执行相关查询。例如,本文重点介绍的全表遍历就属于T_SeqScanState类型,会调用ExecSeqScan来执行。而ExecSeqScan会调用ExecScan,并将SeqNext作为参数,SeqNext是实际执行全表遍历的函数,后面我们会重点讲解,现在先来看看ExecSeqScan的实现:
TupleTableSlot *
ExecSeqScan(SeqScanState *node)
{
return ExecScan((ScanState *) node,
(ExecScanAccessMtd) SeqNext,
(ExecScanRecheckMtd) SeqRecheck);
}
下面我们就来重点看下ExecScan的实现。
ExecScan
实际上不只是ExecSeqScan,几乎所有的单表查询操作都会调用ExecScan,例如使用索引查询的实现如下:
TupleTableSlot *
ExecIndexScan(IndexScanState *node)
{
/*
* If we have runtime keys and they've not already been set up, do it now.
*/
if (node->iss_NumRuntimeKeys != 0 && !node->iss_RuntimeKeysReady)
ExecReScan((PlanState *) node);
if (node->iss_NumOrderByKeys > 0)
return ExecScan(&node->ss,
(ExecScanAccessMtd) IndexNextWithReorder,
(ExecScanRecheckMtd) IndexRecheck);
else
return ExecScan(&node->ss,
(ExecScanAccessMtd) IndexNext,
(ExecScanRecheckMtd) IndexRecheck);
}
只不过在ExecIndexScan中,根据情况将IndexNext和IndexNextWithReorder作为实际执行查询的函数。
ExecScan根据不同的查询方式(比如:全表遍历,索引遍历等)调用对应的函数(如前面提到的SeqNext)获取一条可见的元组(由ExecScanFetch实现),判断元组的合法性(由ExecQual实现)。如果合法则返回该元组,否则获取下一条可见元组,直到获取完所有的可见元组为止,我们来看看具体的代码:
TupleTableSlot *
ExecScan(ScanState *node,
ExecScanAccessMtd accessMtd, /* function returning a tuple */
ExecScanRecheckMtd recheckMtd)
{
ExprContext *econtext;
List *qual;
ProjectionInfo *projInfo;
ExprDoneCond isDone;
TupleTableSlot *resultSlot;
/*
* Fetch data from node
*/
qual = node->ps.qual;
projInfo = node->ps.ps_ProjInfo;
econtext = node->ps.ps_ExprContext;
/*
* If we have neither a qual to check nor a projection to do, just skip
* all the overhead and return the raw scan tuple.
* 如果不需要判断合法性(比如:没有查询条件)则直接调用该函数。
*/
if (!qual && !projInfo)
{
ResetExprContext(econtext);
return ExecScanFetch(node, accessMtd, recheckMtd);
}
/*
* Check to see if we're still projecting out tuples from a previous scan
* tuple (because there is a function-returning-set in the projection
* expressions). If so, try to project another one.
*/
if (node->ps.ps_TupFromTlist)
{
Assert(projInfo); /* can't get here if not projecting */
resultSlot = ExecProject(projInfo, &isDone);
if (isDone == ExprMultipleResult)
return resultSlot;
/* Done with that source tuple... */
node->ps.ps_TupFromTlist = false;
}
/*
* Reset per-tuple memory context to free any expression evaluation
* storage allocated in the previous tuple cycle. Note this can't happen
* until we're done projecting out tuples from a scan tuple.
*/
ResetExprContext(econtext);
/*
* get a tuple from the access method. Loop until we obtain a tuple that
* passes the qualification.
* 主循环,不断获取可见的元组
*/
for (;;)
{
TupleTableSlot *slot;
CHECK_FOR_INTERRUPTS();
/* 根据不同的查询方式,获取一条可见元组 */
slot = ExecScanFetch(node, accessMtd, recheckMtd);
/*
* if the slot returned by the accessMtd contains NULL, then it means
* there is nothing more to scan so we just return an empty slot,
* being careful to use the projection result slot so it has correct
* tupleDesc.
*/
if (TupIsNull(slot))
{
if (projInfo)
return ExecClearTuple(projInfo->pi_slot);
else
return slot;
}
/*
* place the current tuple into the expr context
*/
econtext->ecxt_scantuple = slot;
/*
* check that the current tuple satisfies the qual-clause
*
* check for non-nil qual here to avoid a function call to ExecQual()
* when the qual is nil ... saves only a few cycles, but they add up
* ...
* 判断元组的合法性
*/
if (!qual || ExecQual(qual, econtext, false))
{
/*
* Found a satisfactory scan tuple.
*/
if (projInfo)
{
/*
* Form a projection tuple, store it in the result tuple slot
* and return it --- unless we find we can project no tuples
* from this scan tuple, in which case continue scan.
*/
resultSlot = ExecProject(projInfo, &isDone);
if (isDone != ExprEndResult)
{
node->ps.ps_TupFromTlist = (isDone == ExprMultipleResult);
return resultSlot;
}
}
else
{
/*
* Here, we aren't projecting, so just return scan tuple.
*/
return slot;
}
}
else
InstrCountFiltered1(node, 1);
/*
* Tuple fails qual, so free per-tuple memory and try again.
*/
ResetExprContext(econtext);
}
}
下面,我们来看看ExecScanFetch函数。
ExecScanFetch
static inline TupleTableSlot *
ExecScanFetch(ScanState *node,
ExecScanAccessMtd accessMtd,
ExecScanRecheckMtd recheckMtd)
{
EState *estate = node->ps.state;
if (estate->es_epqTuple != NULL)
{
/*
* We are inside an EvalPlanQual recheck. Return the test tuple if
* one is available, after rechecking any access-method-specific
* conditions.
*/
Index scanrelid = ((Scan *) node->ps.plan)->scanrelid;
if (scanrelid == 0)
{
TupleTableSlot *slot = node->ss_ScanTupleSlot;
/*
* This is a ForeignScan or CustomScan which has pushed down a
* join to the remote side. The recheck method is responsible not
* only for rechecking the scan/join quals but also for storing
* the correct tuple in the slot.
*/
if (!(*recheckMtd) (node, slot))
ExecClearTuple(slot); /* would not be returned by scan */
return slot;
}
else if (estate->es_epqTupleSet[scanrelid - 1])
{
TupleTableSlot *slot = node->ss_ScanTupleSlot;
/* Return empty slot if we already returned a tuple */
if (estate->es_epqScanDone[scanrelid - 1])
return ExecClearTuple(slot);
/* Else mark to remember that we shouldn't return more */
estate->es_epqScanDone[scanrelid - 1] = true;
/* Return empty slot if we haven't got a test tuple */
if (estate->es_epqTuple[scanrelid - 1] == NULL)
return ExecClearTuple(slot);
/* Store test tuple in the plan node's scan slot */
ExecStoreTuple(estate->es_epqTuple[scanrelid - 1],
slot, InvalidBuffer, false);
/* Check if it meets the access-method conditions */
if (!(*recheckMtd) (node, slot))
ExecClearTuple(slot); /* would not be returned by scan */
return slot;
}
}
/*
* Run the node-type-specific access method function to get the next tuple
*/
return (*accessMtd) (node);
}
ExecScanFetch的核心是最后一行代码,调用具体的函数来获取元组。所谓具体函数也就是前面调用ExecScan时的第二个参数,对于全表遍历就是SeqNext函数,下面我们来具体看看SeqNext的实现。
SeqNext
SeqNext是实际执行全表遍历的函数,该函数涉及相当多的细节,放在专门的文档《PostgreSQL流程—全表遍历》进行讲解。
ExecQual
该函数主要用于判断元组的合法性,即元组是否符合where条件
bool
ExecQual(List *qual, ExprContext *econtext, bool resultForNull)
{
bool result;
MemoryContext oldContext;
ListCell *l;
/*
* debugging stuff
*/
EV_printf("ExecQual: qual is ");
EV_nodeDisplay(qual);
EV_printf("\n");
/*
* Run in short-lived per-tuple context while computing expressions.
*/
oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
/*
* Evaluate the qual conditions one at a time. If we find a FALSE result,
* we can stop evaluating and return FALSE --- the AND result must be
* FALSE. Also, if we find a NULL result when resultForNull is FALSE, we
* can stop and return FALSE --- the AND result must be FALSE or NULL in
* that case, and the caller doesn't care which.
*
* If we get to the end of the list, we can return TRUE. This will happen
* when the AND result is indeed TRUE, or when the AND result is NULL (one
* or more NULL subresult, with all the rest TRUE) and the caller has
* specified resultForNull = TRUE.
*/
result = true;
foreach(l, qual)
{
ExprState *clause = (ExprState *) lfirst(l);
Datum expr_value;
bool isNull;
/* 关键函数,获取表达式值 */
expr_value = ExecEvalExpr(clause, econtext, &isNull, NULL);
if (isNull)
{
if (resultForNull == false)
{
result = false; /* treat NULL as FALSE */
break;
}
}
else
{
if (!DatumGetBool(expr_value))
{
result = false; /* definitely FALSE */
break;
}
}
}
MemoryContextSwitchTo(oldContext);
return result;
}
static Datum
ExecMakeFunctionResultNoSets(FuncExprState *fcache,
ExprContext *econtext,
bool *isNull,
ExprDoneCond *isDone)
{
ListCell *arg;
Datum result;
FunctionCallInfo fcinfo;
PgStat_FunctionCallUsage fcusage;
int i;
/* Guard against stack overflow due to overly complex expressions */
check_stack_depth();
if (isDone)
*isDone = ExprSingleResult;
/* inlined, simplified version of ExecEvalFuncArgs */
fcinfo = &fcache->fcinfo_data;
i = 0;
foreach(arg, fcache->args)
{
ExprState *argstate = (ExprState *) lfirst(arg);
/*
* 关键函数,获取参数
* 比如:sname = 'pgsql'
* 该函数返回结果为:fcinfo->arg[0] 为 sname的值
fcinfo->arg[1] 为 'pgsql'
*/
fcinfo->arg[i] = ExecEvalExpr(argstate,
econtext,
&fcinfo->argnull[i],
NULL);
i++;
}
/*
* If function is strict, and there are any NULL arguments, skip calling
* the function and return NULL.
*/
if (fcache->func.fn_strict)
{
while (--i >= 0)
{
if (fcinfo->argnull[i])
{
*isNull = true;
return (Datum) 0;
}
}
}
pgstat_init_function_usage(fcinfo, &fcusage);
fcinfo->isnull = false;
/* 真正的判断函数 */
result = FunctionCallInvoke(fcinfo);
*isNull = fcinfo->isnull;
pgstat_end_function_usage(&fcusage, true);
return result;
}
遗留问题
- ExecutePlan中获取到一条合法记录后是如何处理的?放入结果集还是其他?
- PlanState是什么?深入研究其相关细节。
补充
查询时需要将物理记录中的字段抽出来,流程:
ExecProject->slot_getsomeattrs->slot_deform_tuple
最后存放在Datum中