external_getnext函数用于解析一个数据文件,以heap tuple格式返回其包含的行。FileScanDescData结构体存放了目标表Relation结构体、文件协议URL_FILE、CopyState、表列属性、values值、nulls值标志等external表相关属性。该函数的流程很清晰,首先是打开获取数据的客户端open_external_readable_source,然后进行数据获取externalgettup,最后返回。

HeapTuple external_getnext(FileScanDesc scan, ScanDirection direction, ExternalSelectDesc desc) {
HeapTuple tuple;
if (scan->fs_noop) return NULL; // this segdb has no file to scan 该segment没有file用于扫描

/* open the external source (local file or http).
* NOTE: external_beginscan() seems like the natural place for this call.
* However, in queries with more than one gang each gang will initialized
* all the nodes of the plan (but actually executed only the nodes in it's
* local slice) This means that external_beginscan() (and
* external_endscan() too) will get called more than needed and we'll end
* up opening too many http connections when they are not expected (see
* MPP-1261). Therefore we instead do it here on the first time around
* only.
*/
if (!scan->fs_file) open_external_readable_source(scan, desc);

tuple = externalgettup(scan, direction);
if (tuple == NULL) { return NULL;}
pgstat_count_heap_getnext(scan->fs_rd);
return tuple;
}

open_external_readable_source

open_external_readable_source函数定义在src/backend/access/external/fileam.c中,其源代码如下所示,主要流程就是根据FileScanDesc的成员创建extvar_t参数,然后调用url_fopen函数创建获取数据的客户端【数据源可以是a local file (requested by ‘file’)、 a remote http server、a remote gpfdist server、a command to execute】。

/* open the external source for scanning (RET only)
* an external source is one of the following:
* 1) a local file (requested by 'file')
* 2) a remote http server
* 3) a remote gpfdist server
* 4) a command to execute
*/
static void open_external_readable_source(FileScanDesc scan, ExternalSelectDesc desc) {
extvar_t extvar;
memset(&extvar, 0, sizeof(extvar)); /* set up extvar */
external_set_env_vars_ext(&extvar, scan->fs_uri, scan->fs_pstate->csv_mode, scan->fs_pstate->escape, scan->fs_pstate->quote, scan->fs_pstate->eol_type, scan->fs_pstate->header_line, scan->fs_scancounter, scan->fs_custom_formatter_params);
/* actually open the external source */
scan->fs_file = url_fopen(scan->fs_uri, false /* for read */ , &extvar, scan->fs_pstate, desc);
}

url_fopen函数定义在src/backend/access/external/url.c文件中,更加不同的数据源,创建不同的客户端。

/* url_fopen
* checks for URLs or types in the 'url' and basically use the real fopen() for
* standard files, or if the url happens to be a command to execute it uses
* popen to execute it.
* On error, ereport()s */
URL_FILE *url_fopen(char *url, bool forwrite, extvar_t *ev, CopyState pstate, ExternalSelectDesc desc){
/* if 'url' starts with "execute:" then it's a command to execute and not a url (the command specified in CREATE EXTERNAL TABLE .. EXECUTE) */
if (pg_strncasecmp(url, EXEC_URL_PREFIX, strlen(EXEC_URL_PREFIX)) == 0)
return url_execute_fopen(url, forwrite, ev, pstate);
else if (IS_FILE_URI(url))
return url_file_fopen(url, forwrite, ev, pstate);
else if (IS_HTTP_URI(url) || IS_GPFDIST_URI(url) || IS_GPFDISTS_URI(url))
return url_curl_fopen(url, forwrite, ev, pstate);
else
return url_custom_fopen(url, forwrite, ev, pstate, desc);
}

从下面图片可以看出,open_external_readable_source和externalgettup函数位于中间层,对接Scan执行节点和底层数据API。底层数据API封装了不同的数据源,对上隐藏数据源各自不同的特点。

Greenplum数据库外部表——external_getnext获取元组_java

externalgettup

externalgettup函数主要是对text/csv和custom格式数据获取的抽象和封装,代码如下所示。

static HeapTuple externalgettup(FileScanDesc scan, ScanDirection dir __attribute__((unused))) {
bool custom = (scan->fs_custom_formatter_func != NULL);
HeapTuple tup = NULL;

ErrorContextCallback externalscan_error_context;
externalscan_error_context.callback = external_scan_error_callback;
externalscan_error_context.arg = (void *) scan;
externalscan_error_context.previous = error_context_stack;
error_context_stack = &externalscan_error_context;

if (!scan->fs_inited) {
/* more init stuff here... */
scan->fs_inited = true;
}else{
/* continue from previously returned tuple */
/* (set current state...) */
}

if (!custom) tup = externalgettup_defined(scan); /* text/csv */
else tup = externalgettup_custom(scan); /* custom */

error_context_stack = externalscan_error_context.previous; /* Restore the previous error callback */

return tup;
}