概述

pg_control文件是PostgreSQL的control file。到底如何呢,先看看改名后如何,把pg_control文件改名,然后启动 Postgres,运行时得到信息:

[postgres@pg101 bin]$ postgres: could not find the database system
Expected to find it in the directory "/usr/local/pgsql/bin/../data",
but could not open file "/usr/local/pgsql/bin/../data/global/pg_control": No Such file

对应的源代码,在postmater.c的 checkDataDir方法中:

snprintf(path, sizeof(path), "%s/global/pg_control", DataDir);
fp = AllocateFile(path, PG_BINARY_R);
if (fp == NULL) {
write_stderr("%s: could not find the database system\nExpected to find it in the directory \"%s\",\nbut could not open file \"%s\": %s\n", progname, DataDir, path, strerror(errno)); ExitPostmaster(2);
}
FreeFile(fp);

将 pg_control文件改回原来的名字后,重新启动PostgreSQL数据库,没有问题。
而在main.c中,有如下代码:
从注释中可以看到,数据库中初始化后,会有LC_CTYPE/LC_COLLATE等信息已经写入到pg_control文件中。

/* Set up locale information from environment.  Note that LC_CTYPE and
* LC_COLLATE will be overridden later from pg_control if we are in an
* already-initialized database. We set them here so that they will be
* available to fill pg_control during initdb. LC_MESSAGES will get set
* later during GUC option processing, but we set it here to allow startup
* error messages to be localized. */
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("postgres"));

在 src/backend/access/transam/xlog.c 中,有如下代码:

/* We maintain an image of pg_control in shared memory. */
static ControlFileData *ControlFile = NULL;

可见,与pg_control文件相对应,在内存中保留着一个内存结构。它长得是这个样子:

/*
* Contents of pg_control.
*
* NOTE: try to keep this under 512 bytes so that it will fit on one physical
* sector of typical disk drives. This reduces the odds of corruption due to
* power failure midway through a write.
*/

typedef struct ControlFileData
{
/*
* Unique system identifier --- to ensure we match up xlog files with the
* installation that produced them.
*/
uint64 system_identifier;

/*
* Version identifier information. Keep these fields at the same offset,
* especially pg_control_version; they won't be real useful if they move
* around. (For historical reasons they must be 8 bytes into the file
* rather than immediately at the front.)
*
* pg_control_version identifies the format of pg_control itself.
* catalog_version_no identifies the format of the system catalogs.
*
* There are additional version identifiers in individual files; for
* example, WAL logs contain per-page magic numbers that can serve as
* version cues for the WAL log.
*/
uint32 pg_control_version; /* PG_CONTROL_VERSION */
uint32 catalog_version_no; /* see catversion.h */

/*
* System status data
*/
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */

CheckPoint checkPointCopy; /* copy of last check point record */

/*
* These two values determine the minimum point we must recover up to
* before starting up:
*
* minRecoveryPoint is updated to the latest replayed LSN whenever we
* flush a data change during archive recovery. That guards against
* starting archive recovery, aborting it, and restarting with an earlier
* stop location. If we've already flushed data changes from WAL record X
* to disk, we mustn't start up until we reach X again. Zero when not
* doing archive recovery.
*
* backupStartPoint is the redo pointer of the backup start checkpoint, if
* we are recovering from an online backup and haven't reached the end of
* backup yet. It is reset to zero when the end of backup is reached, and
* we mustn't start up before that. A boolean would suffice otherwise, but
* we use the redo pointer as a cross-check when we see an end-of-backup
* record, to make sure the end-of-backup record corresponds the base
* backup we're recovering from.
*/
XLogRecPtr minRecoveryPoint;
XLogRecPtr backupStartPoint;

/*
* Parameter settings that determine if the WAL can be used for archival
* or hot standby.
*/
int wal_level;
int MaxConnections;
int max_prepared_xacts;
int max_locks_per_xact;

/*
* This data is used to check for hardware-architecture compatibility of
* the database and the backend executable. We need not check endianness
* explicitly, since the pg_control version will surely look wrong to a
* machine of different endianness, but we do need to worry about MAXALIGN
* and floating-point format. (Note: storage layout nominally also
* depends on SHORTALIGN and INTALIGN, but in practice these are the same
* on all architectures of interest.)
*
* Testing just one double value is not a very bulletproof test for
* floating-point compatibility, but it will catch most cases.
*/
uint32 maxAlign; /* alignment requirement for tuples */
double floatFormat; /* constant 1234567.0 */
#define FLOATFORMAT_VALUE 1234567.0

/*
* This data is used to make sure that configuration of this database is
* compatible with the backend executable.
*/
uint32 blcksz; /* data block size for this DB */
uint32 relseg_size; /* blocks per segment of large relation */

uint32 xlog_blcksz; /* block size within WAL files */
uint32 xlog_seg_size; /* size of each WAL segment */

uint32 nameDataLen; /* catalog name field width */
uint32 indexMaxKeys; /* max number of columns in an index */

uint32 toast_max_chunk_size; /* chunk size in TOAST tables */

/* flag indicating internal format of timestamp, interval, time */
bool enableIntTimes; /* int64 storage enabled? */

/* flags indicating pass-by-value status of various types */
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8, int8, etc pass-by-value? */

/* CRC of all above ... MUST BE LAST! */
pg_crc32 crc;
} ControlFileData;

初始化pg_control

/* This func must be called ONCE on system install.  It creates pg_control and the initial XLOG segment. */
void BootStrapXLOG(void) {
CheckPoint checkPoint;
char *buffer;
XLogPageHeader page;
XLogLongPageHeader longpage;
XLogRecord *record;
bool use_existent;
uint64 sysidentifier;
struct timeval tv;
pg_crc32 crc;
/*
* Select a hopefully-unique system identifier code for this installation.
* We use the result of gettimeofday(), including the fractional seconds
* field, as being about as unique as we can easily get. (Think not to
* use random(), since it hasn't been seeded and there's no portable way
* to seed it other than the system clock value...) The upper half of the
* uint64 value is just the tv_sec part, while the lower half is the XOR
* of tv_sec and tv_usec. This is to ensure that we don't lose uniqueness
* unnecessarily if "uint64" is really only 32 bits wide. A person
* knowing this encoding can determine the initialization time of the
* installation, which could perhaps be useful sometimes.
*/
gettimeofday(&tv, NULL);
sysidentifier = ((uint64) tv.tv_sec) << 32;
sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
...
/* Now create pg_control */
memset(ControlFile, 0, sizeof(ControlFileData));
/* Initialize pg_control status fields */
ControlFile->system_identifier = sysidentifier;
ControlFile->state = DB_SHUTDOWNED;
ControlFile->time = checkPoint.time;
ControlFile->checkPoint = checkPoint.redo;
ControlFile->checkPointCopy = checkPoint;
/* Set important parameter values for use when replaying WAL */
ControlFile->MaxConnections = MaxConnections;
ControlFile->max_prepared_xacts = max_prepared_xacts;
ControlFile->max_locks_per_xact = max_locks_per_xact;
ControlFile->wal_level = wal_level;
/* some additional ControlFile fields are set in WriteControlFile() */
WriteControlFile();
/* Bootstrap the commit log, too */
...
pfree(buffer);
}

system_identifier 这个东西 ,是随机算出来的一个值。 state 在初始化的时候,是有SHUTDOWNED。另外可能的值也都可以在pg_control.h中看到:

/* System status indicator.  Note this is stored in pg_control; if you change it, you must bump PG_CONTROL_VERSION */
typedef enum DBState {
DB_STARTUP = 0,
DB_SHUTDOWNED,
DB_SHUTDOWNED_IN_RECOVERY,
DB_SHUTDOWNING,
DB_IN_CRASH_RECOVERY,
DB_IN_ARCHIVE_RECOVERY,
DB_IN_PRODUCTION
} DBState;

DB_IN_PRODUCTION

研究 DBState,先研究 DB_IN_PRODUCTION ,看它如何出现,它出现在启动Postmaster时运行的函数处:

/* This must be called ONCE during postmaster or standalone-backend startup */                                            
void StartupXLOG(void) {

/* Read control file and check XLOG status looks valid. Note: in most control paths, *ControlFile is already valid and we need not do ReadControlFile() here, but might as well do it to be sure. */
ReadControlFile();
if (ControlFile->state < DB_SHUTDOWNED ||
ControlFile->state > DB_IN_PRODUCTION ||
!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
ereport(FATAL,(errmsg("control file contains invalid data")));
if (ControlFile->state == DB_SHUTDOWNED)
ereport(LOG, errmsg("database system was shut down at %s", str_time(ControlFile->time))));
else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
ereport(LOG,(errmsg("database system was shut down in recovery at %s", str_time(ControlFile->time))));
else if (ControlFile->state == DB_SHUTDOWNING)
ereport(LOG,(errmsg("database system shutdown was interrupted; last known up at %s", str_time(ControlFile->time))));
else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
ereport(LOG,(errmsg("database system was interrupted while in recovery at %s", str_time(ControlFile->time)),errhint("This probably means that some data is corrupted and"you will have to use the last backup for recovery.)));
else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
ereport(LOG, (errmsg("database system was interrupted while in recovery at log time %s", str_time(ControlFile->checkPointCopy.time)), errhint("If this has occurred more than once some data might be corrupted and you might need to choose an earlier recovery target.")));
else if (ControlFile->state == DB_IN_PRODUCTION)
ereport(LOG,(errmsg("database system was interrupted; last known up at %s", str_time(ControlFile->time))));
/* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
if (ControlFile->state != DB_SHUTDOWNED) pg_usleep(60000000L);
#endif


/* Check whether we need to force recovery from WAL. If it appears to have been a clean shutdown and we did not have a recovery.conf file, then assume no recovery needed. */
if (XLByteLT(checkPoint.redo, RecPtr))
{

}
else if (ControlFile->state != DB_SHUTDOWNED)
InRecovery = true;
else if (InArchiveRecovery) {
/* force recovery due to presence of recovery.conf */
InRecovery = true;
}

/* REDO */
if (InRecovery) {

/* Update pg_control to show that we are recovering and to show the selected checkpoint as the place we are starting from. We also mark pg_control with any minimum recovery stop point obtained from a backup history file. */
if (InArchiveRecovery)
ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
else {
ereport(LOG, (errmsg("database system was not properly shut down; "automatic recovery in progress)));
ControlFile->state = DB_IN_CRASH_RECOVERY;
}

}

/* Okay, we're officially UP. */
InRecovery = false;
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_IN_PRODUCTION;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
LWLockRelease(ControlFileLock);

}

可以说,只要是正常启动了,那么就是DB_IN_PRODUCTION状态。