数据结构

WaitEventSet结构体用于记录事件,nevents是注册事件的数量,events是WaitEvent结构体的数组指针。WL_EXIT_ON_PM_DEATH被转变为WL_POSTMASTER_DEATH,但是这个标志被设置,我们可以在postmaster被探测到时立即退出,而不是returning。

struct WaitEventSet {
int nevents; /* number of registered events */
int nevents_space; /* maximum number of events in this set */
WaitEvent *events; /* Array, of nevents_space length, storing the definition of events this set is waiting for. */
/* If WL_LATCH_SET is specified in any wait event, latch is a pointer to said latch, and latch_pos the offset in the ->events array. This is useful because we check the state of the latch before performing doing syscalls related to waiting. */
Latch *latch;
int latch_pos;
/* WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag is set so that we'll exit immediately if postmaster death is detected, instead of returning. */
bool exit_on_postmaster_death;

#if defined(WAIT_USE_EPOLL)
int epoll_fd;
struct epoll_event *epoll_ret_events; /* epoll_wait returns events in a user provided arrays, allocate once */
#elif defined(WAIT_USE_POLL)
struct pollfd *pollfds; /* poll expects events to be waited on every poll() call, prepare once */
#endif
};

WaitEvent结构体pos成员指明其在event data结构体中的位置,events触发的时间,和event关联的sokcet fd,user_data是提供给AddWaitEventToSet函数。

typedef struct WaitEvent {
int pos; /* position in the event data structure */
uint32 events; /* triggered events */
pgsocket fd; /* socket fd associated with event */
void *user_data; /* pointer provided in AddWaitEventToSet */
} WaitEvent;

CreateWaitEventSet

CreateWaitEventSet函数为nevents个时间创建WaitEventSet,在内存上下文中申请空间。

WaitEventSet *CreateWaitEventSet(MemoryContext context, int nevents){
WaitEventSet *set;
char *data; Size sz = 0;
/* Use MAXALIGN size/alignment to guarantee that later uses of memory are aligned correctly. E.g. epoll_event might need 8 byte alignment on some platforms, but earlier allocations like WaitEventSet and WaitEvent might not sized to guarantee that when purely using sizeof(). */
sz += MAXALIGN(sizeof(WaitEventSet));
sz += MAXALIGN(sizeof(WaitEvent) * nevents);
#if defined(WAIT_USE_EPOLL) sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
#elif defined(WAIT_USE_POLL) sz += MAXALIGN(sizeof(struct pollfd) * nevents);
#endif
data = (char *) MemoryContextAllocZero(context, sz);

set = (WaitEventSet *) data;
data += MAXALIGN(sizeof(WaitEventSet));
set->events = (WaitEvent *) data;
data += MAXALIGN(sizeof(WaitEvent) * nevents);
#if defined(WAIT_USE_EPOLL)
set->epoll_ret_events = (struct epoll_event *) data;
data += MAXALIGN(sizeof(struct epoll_event) * nevents);
#elif defined(WAIT_USE_POLL)
set->pollfds = (struct pollfd *) data;
data += MAXALIGN(sizeof(struct pollfd) * nevents);
#endif

set->latch = NULL;
set->nevents_space = nevents;
set->exit_on_postmaster_death = false;

#ifdef EPOLL_CLOEXEC
set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (set->epoll_fd < 0) elog(ERROR, "epoll_create1 failed: %m");
#else /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
set->epoll_fd = epoll_create(nevents);
if (set->epoll_fd < 0) elog(ERROR, "epoll_create failed: %m");
if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1) elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
#endif /* EPOLL_CLOEXEC */
return set;
}

FreeWaitEventSet

FreeWaitEventSet函数free之前创建的WaitEventSet。

void FreeWaitEventSet(WaitEventSet *set){
close(set->epoll_fd);
pfree(set);
}

AddWaitEventToSet

AddWaitEventToSet函数向WaitEventSet中添加事件。

  • WL_LATCH_SET: Wait for the latch to be set
  • WL_POSTMASTER_DEATH: Wait for postmaster to die
  • WL_SOCKET_READABLE: Wait for socket to become readable, can be combined in one event with other WL_SOCKET_* events
  • WL_SOCKET_WRITEABLE: Wait for socket to become writeable, can be combined with other WL_SOCKET_* events
  • WL_SOCKET_CONNECTED: Wait for socket connection to be established, can be combined with other WL_SOCKET_* events (on non-Windows platforms, this is the same as WL_SOCKET_WRITEABLE)
  • WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies

返回 WaitEventSet->events 中的偏移量(从 0 开始),可用于使用 ModifyWaitEvent() 修改先前添加的等待事件。在 WL_LATCH_SET 情况下,latch 必须由当前进程拥有,即它必须是一个用 InitLatch 初始化的进程本地latch,或者是一个通过调用 OwnLatch 与当前进程关联的共享latch。在 WL_SOCKET_READABLE/WRITEABLE/CONNECTED 情况下,EOF 和错误条件会导致套接字报告为可读/可写/已连接,以便调用者可以处理条件。此处指定的 user_data 指针将为 WaitEventSetWait() 返回的事件设置,从而可以轻松地将附加数据与事件相关联。

int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, void *user_data) {
WaitEvent *event;
if (events == WL_EXIT_ON_PM_DEATH) {
events = WL_POSTMASTER_DEATH;
set->exit_on_postmaster_death = true;
}
if (latch) {
if (latch->owner_pid != MyProcPid) elog(ERROR, "cannot wait on a latch owned by another process");
if (set->latch) elog(ERROR, "cannot wait on more than one latch");
if ((events & WL_LATCH_SET) != WL_LATCH_SET) elog(ERROR, "latch events only support being set");
}else{
if (events & WL_LATCH_SET) elog(ERROR, "cannot wait on latch without a specified latch");
}
/* waiting for socket readiness without a socket indicates a bug */
if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK)) elog(ERROR, "cannot wait on socket event without a socket");

event = &set->events[set->nevents];
event->pos = set->nevents++;
event->fd = fd;
event->events = events;
event->user_data = user_data;
if (events == WL_LATCH_SET) {
set->latch = latch;
set->latch_pos = event->pos;
} else if (events == WL_POSTMASTER_DEATH) {
event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
}

/* perform wait primitive specific initialization, if needed */
#if defined(WAIT_USE_EPOLL)
WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
#elif defined(WAIT_USE_POLL)
WaitEventAdjustPoll(set, event);
#endif
return event->pos;
}

ModifyWaitEvent

ModifyWaitEvent函数改变事件掩码,在WL_LATCH_SET情况下,latch和WaitEvent关联。

void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch) {
WaitEvent *event;
Assert(pos < set->nevents);
event = &set->events[pos];
/* If neither the event mask nor the associated latch changes, return early. That's an important optimization for some sockets, where ModifyWaitEvent is frequently used to switch from waiting for reads to waiting on writes. */
if (events == event->events && (!(event->events & WL_LATCH_SET) || set->latch == latch)) return;

if (event->events & WL_LATCH_SET && events != event->events) elog(ERROR, "cannot modify latch event"); /* we could allow to disable latch events for a while */

if (event->events & WL_POSTMASTER_DEATH) elog(ERROR, "cannot modify postmaster death event");
/* FIXME: validate event mask */
event->events = events;
if (events == WL_LATCH_SET) set->latch = latch;

#if defined(WAIT_USE_EPOLL)
WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
#elif defined(WAIT_USE_POLL)
WaitEventAdjustPoll(set, event);
#endif
}

WaitEventSetWait

WaitEventSetWait函数等待添加到WaitEventSet中的事件发生或者等待超时。如果timeout为-1,则会一直等到时间发生;如果为0,检查sokect的readiness,但是不阻塞;如果大于0,阻塞一直到超时或事件发生。该函数返回已经发生事件的数量,或者0表示超时。

int WaitEventSetWait(WaitEventSet *set, long timeout, WaitEvent *occurred_events, int nevents, uint32 wait_event_info) {
int returned_events = 0;
instr_time start_time;
instr_time cur_time;
long cur_timeout = -1;
/* Initialize timeout if requested. We must record the current time so that we can determine the remaining timeout if interrupted. */
if (timeout >= 0) {
INSTR_TIME_SET_CURRENT(start_time);
Assert(timeout >= 0 && timeout <= INT_MAX);
cur_timeout = timeout;
}
pgstat_report_wait_start(wait_event_info);

waiting = true;
while (returned_events == 0) {
int rc;
/* Check if the latch is set already. If so, leave the loop
* immediately, avoid blocking again. We don't attempt to report any
* other events that might also be satisfied.
* If someone sets the latch between this and the
* WaitEventSetWaitBlock() below, the setter will write a byte to the
* pipe (or signal us and the signal handler will do that), and the
* readiness routine will return immediately.
* On unix, If there's a pending byte in the self pipe, we'll notice
* whenever blocking. Only clearing the pipe in that case avoids
* having to drain it every time WaitLatchOrSocket() is used. Should
* the pipe-buffer fill up we're still ok, because the pipe is in
* nonblocking mode. It's unlikely for that to happen, because the
* self pipe isn't filled unless we're blocking (waiting = true), or
* from inside a signal handler in latch_sigusr1_handler().
* On windows, we'll also notice if there's a pending event for the
* latch when blocking, but there's no danger of anything filling up,
* as "Setting an event that is already set has no effect.".
* Note: we assume that the kernel calls involved in latch management
* will provide adequate synchronization on machines with weak memory
* ordering, so that we cannot miss seeing is_set if a notification
* has already been queued. */
if (set->latch && set->latch->is_set) {
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->pos = set->latch_pos;
occurred_events->user_data = set->events[set->latch_pos].user_data;
occurred_events->events = WL_LATCH_SET;
occurred_events++;
returned_events++;
break;
}

/* Wait for events using the readiness primitive chosen at the top of
* this file. If -1 is returned, a timeout has occurred, if 0 we have
* to retry, everything >= 1 is the number of returned events. */
rc = WaitEventSetWaitBlock(set, cur_timeout, occurred_events, nevents);

if (rc == -1) break; /* timeout occurred */
else returned_events = rc;

/* If we're not done, update cur_timeout for next iteration */
if (returned_events == 0 && timeout >= 0){
INSTR_TIME_SET_CURRENT(cur_time);
INSTR_TIME_SUBTRACT(cur_time, start_time);
cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
if (cur_timeout <= 0) break;
}
}
waiting = false;
pgstat_report_wait_end();
return returned_events;
}

WaitEventSetWaitBlock

WaitEventSetWaitBlock在使用epoll_wait(2)和poll(2)是不同的。如下以epoll代码详情为例。

static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) {
int returned_events = 0; int rc;
WaitEvent *cur_event;
struct epoll_event *cur_epoll_event;
/* Sleep */
rc = epoll_wait(set->epoll_fd, set->epoll_ret_events, nevents, cur_timeout);
/* Check return code */
if (rc < 0) {
/* EINTR is okay, otherwise complain */
if (errno != EINTR) {
waiting = false;
ereport(ERROR, (errcode_for_socket_access(), /* translator: %s is a syscall name, such as "poll()" */ errmsg("%s failed: %m","epoll_wait()")));
}
return 0;
}else if (rc == 0){/* timeout exceeded */
return -1;
}

/* At least one event occurred, iterate over the returned epoll events
* until they're either all processed, or we've returned all the events
* the caller desired. */
for (cur_epoll_event = set->epoll_ret_events;cur_epoll_event < (set->epoll_ret_events + rc) &&returned_events < nevents;cur_epoll_event++) {
/* epoll's data pointer is set to the associated WaitEvent */
cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
occurred_events->pos = cur_event->pos;
occurred_events->user_data = cur_event->user_data;
occurred_events->events = 0;
if (cur_event->events == WL_LATCH_SET && cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) {
/* There's data in the self-pipe, clear it. */
drainSelfPipe();
if (set->latch->is_set) {
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_LATCH_SET;
occurred_events++;
returned_events++;
}
}else if (cur_event->events == WL_POSTMASTER_DEATH && cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)){
/*
* We expect an EPOLLHUP when the remote end is closed, but
* because we don't expect the pipe to become readable or to have
* any errors either, treat those cases as postmaster death, too.
*
* Be paranoid about a spurious event signalling the postmaster as
* being dead. There have been reports about that happening with
* older primitives (select(2) to be specific), and a spurious
* WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
* cost much.
*/
if (!PostmasterIsAliveInternal()){
if (set->exit_on_postmaster_death) proc_exit(1);
occurred_events->fd = PGINVALID_SOCKET;
occurred_events->events = WL_POSTMASTER_DEATH;
occurred_events++;
returned_events++;
}
}else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) {
Assert(cur_event->fd != PGINVALID_SOCKET);
if ((cur_event->events & WL_SOCKET_READABLE) && (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))){/* data available in socket, or EOF */
occurred_events->events |= WL_SOCKET_READABLE;
}

if ((cur_event->events & WL_SOCKET_WRITEABLE) && (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP))){/* writable, or EOF */
occurred_events->events |= WL_SOCKET_WRITEABLE;
}
if (occurred_events->events != 0) {
occurred_events->fd = cur_event->fd;
occurred_events++;
returned_events++;
}
}
}
return returned_events;
}

例子如下所示:

int WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, long timeout, uint32 wait_event_info) {
int ret = 0;
int rc;
WaitEvent event;
WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
if (wakeEvents & WL_TIMEOUT) Assert(timeout >= 0);
else timeout = -1;

if (wakeEvents & WL_LATCH_SET) AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET, latch, NULL);

/* Postmaster-managed callers must handle postmaster death somehow. */
Assert(!IsUnderPostmaster || (wakeEvents & WL_EXIT_ON_PM_DEATH) || (wakeEvents & WL_POSTMASTER_DEATH));

if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);

if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL);

if (wakeEvents & WL_SOCKET_MASK) {
int ev;
ev = wakeEvents & WL_SOCKET_MASK;
AddWaitEventToSet(set, ev, sock, NULL, NULL);
}

rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);

if (rc == 0) ret |= WL_TIMEOUT;
else ret |= event.events & (WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_MASK);

FreeWaitEventSet(set);

return ret;
}

sendSelfPipeByte

sendSelfPipeByte函数向selfpipe_writefd管道中写入dummy一字节。

static void sendSelfPipeByte(void) {
int rc;
char dummy = 0;
retry:
rc = write(selfpipe_writefd, &dummy, 1);
if (rc < 0) {
/* If interrupted by signal, just retry */
if (errno == EINTR) goto retry;
/* If the pipe is full, we don't need to retry, the data that's there already is enough to wake up WaitLatch. */
if (errno == EAGAIN || errno == EWOULDBLOCK) return;
/* Oops, the write() failed for some other reason. We might be in a signal handler, so it's not safe to elog(). We have no choice but silently ignore the error. */
return;
}
}

PG服务进程(Postgres)——WaitEventSet_数据库