Nginx高可用、高可靠性体现它的平滑升级--在升级过程中能够保证业务不间断,那它是如何实现的呢?接下来我们一起探讨它的实现原理。

一、升级过程

1.1、查看进程信息

先通过命令行感性的了解一下Nginx是如何进行平滑升级的?首先通过命令行查看进程信息,可知,一个master进程、两个worker进程,如下所示:

[root@localhost ~]# ps -ef | grep nginx | grep -v grep
root      2850     1  0 08:24 ?        00:00:00 nginx: master process /usr/local/nginx/sbin/nginx
root      2851  2850  0 08:24 ?        00:00:00 nginx: worker process      
root      2852  2850  0 08:24 ?        00:00:00 nginx: worker process      
[root@localhost ~]#

地址监听信息如下: 

[root@localhost ~]# netstat -apn | grep tcp| grep nginx 
tcp        0      0 0.0.0.0:80                  0.0.0.0:*                   LISTEN      2850/nginx          
[root@localhost ~]#

1.2、升级版本

通过命令行,发送SIGUSR2给master进程(Nginx定义SIGUSR2是版本升级),进行版本升级。

[root@localhost ~]# kill -USR2 `cat /usr/local/nginx/logs/nginx.pid`
[root@localhost ~]#
[root@localhost ~]# ps -ef | grep nginx | grep -v grep
root      2850     1  0 08:24 ?        00:00:00 nginx: master process /usr/local/nginx/sbin/nginx
root      2851  2850  0 08:24 ?        00:00:00 nginx: worker process      
root      2852  2850  0 08:24 ?        00:00:00 nginx: worker process      
root      2942  2850  0 08:39 ?        00:00:00 nginx: master process /usr/local/nginx/sbin/nginx
root      2943  2942  0 08:39 ?        00:00:00 nginx: worker process      
root      2944  2942  0 08:39 ?        00:00:00 nginx: worker process      
[root@localhost ~]#

变化内容:

1)会发现会多出一组master和worker进行(新master进程id是2942,新woker进程是2943、2944) ,其中新master进程的父进程是2850,即旧master进程。

2)nginx.pid文件发生改变,变成新的master进程id,这点很重要,后面再操作nginx.pid时,应该警惕,如下:

[root@localhost ~]# 
[root@localhost ~]# cat /usr/local/nginx/logs/nginx.pid
2942
[root@localhost ~]#

地址监听信息,没有变化:

[root@localhost ~]# netstat -apn | grep tcp| grep nginx
tcp        0      0 0.0.0.0:80                  0.0.0.0:*                   LISTEN      2850/nginx          
[root@localhost ~]#

此时我们进行访问Nginx服务(浏览器输入地址),然后查看链接信息:

[root@localhost ~]# 
[root@localhost ~]# netstat -apn | grep tcp| grep nginx
tcp        0      0 0.0.0.0:80                  0.0.0.0:*                   LISTEN      2850/nginx          
tcp        0      0 192.168.12.129:80           192.168.12.1:52980          ESTABLISHED 2944/nginx          
tcp        0      0 192.168.12.129:80           192.168.12.1:52979          ESTABLISHED 2944/nginx          
tcp        0      0 192.168.12.129:80           192.168.12.1:52983          ESTABLISHED 2944/nginx          
tcp        0      0 192.168.12.129:80           192.168.12.1:52981          ESTABLISHED 2944/nginx          
tcp        0      0 192.168.12.129:80           192.168.12.1:52982          ESTABLISHED 2851/nginx          
tcp        0      0 192.168.12.129:80           192.168.12.1:52984          ESTABLISHED 2943/nginx          
[root@localhost ~]#

发现提供服务worker进程分别是2944、2851、2943,其中2851是旧worker进程。

1.3、旧版本服务下线

通过发送WINCH信号,使旧版本worker进程不在处理新请求且结束进程,注意:发送WINCH,不能使用nginx.pid文件,必须使用进程id。

[root@localhost ~]# 
[root@localhost ~]#  kill -WINCH 2850
[root@localhost ~]# 
[root@localhost ~]# ps -ef|grep nginx |grep -v grep
root      2850     1  0 09:04 ?        00:00:00 nginx: master process /usr/local/nginx/sbin/nginx
root      2942  2850  0 09:06 ?        00:00:00 nginx: master process /usr/local/nginx/sbin/nginx
root      2943  2942  0 09:06 ?        00:00:00 nginx: worker process      
root      2944  2942  0 09:06 ?        00:00:00 nginx: worker process      
[root@localhost ~]#
[root@localhost ~]# netstat -apn | grep nginx |grep tcp
tcp        0      0 0.0.0.0:80                  0.0.0.0:*                   LISTEN      2850/nginx   
[root@localhost ~]#

通过查看命令行可知,旧worker进程都已经退出,但是旧的master进程并没有退出,此时监听地址信息,进程id号仍然是旧master进程id号。

1.4、关闭旧master进程

关闭旧master进程,可以直接通过kill命令行,将其杀死,如下:

[root@localhost ~]# kill 2580
[root@localhost ~]# ps -ef|grep nginx |grep -v grep
root      2942     1  0 09:06 ?        00:00:00 nginx: master process /usr/local/nginx/sbin/nginx
root      2943  2942  0 09:06 ?        00:00:00 nginx: worker process      
root      2944  2942  0 09:06 ?        00:00:00 nginx: worker process      
[root@localhost ~]#

最后查看,监听信息,进程信息变成新master进程id:

[root@localhost ~]# netstat -apn | grep nginx |grep tcp
tcp        0      0 0.0.0.0:80                  0.0.0.0:*                   LISTEN      2942/nginx          
[root@localhost ~]# 
[root@localhost ~]#

二、升级实现原理

上面只是演示如何进行平滑升级,这里再从源码级别深入探讨平滑升级的流程。

通过上面演示,可知Nginx进行平滑升级采用的是进程间通信方式--信号。通过命令行kill(也是一个进程)发送信号给master进程,那么master进程才可以针对不同信号进行处理。既然是通过信号方式,必然要先注册信号,否则应用程序是无法处理的。

2.1、信号注册

信号注册是在main函数中调用,信号注册方式一般有两种,一种signal和sigaction,nginx采用sigaction,如下:

ngx_int_t
ngx_init_signals(ngx_log_t *log)
{
    ngx_signal_t      *sig;
    struct sigaction   sa;

    for (sig = signals; sig->signo != 0; sig++) {
        ngx_memzero(&sa, sizeof(struct sigaction));
        sa.sa_handler = sig->handler;
        sigemptyset(&sa.sa_mask);
        if (sigaction(sig->signo, &sa, NULL) == -1) {//注册信号,当信号发生由sig->handler处理
#if (NGX_VALGRIND)
            ngx_log_error(NGX_LOG_ALERT, log, ngx_errno,
                          "sigaction(%s) failed, ignored", sig->signame);
#else
            ngx_log_error(NGX_LOG_EMERG, log, ngx_errno,
                          "sigaction(%s) failed", sig->signame);
            return NGX_ERROR;
#endif
        }
    }

    return NGX_OK;
}

信号表signals主要定义信号和处理函数映射关系,具体定义如下:

ngx_signal_t  signals[] = {
    { ngx_signal_value(NGX_RECONFIGURE_SIGNAL),
      "SIG" ngx_value(NGX_RECONFIGURE_SIGNAL),
      "reload",
      ngx_signal_handler },
    { ngx_signal_value(NGX_REOPEN_SIGNAL),
      "SIG" ngx_value(NGX_REOPEN_SIGNAL),
      "reopen",
      ngx_signal_handler },
    { ngx_signal_value(NGX_NOACCEPT_SIGNAL),
      "SIG" ngx_value(NGX_NOACCEPT_SIGNAL),
      "",
      ngx_signal_handler },
    { ngx_signal_value(NGX_TERMINATE_SIGNAL),
      "SIG" ngx_value(NGX_TERMINATE_SIGNAL),
      "stop",
      ngx_signal_handler },
    { ngx_signal_value(NGX_SHUTDOWN_SIGNAL),
      "SIG" ngx_value(NGX_SHUTDOWN_SIGNAL),
      "quit",
      ngx_signal_handler },
    { ngx_signal_value(NGX_CHANGEBIN_SIGNAL),
      "SIG" ngx_value(NGX_CHANGEBIN_SIGNAL),
      "",
      ngx_signal_handler },
    { SIGALRM, "SIGALRM", "", ngx_signal_handler },
    { SIGINT, "SIGINT", "", ngx_signal_handler },
    { SIGIO, "SIGIO", "", ngx_signal_handler },
    { SIGCHLD, "SIGCHLD", "", ngx_signal_handler },
    { SIGSYS, "SIGSYS, SIG_IGN", "", SIG_IGN },
    { SIGPIPE, "SIGPIPE, SIG_IGN", "", SIG_IGN },
    { 0, NULL, "", NULL }
};

2.2 、信号处理

  由上一小节可知,信号处理函数均是ngx_signal_handler方法:

static void
ngx_signal_handler(int signo)
{
    char            *action;
    ngx_int_t        ignore;
    ngx_err_t        err;
    ngx_signal_t    *sig;

    ignore = 0;

    err = ngx_errno;

    for (sig = signals; sig->signo != 0; sig++) {//根据信号 查找具体信号组
        if (sig->signo == signo) {
            break;
        }
    }

    ngx_time_sigsafe_update();//跟新时间

    action = "";

    switch (ngx_process) {//ngx_process代表当前进程角色
    /* master进程 */
    case NGX_PROCESS_MASTER:
    case NGX_PROCESS_SINGLE:
        switch (signo) {

        case ngx_signal_value(NGX_SHUTDOWN_SIGNAL):
            ngx_quit = 1;
            action = ", shutting down";
            break;

        case ngx_signal_value(NGX_TERMINATE_SIGNAL):
        case SIGINT:
            ngx_terminate = 1;
            action = ", exiting";
            break;

        case ngx_signal_value(NGX_NOACCEPT_SIGNAL):
            if (ngx_daemonized) {
                ngx_noaccept = 1;
                action = ", stop accepting connections";
            }
            break;

        case ngx_signal_value(NGX_RECONFIGURE_SIGNAL):
            ngx_reconfigure = 1;
            action = ", reconfiguring";
            break;

        case ngx_signal_value(NGX_REOPEN_SIGNAL):
            ngx_reopen = 1;
            action = ", reopening logs";
            break;
        /* 平滑升级 接收到的信号 */
        case ngx_signal_value(NGX_CHANGEBIN_SIGNAL):
            if (getppid() > 1 || ngx_new_binary > 0) {

                /*
                 * Ignore the signal in the new binary if its parent is
                 * not the init process, i.e. the old binary's process
                 * is still running.  Or ignore the signal in the old binary's
                 * process if the new binary's process is already running.
                 */

                action = ", ignoring";
                ignore = 1;
                break;
            }

            ngx_change_binary = 1;//这里只是把标志位设置为1
            action = ", changing binary"; 
            break;

        case SIGALRM:
            ngx_sigalrm = 1;
            break;

        case SIGIO:
            ngx_sigio = 1;
            break;
        /**
         * 当worker进程异常退出时,会向父进程发送SIGCHLD信号
         * 当master进程收到该信号,就会重新调度起worker进程
         */
        case SIGCHLD:
            ngx_reap = 1;
            break;
        }

        break;
    /* worker进程 */
    case NGX_PROCESS_WORKER:
    case NGX_PROCESS_HELPER:
        switch (signo) {

        case ngx_signal_value(NGX_NOACCEPT_SIGNAL):
            if (!ngx_daemonized) {
                break;
            }
            ngx_debug_quit = 1;
            /* fall through */
        case ngx_signal_value(NGX_SHUTDOWN_SIGNAL):
            ngx_quit = 1;
            action = ", shutting down";
            break;

        case ngx_signal_value(NGX_TERMINATE_SIGNAL):
        case SIGINT:
            ngx_terminate = 1;
            action = ", exiting";
            break;

        case ngx_signal_value(NGX_REOPEN_SIGNAL):
            ngx_reopen = 1;
            action = ", reopening logs";
            break;

        case ngx_signal_value(NGX_RECONFIGURE_SIGNAL):
        case ngx_signal_value(NGX_CHANGEBIN_SIGNAL):
        case SIGIO:
            action = ", ignoring";
            break;
        }

        break;
    }

    ngx_log_error(NGX_LOG_NOTICE, ngx_cycle->log, 0,
                  "signal %d (%s) received%s", signo, sig->signame, action);

    if (ignore) {
        ngx_log_error(NGX_LOG_CRIT, ngx_cycle->log, 0,
                      "the changing binary signal is ignored: "
                      "you should shutdown or terminate "
                      "before either old or new binary's process");
    }

    if (signo == SIGCHLD) {//当子进程异常退出时,需要修改退出的子进程的状态
        ngx_process_get_status();
    }

    ngx_set_errno(err);
}

2.3、信号产生方式

有两种方式处理方式:一种是nginx自身发起信号,一种是kill命令行。针对平滑升级来说,我们通过kill方式,发送USR2信号给master进程,因此上面流程会进入:

/* 平滑升级 接收到的信号 */
case ngx_signal_value(NGX_CHANGEBIN_SIGNAL):
	if (getppid() > 1 || ngx_new_binary > 0) {

		/*
		 * Ignore the signal in the new binary if its parent is
		 * not the init process, i.e. the old binary's process
		 * is still running.  Or ignore the signal in the old binary's
		 * process if the new binary's process is already running.
		 */

		action = ", ignoring";
		ignore = 1;
		break;
	}

	ngx_change_binary = 1;//这里只是把标志位设置为1
	action = ", changing binary"; 
	break;

 此时退出该函数表示,信号中断处理函数已经结束,那么master进程应该如何处理呢?

三、信号事件后续处理

我们在《菜鸟学习Nginx之启动流程2》已经说明了,master进程会阻塞在sigsuspend函数这里,当有信号产生时,进程会被中断,转而调用信号处理函数。当信号处理函数结束后sigsuspend阻塞函数返回,执行后续代码流程。这段流程需要谨记于心。

/**
         * 当master进程接收到USR2信号,表明进行平滑升级
         */
        if (ngx_change_binary)
        {
            ngx_change_binary = 0;
            ngx_log_error(NGX_LOG_NOTICE, cycle->log, 0, "changing binary");
            ngx_new_binary = ngx_exec_new_binary(cycle, ngx_argv);
        }

进入ngx_exec_new_binary函数:

ngx_pid_t
ngx_exec_new_binary(ngx_cycle_t *cycle, char *const *argv)
{
    char             **env, *var;
    u_char            *p;
    ngx_uint_t         i, n;
    ngx_pid_t          pid;
    ngx_exec_ctx_t     ctx;
    ngx_core_conf_t   *ccf;
    ngx_listening_t   *ls;

    ngx_memzero(&ctx, sizeof(ngx_exec_ctx_t));

    ctx.path = argv[0];
    ctx.name = "new binary process";
    ctx.argv = argv;

    n = 2;
    env = ngx_set_environment(cycle, &n);
    if (env == NULL) {
        return NGX_INVALID_PID;
    }

    var = ngx_alloc(sizeof(NGINX_VAR)
                    + cycle->listening.nelts * (NGX_INT32_LEN + 1) + 2,
                    cycle->log);
    if (var == NULL) {
        ngx_free(env);
        return NGX_INVALID_PID;
    }

    p = ngx_cpymem(var, NGINX_VAR "=", sizeof(NGINX_VAR));

    ls = cycle->listening.elts;
    for (i = 0; i < cycle->listening.nelts; i++) {
        p = ngx_sprintf(p, "%ud;", ls[i].fd);
    }

    *p = '\0';

    env[n++] = var;

#if (NGX_SETPROCTITLE_USES_ENV)

    /* allocate the spare 300 bytes for the new binary process title */

    env[n++] = "SPARE=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
               "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
               "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
               "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
               "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";

#endif

    env[n] = NULL;

#if (NGX_DEBUG)
    {
    char  **e;
    for (e = env; *e; e++) {
        ngx_log_debug1(NGX_LOG_DEBUG_CORE, cycle->log, 0, "env: %s", *e);
    }
    }
#endif

    ctx.envp = (char *const *) env;

    ccf = (ngx_core_conf_t *) ngx_get_conf(cycle->conf_ctx, ngx_core_module);
    //重命名pid文件 把旧的pid文件重名为pid.old
    if (ngx_rename_file(ccf->pid.data, ccf->oldpid.data) == NGX_FILE_ERROR) {
        ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_errno,
                      ngx_rename_file_n " %s to %s failed "
                      "before executing new binary process \"%s\"",
                      ccf->pid.data, ccf->oldpid.data, argv[0]);

        ngx_free(env);
        ngx_free(var);

        return NGX_INVALID_PID;
    }

    pid = ngx_execute(cycle, &ctx);/* 执行exec家族函数 启动新进程 */

    if (pid == NGX_INVALID_PID) {
        if (ngx_rename_file(ccf->oldpid.data, ccf->pid.data)
            == NGX_FILE_ERROR)
        {
            ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_errno,
                          ngx_rename_file_n " %s back to %s failed after "
                          "an attempt to execute new binary process \"%s\"",
                          ccf->oldpid.data, ccf->pid.data, argv[0]);
        }
    }

    ngx_free(env);
    ngx_free(var);

    return pid;
}
ngx_pid_t
ngx_execute(ngx_cycle_t *cycle, ngx_exec_ctx_t *ctx)
{
    return ngx_spawn_process(cycle, ngx_execute_proc, ctx, ctx->name,
                             NGX_PROCESS_DETACHED);
}

static void
ngx_execute_proc(ngx_cycle_t *cycle, void *data)
{
    ngx_exec_ctx_t  *ctx = data;

    /**
     * 如果执行成功则函数不会返回,执行失败则直接返回-1,失败原因存于errno 中
     * 当执行成功之后进程上下文就会切换,因此也没有必要返回
     */
    if (execve(ctx->path, ctx->argv, ctx->envp) == -1) {
        ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_errno,
                      "execve() failed while executing %s \"%s\"",
                      ctx->name, ctx->path);
    }

    exit(1);
}

四、总结

这里有一个问题:新进程是如何将继续监听原先listening socket呢?主要通过两种方式:

1、启动平滑升级时,旧master会把监听socket文件句柄(数字)写到环境变量中。

2、旧master在fork出子进程后,从环境变量中读取出listen socket,然后子进程在执行exec函数,进行替换。这里需要提示父子进程空间是共享的,所以子进程可以操作listen socket。

Nginx这种升级方式,是比较经典的升级方式,为了日后工作提供了指导方向。但是我个人感觉Nginx升级方式有点不完美,不完美之处在于需要敲三次命令行,才能完成升级。