二十一:redis主从复制实现原理

主从复制实现原理

本文使用源码redis 2.2

主从结构有两种结构:

1.一主多从
2. 级联结果

java redis 主从复制 redis的主从复制,怎么实现_java redis 主从复制

复制方式分为:

1.全量复制
2. 增量同步

全量复制: 发生在slave初始阶段及slave断线重新连接到master
增量同步: 当全量复制完成后,主服务每执行一个写命令就会向从服务发送命令。

全量复制

1.当slave启动连接到master发送sync命令
2.master接收到sync后,开始执行bgsave命令生成RDB
3.master后续写入命令存入到缓冲区
4.slave与master建立连接,读取master的rdb数据
5.slave读取数据完成,清空缓存加载rdb
6.准备接受master的后续写命令

java redis 主从复制 redis的主从复制,怎么实现_java redis 主从复制_02

slave端

--------------------------slave-------------------------------------------------------

main

/**
* file:      redis.c
* function:  main()
* 这里展示的仅是main的部分,为了排除干扰仅说明主从复制的原理
**/
int main(int argc, char **argv) {
// 初始化server配置(未读取配置文件)
initServerConfig();
//启动参数 redis-server /path/redis.conf
if (argc == 2) {
        //加载配置文件
        loadServerConfig(argv[1]);
    }

}

loadServerConfig

/**
* file:      config.c
* function:  loadServerConfig()
**/
void loadServerConfig(char *filename){
    // 如果当前启动的是slave,则配置文件有配置master的ip、port (slaveof是redis5.0之前的的配置,5.0后使用replicaof )
    if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
        server.masterhost = sdsnew(argv[1]);
        server.masterport = atoi(argv[2]);
        //设置复制状态为可连接
        server.replstate = REDIS_REPL_CONNECT;
    }
}

replicationCron

/**
* file:      replication.c
* function:  replicationCron()
* des:  serverCronding定时任务,调用replicationCron定时复制任务,通常当重连接到master或传输失败时执行
**/
void replicationCron(void) {
    // 上面的初始化读取配置文件设置了replstate为REDIS_REPL_CONNECT
    if (server.replstate == REDIS_REPL_CONNECT) {
        redisLog(REDIS_NOTICE,"Connecting to MASTER...");
        // 发送sync到master,master准备bgsave生成rdb文件
        if (syncWithMaster() == REDIS_OK) {
            redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync started: SYNC sent");
        }
    }
}

syncWithMaster

/** 
* replication.c
* 
* syncWithMaster()
**/
int syncWithMaster(void) {
    char buf[1024], tmpfile[256], authcmd[1024];
    // tcp socket连接master
    int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
    // dfd:  maxtries: 打开rdb文件最大的重试次数
    int dfd, maxtries = 5;
    // 不能连接到master
    if (fd == -1) {
        redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
            strerror(errno));
        return REDIS_ERR;
    }

    //如果有配置master的密码则尝试认证
    if(server.masterauth) {
        // 构造 auth password 命令
    	snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
        //阻塞式的执行认证命令,并且设置超时时间5秒
    	if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
            close(fd);
            redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
                strerror(errno));
            return REDIS_ERR;
    	}
       // 阻塞式的读取auth password执行的结果,并设置超时间时间3600秒
        if (syncReadLine(fd,buf,1024,3600) == -1) {
            close(fd);
            redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
                strerror(errno));
            return REDIS_ERR;
        }
        //resp协议,五种数据类型,这里执行auth命令可能的结果前缀+、-; 如果不是+开头,则表示密码错误
        if (buf[0] != '+') {
            close(fd);
            redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
            return REDIS_ERR;
        }
    }

    //开始准备执行sync命令,结果返回-1表示执行失败
    if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
        close(fd);
        redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
            strerror(errno));
        return REDIS_ERR;
    }

    //最大尝试5次,打开文件
    while(maxtries--) {
        // 构造临时文件名
        snprintf(tmpfile,256,
            "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
        //O_CREAT: 如果指定文件不存在,则创建这个文件
        //O_WRONLY:只写模式
        //O_EXCL:  如果要创建的文件已存在,则返回-1,并且修改error的值
        //oepn函数返回值成功则返回文件描述符,否则返回-1
        dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
        // 创建成功,直接推出循环
        if (dfd != -1) break;
        //否则睡眠1秒,继续尝试创建文件,最多尝试5次。
        sleep(1);
    }
    //如果创建文件失败,输出错误信息,结束函数
    if (dfd == -1) {
        close(fd);
        redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
        return REDIS_ERR;
    }

    // 从master非阻塞的下载rdb文件, readSyncBulkPayload读master的rdb数据
    if (aeCreateFileEvent(server.el, fd, AE_READABLE, readSyncBulkPayload, NULL)
            == AE_ERR)
    {
        close(fd);
        redisLog(REDIS_WARNING,"Can't create readable event for SYNC");
        return REDIS_ERR;
    }
    server.replstate = REDIS_REPL_TRANSFER;
    server.repl_transfer_left = -1;
    server.repl_transfer_s = fd;
    server.repl_transfer_fd = dfd;
    server.repl_transfer_lastio = time(NULL);
    server.repl_transfer_tmpfile = zstrdup(tmpfile);
    return REDIS_OK;
}

readSyncBulkPayload

/**
* 实际读取master rdb到slave的函数
** file:        replication.c 
** function:    readSyncBulkPayload
**/
/* Asynchronously read the SYNC payload we receive from a master */
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
    char buf[4096];
    ssize_t nread, readlen;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(privdata);
    REDIS_NOTUSED(mask);

   // 从master读取的字节大小,如果等于-1,则读取
    if (server.repl_transfer_left == -1) {
        //读取master rdb一行字符
        if (syncReadLine(fd,buf,1024,3600) == -1) {
            redisLog(REDIS_WARNING,
                "I/O error reading bulk count from MASTER: %s",
                strerror(errno));
            replicationAbortSyncTransfer();
            return;
        }
        // 失败的命令,直接返回,关闭与master的连接
        if (buf[0] == '-') {
            redisLog(REDIS_WARNING,
                "MASTER aborted replication with an error: %s",
                buf+1);
            replicationAbortSyncTransfer();
            return;
        // 读到行末,继续读取下一行
        } else if (buf[0] == '\0') {
            server.repl_transfer_lastio = time(NULL);
            return;
        } else if (buf[0] != '$') {
            redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
            replicationAbortSyncTransfer();
            return;
        }
        // 计算需要读取的字节数
        server.repl_transfer_left = strtol(buf+1,NULL,10);
        redisLog(REDIS_NOTICE,
            "MASTER <-> SLAVE sync: receiving %ld bytes from master",
            server.repl_transfer_left);
        return;
    }

    // 计算准备读取的字节长度,如果repl_transfer_left小于buf,则读repl_transfer_left大小
    readlen = (server.repl_transfer_left < (signed)sizeof(buf)) ?
        server.repl_transfer_left : (signed)sizeof(buf);
    //读取数据从master
    nread = read(fd,buf,readlen);
    //读失败,关闭连接
    if (nread <= 0) {
        redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
            (nread == -1) ? strerror(errno) : "connection lost");
        replicationAbortSyncTransfer();
        return;
    }
    server.repl_transfer_lastio = time(NULL);
    //写master读到的数据到slave的repl_transfer_fd(temp文件)
    if (write(server.repl_transfer_fd,buf,nread) != nread) {
        redisLog(REDIS_WARNING,"Write error or short write writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
        replicationAbortSyncTransfer();
        return;
    }
    server.repl_transfer_left -= nread;
    // 检查是否已经读完了master的rdb数据
    if (server.repl_transfer_left == 0) {
        //重命名slave tmp文件名为slave rdb文件名
        if (rename(server.repl_transfer_tmpfile,server.dbfilename) == -1) {
            redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
            replicationAbortSyncTransfer();
            return;
        }
        //输出日志,在master与salve之间执行sync命令完成,
        redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory");
        //清空slave的缓存数据
        emptyDb();
        /* Before loading the DB into memory we need to delete the readable
         * handler, otherwise it will get called recursively since
         * rdbLoad() will call the event loop to process events from time to
         * time for non blocking loading. */
         //关闭事件连接s
        aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);
        //加载rdb到redis
        if (rdbLoad(server.dbfilename) != REDIS_OK) {
            redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
            replicationAbortSyncTransfer();
            return;
        }
       //释放临时文件、关闭临时文件的连接、slave创建连接,准备接受master后续的写命令
        zfree(server.repl_transfer_tmpfile);
        close(server.repl_transfer_fd);
        server.master = createClient(server.repl_transfer_s);
        server.master->flags |= REDIS_MASTER;
        server.master->authenticated = 1;
        server.replstate = REDIS_REPL_CONNECTED;
        redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Finished with success");
        /* Rewrite the AOF file now that the dataset changed. */
        if (server.appendonly) rewriteAppendOnlyFileBackground();
    }
}

master

---------------------master-----------------------------------------------------------------

syncCommand

/**
*  master
*  file:       replication.c
*  funtion:    syncCommand
**/
void syncCommand(redisClient *c) {
    //slave不执行sync命令
    if (c->flags & REDIS_SLAVE) return;

    // 这两个参数配置,说明是自己slave,不执行sync命令
    if (server.masterhost && server.replstate != REDIS_REPL_CONNECTED) {
        addReplyError(c,"Can't SYNC while not connected with my master");
        return;
    }
     //当slave发送sync表示slave启动重新连接到master,重新全量复制mater的数据
     //所以执行sync之前,数据缓存区应该是没有数据的。
    if (listLength(c->reply) != 0) {
        addReplyError(c,"SYNC is invalid with pending input");
        return;
    }
    //master输出日志,准备全量复制数据到slave
    redisLog(REDIS_NOTICE,"Slave ask for synchronization");
   // bgsave的子进程 != -1表示子进程正在保存快照中
    if (server.bgsavechildpid != -1) {
       // 如果bgsave正在运行,则判断是否有其他slave执行了sync,如果有那么就等待
       //并当前连接和那个slave保持一致.
        redisClient *slave;
        listNode *ln;
        listIter li;
        //获取当前master连接的所有slave的迭代器
        listRewind(server.slaves,&li);
        while((ln = listNext(&li))) {
            //迭代,如果有其他的slave在等待bgsave结束,则返回这个slave
            //否则返回空的listnode
            slave = ln->value;
            if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
        }
        if (ln) {
          //表示有其他的slava执行了bgsave
          //当前连接salve,复制那个slave缓冲区、以及复制状态设为等待bgsave
            listRelease(c->reply);
            c->reply = listDup(slave->reply);
            c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
            redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
        } else {
            //没有其他salve执行bgsave表示,配置的save参数(表示在多少秒内直到有多少key改变才触发rdb持久化)
            //生效了,设置salve的复制状态为bgsave
            c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
            redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
        }
    } else {
       //没有其他salve执行bgsave,save的配置没有触发bgsave的条件,
       //那么当前连接slave则触发bgsave
        redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
        if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
            redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
            addReplyError(c,"Unable to perform background save");
            return;
        }
        //设置salve连接的复制状态为`等待bgsave结束`
        c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
    }
    c->repldbfd = -1;
    c->flags |= REDIS_SLAVE;
    c->slaveseldb = 0;
    //当前salve连接加入到master的slave队列中
    listAddNodeTail(server.slaves,c);
    return;
}

updateSlavesWaitingBgsave

/**
*  file:      replication.c
*  function:  updateSlavesWaitingBgsave
*  
*  des: redis.c的serverCron函数会定时执行,当bgsave执行完毕
*       会开始执行backgroundSaveDoneHandler函数,调用updateSlavesWaitingBgsave
*       发送rdb文件的数据到salve
*
**/
//当bgsave成功,传入的bgsaveerr= REDIS_OK
void updateSlavesWaitingBgsave(int bgsaveerr) {
    listNode *ln;
    int startbgsave = 0;
    listIter li;
    // 获取li,为连接master的slave的迭代器
    listRewind(server.slaves,&li);
    while((ln = listNext(&li))) {
        //当前迭代slave的连接对象
        redisClient *slave = ln->value;
        //如果slave的复制状态是等待bgsave则设置为bgsave结束了
        if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
            startbgsave = 1;
            slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
        } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
            //如果slave的复制状态是 等待bgsave结束,则开始发送master的rdb数据
            //到slave
            struct redis_stat buf;
            //如果bgsave是失败的则,释放slave的连接。(slave后面后再次发起sync请求)
            if (bgsaveerr != REDIS_OK) {
                freeClient(slave);
                redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
                continue;
            }
            //否则,bgsave是成功的,以只读方式打开master的rdb
            if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
                redis_fstat(slave->repldbfd,&buf) == -1) {
                //打开rdb文件失败,则释放slave的连接
                freeClient(slave);
                redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
                continue;
            }
            slave->repldboff = 0;
            slave->repldbsize = buf.st_size;
            slave->replstate = REDIS_REPL_SEND_BULK;
            aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
            //使用redis自己的事件驱动来写数据到slave
            //sendBulkToSlave函数为写数据的主要逻辑
            if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
                freeClient(slave);
                continue;
            }
        }
    }
    if (startbgsave) {
        if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
            listIter li;

            listRewind(server.slaves,&li);
            redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
            while((ln = listNext(&li))) {
                redisClient *slave = ln->value;

                if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
                    freeClient(slave);
            }
        }
    }
}

增量复制

增量复制:当全量复制完成后,主服务每执行一个写命令就会向从服务发送命令。

以set时判断key是否过期为例,分析增量复制:setGenericCommand —>expireIfNeeded

java redis 主从复制 redis的主从复制,怎么实现_加载_03

master

propagateExpire

/**
* file: db.c
* function: propagateExpire
*  
*  dec: 在expireIfNeeded函数中,当判断key是否过期,如果key过期了则调用
*       propagateExpire函数传播删除key,到slave及aof文件中
**/
void propagateExpire(redisDb *db, robj *key) {
    robj *argv[2];

    argv[0] = createStringObject("DEL",3);
    argv[1] = key;
    incrRefCount(key);
    //如果开启了aof,则写入命令到aof文件中
    if (server.appendonly)
        feedAppendOnlyFile(server.delCommand,db->id,argv,2);
    if (listLength(server.slaves))
        //如果master有连接的slave,则传播del命令到所有的slave
        replicationFeedSlaves(server.slaves,db->id,argv,2);

    decrRefCount(argv[0]);
    decrRefCount(argv[1]);
}

replicationFeedSlaves

/**
*  file:  replication.c
*  function:  replicationFeedSlaves
* 
*  des: 复制命令到slave
**/
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
    listNode *ln;
    listIter li;
    int outc = 0, j;
    robj **outv;
    //构造存放发送命令的数组
    // REDIS_STATIC_ARGS = 8 
    robj *static_outv[REDIS_STATIC_ARGS*3+1];
    robj *lenobj;

    if (argc <= REDIS_STATIC_ARGS) {
        outv = static_outv;
    } else {
        outv = zmalloc(sizeof(robj*)*(argc*3+1));
    }

    //构造消息头(根据resp协议),* :表示后面是数组,以及指定的数组的大小
    lenobj = createObject(REDIS_STRING,
            sdscatprintf(sdsempty(), "*%d\r\n", argc));
    lenobj->refcount = 0;
    outv[outc++] = lenobj;
    //argv存放到out数组中
    for (j = 0; j < argc; j++) {
        lenobj = createObject(REDIS_STRING,
            sdscatprintf(sdsempty(),"$%lu\r\n",
                (unsigned long) stringObjectLen(argv[j])));
        lenobj->refcount = 0;
        outv[outc++] = lenobj;
        outv[outc++] = argv[j];
        outv[outc++] = shared.crlf;
    }


    for (j = 0; j < outc; j++) incrRefCount(outv[j]);
    // slave的迭代器
    listRewind(slaves,&li);
    while((ln = listNext(&li))) {
        redisClient *slave = ln->value;

        //如果slave的复制状态是等待开始,表示刚连接上,等待bgsave
        //全量复制还未完成,则不进行增量复制
        if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;

       // 选择master的数据库,如果slave与master的数据库不一样
       //则构造命令执行指定slave的数据库
        if (slave->slaveseldb != dictid) {
            robj *selectcmd;

            switch(dictid) {
            case 0: selectcmd = shared.select0; break;
            case 1: selectcmd = shared.select1; break;
            case 2: selectcmd = shared.select2; break;
            case 3: selectcmd = shared.select3; break;
            case 4: selectcmd = shared.select4; break;
            case 5: selectcmd = shared.select5; break;
            case 6: selectcmd = shared.select6; break;
            case 7: selectcmd = shared.select7; break;
            case 8: selectcmd = shared.select8; break;
            case 9: selectcmd = shared.select9; break;
            default:
                selectcmd = createObject(REDIS_STRING,
                    sdscatprintf(sdsempty(),"select %d\r\n",dictid));
                selectcmd->refcount = 0;
                break;
            }
            addReply(slave,selectcmd);
            slave->slaveseldb = dictid;
        }
        // 发送数组的数据到slave的socket到slave
        for (j = 0; j < outc; j++) addReply(slave,outv[j]);
    }
    for (j = 0; j < outc; j++) decrRefCount(outv[j]);
    if (outv != static_outv) zfree(outv);
}