一 序 

   之前在整理Object的时候,关于LRU的淘汰地方没有看代码,本篇补上。

redisobject中除了type、encoding、ptr和refcount属性外,还有一个lru属性用来计算空转时长。OBJECT IDLETIME命令可以打印出给定键的空转时长,是用当前时间减去键的lru时间计算得出的。OBJECT IDLETIME命令是特殊的,这个命令在访问键的对象时,不会修改值对象的lru属性。

键的空转时长还有一个作用,如果服务器打开了maxmemory选项,并且服务器用于回收内存的算法是volatile-lru 或者 allkeys-lru,那么当服务器占用的内存数超过了maxmemory选项所设置的上限值时,空转时长较高的那部分键会优先被服务器释放,从而回收内存。

  这里没有展开,等后面整理淘汰算法的时候再看。
 

二  processCommand

Redis 每服务客户端执行一个命令的时候,会检测使用的内存是否超额。如果超额,即进行数据淘汰。函数是processCommand,源码在server.c

/* If this function gets called we already read a whole
 * command, arguments are in the client argv/argc fields.
 * processCommand() execute the command or prepare the
 * server for a bulk read from the client.
 *这个函数执行时,我们已经读入了一个完整的命令到客户端,
 * 这个函数负责执行这个命令,
 * 或者服务器准备从客户端中进行一次读取。
 * If C_OK is returned the client is still alive and valid and
 * other operations can be performed by the caller. Otherwise
 * if C_ERR is returned the client was destroyed (i.e. after QUIT). 
 * 如果client没有被关闭则返回C_OK,调用者可以继续执行其他的操作,否则返回C_ERR,表示client被销毁
 */
int processCommand(client *c) {
    /* The QUIT command is handled separately. Normal command procs will
     * go through checking for replication and QUIT will cause trouble
     * when FORCE_REPLICATION is enabled and would be implemented in
     * a regular command proc. */
    // 如果是 quit 命令,则单独处理
    if (!strcasecmp(c->argv[0]->ptr,"quit")) {
        addReply(c,shared.ok);
        c->flags |= CLIENT_CLOSE_AFTER_REPLY;//设置client的状态为回复后立即关闭,返回C_ERR
        return C_ERR;
    }

    /* Now lookup the command and check ASAP about trivial error conditions
     * such as wrong arity, bad command name and so forth. */
    // 从数据库的字典中查找该命令  
    c->cmd = c->lastcmd = lookupCommand(c->argv[0]->ptr);
    if (!c->cmd) {
    	  // 没找到指定的命令
        flagTransaction(c);
        addReplyErrorFormat(c,"unknown command '%s'",
            (char*)c->argv[0]->ptr);
        return C_OK;
       // 参数个数不匹配
    } else if ((c->cmd->arity > 0 && c->cmd->arity != c->argc) ||
               (c->argc < -c->cmd->arity)) {
        flagTransaction(c); //如果是事务状态的命令,则设置事务为失败
        addReplyErrorFormat(c,"wrong number of arguments for '%s' command",
            c->cmd->name);
        return C_OK;
    }

    /* Check if the user is authenticated */
    //检查认证信息
    if (server.requirepass && !c->authenticated && c->cmd->proc != authCommand)
    {
        flagTransaction(c);//如果是事务状态的命令,则设置事务为失败
        addReply(c,shared.noautherr);
        return C_OK;
    }

    /* If cluster is enabled perform the cluster redirection here.
     * However we don't perform the redirection if:
     * 1) The sender of this command is our master.
     * 2) The command has no key arguments. */
    // 如果开启了集群模式,则执行集群的重定向操作,下面的两种情况例外:
    //  1. 命令的发送是主节点服务器
    //  2. 命令没有key 
    if (server.cluster_enabled &&
        !(c->flags & CLIENT_MASTER) &&
        !(c->flags & CLIENT_LUA &&
          server.lua_caller->flags & CLIENT_MASTER) &&
        !(c->cmd->getkeys_proc == NULL && c->cmd->firstkey == 0 &&
          c->cmd->proc != execCommand))
    {
        int hashslot;
        int error_code;
        // 从集群中返回一个能够执行命令的节点 
        clusterNode *n = getNodeByQuery(c,c->cmd,c->argv,c->argc,
                                        &hashslot,&error_code);
        // 返回的节点不合格                                 
        if (n == NULL || n != server.cluster->myself) {
        	  // 如果是执行事务的命令,则取消事务
            if (c->cmd->proc == execCommand) {
                discardTransaction(c);
            } else {// 将事务状态设置为失败
                flagTransaction(c);
            }
            // 执行client的重定向操作
            clusterRedirectClient(c,n,hashslot,error_code);
            return C_OK;
        }
    }

    /* Handle the maxmemory directive.
     *
     * First we try to free some memory if possible (if there are volatile
     * keys in the dataset). If there are not the only thing we can do
     * is returning an error. */
    // 如果设置了最大内存,那么检查内存是否超过限制,并做相应的操作 
    if (server.maxmemory) {
    	   // 如果内存已超过限制,那么尝试通过删除过期键来释放内存
        int retval = freeMemoryIfNeeded();
        /* freeMemoryIfNeeded may flush slave output buffers. This may result
         * into a slave, that may be the active client, to be freed. */
         // freeMemoryIfNeeded()函数之后需要冲洗从节点的输出缓冲区,这可能导致被释放的从节点是一个活跃的client
        // 如果当前的client被释放,返回C_ERR 
        if (server.current_client == NULL) return C_ERR;

        /* It was impossible to free enough memory, and the command the client
         * is trying to execute is denied during OOM conditions? Error. */
       // 如果命令会耗费大量的内存但是释放内存失败  
       // 那么向客户端返回内存错误
        if ((c->cmd->flags & CMD_DENYOOM) && retval == C_ERR) {
            flagTransaction(c);  // 将事务状态设置为失败
            addReply(c, shared.oomerr);
            return C_OK;
        }
    }

    /* Don't accept write commands if there are problems persisting on disk
     * and if this is a master instance. */
   // 如果 BGSAVE 命令执行错误而且服务器是一个主节点,那么不接受写命令  
    if (((server.stop_writes_on_bgsave_err &&
          server.saveparamslen > 0 &&
          server.lastbgsave_status == C_ERR) ||
          server.aof_last_write_status == C_ERR) &&
        server.masterhost == NULL &&
        (c->cmd->flags & CMD_WRITE ||
         c->cmd->proc == pingCommand))
    {
        flagTransaction(c); // 将事务状态设置为失败
         // 如果上一次执行AOF成功回复BGSAVE错误回复
        if (server.aof_last_write_status == C_OK)
            addReply(c, shared.bgsaveerr);
        else
            addReplySds(c,
                sdscatprintf(sdsempty(),
                "-MISCONF Errors writing to the AOF file: %s\r\n",
                strerror(server.aof_last_write_errno)));
        return C_OK;
    }

    /* Don't accept write commands if there are not enough good slaves and
     * user configured the min-slaves-to-write option. */
     // 如果没有足够的良好的从节点而且用户配置了 min-slaves-to-write,那么不接受写命令 
    if (server.masterhost == NULL &&
        server.repl_min_slaves_to_write &&
        server.repl_min_slaves_max_lag &&
        c->cmd->flags & CMD_WRITE &&
        server.repl_good_slaves_count < server.repl_min_slaves_to_write)
    {  // 将事务状态设置为失败
        flagTransaction(c);
        addReply(c, shared.noreplicaserr);
        return C_OK;
    }

    /* Don't accept write commands if this is a read only slave. But
     * accept write commands if this is our master. */
      // 如果这是一个只读的从节点服务器,则不接受写命令 
    if (server.masterhost && server.repl_slave_ro &&
        !(c->flags & CLIENT_MASTER) &&
        c->cmd->flags & CMD_WRITE)
    {
        addReply(c, shared.roslaveerr);
        return C_OK;
    }

    /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
    // 如果处于发布订阅模式,但是执行的不是发布订阅命令,返回 
    if (c->flags & CLIENT_PUBSUB &&
        c->cmd->proc != pingCommand &&
        c->cmd->proc != subscribeCommand &&
        c->cmd->proc != unsubscribeCommand &&
        c->cmd->proc != psubscribeCommand &&
        c->cmd->proc != punsubscribeCommand) {
        addReplyError(c,"only (P)SUBSCRIBE / (P)UNSUBSCRIBE / PING / QUIT allowed in this context");
        return C_OK;
    }

    /* Only allow INFO and SLAVEOF when slave-serve-stale-data is no and
     * we are a slave with a broken link with master. */
     // 如果是从节点且和主节点断开了连接,不允许从服务器带有过期数据,返回
    if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED &&
        server.repl_serve_stale_data == 0 &&
        !(c->cmd->flags & CMD_STALE))
    {
        flagTransaction(c);
        addReply(c, shared.masterdownerr);
        return C_OK;
    }

    /* Loading DB? Return an error if the command has not the
     * CMD_LOADING flag. */
     // 如果服务器正在载入数据到数据库,那么只执行带有 REDIS_CMD_LOADING 标识的命令,否则将出错
    if (server.loading && !(c->cmd->flags & CMD_LOADING)) {
        addReply(c, shared.loadingerr);
        return C_OK;
    }

    /* Lua script too slow? Only allow a limited number of commands. */
     // Lua 脚本超时,只允许执行限定的操作,比如 SHUTDOWN 和 SCRIPT KILL
    if (server.lua_timedout &&
          c->cmd->proc != authCommand &&
          c->cmd->proc != replconfCommand &&
        !(c->cmd->proc == shutdownCommand &&
          c->argc == 2 &&
          tolower(((char*)c->argv[1]->ptr)[0]) == 'n') &&
        !(c->cmd->proc == scriptCommand &&
          c->argc == 2 &&
          tolower(((char*)c->argv[1]->ptr)[0]) == 'k'))
    {
        flagTransaction(c);
        addReply(c, shared.slowscripterr);
        return C_OK;
    }

    /* Exec the command 执行命令 */
    // client处于事务环境中,但是执行命令不是exec、discard、multi和watch
    if (c->flags & CLIENT_MULTI &&
        c->cmd->proc != execCommand && c->cmd->proc != discardCommand &&
        c->cmd->proc != multiCommand && c->cmd->proc != watchCommand)
    {    // 除了上述的四个命令,其他的命令添加到事务队列中
        queueMultiCommand(c);
        addReply(c,shared.queued);
    } else { // 执行普通的命令
        call(c,CMD_CALL_FULL);
           // 保存写全局的复制偏移量
        c->woff = server.master_repl_offset;
        // 如果因为BLPOP而阻塞的命令已经准备好,则处理client的阻塞状态
        if (listLength(server.ready_keys))
            handleClientsBlockedOnLists();
    }
    return C_OK;
}

可以看到处理命令处理函数的过程,会涉及到内存使用量的检测,如果检测到内存使用超额,会触发数据淘汰机制。我们来看看淘汰机制触发的函数 freeMemoryIfNeeded() ,源码在。

int freeMemoryIfNeeded(void) {
    size_t mem_used, mem_tofree, mem_freed;
    int slaves = listLength(server.slaves);
    mstime_t latency, eviction_latency;

    /* When clients are paused the dataset should be static not just from the
     * POV of clients not being able to write, but also from the POV of
     * expires and evictions of keys not being performed. */
     //客户端暂停,直接返回
    if (clientsArePaused()) return C_OK;

    /* Remove the size of slaves output buffers and AOF buffer from the
     * count of used memory. */
    // 计算出 Redis 目前占用的内存总数,但有两个方面的内存不会计算在内:
    // 1)从服务器的输出缓冲区的内存
    // 2)AOF 缓冲区的内存
    mem_used = zmalloc_used_memory();
    if (slaves) {  // 存在从节点
        listIter li;
        listNode *ln;

        listRewind(server.slaves,&li);
           // 遍历从节点链表
        while((ln = listNext(&li))) {
            client *slave = listNodeValue(ln);
            // 获取当前从节点的输出缓冲区的大小,不包含静态的固定回复缓冲区,因为他总被分配
            unsigned long obuf_bytes = getClientOutputBufferMemoryUsage(slave);
            // 减去当前从节点的输出缓冲区的大小
            if (obuf_bytes > mem_used)
                mem_used = 0;
            else
                mem_used -= obuf_bytes;
        }
    }
    // 如果开启了AOF操作
    if (server.aof_state != AOF_OFF) {
    	  // 减去AOF缓冲区的大小
        mem_used -= sdslen(server.aof_buf);
        // 减去AOF重写缓冲区的大小
        mem_used -= aofRewriteBufferSize();
    }

    /* Check if we are over the memory limit. */
    // 如果目前使用的内存大小比设置的 maxmemory 要小,那么无须执行进一步操作
    if (mem_used <= server.maxmemory) return C_OK;
    	
     // 如果占用内存比 maxmemory 要大,但是 maxmemory 策略为不淘汰,那么直接返回C_ERR
    if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION)
        return C_ERR; /* We need to free memory, but policy forbids. */

    /* Compute how much memory we need to free. */
    // 计算需要回收的大小
    mem_tofree = mem_used - server.maxmemory;
    mem_freed = 0; // 初始化已释放内存的字节数为 0
     // 设置回收延迟检测开始的时间
    latencyStartMonitor(latency);
    // 循环回收,直到到达需要回收大小(遍历数据库,释放内存并记录被释放内存的字节数)
    while (mem_freed < mem_tofree) {
        int j, k, keys_freed = 0;

         // 遍历所有的数据库
        for (j = 0; j < server.dbnum; j++) {
            long bestval = 0; /* just to prevent warning */
            sds bestkey = NULL;
            dictEntry *de;
            redisDb *db = server.db+j;
            dict *dict;

            if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU ||
                server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM)
            { // 如果策略是 allkeys-lru 或者 allkeys-random 
                // 那么淘汰的目标为所有数据库键
                dict = server.db[j].dict;
            } else { // 否则从过期键字典中选择回收的键。选择样品字典
                dict = server.db[j].expires;
            }
            // 跳过空字典
            if (dictSize(dict) == 0) continue;

            /* volatile-random and allkeys-random policy */
            // 如果回收策略有 ALLKEYS_RANDOM 或 VOLATILE_RANDOM,则是随机挑选
            if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
                server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
            {
            	   // 随机返回一个key
                de = dictGetRandomKey(dict);
                bestkey = dictGetKey(de);
            }

            /* volatile-lru and allkeys-lru policy */
           // 如果回收策略有 ALLKEYS_LRU 或 VOLATILE_LRU,则使用LRU策略
             // 那么从一集 sample 键中选出 IDLE 时间最长的那个键
            else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_LRU ||
                server.maxmemory_policy == MAXMEMORY_VOLATILE_LRU)
            {
            	  // 回收池
                struct evictionPoolEntry *pool = db->eviction_pool;

                while(bestkey == NULL) {
                	  // 从样品字典dict中随机选择样品
                    evictionPoolPopulate(dict, db->dict, db->eviction_pool);
                    /* Go backward from best to worst element to evict. */
                     // 从空转时间最长的开始遍历
                    for (k = MAXMEMORY_EVICTION_POOL_SIZE-1; k >= 0; k--) {
                    	  // 跳过空位置
                        if (pool[k].key == NULL) continue;
                        // 从样品字典dict中查找当前key
                        de = dictFind(dict,pool[k].key);

                        /* Remove the entry from the pool. */
                        // 从收回池中删除
                        sdsfree(pool[k].key);
                        /* Shift all elements on its right to left. */
                         // 释放位置
                        memmove(pool+k,pool+k+1,
                            sizeof(pool[0])*(MAXMEMORY_EVICTION_POOL_SIZE-k-1));
                        /* Clear the element on the right which is empty
                         * since we shifted one position to the left.  */
                        // 重置key和空转时间
                        pool[MAXMEMORY_EVICTION_POOL_SIZE-1].key = NULL;
                        pool[MAXMEMORY_EVICTION_POOL_SIZE-1].idle = 0;

                        /* If the key exists, is our pick. Otherwise it is
                         * a ghost and we need to try the next element. */
                        // 如果从样品字典中可以找到,则保存键 
                        if (de) {
                            bestkey = dictGetKey(de);
                            break;
                        } else {// 没找到,则继续找下一个样品空间所保存的键
                            /* Ghost... */
                            continue;
                        }
                    } // 如果当前选出的所有的样品都没找到,则重新选择一批样品,知道找到一个可以释放的键
                }
            }

            /* volatile-ttl */
             // 策略为 volatile-ttl ,从一集 sample 键中选出过期时间距离当前时间最接近的键
            else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
            	   // 抽样个数为maxmemory_samples个
                for (k = 0; k < server.maxmemory_samples; k++) {
                    sds thiskey;
                    long thisval;
                    
                    // 返回一个键,获取他的生存时间
                    de = dictGetRandomKey(dict);
                    thiskey = dictGetKey(de);
                    thisval = (long) dictGetVal(de);

                    /* Expire sooner (minor expire unix timestamp) is better
                     * candidate for deletion */
                     // 如果当前键的生存时间更短,则保存 
                    if (bestkey == NULL || thisval < bestval) {
                        bestkey = thiskey;
                        bestval = thisval;
                    }
                }
            }

            /* Finally remove the selected key. */
            // 删除所有被选择的键
            if (bestkey) {
                long long delta;

                robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
                // 当一个键在主节点中过期时,主节点会发送del命令给从节点和AOF文件
                propagateExpire(db,keyobj);
                /* We compute the amount of memory freed by dbDelete() alone.
                 * It is possible that actually the memory needed to propagate
                 * the DEL in AOF and replication link is greater than the one
                 * we are freeing removing the key, but we can't account for
                 * that otherwise we would never exit the loop.
                 *
                 * AOF and Output buffer memory will be freed eventually so
                 * we only care about memory used by the key space. */
                // 单独计算dbDelete()所释放的空间大小, 在AOF和复制链接中传播DEL的内存实际上大于我们释放的key的内存
                // 但是无法解释,窦泽不会退出循环
                // AOF和输出缓冲区的内存最终被释放,所以我们只关心键空间使用的内存 
                delta = (long long) zmalloc_used_memory();
                // 设置删除key对象的开始时间
                latencyStartMonitor(eviction_latency);
                dbDelete(db,keyobj);//删除
                // 保存删除key对象时间
                latencyEndMonitor(eviction_latency);
                 // 添加到延迟诊断字典中
                latencyAddSampleIfNeeded("eviction-del",eviction_latency);
                 // 删除嵌套的延迟事件
                latencyRemoveNestedEvent(latency,eviction_latency);
                 // 计算删除这个键的大小
                delta -= (long long) zmalloc_used_memory();
                 // 更新内存释放量
                mem_freed += delta;
                 // 服务器总的回收键的个数计数器加1
                server.stat_evictedkeys++;
                 // 事件通知
                notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted",
                    keyobj, db->id);
                 // 释放键对象    
                decrRefCount(keyobj);
                keys_freed++; // 释放键的个数加1

                /* When the memory to free starts to be big enough, we may
                 * start spending so much time here that is impossible to
                 * deliver data to the slaves fast enough, so we force the
                 * transmission here inside the loop. */
                // 如果有从节点,则刷新所有的输出缓冲区数据 
                if (slaves) flushSlavesOutputBuffers();
            }
        }
        if (!keys_freed) { // 如果所有数据库都没有释放键,返回C_ERR
            latencyEndMonitor(latency);
            latencyAddSampleIfNeeded("eviction-cycle",latency);
            return C_ERR; /* nothing to free... */
        }
    } // 计算回收延迟的时间
    latencyEndMonitor(latency);
    latencyAddSampleIfNeeded("eviction-cycle",latency);
    return C_OK;
}

其他:

Redis的LRU算法不是一个严格的LRU实现。这意味着Redis不能选择最佳候选键来回收,也就是最久未被访问的那些键。相反,Redis 会尝试执行一个近似的LRU算法,通过采样一小部分键,然后在采样键中回收最适合(拥有最久访问时间)的那个。

然而,从Redis3.0开始,算法被改进为维护一个回收候选键池。这改善了算法的性能,使得更接近于真实的LRU算法的行为。Redis的LRU算法有一点很重要,你可以调整算法的精度,通过改变每次回收时检查的采样数量。

这个参数可以通过如下配置指令:

maxmemory-samples 5

总结:借用二代的话,就是从直接删除来看,redis不能当做数据库来用。