一、什么是VM

    virtual memory(VM)虚拟内存,在进程视角看到的内存空间,主要是使用磁盘文件扩展内存,使其实际使用的内存空间超过实际的物理空间。原理是在实际物理内存空间不足的情况下,将内存中最近最久未使用的数据(冷数据)序列化到文件中,然后释放这部分数据占用的空间,以腾出空间给其他数据分配空间使用。

二、redis的VM

    Linux中已经有VM,为什么redis还要自己实现一套VM呢?

    主要还是考虑使用方面,Linux在换出数据时将整个页(Page)的数据换出,整个页上可能有很多数据,可能包含key或则value,不方便管理。(可能不正确,后续继续学习Linux)

只将value换出,key保留在内存中,简单可控。

    redis从2.0.0引入VM,而在2.6.0中已经消失了,真是昙花一现。

三、redis的vm原理

3.1 配置

#开关,是否启用vm
vm-enabled no

#swap文件
vm-swap-file /tmp/redis.swap

#最大内存限制,超过则淘汰某些value到文件
vm-max-memory 102400

#swap文件中每页的大小
vm-page-size 32

#swap文件中页的总数
vm-pages 134217728

3.2 VM初始化

  分配一个page_size * pages 的文件, 以及一个(pages+7)/8大小的位图,用于标记swap文件中哪些page已经使用,哪些没有使用

Redis新的vm机制 redis vm_sed

static void vmInit(void) {
    off_t totsize;
    int pipefds[2];
    size_t stacksize;
    struct flock fl;

    ...

    /* Try to open the old swap file, otherwise create it */
    if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
        server.vm_fp = fopen(server.vm_swap_file,"w+b");
    }
    ...

    server.vm_fd = fileno(server.vm_fp);
   ...

    /* Initialize */
    server.vm_next_page = 0;
    server.vm_near_pages = 0;
    server.vm_stats_used_pages = 0;
    server.vm_stats_swapped_objects = 0;
    server.vm_stats_swapouts = 0;
    server.vm_stats_swapins = 0;
    totsize = server.vm_pages*server.vm_page_size; //计算文件总大小
  

    //创建指定大小的文件
    ftruncate(server.vm_fd,totsize) 

    //创建位图
    server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
    memset(server.vm_bitmap,0,(server.vm_pages+7)/8);

    //创建任务队列,多线程使用
    /* Initialize threaded I/O (used by Virtual Memory) */
    server.io_newjobs = listCreate();
    server.io_processing = listCreate();
    server.io_processed = listCreate();
    server.io_ready_clients = listCreate();
    pthread_mutex_init(&server.io_mutex,NULL);
    pthread_mutex_init(&server.obj_freelist_mutex,NULL);
    pthread_mutex_init(&server.io_swapfile_mutex,NULL);
    server.io_active_threads = 0;

   //创建管道,用于多线程通知主线程有任务完成
    if (pipe(pipefds) == -1) {
        redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
            ,strerror(errno));
        exit(1);
    }
    server.io_ready_pipe_read = pipefds[0];
    server.io_ready_pipe_write = pipefds[1];
    redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
   ...

    //设置回调函数
    /* Listen for events in the threaded I/O pipe */
    if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
        vmThreadedIOCompletedJob, NULL) == AE_ERR)
        oom("creating file event");
}

3.3 value换出

     将对象序列化到文件对应位置,并且设置位图,标志相应页已经使用

Redis新的vm机制 redis vm_位图_02

 

static int vmSwapObjectBlocking(robj *key, robj *val) {
    off_t pages = rdbSavedObjectPages(val,NULL);
    off_t page;

    assert(key->storage == REDIS_VM_MEMORY);
    assert(key->refcount == 1);
  
 //寻找连续页空间
    if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return REDIS_ERR;
  
 //将对象写入文件中
    if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return REDIS_ERR;
  
  //设置状态
    key->vm.page = page;
    key->vm.usedpages = pages;
    key->storage = REDIS_VM_SWAPPED;
    key->vtype = val->type;
    
//删除空间
    decrRefCount(val); /* Deallocate the object from memory. */

    //设置位图
    vmMarkPagesUsed(page,pages);
  ...
    server.vm_stats_swapped_objects++;
    server.vm_stats_swapouts++;
    return REDIS_OK;
}

3.4 value换入

     从文件中读取对象后,修改位图清空标志,并未对文件对应页清空

Redis新的vm机制 redis vm_sed_03

vmGenericLoadObject(..)
{
...

        val = vmReadObjectFromSwap(key->vm.page,key->vtype);
        
        //设置状态
        key->storage = REDIS_VM_MEMORY;
        key->vm.atime = server.unixtime;

       //清理位图
        vmMarkPagesFree(key->vm.page,key->vm.usedpages);
...
      return val;
}
static robj *vmReadObjectFromSwap(off_t page, int type) {
    robj *o;

   ...
   //偏移到指定页
    if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
       ...
    }

    //加载文件,创建对象
    o = rdbLoadObject(type,server.vm_fp);
    if (o == NULL) {
      ...
    }
    ...
    return o;
}

四、redis的vm的交换过程

4.1 如何选择value进行交换

  • 遍历所有数据库
  • 在每一个数据库中随机选择五个key
  • 计算候选分数,最终选择分数最大的一个作为淘汰对象

4.1.1 如何随机选择key

整个redis数据库就是一个巨大的hash字典,并且通过链表形式进行冲突处理,所以选择key过程如下

step1. random()生成一个随机值,使用随机值对hash大小进行取余,生成一个hash下标

step2. 如果此下标没有对象,则回到step1

step3. 计算此下标下的对象链表长度

step4. 再根据长度进行random(), 随机取一个位置

step5. 最终根据随机到的位置,取此对象

dictEntry *dictGetRandomKey(dict *d)
{
    dictEntry *he, *orighe;
    unsigned int h;
    int listlen, listele;

    if (dictSize(d) == 0) return NULL;
    ...

    //随机一个下标
    do {
       h = random() & d->ht[0].sizemask;
       he = d->ht[0].table[h];
    } while(he == NULL);
    
  
    //计算链表长度
    listlen = 0;
    orighe = he;
    while(he) {
        he = he->next;
        listlen++;
    }

    //随机一个对象
    listele = random() % listlen;
   
    //获取此对象
    he = orighe;
    while(listele--) he = he->next;
    return he;
}

4.1.2 如何计算候选分数

step1. 计算此对象多久未被访问age(当前时间 - 最后一次访问时间)

step2. 计算对象大小asize

step3.  未被访问时间 乘以 对象大小的对数值   age*log(1+asize)

static double computeObjectSwappability(robj *o) {
    time_t age = server.unixtime - o->vm.atime; //计算多久未被访问
    long asize = 0;
    ...
    
    //刚访问过的,分数为0,尽量不交换出去
    if (age <= 0) return 0;
    
    // 根据对象类型,进行对象大小的计算
    switch(o->type) {
    case REDIS_STRING:
        if (o->encoding != REDIS_ENCODING_RAW) {
            asize = sizeof(*o);
        } else {
            asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
        }
        break;
    case REDIS_LIST:
        l = o->ptr;
        ...
        break;
    case REDIS_SET:
    case REDIS_ZSET:
        ...
        break;
    case REDIS_HASH:
        ...
        break;
    }

    //计算候选分数
    return (double)age*log(1+asize);
}

4.2 如何交换

4.2.1 换出

  • 计算需要换出的对象需要页数n
  • 遍历位图,查找连续n个页
  • 将对象写入找到的页
  • 将页位置及页数保存到key中
  • 释放value对象
  • 设置位图
static off_t rdbSavedObjectPages(robj *o, FILE *fp) {
    off_t bytes = rdbSavedObjectLen(o,fp);//计算对象序列化后的字节数

    //根据页大小,计算需要多少页(根据页大小对齐)
    return (bytes+(server.vm_page_size-1))/server.vm_page_size; 
}
static int vmFindContiguousPages(off_t *first, off_t n) {
    off_t base, offset = 0, since_jump = 0, numfree = 0;

    if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
        server.vm_near_pages = 0;
        server.vm_next_page = 0;
    }
    server.vm_near_pages++; /* Yet another try for pages near to the old ones */
    base = server.vm_next_page;

    while(offset < server.vm_pages) {
        off_t this = base+offset;

        /* If we overflow, restart from page zero */
        if (this >= server.vm_pages) {
            this -= server.vm_pages;
            if (this == 0) {
                /* Just overflowed, what we found on tail is no longer
                 * interesting, as it's no longer contiguous. */
                numfree = 0;
            }
        }
        if (vmFreePage(this)) {
            /* This is a free page */
            numfree++;
            /* Already got N free pages? Return to the caller, with success */
            if (numfree == n) {
                *first = this-(n-1);
                server.vm_next_page = this+1;
                redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
                return REDIS_OK;
            }
        } else {
            /* The current one is not a free page */
            numfree = 0;
        }

        /* Fast-forward if the current page is not free and we already
         * searched enough near this place. */
        since_jump++;
        if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
            offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
            since_jump = 0;
            /* Note that even if we rewind after the jump, we are don't need
             * to make sure numfree is set to zero as we only jump *if* it
             * is set to zero. */
        } else {
            /* Otherwise just check the next page */
            offset++;
        }
    }
    return REDIS_ERR;
}
static int vmWriteObjectOnSwap(robj *o, off_t page) {
    ...
    //偏移到对应页的位置
    if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
        ...
        return REDIS_ERR;
    }

    //将对象写入文件
    rdbSaveObject(server.vm_fp,o);

    //刷新磁盘
    fflush(server.vm_fp);
    ...

    return REDIS_OK;
}
//保存相关信息
  key->vm.page = page;
  key->vm.usedpages = pages;
  key->storage = REDIS_VM_SWAPPED;
  key->vtype = val->type;
decrRefCount(val); /* Deallocate the object from memory. */
static void vmMarkPagesUsed(off_t page, off_t count) {
    off_t j;

    for (j = 0; j < count; j++)
        vmMarkPageUsed(page+j);
    ...
}

4.2.2 换入

  • 根据页偏移读取对象,反序列化成对象
  • 清理位图
static robj *vmReadObjectFromSwap(off_t page, int type) {
    robj *o;
    ...
    //偏移到指定位置
    if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
          ...
        _exit(1);
    }

   //加载对象
    o = rdbLoadObject(type,server.vm_fp);
    if (o == NULL) {
        ...
        _exit(1);
    }

    ...
    return o;
}
static void vmMarkPagesFree(off_t page, off_t count) {
    off_t j;

    for (j = 0; j < count; j++)
        vmMarkPageFree(page+j);
  ...
}

五、持久化过程如何处理

当发送了swap时,内存的数据是不完整的,如何持久化呢?

在持久化过程中,发现某个value在swap时,将swap中的内容加载到内存,然后再序列化到持久化文件中,所以最终的持久化文件内容是完整的数据。

六、多线程处理

因redis使用单线程进行命令处理,所以在进行swap时,将阻塞其他客户端的请求响应,因此引入了多线程处理,将阻塞所有客户端转为阻塞某些客户端。

6.1 配置

# 0则表示不用线程,阻塞操作
vm-max-threads 4

6.2 整体流程

  • 创建换入、换出任务,加入到任务队列中(只有一个队列,所以需要加锁)
  • 如果线程数没有超过最大限制,则创建新线程
  • 线程从任务队列中获取任务(多个线程操作一个队列,需要加锁),放入正常队列队列中,然后进行处理
  • 每个任务处理完后,将放入完成队列
  • 当队列中所有任务都处理完时,线程将退出
  • 线程每处理完一个任务,将通知主线程,主线程从完成队列中取队列进行后续的处理

Redis新的vm机制 redis vm_sed_04

static int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
    iojob *j;

    assert(key->storage == REDIS_VM_MEMORY);
    assert(key->refcount == 1);

    //创建任务
    j = zmalloc(sizeof(*j));
    j->type = REDIS_IOJOB_PREPARE_SWAP;
    j->db = db;
    j->key = key;
    j->val = val;
    incrRefCount(val);
    j->canceled = 0;
    j->thread = (pthread_t) -1;
    key->storage = REDIS_VM_SWAPPING;

    //加入任务队列
    lockThreadedIO();
    queueIOJob(j);
    unlockThreadedIO();
    return REDIS_OK;
}

 

static void *IOThreadEntryPoint(void *arg) {
    iojob *j;
    listNode *ln;
    REDIS_NOTUSED(arg);

    pthread_detach(pthread_self());
    while(1) {
        /* Get a new job to process */
        lockThreadedIO();
        //任务队列为空,则线程退出
        if (listLength(server.io_newjobs) == 0) {
            ...
            server.io_active_threads--;
            unlockThreadedIO();
            return NULL;
        }
        
        //从任务队列中取一个任务
        ln = listFirst(server.io_newjobs);
        j = ln->value;
        listDelNode(server.io_newjobs,ln); 

        //将刚取出的任务加入到正在处理队列上
        /* Add the job in the processing queue */
        j->thread = pthread_self();
        listAddNodeTail(server.io_processing,j);
        ln = listLast(server.io_processing); /* We use ln later to remove it */
        unlockThreadedIO();
        redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
            (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);

        //处理任务
        /* Process the Job */
        if (j->type == REDIS_IOJOB_LOAD) {
            j->val = vmReadObjectFromSwap(j->page,j->key->vtype);
        } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
            FILE *fp = fopen("/dev/null","w+");
            j->pages = rdbSavedObjectPages(j->val,fp);
            fclose(fp);
        } else if (j->type == REDIS_IOJOB_DO_SWAP) {
            if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
                j->canceled = 1;
        }

        /* Done: insert the job into the processed queue */
        redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
            (long) pthread_self(), (void*)j, (char*)j->key->ptr);

        //将处理完成的任务从正在处理队列中取出,放入完成队列中
        lockThreadedIO();
        listDelNode(server.io_processing,ln);
        listAddNodeTail(server.io_processed,j);
        unlockThreadedIO();

        //通知主线程有任务完成
        /* Signal the main thread there is new stuff to process */
        assert(write(server.io_ready_pipe_write,"x",1) == 1);
    }
    return NULL; /* never reached */
}

6.3 刚换出又要换入

因为多线程处理后,换入换出过程就成了异步过程,可能ClientA写请求导致key1的value要被换出,然后ClientB读请求刚好读取key1的value,此时value已经被换出,又需要将value换入。

  • 如果换出请求还在任务队列中未处理,则直接取消,并将位图对应位置清理
  • 如果任务正在处理,等待处理完成后,进行重试
  • 如果任务已经完成,则设置cancele标志,主线程的回调函数在处理时,如果时canceled的任务,直接忽略
static void vmCancelThreadedIOJob(robj *o) {
    list *lists[3] = {
        server.io_newjobs,      /* 0 */
        server.io_processing,   /* 1 */
        server.io_processed     /* 2 */
    };
    int i;

    assert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
again:
    lockThreadedIO();
    /* Search for a matching key in one of the queues */
    for (i = 0; i < 3; i++) {
        listNode *ln;
        listIter li;

        listRewind(lists[i],&li);
        while ((ln = listNext(&li)) != NULL) {
            iojob *job = ln->value;

            if (job->canceled) continue; /* Skip this, already canceled. */
            if (job->key == o) {
                 ...
                 //将位图清理
                /* Mark the pages as free since the swap didn't happened
                 * or happened but is now discarded. */
                if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
                    vmMarkPagesFree(job->page,job->pages);
                /* Cancel the job. It depends on the list the job is
                 * living in. */
                switch(i) {
                case 0: /* io_newjobs */
                    /* If the job was yet not processed the best thing to do
                     * is to remove it from the queue at all */
                    freeIOJob(job);
                    listDelNode(lists[i],ln);
                    break;
                case 1: /* io_processing */
                    /* Oh Shi- the thread is messing with the Job:
                     *
                     * Probably it's accessing the object if this is a
                     * PREPARE_SWAP or DO_SWAP job.
                     * If it's a LOAD job it may be reading from disk and
                     * if we don't wait for the job to terminate before to
                     * cancel it, maybe in a few microseconds data can be
                     * corrupted in this pages. So the short story is:
                     *
                     * Better to wait for the job to move into the
                     * next queue (processed)... */

                    /* We try again and again until the job is completed. */
                    unlockThreadedIO();
                    /* But let's wait some time for the I/O thread
                     * to finish with this job. After all this condition
                     * should be very rare. */
                    usleep(1);
                    goto again;
                case 2: /* io_processed */
                    /* The job was already processed, that's easy...
                     * just mark it as canceled so that we'll ignore it
                     * when processing completed jobs. */
                    job->canceled = 1;
                    break;
                }

                //恢复状态
                /* Finally we have to adjust the storage type of the object
                 * in order to "UNDO" the operaiton. */
                if (o->storage == REDIS_VM_LOADING)
                    o->storage = REDIS_VM_SWAPPED;
                else if (o->storage == REDIS_VM_SWAPPING)
                    o->storage = REDIS_VM_MEMORY;
                unlockThreadedIO();
                return;
            }
        }
    }
    unlockThreadedIO();
    assert(1 != 1); /* We should never reach this */
}
static void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
            int mask)
{
    char buf[1];
    int retval, processed = 0, toprocess = -1, trytoswap = 1;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(mask);
    REDIS_NOTUSED(privdata);

    if (privdata != NULL) trytoswap = 0; /* check the comments above... */

    /* For every byte we read in the read side of the pipe, there is one
     * I/O job completed to process. */
    while((retval = read(fd,buf,1)) == 1) {
        iojob *j;
        listNode *ln;
        robj *key;
        struct dictEntry *de;

        redisLog(REDIS_DEBUG,"Processing I/O completed job");

        /* Get the processed element (the oldest one) */
        lockThreadedIO();
        assert(listLength(server.io_processed) != 0);

         //计算本次需要处理多少个任务,因这个回调是在主线程执行,所有不会全部处理完
         //而处理百分比
        if (toprocess == -1) {
            toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
            if (toprocess <= 0) toprocess = 1;
        }
        ln = listFirst(server.io_processed);
        j = ln->value;
        listDelNode(server.io_processed,ln);
        unlockThreadedIO();

        //标记为取消的任务,不处理后续
        /* If this job is marked as canceled, just ignore it */
        if (j->canceled) {
            freeIOJob(j);
            continue;
        }
        /* Post process it in the main thread, as there are things we
         * can do just here to avoid race conditions and/or invasive locks */
        ...
        de = dictFind(j->db->dict,j->key);
        assert(de != NULL);
        key = dictGetEntryKey(de);
        if (j->type == REDIS_IOJOB_LOAD) { //加载任务
            redisDb *db;

            /* Key loaded, bring it at home */
            key->storage = REDIS_VM_MEMORY;
            key->vm.atime = server.unixtime;
            vmMarkPagesFree(key->vm.page,key->vm.usedpages);
            redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
                (unsigned char*) key->ptr);
            server.vm_stats_swapped_objects--;
            server.vm_stats_swapins++;
            dictGetEntryVal(de) = j->val;
            incrRefCount(j->val);
            db = j->db;
            freeIOJob(j);

          //通知等待key的client
            /* Handle clients waiting for this key to be loaded. */
            handleClientsBlockedOnSwappedKey(db,key);
        } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {//准备换出任务
            /* Now we know the amount of pages required to swap this object.
             * Let's find some space for it, and queue this task again
             * rebranded as REDIS_IOJOB_DO_SWAP. */
          ...
                /* Note that we need to mark this pages as used now,
                 * if the job will be canceled, we'll mark them as freed
                 * again. */
                vmMarkPagesUsed(j->page,j->pages);
                j->type = REDIS_IOJOB_DO_SWAP;
                lockThreadedIO();
                queueIOJob(j);
                unlockThreadedIO();
          ...
        } else if (j->type == REDIS_IOJOB_DO_SWAP) { //换出任务
            robj *val;

            /* Key swapped. We can finally free some memory. */
            if (key->storage != REDIS_VM_SWAPPING) {
                printf("key->storage: %d\n",key->storage);
                printf("key->name: %s\n",(char*)key->ptr);
                printf("key->refcount: %d\n",key->refcount);
                printf("val: %p\n",(void*)j->val);
                printf("val->type: %d\n",j->val->type);
                printf("val->ptr: %s\n",(char*)j->val->ptr);
            }
            redisAssert(key->storage == REDIS_VM_SWAPPING);
            val = dictGetEntryVal(de);
            key->vm.page = j->page;
            key->vm.usedpages = j->pages;
            key->storage = REDIS_VM_SWAPPED;
            key->vtype = j->val->type;

            //成功换出后,这里才真正的删除此对象
            decrRefCount(val); /* Deallocate the object from memory. */
            dictGetEntryVal(de) = NULL;
            ...
            server.vm_stats_swapped_objects++;
            server.vm_stats_swapouts++;
            freeIOJob(j);

            ...
        }
        processed++;
        if (processed == toprocess) return; //已经处理预定任务个数,退出,主线程继续后续处理
    }

   ...
}

6.4 如何知道哪些client在等待哪些key的值

Redis新的vm机制 redis vm_Redis新的vm机制_05

 当某个client访问某个key时,此key的value已经交换出去了,则将key,client加入到io_keys字典中,并且多个client访问相同key时,将串成一个链表。

每个client自己维护了自己正在等待key的一个链表。

当某个client的等待队列空了的时候,将恢复读。

static int processCommand(redisClient *c) {
...
   //开启了vm, 并且使用多线程,则进行判断是否有key不在内存
   if (server.vm_enabled && server.vm_max_threads > 0 &&
            blockClientOnSwappedKeys(c,cmd)) return 1;
        call(c,cmd);
...
}

static int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
    if (cmd->vm_preload_proc != NULL) { 
//某几个命令需要所有key对应的值都在内存,所以进行批量预处理
        cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
    } else {
        waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv); 
    }

    /* If the client was blocked for at least one key, mark it as blocked. */
    if (listLength(c->io_keys)) { //如果访问的key不在内存,则阻塞
        c->flags |= REDIS_IO_WAIT;
        aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
        server.vm_blocked_clients++;
        return 1;
    } else {
        return 0;
    }
}

static void waitForMultipleSwappedKeys(redisClient *c,
 struct redisCommand *cmd,int argc, robj **argv) {
    int j, last;
    if (cmd->vm_firstkey == 0) return;
    last = cmd->vm_lastkey;
    if (last < 0) last = argc+last;
    for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
        redisAssert(j < argc);
        waitForSwappedKey(c,argv[j]);
    }
}

static int waitForSwappedKey(redisClient *c, robj *key) {
    struct dictEntry *de;
    robj *o;
    list *l;

    /* If the key does not exist or is already in RAM we don't need to
     * block the client at all. */
    de = dictFind(c->db->dict,key);
    if (de == NULL) return 0;
    o = dictGetEntryKey(de);
    if (o->storage == REDIS_VM_MEMORY) { //在内存,退出
        return 0;
    } else if (o->storage == REDIS_VM_SWAPPING) {  //正在交换,取消任务
        /* We were swapping the key, undo it! */
        vmCancelThreadedIOJob(o);
        return 0;
    }

    /* OK: the key is either swapped, or being loaded just now. */

    //将key放入client的等待链表中
    /* Add the key to the list of keys this client is waiting for.
     * This maps clients to keys they are waiting for. */
    listAddNodeTail(c->io_keys,key); 
    incrRefCount(key);

    //将client放入db的等待hash中,并且以链表的形式组织等待相同key的client
    /* Add the client to the swapped keys => clients waiting map. */
    de = dictFind(c->db->io_keys,key);
    if (de == NULL) {
        int retval;

        /* For every key we take a list of clients blocked for it */
        l = listCreate();
        retval = dictAdd(c->db->io_keys,key,l);
        incrRefCount(key);
        assert(retval == DICT_OK);
    } else {
        l = dictGetEntryVal(de);
    }
    listAddNodeTail(l,c);

    //建立任务,载入内存
    /* Are we already loading the key from disk? If not create a job */
    if (o->storage == REDIS_VM_SWAPPED) {
        iojob *j;

        o->storage = REDIS_VM_LOADING;
        j = zmalloc(sizeof(*j));
        j->type = REDIS_IOJOB_LOAD;
        j->db = c->db;
        j->key = o;
        j->key->vtype = o->vtype;
        j->page = o->vm.page;
        j->val = NULL;
        j->canceled = 0;
        j->thread = (pthread_t) -1;
        lockThreadedIO();
        queueIOJob(j);
        unlockThreadedIO();
    }
    return 1;
}

 

//当key已经加载到内存,通知client
static void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
    struct dictEntry *de;
    list *l;
    listNode *ln;
    int len;

    de = dictFind(db->io_keys,key);
    if (!de) return;

    l = dictGetEntryVal(de);
    len = listLength(l);
    /* Note: we can't use something like while(listLength(l)) as the list
     * can be freed by the calling function when we remove the last element. */
    while (len--) {
        ln = listFirst(l);
        redisClient *c = ln->value;

        //如果返回1,则当前client已经没有需要等待的key
        //加入ready链表
        if (dontWaitForSwappedKey(c,key)) { 
            /* Put the client in the list of clients ready to go as we
             * loaded all the keys about it. */
            listAddNodeTail(server.io_ready_clients,c);
        }
    }
}

static int dontWaitForSwappedKey(redisClient *c, robj *key) {
    list *l;
    listNode *ln;
    listIter li;
    struct dictEntry *de;

    /* The key object might be destroyed when deleted from the c->io_keys
     * list (and the "key" argument is physically the same object as the
     * object inside the list), so we need to protect it. */
    incrRefCount(key);

    //遍历client等待链表,删除此key
    /* Remove the key from the list of keys this client is waiting for. */
    listRewind(c->io_keys,&li);
    while ((ln = listNext(&li)) != NULL) {
        if (equalStringObjects(ln->value,key)) {
            listDelNode(c->io_keys,ln);
            break;
        }
    }
    redisAssert(ln != NULL);

    //从db的hash中对应key的链表中删除client
    /* Remove the client form the key => waiting clients map. */
    de = dictFind(c->db->io_keys,key);
    redisAssert(de != NULL);
    l = dictGetEntryVal(de);
    ln = listSearchKey(l,c);
    redisAssert(ln != NULL);
    listDelNode(l,ln);

    //如果hash节点中的链表已经空了,则删除此节点
    if (listLength(l) == 0)
        dictDelete(c->db->io_keys,key);

    decrRefCount(key);
    return listLength(c->io_keys) == 0;
}

七、其他

7.1 当某个key的值被换出后,后续再次给此key设置新值

  • 如果已经换出成功,则直接将位图对应位置清空
  • 如果正在换出、换入,将任务取消,取消成功后,将位图对应位置清空