内核中提供了等待队列,作用是实现阻塞操作。比如,当一个应用程序去读取设备上的数据时,可能设备驱动中暂时没有数据,那么此时可以把当前进程suspend,等待有数据输入了,即条件满足时,在将此进程唤醒继续执行。

    1. 创建一个等待队列

在Linux内核中,wait_queue_head_t代表一个等待队列,只需要定义一个wait_queue_head_t类型的变量,就表示创建一个等待队列,还需要调用如下接口来初始化此队列:

staitc wait_queue_head_t prod_wq;
init_waitqueue_head(&prod_wq);
staitc wait_queue_head_t prod_wq;
init_waitqueue_head(&prod_wq);

    具体看一下wait_queue_head_t数据类型:

struct __wait_queue_head {
       spinlock_t lock;
       struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;
struct __wait_queue_head {
       spinlock_t lock;
       struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

    就是一个链表和一把自旋锁,链表是用于保存等待该队列的wait_queue_t类型waiter对象(此类型对象内部的private成员保存了当前的任务对象task_struct *),自旋锁是为了保证对链表操作的原子性。这里简单的看一下wait_queue_t数据类型:

typedef struct __wait_queue wait_queue_t;
struct __wait_queue {
    unsigned int flags;
#define WQ_FLAG_EXCLUSIVE   0x01
    void *private; // 保存当前任务的task_struct对象地址
    wait_queue_func_t func; // 用于唤醒被挂起任务的回调函数
    struct list_head task_list; // 连接到wait_queue_head_t中的task_list链表                                                              
};
typedef struct __wait_queue wait_queue_t;
struct __wait_queue {
    unsigned int flags;
#define WQ_FLAG_EXCLUSIVE   0x01
    void *private; // 保存当前任务的task_struct对象地址
    wait_queue_func_t func; // 用于唤醒被挂起任务的回调函数
    struct list_head task_list; // 连接到wait_queue_head_t中的task_list链表                                                              
};

让当前进程开始等待

    内核提供了如下的接口来让当前进程在条件不满足的情况下,阻塞等待:

wait_event(wq, condition)
wait_event_timeout(wq, condition, timeout)
wait_event_interruptible(wq, condition)
wait_event_interruptible_timeout(wq, condition, timeout)
wait_event(wq, condition)
wait_event_timeout(wq, condition, timeout)
wait_event_interruptible(wq, condition)
wait_event_interruptible_timeout(wq, condition, timeout)

   返回值如下:

    1)    -ERESTARTSYS: 表示被信号激活唤醒

    2)    > 0: 表示condition满足,返回值表示距离设定超时还有多久

    3)    = 0: 表示超时发生


    其内部实现源码都很类似,只是有些细节不太一样,这里以wait_event_interruptible()为例子,看看其源码:

#define __wait_event_interruptible(wq, condition, ret)          \
do {                                    \
    // 定义一个waiter对象
    DEFINE_WAIT(__wait);                        \
                                    \
for (;;) {                          \
    // 将waiter对象加入到等待链表中,并设置当前task的状态为TASK_INTERRUPTIBLE
        prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);  \                                                                            
        if (condition)                      \
            break;                      \
        if (!signal_pending(current)) {             \
            // 进行任务调度,
            schedule();                 \
            continue;                   \
        }                           \
        ret = -ERESTARTSYS;                 \
        break;                          \
}                               \
    // 将waiter对象从等待链表中删除
    finish_wait(&wq, &__wait);                  \
} while (0)
#define __wait_event_interruptible(wq, condition, ret)          \
do {                                    \
    // 定义一个waiter对象
    DEFINE_WAIT(__wait);                        \
                                    \
for (;;) {                          \
    // 将waiter对象加入到等待链表中,并设置当前task的状态为TASK_INTERRUPTIBLE
        prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);  \                                                                            
        if (condition)                      \
            break;                      \
        if (!signal_pending(current)) {             \
            // 进行任务调度,
            schedule();                 \
            continue;                   \
        }                           \
        ret = -ERESTARTSYS;                 \
        break;                          \
}                               \
    // 将waiter对象从等待链表中删除
    finish_wait(&wq, &__wait);                  \
} while (0)

当我们调用wait_event_interruptible()接口时,会先判断condition是否满足,如果不满足,则会suspend当前task。

这里再看一下DEFINE_WAIT宏的源码,可以发现其private成员总是保存这当前task对象的地址current,还有一个成员func也是非常重要的,保存着task被唤醒前的操作方法,这里暂不说明,待下面的wait_up唤醒等待队列时再进行分析:

#define DEFINE_WAIT(name)                       \
    wait_queue_t name = {                       \
        .private    = current,              \
        .func       = autoremove_wake_function,     \
        .task_list  = LIST_HEAD_INIT((name).task_list), \
    }
#define DEFINE_WAIT(name)                       \
    wait_queue_t name = {                       \
        .private    = current,              \
        .func       = autoremove_wake_function,     \
        .task_list  = LIST_HEAD_INIT((name).task_list), \
    }

唤醒此等待队列上的进程:

    内核提供了如下的接口:

void wake_up(wait_queue_head_t *q);
void wake_up_interruptible(wait_queue_head_t *q);
void wake_up_interruptible_all(wait_queue_head_t *q);
void wake_up(wait_queue_head_t *q);
void wake_up_interruptible(wait_queue_head_t *q);
void wake_up_interruptible_all(wait_queue_head_t *q);

    这里以分析wake_up_interruptible()函数的源码进行说明唤醒task的原理,因为其他的唤醒过程都是类似的。最后都会调用到__wake_up_common()这个函数:

void __wake_up_common(wait_queue_head_t *q, unsigned int mode,                                                                           
            int nr_exclusive, int sync, void *key)
{
    wait_queue_t *curr, *next;
 
    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;
 
        if (curr->func(curr, mode, sync, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}
void __wake_up_common(wait_queue_head_t *q, unsigned int mode,                                                                           
            int nr_exclusive, int sync, void *key)
{
    wait_queue_t *curr, *next;
 
    list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
        unsigned flags = curr->flags;
 
        if (curr->func(curr, mode, sync, key) &&
                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
            break;
    }
}

    从上面的源码可以看出最终就是调用了等待队列q上的task_list链表上的waiter对象的func方法,在前面又提到过这个方法就是autoremove_wake_function():

int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
   // 将wait对象private成员保存的task添加到run queue中,便于系统的调度
    int ret = default_wake_function(wait, mode, sync, key);
 
    // 将此wait对象从链表中删除
    if (ret)                                                                                                                             
        list_del_init(&wait->task_list);
    return ret;
}
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
   // 将wait对象private成员保存的task添加到run queue中,便于系统的调度
    int ret = default_wake_function(wait, mode, sync, key);
 
    // 将此wait对象从链表中删除
    if (ret)                                                                                                                             
        list_del_init(&wait->task_list);
    return ret;
}


defailt_wake_function()的源码如下,又看到我们熟悉的private成员

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
              void *key)
{
    return try_to_wake_up(curr->private, mode, sync);                                                                                     
}
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
              void *key)
{
    return try_to_wake_up(curr->private, mode, sync);                                                                                     
}

ry_to_wake_up()函数的源码比较长,这里就截取能体现其大致逻辑的代码进行说明:

static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
    old_state = p->state;
    if (!(old_state & state)) // 进行状态的判断
        goto out;
 
    // 如果task对象没有处于running state,则跳到out_activate处
    if (unlikely(task_running(rq, p)))
        goto out_activate;
        
    ......
 
out_activate:
    schedstat_inc(p, se.nr_wakeups);
    if (sync)
        schedstat_inc(p, se.nr_wakeups_sync);
    if (orig_cpu != cpu)
        schedstat_inc(p, se.nr_wakeups_migrate);
    if (cpu == this_cpu)
        schedstat_inc(p, se.nr_wakeups_local);
    else
        schedstat_inc(p, se.nr_wakeups_remote);
    update_rq_clock(rq);
    activate_task(rq, p, 1); // 将此task对象加入到run queue
    success = 1;
 
out_running:
    trace_sched_wakeup(rq, p);
    check_preempt_curr(rq, p, sync);
 
    p->state = TASK_RUNNING; // 设置task对象的状态为TASK_RUNNING
    if (p->sched_class->task_wake_up)                                                                                                     
        p->sched_class->task_wake_up(rq, p);
out:
    current->se.last_wakeup = current->se.sum_exec_runtime;
 
    task_rq_unlock(rq, &flags);
 
    return success;
}
 
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
{
    if (task_contributes_to_load(p))
        rq->nr_uninterruptible--;
 
    enqueue_task(rq, p, wakeup);
    inc_nr_running(rq);
}
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
    old_state = p->state;
    if (!(old_state & state)) // 进行状态的判断
        goto out;
 
    // 如果task对象没有处于running state,则跳到out_activate处
    if (unlikely(task_running(rq, p)))
        goto out_activate;
        
    ......
 
out_activate:
    schedstat_inc(p, se.nr_wakeups);
    if (sync)
        schedstat_inc(p, se.nr_wakeups_sync);
    if (orig_cpu != cpu)
        schedstat_inc(p, se.nr_wakeups_migrate);
    if (cpu == this_cpu)
        schedstat_inc(p, se.nr_wakeups_local);
    else
        schedstat_inc(p, se.nr_wakeups_remote);
    update_rq_clock(rq);
    activate_task(rq, p, 1); // 将此task对象加入到run queue
    success = 1;
 
out_running:
    trace_sched_wakeup(rq, p);
    check_preempt_curr(rq, p, sync);
 
    p->state = TASK_RUNNING; // 设置task对象的状态为TASK_RUNNING
    if (p->sched_class->task_wake_up)                                                                                                     
        p->sched_class->task_wake_up(rq, p);
out:
    current->se.last_wakeup = current->se.sum_exec_runtime;
 
    task_rq_unlock(rq, &flags);
 
    return success;
}
 
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
{
    if (task_contributes_to_load(p))
        rq->nr_uninterruptible--;
 
    enqueue_task(rq, p, wakeup);
    inc_nr_running(rq);
}

    内核调度任务,总是从就绪列表run queue中选择优先级最高的任务来运行。等待队列的唤醒操作,实际上就是把阻塞在此等待队列上的进程,加入到run queue中,等待调度器在下次调度时对其继续运行。

4. 例子:

一个简单的例子,我们常见的生产者-消费者模型:生产者每生产一个任务,就等待消费者将此任务处理掉,然后再生产下一个任务;消费者每接收到一个任务,就将其消耗掉,并通知生产者继续生产;

#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/wait.h>
 
#define ENTER() printk(KERN_DEBUG "%s() Enter", __func__)
#define EXIT() printk(KERN_DEBUG "%s() Exit", __func__)
#define ERR(fmt, args...) printk(KERN_ERR "%s()-%d: " fmt "\n", __func__, __LINE__, ##args)
#define DBG(fmt, args...) printk(KERN_DEBUG "%s()-%d: " fmt "\n", __func__, __LINE__, ##args)
 
MODULE_LICENSE("GPL");
 
 
struct work {
    char name[64];
    void (*work_func)(void *data);
    void *data;
};
 
static void do_work(void *data)
{
    int num = (int)data;
    DBG("work num is %d", num);
    msleep_interruptible(1000);
}
 
static struct task_struct *producer = NULL;
static struct task_struct *consumer = NULL;
static wait_queue_head_t prod_wq;
static wait_queue_head_t cons_wq;
 
static struct work *work = NULL;
 
static int producer_thr(void *arg)
{
    int num = 0;
    ENTER();
    while (!kthread_should_stop()) {
        int ret = wait_event_interruptible(prod_wq, (work == NULL));
        if (ret == -ERESTARTSYS) {
            DBG("wake up by signal");
            continue;
        }
 
        // DBG("ret = %d", ret);
        work = kzalloc(sizeof(struct work), GFP_KERNEL);
        if (!work) {
            ERR("kzalloc fail");
            break;
        }
        num++;
        snprintf(work->name, sizeof(work->name), "debug-work");
        work->work_func = do_work;
        work->data = (void *)num;
       
        wake_up_interruptible(&cons_wq);
    }
 
    EXIT();
    return 0;
}
 
static int consumer_thr(void *arg)
{
    ENTER();
    wake_up_interruptible(&prod_wq);
   
    while (!kthread_should_stop()) {
        int ret = wait_event_interruptible(cons_wq, (work != NULL));
        if (ret == -ERESTARTSYS) {
            DBG("wait_up by signal");
            continue;
        }
        // DBG("ret = %d", ret);
        DBG("excute work: %s", work->name);
        work->work_func(work->data);
        kfree(work);
        work = NULL;
        wake_up_interruptible(&prod_wq);
    }
 
    EXIT();
    return 0;
}
 
 
static __init int wq_demo_init(void)
{
    ENTER();
    init_waitqueue_head(&prod_wq);
    init_waitqueue_head(&cons_wq);
 
    producer = kthread_run(producer_thr, NULL, "producer-thr");
    if (!producer) {
        ERR("kthread_run fail");
        goto _fail;
    }
 
    consumer = kthread_run(consumer_thr, NULL, "consumer-thr");
    if (!consumer) {
        ERR("kthread_run fail");
        goto _fail;
    }
 
    EXIT();
    return 0;
 
_fail:
    if (producer)
        kthread_stop(producer);
    if (consumer)
        kthread_stop(consumer);
    return -ECHILD;
}
 
static __exit void wq_demo_exit(void)
{
    ENTER();
    if (producer)
        kthread_stop(producer);
    if (consumer)
        kthread_stop(consumer);
   
    EXIT();
}
 
module_init(wq_demo_init);
module_exit(wq_demo_exit);
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/wait.h>
 
#define ENTER() printk(KERN_DEBUG "%s() Enter", __func__)
#define EXIT() printk(KERN_DEBUG "%s() Exit", __func__)
#define ERR(fmt, args...) printk(KERN_ERR "%s()-%d: " fmt "\n", __func__, __LINE__, ##args)
#define DBG(fmt, args...) printk(KERN_DEBUG "%s()-%d: " fmt "\n", __func__, __LINE__, ##args)
 
MODULE_LICENSE("GPL");
 
 
struct work {
    char name[64];
    void (*work_func)(void *data);
    void *data;
};
 
static void do_work(void *data)
{
    int num = (int)data;
    DBG("work num is %d", num);
    msleep_interruptible(1000);
}
 
static struct task_struct *producer = NULL;
static struct task_struct *consumer = NULL;
static wait_queue_head_t prod_wq;
static wait_queue_head_t cons_wq;
 
static struct work *work = NULL;
 
static int producer_thr(void *arg)
{
    int num = 0;
    ENTER();
    while (!kthread_should_stop()) {
        int ret = wait_event_interruptible(prod_wq, (work == NULL));
        if (ret == -ERESTARTSYS) {
            DBG("wake up by signal");
            continue;
        }
 
        // DBG("ret = %d", ret);
        work = kzalloc(sizeof(struct work), GFP_KERNEL);
        if (!work) {
            ERR("kzalloc fail");
            break;
        }
        num++;
        snprintf(work->name, sizeof(work->name), "debug-work");
        work->work_func = do_work;
        work->data = (void *)num;
       
        wake_up_interruptible(&cons_wq);
    }
 
    EXIT();
    return 0;
}
 
static int consumer_thr(void *arg)
{
    ENTER();
    wake_up_interruptible(&prod_wq);
   
    while (!kthread_should_stop()) {
        int ret = wait_event_interruptible(cons_wq, (work != NULL));
        if (ret == -ERESTARTSYS) {
            DBG("wait_up by signal");
            continue;
        }
        // DBG("ret = %d", ret);
        DBG("excute work: %s", work->name);
        work->work_func(work->data);
        kfree(work);
        work = NULL;
        wake_up_interruptible(&prod_wq);
    }
 
    EXIT();
    return 0;
}
 
 
static __init int wq_demo_init(void)
{
    ENTER();
    init_waitqueue_head(&prod_wq);
    init_waitqueue_head(&cons_wq);
 
    producer = kthread_run(producer_thr, NULL, "producer-thr");
    if (!producer) {
        ERR("kthread_run fail");
        goto _fail;
    }
 
    consumer = kthread_run(consumer_thr, NULL, "consumer-thr");
    if (!consumer) {
        ERR("kthread_run fail");
        goto _fail;
    }
 
    EXIT();
    return 0;
 
_fail:
    if (producer)
        kthread_stop(producer);
    if (consumer)
        kthread_stop(consumer);
    return -ECHILD;
}
 
static __exit void wq_demo_exit(void)
{
    ENTER();
    if (producer)
        kthread_stop(producer);
    if (consumer)
        kthread_stop(consumer);
   
    EXIT();
}
 
module_init(wq_demo_init);
module_exit(wq_demo_exit);