经过上一节,我们分析了周期性调度器和fork一个新进程之后的逻辑,但是有没有发现这两个函数处理都只是设置了TIF_NEED_RESCHED,并没有做进程切换,其实进程切换是在主调度器中实现的,忘记的可以回顾一下 ​​重学计算机(十七、linux调度器和调度器类)​​,这篇文章里面有介绍主调度。

20.1 主调度器切换进程

这里我就不主要介绍主调度器了,想看主调度器的可以回去看这篇​​重学计算机(十七、linux调度器和调度器类)​​。

不过通过回顾了一下主调度器,发现我们还没有分析的有两个,一个是pick_next_task选择下一个进程,还有一个是deactivate_task(),删除红黑树的结点。

20.1.1 选择下一个进程pick_next_task_fair()

static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;

again:
#ifdef CONFIG_FAIR_GROUP_SCHED // 组调度的忽略
if (!cfs_rq->nr_running)
goto idle;

if (prev->sched_class != &fair_sched_class)
goto simple;

/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
因为dequeue_task_fair()中有set_next_buddy(),
所以下一个任务很可能和当前任务来自同一个cgroup
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
因此,尝试避免放置和设置整个cgroup层次结构,只更改实际更改的部分
*/

do {
struct sched_entity *curr = cfs_rq->curr;

/*
* Since we got here without doing put_prev_entity() we also
* have to consider cfs_rq->curr. If it is still a runnable
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
if (curr) {
if (curr->on_rq)
update_curr(cfs_rq);
else
curr = NULL;

/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
* Therefore the 'simple' nr_running test will indeed
* be correct.
*/
if (unlikely(check_cfs_rq_runtime(cfs_rq)))
goto simple;
}

se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);

/*
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;

while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
int pse_depth = pse->depth;

if (se_depth <= pse_depth) {
put_prev_entity(cfs_rq_of(pse), pse);
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
set_next_entity(cfs_rq_of(se), se);
se = parent_entity(se);
}
}

put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
}

if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);

return p;
simple:
cfs_rq = &rq->cfs;
#endif

if (!cfs_rq->nr_running) // 如果没有进程运行,退出
goto idle;

put_prev_task(rq, prev); // 把当前进程设置为prev

do {
se = pick_next_entity(cfs_rq, NULL); // 获取下一个进程的调度实体
set_next_entity(cfs_rq, se); // 设置下一个实体
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se); // 通过se获取到p的值

if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);

return p;

idle:
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
lockdep_unpin_lock(&rq->lock);
new_tasks = idle_balance(rq);
lockdep_pin_lock(&rq->lock);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
if (new_tasks < 0)
return RETRY_TASK;

if (new_tasks > 0)
goto again;

return NULL;
}

这个函数比较简单,获取下一个调度实体,然后设置新的下一个实体。

20.1.2 获取下一个调度实体pick_next_entity()

/*
* Pick the next process, keeping these things in mind, in this order:
选择下一个过程,记住以下几点,按以下顺序:
* 1) keep things fair between processes/task groups
保持过程/任务组之间的公平
* 2) pick the "next" process, since someone really wants that to run
选择“下一个”流程,因为有人确实希望它运行
* 3) pick the "last" process, for cache locality
为缓存位置选择“最后”进程
* 4) do not run the "skip" process, if something else is available
不要运行“跳过”进程,如果有其他可用的
*/
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
struct sched_entity *left = __pick_first_entity(cfs_rq); // 直接获取最小的结点
struct sched_entity *se;

/*
* If curr is set we have to see if its left of the leftmost entity
* still in the tree, provided there was anything in the tree at all.
如果设置了curr,我们必须看看它是否仍然在树中最左边的实体,前提是树中有任何东西
*/
if (!left || (curr && entity_before(curr, left))) // 如果不存在左边结点,或者当前进程时间更小
left = curr;

se = left; /* ideally we run the leftmost entity */

/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
如果运行其他内容不会变得太不公平,那么避免运行跳过伙伴
*/
if (cfs_rq->skip == se) {
struct sched_entity *second;

if (se == curr) {
second = __pick_first_entity(cfs_rq);
} else {
second = __pick_next_entity(se); // 获取下一个结点??
if (!second || (curr && entity_before(curr, second)))
second = curr;
}

if (second && wakeup_preempt_entity(second, left) < 1) // 判断下一个结点会比最左边的结点小??
se = second;
}

/*
* Prefer last buddy, try to return the CPU to a preempted task.
最好是最后一个,尝试将CPU返回给一个被抢占的任务。
*/
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;

/*
* Someone really wants this to run. If it's not unfair, run it.
有人真的想让它运行。如果不是不公平,那就运行它。
*/
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;

clear_buddies(cfs_rq, se);

return se;
}

这个函数不是很看的懂,明明第一步就获取到最小值, 怎么还取__pick_next_entity这个下一个结点的值呢?搞不懂,先保留个这个疑问吧。

20.1.3 设置下一个调度实体set_next_entity()

// se : 是即将运行得调度实体
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/* 'current' is not kept within the tree. */
if (se->on_rq) { // 如果在红黑树中,感觉是在的
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
任何任务在CPU上执行之前都必须进入队列。因此,请考虑它等待运行队列所花费的时间
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se); // 直接删除结点
update_load_avg(se, 1);
}

update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se; // 设置当前运行得时新的se了
#ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

这个设置下一个调度实体的函数简单粗暴,直接把该实体从红黑树中移除,然后就设置curr指针指向该实体。

20.1.4 退出对列准备deactivate_task()

如果在调度的时候,本进程已经进入休眠状态了,就会调用这个函数,准备删除红黑树中的结点(红黑树的结点都是就绪状态的进程)。

void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;

dequeue_task(rq, p, flags);
}

又是调用了这个函数。

20.1.5 继续退出对列准备dequeue_task()

static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}

这个函数也简单粗暴,直接调用调度类中的dequeue_task函数。

20.1.6 cfs退出对列函数dequeue_task_fair()

/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
dequeue_task方法在nr_running减少之前被调用。我们从rbtree中删除任务并更新公平调度统计数据
*/
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags); // 删除红黑树结点se

/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running decrement below.
*/
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;

/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
if (task_sleep && parent_entity(se))
set_next_buddy(parent_entity(se));

/* avoid re-evaluating load for this entity */
se = parent_entity(se);
break;
}
flags |= DEQUEUE_SLEEP;
}

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;

if (cfs_rq_throttled(cfs_rq))
break;

update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
}

if (!se)
sub_nr_running(rq, 1);

hrtick_update(rq);
}

看这个代码,让我想起了进队的操作,基本也差不多的。

20.1.7 删除结点dequeue_entity()

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);

update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);

if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
#endif
}

clear_buddies(cfs_rq, se);

if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se); // 这个就是红黑树删除结点操作
se->on_rq = 0; // 设置不在就绪红黑树中
account_entity_dequeue(cfs_rq, se);

/*
* Normalize the entity after updating the min_vruntime because the
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;

/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);

update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
}

这个也是比较简单,就是删除红黑树中的结点吧。

20.1.8 总结

重学计算机(二十、CFS完全公平调度器<下:进程切换和睡眠和唤醒>)_进程唤醒

20.2 进程睡眠

通过20.1中,我们了解到了,如果该进程睡眠了,然后主动调用schedule(),然后schedule()函数中会判断该进程如果是睡眠了,就会把该进程从就绪红黑树中删除,那这个睡眠的进程是怎么保存的。

20.2.1 等待对列

其实这个进程睡眠是不属于CFS完全公平调度器的内容的,因为这个等待对列是由自己业务也维护的,内核只是提供了对应的接口,不过为了整个调度的完整性,我们这里就简单的介绍一个这个等待对列。

等待对列的头文件是include/linux/wait.h

等待对列的代码文件是kernel/sched/wait.c

20.2.2 对列头结点定义

struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list; // 链表头结点
};
typedef struct __wait_queue_head wait_queue_head_t;

20.2.3 对列结点定义

struct __wait_queue {
unsigned int flags;
void *private; // 这个存储task结点
wait_queue_func_t func;
struct list_head task_list; // 通过这个结点来链接的
};

20.2.4 对列头结点初始化

// 使用init_waitqueue_head函数初始化头结点
extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);

#define init_waitqueue_head(q) \
do { \
static struct lock_class_key __key; \
\
__init_waitqueue_head((q), #q, &__key); \
} while (0)

void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
spin_lock_init(&q->lock);
lockdep_set_class_and_name(&q->lock, key, name);
INIT_LIST_HEAD(&q->task_list);
}

// 使用宏DECLARE_WAITQUEUE初始化头结点
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
.task_list = { &(name).task_list, &(name).task_list } }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

20.2.5 对列结点初始化

static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
q->flags = 0;
q->private = p;
q->func = default_wake_function; // 通用唤醒回调函数
}


#define __WAITQUEUE_INITIALIZER(name, tsk) { \
.private = tsk, \
.func = default_wake_function, \ //通用唤醒回调函数
.task_list = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk) \
wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)

20.2.6 等待对列添加元素

void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;

wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait); // 这个是添加对列的头
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);

void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;

wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue_tail(q, wait); // 这个添加对列的尾
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);

WQ_FLAG_EXCLUSIVE:这个标记是有排他性质的,有时候可能唤醒全部进程,有时候就只想唤醒一个进程的意思。

20.2.7 进程睡眠

铺垫了这么多,终于来到了进程睡眠,内核封装了几个函数,提供给我们使用,调用了之后,进程就会进入睡眠,等待唤醒。

wait_event(wq, condition)   //程被置于睡眠状态(TASK_UNINTERRUPTIBLE不可中断状态),直到@condition的值为true
io_wait_event(wq, condition) // 类似wait_event(),但带有io_schedule()。也是不可中断状态
wait_event_freezable(wq, condition) // 进程被置于睡眠状态(TASK_INTERRUPTIBLE——这样就不会对系统负载做出贡献),直到@condition的计算结果为true。可中断状态
wait_event_timeout(wq, condition, timeout) // 进程被置于睡眠状态(TASK_UNINTERRUPTIBLE),直到@condition的值为true。也是不可中断状态,不过是有超时时间的
wait_event_freezable_timeout(wq, condition, timeout) // 可中断状态,并且有超时时间
wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) // 就像wait_event_cmd(),只是它设置了exclusive标志。也是不可中断状态
wait_event_cmd(wq, condition, cmd1, cmd2) // 进程被置于睡眠状态(TASK_UNINTERRUPTIBLE),直到@condition的值为true。也是不可中断状态
wait_event_interruptible(wq, condition) // 进程被置于睡眠状态(TASK_INTERRUPTIBLE),直到@condition的计算结果为true或接收到信号。可中断状态
wait_event_interruptible_timeout(wq, condition, timeout) // 进程被置于睡眠状态(TASK_INTERRUPTIBLE),直到@condition的计算结果为true或接收到信号。支持超时时间
wait_event_hrtimeout(wq, condition, timeout) // 进程被置于睡眠状态(TASK_UNINTERRUPTIBLE),直到@condition的计算结果为true或接收到信号。不可中断的
wait_event_interruptible_hrtimeout(wq, condition, timeout) // 进程被置于睡眠状态(TASK_INTERRUPTIBLE),直到@condition的计算结果为true或接收到信号。可中断的
wait_event_interruptible_exclusive(wq, condition) // 可中断的
wait_event_freezable_exclusive(wq, condition) // 可中断的
wait_event_interruptible_locked(wq, condition) // 使用spin_lock()/spin_unlock()函数锁定/解锁锁,这些函数必须与宏之外的锁定/解锁方式匹配。可中断的
wait_event_interruptible_locked_irq(wq, condition) // 使用spin_lock_irq()/spin_unlock_irq()函数锁定/解锁锁,这些函数必须与宏之外的锁定/解锁方式匹配。可中断的
wait_event_interruptible_exclusive_locked(wq, condition) // spin_lock。可中断的
wait_event_interruptible_exclusive_locked_irq(wq, condition) // spin_lock_irq。可中断的
wait_event_killable(wq, condition) // 进程被置于睡眠状态(TASK_KILLABLE),直到@condition的计算结果为true或接收到信号。可被杀死状态,进程的一个新状态
wait_event_lock_irq_cmd(wq, condition, lock, cmd) // 不可中断
wait_event_lock_irq(wq, condition, lock) // 不可中断
wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) // 可中断
wait_event_interruptible_lock_irq(wq, condition, lock) // 可中断
wait_event_interruptible_lock_irq_timeout(wq, condition, lock, timeout) // 可中断并且超时

这么多我可以拒绝解释么,这个头文件,基本就被这基本睡眠函数给占完了。

20.2.8 进程睡眠函数wait_event()

接下来就分析这个就可以了。

// wait_event
#define wait_event(wq, condition) \
do { \
might_sleep(); \
if (condition) \
break; \
__wait_event(wq, condition); \
} while (0)

// __wait_event
#define __wait_event(wq, condition) \
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
schedule())

// ___wait_event
#define ___wait_event(wq, condition, state, exclusive, ret, cmd) \
({ \
__label__ __out; \
wait_queue_t __wait; \
long __ret = ret; /* explicit shadow */ \
\
INIT_LIST_HEAD(&__wait.task_list); \ // 初始化头
if (exclusive) \
__wait.flags = WQ_FLAG_EXCLUSIVE; \ // 是否带标记
else \
__wait.flags = 0; \
\
for (;;) { \
long __int = prepare_to_wait_event(&wq, &__wait, state);\ // 负责将等待对列元素添加到对应的等待对列,并且设置成不可中断状态。
\
if (condition) \
break; \
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
if (exclusive) { \
abort_exclusive_wait(&wq, &__wait, \
state, NULL); \
goto __out; \
} \
break; \
} \
\
cmd; \ // 这个就是 schedule()
} \
finish_wait(&wq, &__wait); \
__out: __ret; \
})

20.2.9 添加等待对列prepare_to_wait_event()

long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;

if (signal_pending_state(state, current))
return -ERESTARTSYS;

wait->private = current;
wait->func = autoremove_wake_function; // 绑定的回调函数

spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list)) {
if (wait->flags & WQ_FLAG_EXCLUSIVE) // 判断添加到等待对列
__add_wait_queue_tail(q, wait);
else
__add_wait_queue(q, wait);
}
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);

return 0;
}

20.2.10 进程唤醒

竟然进程睡眠函数都有一大堆了,那进行唤醒函数也是有一大堆。

#define wake_up(x)      __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0)

#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE, 1)

/*
* Wakeup macros to be used to report events to the targets.
*/
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, (void *) (m))
#define wake_up_locked_poll(x, m) \
__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
#define wake_up_interruptible_poll(x, m) \
__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))

函数中名字带_interruptible这个宏只能唤醒可中断状态的进程,名字不带的话,两种状态的都可以唤醒。

wake_up有一些函数后面带了_nr和_all,其实不带后缀的,只能唤醒一个带有WQ_FLAG_EXCLUSIVE标志的进程,带_nr的可以唤醒nr个带有WQ_FLAG_EXCLUSIVE标志的进程,带_all可以唤醒在等待对列上所有的进程。

分析了一下,唤醒的函数基本都会调度_wake_up()这个函数,下面我们来分析分析。

20.2.11 唤醒函数_wake_up()

void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;

spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, 0, key); // 调用了这个
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(__wake_up);

20.2.12 唤醒公共函数__wake_up_common()

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;

// 遍历等待对列头部对应的双向链表
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
// 最多唤醒nr个设置了排他性的等待进程,防止惊群
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}

遍历整个链表,按照条件进行唤醒进程。最后调用了绑定函数autoremove_wake_function()。

20.2.13 绑定回调函数autoremove_wake_function()

我们就来看看绑定的回调函数。

int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);

if (ret)
list_del_init(&wait->task_list);
return ret;
}

20.2.14 默认唤醒函数default_wake_function()

调用来调用去,还是回到了默认的唤醒函数。

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr->private, mode, wake_flags);
}

这个函数就是调度代码中的函数了,我们后面分析。

20.3 睡眠进程醒来

经过上面的分析,大体知道了进程睡眠和唤醒,也分析到了最后是调用了try_to_wake_up()这个函数。

20.3.1 try_to_wake_up()

/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened 被唤醒的进程
* @state: the mask of task states that can be woken 可以被唤醒的任务状态的掩码
* @wake_flags: wake modifier flags (WF_*)
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
* re-schedule is in progress), and as such you're allowed to do
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
如果还没有,就把它放到运行队列中。“当前”线程总是在运行队列上(除非实际的重新调度正在进行中),
因此,您可以使用更简单的“current->state = TASK_RUNNING”来标记自己是可运行的,而不需要这样做的开销。
*
* Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;

/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;

trace_sched_waking(p);

success = 1; /* we're going to change ->state */
cpu = task_cpu(p);

if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;

#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
*
* One must be running (->on_cpu == 1) in order to remove oneself
* from the runqueue.
*
* [S] ->on_cpu = 1; [L] ->on_rq
* UNLOCK rq->lock
* RMB
* LOCK rq->lock
* [S] ->on_rq = 0; [L] ->on_cpu
*
* Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
* from the consecutive calls to schedule(); the first switching to our
* task, the second putting it to sleep.
*/
smp_rmb();

/*
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*/
while (p->on_cpu)
cpu_relax();
/*
* Combined with the control dependency above, we have an effective
* smp_load_acquire() without the need for full barriers.
*
* Pairs with the smp_store_release() in finish_lock_switch().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
smp_rmb();

p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;

if (p->sched_class->task_waking)
p->sched_class->task_waking(p);

cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
#endif /* CONFIG_SMP */

ttwu_queue(p, cpu); // 把休眠中醒来的进程放到合适的对列中
stat:
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);

return success;
}

看来整个函数,其实就是ttwu_queue()。这个函数最有用。

20.3.2 放到对列中ttwu_queue()

static void ttwu_queue(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);

#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
}
#endif

raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
ttwu_do_activate(rq, p, 0); // 干活函数
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}

又调用这个函数来干活了。ttwu_do_activate()

20.3.3 加入对列ttwu_do_activate()

static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
lockdep_assert_held(&rq->lock);

#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
#endif

ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
ttwu_do_wakeup(rq, p, wake_flags);
}

这个有两个函数比较重要了ttwu_activate()和ttwu_do_wakeup()。

20.3.4 ttwu_activate()

static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags); // 比较熟悉了,进入对列
p->on_rq = TASK_ON_RQ_QUEUED; // 设置在运行得标记

/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
}

activate_task这个就是进入对列函数,上一节新进程的时候就分析过了。这里就不分析了,接着就设置了在对列中的标记。

但是这里有一点不一样的是在enqueue_entity()函数里,如果是唤醒进程,会调用place_entity()函数进行补偿,我们就来看看这个函数怎么补偿。

20.3.5 补偿虚拟时间place_entity()

这个函数是我们的老朋友, 在新进程创建的时候,也是使用了这个函数来调整新进程的虚拟时间,现在唤醒的进程也是使用这个函数来调整。

// initial=1 : 表示新创建的进程  initial=0:表示唤醒的进程
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;

/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
“当前”阶段已经被承诺给当前的任务,但是新任务的额外重量会让他们慢下来一点,
把新任务放到最后的空位上
*/
// START_DEBIT是给新创建的进程略加惩罚的
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);

/* sleeps up to a single latency don't count. */
if (!initial) {
unsigned long thresh = sysctl_sched_latency; // 一个调度延迟

/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS)) // 如果设置了这个标记
thresh >>= 1; // 补偿减半,右移以为相等/2

vruntime -= thresh;
}

/* ensure we never gain time by being placed backwards.
确保我们永远不会因为被放置在后面而无法赢得时间*/

se->vruntime = max_vruntime(se->vruntime, vruntime);
}

  • 如果没有设置GENTLE_FAIR_SLEEPERS标记
    补偿一个调度延迟
  • 如果设置GENTLE_FAIR_SLEEPERS标记
    补偿半个调度延迟

但是如果休眠进程的睡眠时间非常短,很有可能原本进程的虚拟时间比补偿后的还大,所以最后还加了一个判断。

如果休眠进程睡眠时间特别久,那么就将虚拟运行时间设置为所在运行对列的最小虚拟运行时间减去补偿量。

从上面代码上看,从长时间休眠中醒来的进程,因为其虚拟运行时间减小(比对列的最小虚拟运行时间还小),所以会获得优先调度,从而使交互型进程得到及时的响应。

20.3.6 唤醒ttwu_do_wakeup()

/*
* Mark the task runnable and perform wakeup-preemption.
将任务标记为可运行的,并执行唤醒抢占。
*/
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags); // 调用了这个函数
p->state = TASK_RUNNING;
trace_sched_wakeup(p);

#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Our task @p is fully woken up and running; so its safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
lockdep_pin_lock(&rq->lock);
}

if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
u64 max = 2*rq->max_idle_balance_cost;

update_avg(&rq->avg_idle, delta);

if (rq->avg_idle > max)
rq->avg_idle = max;

rq->idle_stamp = 0;
}
#endif
}

check_preempt_curr()这个函数其实就是我们上一节分析的,新进程加入之后唤醒的操作,判断是否可以抢占,所以可以到上一节去看。

20.3.7 总结

重学计算机(二十、CFS完全公平调度器<下:进程切换和睡眠和唤醒>)_进程睡眠_02

20.4 提出问题

经过这次的分析,下面总结了几个问题,我们一起来分析一下。

20.4.1 怎么保证高优先级进程

虚拟运行时间都一样了?那些高优先级的进程不是很吃亏么?那怎么保证这些高优先级的进程不吃亏呢?

一般高优先级进程,运行的时间都比较小,所以虚拟时间都比较少,每次唤醒都可以很快的得到运行的权利。

20.4.2 子进程的虚拟时间问题

看过上一篇分析的都知道,子进程的虚拟时间复制父进程的,如果设置了那个标记,会稍微调整一下。

20.4.3 父子进程先运行问题

虽然内核确实也支持了设置父进程先运行还是子进程先运行,不过我们还是不能保证,所以还是需要不知道这个情况,哈哈哈。

20.4.4 睡眠进程虚拟时间问题

唤醒的进程会进行一定的补偿,但是即使补偿,唤醒的进程还是会优先获得CPU。

20.4.5 整体

重学计算机(二十、CFS完全公平调度器<下:进程切换和睡眠和唤醒>)_进程唤醒_03

参考链接:​​从几个问题开始理解CFS调度器​