重学计算机（二十一·、实时进程调度）

原创

酱油师兄 2022-03-30 11:45:12 博主文章分类：重学计算机 ©著作权

文章标签 实时进程调度进程调度时间片优先级链表 文章分类 后端开发

©著作权归作者所有：来自51CTO博客作者酱油师兄的原创作品，请联系作者获取转载授权，否则将追究法律责任

这一篇我们开始实时进程调度。实时系统分为两大类：硬实时和软实时。我们都知道Linux系统是软实时，并没有支持硬实时，一些rtos可能支持硬实时。不过我们分析的都是Linux系统。

21.1 实时调度对列

老规矩，我们还是从实时调度对列开始看起，之前我们分析过，在rq对列中，分别包含了cfs_rq和rt_rq。

这次我们就分析rt_rq。

21.2.1 就绪对列struct rt_rq

/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
  struct rt_prio_array active;    // 管理进程
  unsigned int rt_nr_running;     // 对列运行个数

  int rt_queued;

  int rt_throttled;
  u64 rt_time;
  u64 rt_runtime;
  /* Nests inside the rq lock: */
  raw_spinlock_t rt_runtime_lock;
};

我把多cpu和组调度的删掉了，组调度以后有机会在分析了。

这里面最重要的是struct rt_prio_array这个结构体，下面我们来分析。

21.2.2 保存结构体struct rt_prio_array

#define DECLARE_BITMAP(name,bits) \
  unsigned long name[BITS_TO_LONGS(bits)]

/*
 * This is the priority-queue data structure of the RT scheduling class:
 */
struct rt_prio_array {
  DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
  struct list_head queue[MAX_RT_PRIO];
};

这个结构就有点意思，bitmap是定义的一个数组，数组大小是MAX_RT_PRIO，MAX_RT_PRIO这个想必大家都忘记了吧，其实我也忘记了，哈哈。MAX_RT_PRIO是实时优先级最大值，最大值是99，所以这个数组的大小是100。然后又定义了一个链表数组，大小是99。

bitmap就是标志位，如果有相应优先级的进程存在，就会把对应的位置1，然后再挂载在queue[prio]链表上。可以看下图：

重学计算机（二十一·、实时进程调度）_链表

这个图已经很明确了，每一个优先级都有一个链表，到查询的时候，只需要遍历即可。

21.2.3 调度实体struct sched_rt_entity

struct sched_rt_entity {   // rt的实体
  struct list_head run_list;    // 挂载链表的结点
  unsigned long timeout;
  unsigned long watchdog_stamp;
  unsigned int time_slice;

  struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
  struct sched_rt_entity  *parent;
  /* rq on which this entity is (to be) queued: */
  struct rt_rq    *rt_rq;
  /* rq "owned" by this entity/group: */
  struct rt_rq    *my_q;
#endif
};

这个结构体就是我们调度实体，run_list就是我们上面挂载在链表的结点。

21.2.4 进队操作__enqueue_rt_entity()

static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
  struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  struct rt_prio_array *array = &rt_rq->active;
  struct rt_rq *group_rq = group_rt_rq(rt_se);
  struct list_head *queue = array->queue + rt_se_prio(rt_se);  // 通过优先级获取

  /*
   * Don't enqueue the group if its throttled, or when empty.
   * The latter is a consequence of the former when a child group
   * get throttled and the current group doesn't have any other
   * active members.
   */
  if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
    return;

  if (head)
    list_add(&rt_se->run_list, queue);    // 添加在头
  else
    list_add_tail(&rt_se->run_list, queue);   // 添加在尾
  __set_bit(rt_se_prio(rt_se), array->bitmap);

  inc_rt_tasks(rt_se, rt_rq);
}

21.2.5 出队操作__dequeue_rt_entity

static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
  struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  struct rt_prio_array *array = &rt_rq->active;

  list_del_init(&rt_se->run_list);    // 重置了，没有释放内存？
  if (list_empty(array->queue + rt_se_prio(rt_se)))
    __clear_bit(rt_se_prio(rt_se), array->bitmap);  // 如果为空，就清除标记

  dec_rt_tasks(rt_se, rt_rq);
}

21.2 实时调度类实现

接下来我们看看实时调度类的实现，通过分析cfs，就明白这个结构体的重要。

const struct sched_class rt_sched_class = {
  .next     = &fair_sched_class,
  .enqueue_task   = enqueue_task_rt,    // 进队列
  .dequeue_task   = dequeue_task_rt,    // 出队列
  .yield_task   = yield_task_rt,      

  .check_preempt_curr = check_preempt_curr_rt,    // 判断当前是否抢占

  .pick_next_task   = pick_next_task_rt,  // 选择下一个进程
  .put_prev_task    = put_prev_task_rt,

#ifdef CONFIG_SMP
  .select_task_rq   = select_task_rq_rt,

  .set_cpus_allowed       = set_cpus_allowed_common,
  .rq_online              = rq_online_rt,
  .rq_offline             = rq_offline_rt,
  .task_woken   = task_woken_rt,
  .switched_from    = switched_from_rt,
#endif

  .set_curr_task          = set_curr_task_rt,
  .task_tick    = task_tick_rt,       // 周期调度器

  .get_rr_interval  = get_rr_interval_rt,

  .prio_changed   = prio_changed_rt,
  .switched_to    = switched_to_rt,

  .update_curr    = update_curr_rt,
};

21.2.1 统计函数update_curr_rt()

/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 更新当前任务的运行时统计信息。跳过调度类中没有的当前任务。
 */
static void update_curr_rt(struct rq *rq)
{
  struct task_struct *curr = rq->curr;      // 当前进程
  struct sched_rt_entity *rt_se = &curr->rt;    // 获取rt调度实体
  u64 delta_exec;

  if (curr->sched_class != &rt_sched_class)   // 直接判断不是rt调度类就退出
    return;

  delta_exec = rq_clock_task(rq) - curr->se.exec_start; // 计算实际运行得时间
  if (unlikely((s64)delta_exec <= 0))
    return;

  schedstat_set(curr->se.statistics.exec_max,
          max(curr->se.statistics.exec_max, delta_exec));   // 更新最大执行时间

  curr->se.sum_exec_runtime += delta_exec;     // 统计的结果好像还是存在se中
  account_group_exec_runtime(curr, delta_exec);

  curr->se.exec_start = rq_clock_task(rq);
  cpuacct_charge(curr, delta_exec);

  sched_rt_avg_update(rq, delta_exec);   // 更新rt平均运行时间

  if (!rt_bandwidth_enabled())
    return;

  for_each_sched_rt_entity(rt_se) {
    struct rt_rq *rt_rq = rt_rq_of_se(rt_se);

    if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
      raw_spin_lock(&rt_rq->rt_runtime_lock);
      rt_rq->rt_time += delta_exec;       // 增加运行时间
      if (sched_rt_runtime_exceeded(rt_rq))  // rt_time > runtime 需要做调度标记
        resched_curr(rq); 
      raw_spin_unlock(&rt_rq->rt_runtime_lock);
    }
  }
}

21.3 周期调度器

刚开始还是分析周期调度器吧，这个比较简单，哈哈。

21.3.1 周期调度函数task_tick_rt()

static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
  struct sched_rt_entity *rt_se = &p->rt;

  update_curr_rt(rq);   // 运行时统计数据

  watchdog(rq, p);    // 保存jiffies的值，不知道有啥用

  /*
   * RR tasks need a special form of timeslice management.
   * FIFO tasks have no timeslices.
   */
  if (p->policy != SCHED_RR)    // FIFO的没有时间片，没有更高优先级的进程就绪，使用该调度策略进程就会一直执行
    return;

  if (--p->rt.time_slice)   // 时间片还没到，就直接返回
    return;

  // 时间片已经耗尽，先讲进程的时间片重新初始化为默认时间片
  p->rt.time_slice = sched_rr_timeslice;   // 重新设置时间片，sched_rr_timeslice = 100ms

  /*
   * Requeue to the end of queue if we (and all of our ancestors) are not
   * the only element on the queue
   如果我们(以及我们的所有祖先)不是队列中唯一的元素，则将其重新排列到队列的末尾
   */
  for_each_sched_rt_entity(rt_se) {
    // 如果对列上存在其他进程，则将自身移到队伍的尾部，并且设置need_resched标记
    if (rt_se->run_list.prev != rt_se->run_list.next) {
      requeue_task_rt(rq, p, 0);
      resched_curr(rq);   // 设置标记函数，前面讲过了。
      return;
    }
  }
}

通过上面的代码看出，FIFO是没有时间片的概念的，SCHED_RR是有时间片，当时间片还没到的时候，会返回，如果时间片到了，就会重新设置时间片，并且判断优先级链表，如果有其他进程，就会把这个进程移动后后面（这个代码还没开始分析，哈哈）。

21.3.2 调整优先级对列准备requeue_task_rt()

static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
{
  struct sched_rt_entity *rt_se = &p->rt;
  struct rt_rq *rt_rq;

  for_each_sched_rt_entity(rt_se) {
    rt_rq = rt_rq_of_se(rt_se);   // 获取rt_rq
    requeue_rt_entity(rt_rq, rt_se, head);  // 这个才是干活函数
  }
}

这个函数其实没做啥，主要是调用了requeue_rt_entity()这个函数来处理。

21.3.3 调整优先级对列requeue_rt_entity()

/*
 * Put task to the head or the end of the run list without the overhead of
 * dequeue followed by enqueue.
 将任务放在运行列表的头或尾，而不需要先出队列，再入队列的开销
 */
static void
requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
{
  if (on_rt_rq(rt_se)) {
    struct rt_prio_array *array = &rt_rq->active;
    struct list_head *queue = array->queue + rt_se_prio(rt_se);  // 获取优先级的对列

    if (head)   // head = 0 
      list_move(&rt_se->run_list, queue);   // 这个的应该是移除到头部
    else
      list_move_tail(&rt_se->run_list, queue);  // 这个是移除到尾部
  }
}

分析到这里就能看到，时间片到达了，就把这个进程移动到对列的尾部。关于链表的操作，我们这一节就不分析了。

21.3.4 总结

重学计算机（二十一·、实时进程调度）_时间片_02

21.4 新进程加入

周期性调度器已经分析完了，那我们就看看新进程加入的时候，实时调度器是怎么实现的。

21.4.1 (*task_fork)()初始化

实时调度类并没有实现新进程加入的初始化，没有就没有吧，我们继续往后看。

21.4.2 进队操作enqueue_task_rt()

这个进队操作，实时调度类还是支持的，我们来看看：

/*
 * Adding/removing a task to/from a priority array:
 */
static void
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
  struct sched_rt_entity *rt_se = &p->rt;

  if (flags & ENQUEUE_WAKEUP)
    rt_se->timeout = 0;

  enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);   // 才是真正干活的函数

  if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
    enqueue_pushable_task(rq, p);   // 多CPU的
}

21.4.3 真正进队函数enqueue_rt_entity()

static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
  struct rq *rq = rq_of_rt_se(rt_se);

  dequeue_rt_stack(rt_se);  // 清除一些标记，就是这个函数看不懂
  for_each_sched_rt_entity(rt_se)
    __enqueue_rt_entity(rt_se, head);   // 添加到对列中
  enqueue_top_rt_rq(&rq->rt);  //看着只是把标记打开，并且设置了几个运行对列
}

21.4.4 rt_rq就绪对列的标记清除dequeue_rt_stack()

/*
 * Because the prio of an upper entry depends on the lower
 * entries, we must remove entries top - down.
 因为上项的优先级取决于下项，所以我们必须从上到下删除项。
 */
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
{
  struct sched_rt_entity *back = NULL;

  for_each_sched_rt_entity(rt_se) {
    rt_se->back = back;
    back = rt_se;
  }

  // 把rt就绪标记清0
  dequeue_top_rt_rq(rt_rq_of_se(back));   // 获取所在的对列rt_rq_of_se(back)

  // 这个函数难道不执行？？？
  for (rt_se = back; rt_se; rt_se = rt_se->back) {
    if (on_rt_rq(rt_se))
      __dequeue_rt_entity(rt_se);
  }
}

rt_se->back是代表什么？？？

21.4.5 清除了一些标记dequeue_top_rt_rq()

static void
dequeue_top_rt_rq(struct rt_rq *rt_rq)
{
  struct rq *rq = rq_of_rt_rq(rt_rq);

  BUG_ON(&rq->rt != rt_rq);

  if (!rt_rq->rt_queued)
    return;

  BUG_ON(!rq->nr_running);

  sub_nr_running(rq, rt_rq->rt_nr_running);
  rt_rq->rt_queued = 0;
}

这个函数只是做了清除标记的操作。

21.4.6 恢复rt_rq的标记enqueue_top_rt_rq()

static void
enqueue_top_rt_rq(struct rt_rq *rt_rq)
{
  struct rq *rq = rq_of_rt_rq(rt_rq);

  BUG_ON(&rq->rt != rt_rq);

  if (rt_rq->rt_queued)
    return;
  if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
    return;

  add_nr_running(rq, rt_rq->rt_nr_running);
  rt_rq->rt_queued = 1;
}

21.4.7 判断是否需要抢占check_preempt_curr_rt()

前面已经添加了对列了，现在就可以判断是否需要抢占

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
  if (p->prio < rq->curr->prio) {   // 这个判断简单粗暴，直接判断优先级
    resched_curr(rq);   // 优先级高就设置调度标记
    return;
  }

#ifdef CONFIG_SMP
  /*
   * If:
   *
   * - the newly woken task is of equal priority to the current task
   * - the newly woken task is non-migratable while current is migratable
   * - current will be preempted on the next reschedule
   *
   * we should check to see if current can readily move to a different
   * cpu.  If so, we will reschedule to allow the push logic to try
   * to move current somewhere else, making room for our non-migratable
   * task.
   */
  if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
    check_preempt_equal_prio(rq, p);
#endif
}

21.4.8 总结

重学计算机（二十一·、实时进程调度）_实时进程调度_03

21.5 主调度器

接下来我们看看主调度器。

21.5.1 退出对列dequeue_task_rt()

static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
  struct sched_rt_entity *rt_se = &p->rt;

  update_curr_rt(rq);         // 统计函数
  dequeue_rt_entity(rt_se);   // 这个应该是干活函数了

  dequeue_pushable_task(rq, p);   // 多CPU
}

21.5.2 真正退出对列函数dequeue_rt_entity()

static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
  struct rq *rq = rq_of_rt_se(rt_se);

  dequeue_rt_stack(rt_se);  // 老套路了，先请求rq_rt标记

  for_each_sched_rt_entity(rt_se) {
    struct rt_rq *rt_rq = group_rt_rq(rt_se);

    if (rt_rq && rt_rq->rt_nr_running)
      __enqueue_rt_entity(rt_se, false);   // 这是插入到队尾
  }
  enqueue_top_rt_rq(&rq->rt);   // 插入完成就可以，置位各种标记
}

21.5.3 找到下一个进程pick_next_task_rt()

static struct task_struct *
pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
  struct task_struct *p;
  struct rt_rq *rt_rq = &rq->rt;

  if (need_pull_rt_task(rq, prev)) {  // 多CPU的
    /*
     * This is OK, because current is on_cpu, which avoids it being
     * picked for load-balance and preemption/IRQs are still
     * disabled avoiding further scheduler activity on it and we're
     * being very careful to re-start the picking loop.
     */
    lockdep_unpin_lock(&rq->lock);
    pull_rt_task(rq);
    lockdep_pin_lock(&rq->lock);
    /*
     * pull_rt_task() can drop (and re-acquire) rq->lock; this
     * means a dl or stop task can slip in, in which case we need
     * to re-start task selection.
     */
    if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
           rq->dl.dl_nr_running))
      return RETRY_TASK;
  }

  /*
   * We may dequeue prev's rt_rq in put_prev_task().
   * So, we update time before rt_nr_running check.
   */
  if (prev->sched_class == &rt_sched_class)
    update_curr_rt(rq);   // 统计函数

  if (!rt_rq->rt_queued)
    return NULL;

  put_prev_task(rq, prev);  

  p = _pick_next_task_rt(rq);   // 重要是这个函数

  /* The running task is never eligible for pushing */
  dequeue_pushable_task(rq, p); // 多CPU

  queue_push_tasks(rq);     // 多CPU

  return p;
}

21.5.4 查找下一个进程_pick_next_task_rt()

static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
  struct sched_rt_entity *rt_se;
  struct task_struct *p;
  struct rt_rq *rt_rq  = &rq->rt;

  do {    // 查找适合的下一个
    rt_se = pick_next_rt_entity(rq, rt_rq);
    BUG_ON(!rt_se);
    rt_rq = group_rt_rq(rt_se);
  } while (rt_rq);

  p = rt_task_of(rt_se);
  p->se.exec_start = rq_clock_task(rq);  // 设置进程运行时间

  return p;
}

21.5.5 挑选合适的pick_next_rt_entity()

static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
               struct rt_rq *rt_rq)
{
  struct rt_prio_array *array = &rt_rq->active;
  struct sched_rt_entity *next = NULL;
  struct list_head *queue;
  int idx;

  idx = sched_find_first_bit(array->bitmap);  // 找到位图中，优先级最高的
  BUG_ON(idx >= MAX_RT_PRIO);

  queue = array->queue + idx;
  next = list_entry(queue->next, struct sched_rt_entity, run_list);

  return next;
}