最近想搞一个统计进程IO延迟的内核模块,iostat和blktrace都有不足,iostat统计的是块设备总的IO数据,blktrace统计的IO数据太过杂乱。该模块主要在IO请求插入IO队列、派发IO请求、IO请求传输完成3个节点的关键函数blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done中插入代码,统计进程的IO请求在IO队列、IO请求在磁盘驱动层的耗时。原理其实比较简单,但是在开发和调试过程却遇到莫名其妙的内核卡死,排查到最后发现与spin lock锁异常、内核内存越界有关,很奇葩又很无语。

本文是在centos 8.3里做的测试,内核版本4.18.0-240.el8,主要内容包括:

1:内核莫名卡死不会自动重启怎么办?怎么生成crash日志?

2:spin lock莫名其妙进入native_queued_spin_lock_slowpath分支的分析

3:list_for_each_entry遍历内核链表的一个坑

1:添加的源码

这一节主要介绍在IO请求插入IO队列、派发IO请求、IO请求传输完成3个节点的关键函数blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done中插入的代码,具体原理不用懂,有个大体了解就行,本次遇到的内核卡死与这些代码息息相关。

首先在include/linux/blk_types.h中增加本次需要的结构体

1. #define COMM_LEN                        16
2. struct process_io_control{
3. //使能标记
4. int enable;
5.     spinlock_t process_lock;
6. //进程的 process_io_info 结构靠 process_io_info_list 插入 process_io_control 结构的 process_io_control_head 链表
7. struct list_head process_io_control_head;
8. //内核线程,定时打印每个进程的IO延迟信息
9. struct task_struct *kernel_thread;
10. //process_rq_stat 内存结构资源池
11. struct kmem_cache *process_rq_stat_cachep;
12. //process_io_info 内存结构资源池
13. struct kmem_cache *process_io_info_cachep;
14. };
15. struct process_io_info{
16. int pid;//记录进程PID
17. char comm[COMM_LEN];//进程名字
18. //进程有多少个IO请求在传输
19. int rq_count;
20. //进程没有IO请求时的统计次数
21. int rq_empty_count;
22. //IO请求在IO队列最长的停留时间(进程的)
23.     u32 max_id_time;
24. //IO请求在磁盘驱动层的最长耗时(进程的)
25.     u32 max_dc_time;
26. //IO请求从插入队列到传输完成的最大总耗时(进程的)
27.     u32 max_idc_time;
28. //在IO队列停留时间最长的IO请求(进程的)
29. struct request *max_id_time_rq;
30. //在磁盘驱动层耗时最长的IO请求(进程的)
31. struct request *max_dc_time_rq;
32. //IO请求传输耗时最长的IO请求(进程的)
33. struct request *max_idc_time_rq;
34. //进程的 process_io_info 结构靠 process_io_info_list 插入 process_io_control 结构的 process_io_control_head 链表
35. struct list_head process_io_info_list;
36. };
37. struct process_rq_stat{
38. //指向IO请求结构
39. struct request *rq;
40. //IO请求插入IO队列的时间点
41.     u64 rq_inset_time;
42. //IO请求派发的时间点
43.     u64 rq_issue_time;
44. //IO请求在IO队列的停留时间
45.     u32 id_time;
46. //IO请求在磁盘驱动层的耗时
47.     u32 dc_time;
48. //IO请求从插入队列到传输完成的总耗时
49.     u32 idc_time;
50. //指向IO请求所属的 process_io_info 结构
51. struct process_io_info *p_process_io_info;
52. };

struct process_rq_stat代表每次的IO请求,主要记录IO请求插入IO队列、IO请求派发、IO请求传输完成的时间点。struct process_io_info代表每个IO传输的进程,主要记录在IO队列时间最长的IO请求和停留时间、在磁盘驱动层时间最长的IO请求和停留时间。struct process_io_control是总的控制结构,每个参与IO传输的进程绑定的process_io_info结构都添加到process_io_control结构的process_io_control_head链表,并且process_io_control还开启一个内核线程,默认每隔1s打印一次每个IO传输进程的IO延迟信息(主要是IO请求在IO队列最大停留时间,IO请求在磁盘驱动的最大耗时)。

这些结构体是怎么跟块层联系上呢?看下:

1. struct request {
2.     ............
3. struct process_rq_stat *p_process_rq_stat;
4.     ............
5. }
6. struct gendisk {
7.     ...........
8. struct process_io_control process_io;
9.     ...........
10. }

只是在代表IO请求的struct request结构插入了struct process_rq_stat *p_process_rq_stat结构。在代表块设备的struct gendisk结构插入了struct process_io_control process_io。

在IO请求插入IO队列、派发IO请求、IO请求传输完成3个节点的关键函数blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done中插入的代码如下:

1. //IO请求插入IO队列时执行
2. void blk_mq_sched_request_inserted(struct request *rq)
3. {
4. 
5.     trace_block_rq_insert(rq->q, rq);
6. 
7.     if(rq->rq_disk && rq->rq_disk->process_io.enable){
8. struct process_rq_stat *p_process_rq_stat_tmp = NULL;
9. struct process_io_info *p_process_io_info_tmp = NULL;
10. int find = 0;
11. 
12.             spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
13. //先遍历process_io_control_head链表上是否已经有了当前进程绑定的 process_io_info 结构
14.             list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
15.                     if(p_process_io_info_tmp->pid == current->pid){
16.                           p_process_io_info_tmp->rq_count ++;
17.                           find = 1;
18.                     }
19.             }
20.             spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
21. 
22.             if(0 == find){
23. //没有找到则为当前进程分配一个新的 process_io_info结构
24.                 p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_KERNEL);
25.                 if(!p_process_io_info_tmp)
26.                         goto fail;
27. 
28.                     memset(p_process_io_info_tmp,0,sizeof(struct process_io_info));
29.                     spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
30.                     p_process_io_info_tmp->rq_count ++;
31. //把 process_io_info结构 添加到 process_io.process_io_control_head 链表
32.                     list_add(&p_process_io_info_tmp->process_io_info_list,&(rq->rq_disk->process_io.process_io_control_head));
33.                     spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
34.             }
35. //为当前传输的IO请求分配一个 process_rq_stat 结构
36.             p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_KERNEL);
37.             if(!p_process_rq_stat_tmp)
38.                     goto fail;
39.             memset(p_process_rq_stat_tmp,0,sizeof(struct process_rq_stat));
40. //process_io_info 结构记录当前进程的PID和名字
41.             p_process_io_info_tmp->pid = current->pid;
42.             strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
43. //process_rq_stat_tmp->p_process_io_info 指向 p_process_io_info
44.             p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
45. //记录IO请求插入IO队列的时间
46.             p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
47. //IO请求的 rq->p_process_rq_stat 指向 process_rq_stat 结构
48.             rq->p_process_rq_stat = p_process_rq_stat_tmp;
49. 
50.             return;
51.     fail:
52.             if(p_process_rq_stat_tmp)
53.                 kmem_cache_free(rq->rq_disk->process_io.process_rq_stat_cachep, p_process_rq_stat_tmp);
54.             if(p_process_io_info_tmp)
55.                 kmem_cache_free(rq->rq_disk->process_io.process_io_info_cachep, p_process_io_info_tmp);
56.     }
57. }
58. 
59. //派发IO请求时执行
60. void blk_mq_start_request(struct request *rq)
61. {
62.        ....................
63.     if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
64. struct process_rq_stat *p_process_rq_stat_tmp = rq->p_process_rq_stat;
65. struct process_io_info *p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
66. //process_rq_stat_tmp->rq_issue_time 记录IO请求派发的时间
67.         p_process_rq_stat_tmp->rq_issue_time = ktime_to_us(ktime_get());
68. //process_rq_stat_tmp->id_time记录IO请求在IO队列停留的时间
69.         p_process_rq_stat_tmp->id_time = p_process_rq_stat_tmp->rq_issue_time - p_process_rq_stat_tmp->rq_inset_time;
70.         if(p_process_rq_stat_tmp->id_time > p_process_io_info_tmp->max_id_time){
71. //process_io_info_tmp->max_id_time 记录 IO请求在IO队列最长的停留时间
72.             p_process_io_info_tmp->max_id_time = p_process_rq_stat_tmp->id_time;
73.             p_process_io_info_tmp->max_id_time_rq = rq;
74.         }
75.     }   
76. }
77. 
78. //IO请求传输完成时执行
79. void blk_account_io_done(struct request *req, u64 now)
80. {
81.     ....................
82.     if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
83. struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
84. struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
85. //process_rq_stat_tmp->dc_time 记录IO请求在磁盘驱动的耗时
86.             p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
87. //process_rq_stat->idc_time 记录IO请求从插入队列到传输完成的总耗时
88.             req->p_process_rq_stat->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
89.             
90.             if(p_process_rq_stat_tmp->dc_time > p_process_io_info_tmp->max_dc_time){
91. //process_io_info_tmp->max_dc_time 记录IO请求在磁盘驱动层的最长耗时(进程的)
92.                 p_process_io_info_tmp->max_dc_time = p_process_rq_stat_tmp->dc_time;
93.                 p_process_io_info_tmp->max_dc_time_rq = req;
94.             }
95.             
96.             if(p_process_rq_stat_tmp->idc_time > p_process_io_info_tmp->max_idc_time){
97. //process_io_info_tmp->max_idc_time 记录IO请求从插入队列到传输完成的最大总耗时(进程的)
98.                 p_process_io_info_tmp->max_idc_time = p_process_rq_stat_tmp->idc_time;
99.                 p_process_io_info_tmp->max_idc_time_rq = req;
100.             }
101.             p_process_io_info_tmp->rq_count --;
102.             if(p_process_io_info_tmp->rq_count < 0){
103.                     printk("%s error:%d\n",__func__,p_process_io_info_tmp->rq_count);
104.             }
105.             kmem_cache_free(req->rq_disk->process_io.process_rq_stat_cachep, p_process_rq_stat_tmp);
106.             req->p_process_rq_stat = NULL;
107.     }
108. }

另外,在block/genhd.c 中添加源码如下:

1. #include <linux/delay.h>
2. extern void print_process_io_info(struct process_io_control *p_process_io_tmp);
3. //extern void free_all_process_io_info(struct process_io_control *p_process_io_tmp);
4. static int process_rq_stat_thread(void *arg)
5. {                       
6. struct process_io_control *p_process_io_tmp = (struct process_io_control *)arg;
7.     while (!kthread_should_stop()) {
8.         print_process_io_info(p_process_io_tmp);
9.         msleep(1000);
10.     }
11.     p_process_io_tmp->kernel_thread = NULL; 
12.     return 0;            
13. }
14. static ssize_t disk_process_rq_stat_show(struct device *dev,
15. struct device_attribute *attr,
16. char *buf)
17. {       
18. struct gendisk *disk = dev_to_disk(dev);
19.         
20.         return sprintf(buf, "%d\n", disk->process_io.enable);
21. }
22. static ssize_t disk_process_rq_stat_store(struct device *dev,
23. struct device_attribute *attr,
24. const char *buf, size_t count)
25. {
26. struct gendisk *disk = dev_to_disk(dev);
27. long intv;
28. 
29.         if (!count || !sscanf(buf, "%ld", &intv))
30.                 return -EINVAL;
31. 
32.         if (!(intv == 0 || intv == 1))
33.                 return -EINVAL;
34. 
35.         if(disk->process_io.enable != intv){
36.                 if(intv)
37.                 {
38.                     INIT_LIST_HEAD(&(disk->process_io.process_io_control_head));
39.                     spin_lock_init(&(disk->process_io.process_lock));
40.                     atomic_set(&(disk->process_io.lock_count),0);
41.                     disk->process_io.process_rq_stat_cachep = KMEM_CACHE(process_rq_stat,0);
42.                     disk->process_io.process_io_info_cachep = KMEM_CACHE(process_io_info,0);
43. 
44.                     disk->process_io.kernel_thread = kthread_create(process_rq_stat_thread,(void *)&disk->process_io,"process_rq_stat");
45.                     if (IS_ERR(disk->process_io.kernel_thread )) {
46.                                 printk("%s kthread_create fail\n",__func__);
47.                         }else{
48.                             wake_up_process(disk->process_io.kernel_thread);
49.                         }
50.                 }
51.                 disk->process_io.enable = intv;
52.        }
53.        return count;
54. }
55. static const DEVICE_ATTR(process_rq_stat, 0644,
56.                          disk_process_rq_stat_show,
57.                          disk_process_rq_stat_store);
58. static const struct attribute *disk_events_attrs[] = {
59.         &dev_attr_events.attr,
60.         &dev_attr_events_async.attr,
61.         &dev_attr_events_poll_msecs.attr,
62. //生成 /sys/block/sd*/process_rq_stat 文件节点
63.         &dev_attr_process_rq_stat.attr,
64.         NULL,
65. };

这些代码是保证生成/sys/block/sd*/process_rq_stat 文件节点,如此就可以echo 1 >/sys/block/sd*/process_rq_stat使能这个统计进程IO延迟的功能。当然,也包含process_io_control结构体成员(spin lock锁变量process_lock、process_rq_stat_cachep和process_io_info_cachep内存slab池、process_io_control_head链表等等)必要的初始化,创建内核线程等等。线程函数里重点执行print_process_io_info(),它打印进程IO延迟信息,源码如下:

1. block/blk-mq.c
2. void print_process_io_info(struct process_io_control *p_process_io_tmp)
3. {
4. struct process_io_info *p_process_io_info_tmp = NULL;
5. struct process_io_info *p_process_io_info_del = NULL;
6. 
7.         spin_lock_irq(&(p_process_io_tmp->process_lock));
8. //遍历 process_io_control的process_io_control_head 链表上的每个进程绑定的 process_io_info结构,打印这些进程的IO延迟数据
9.         list_for_each_entry(p_process_io_info_tmp, &(p_process_io_tmp->process_io_control_head), process_io_info_list){
10. 
11.                 if(p_process_io_info_del)//删除掉长时间没有IO传输的进程的 process_io_info结构
12.                 {
13.                    list_del(&p_process_io_info_del->process_io_info_list);
14.                     kmem_cache_free(p_process_io_tmp->process_io_info_cachep, p_process_io_info_del);
15.                     p_process_io_info_del = NULL;
16.                 }
17.                 if(p_process_io_info_tmp->rq_count == 0)
18.                 {
19.                         p_process_io_info_tmp->rq_empty_count ++;
20. //释放没有IO请求进程的process_io_info 结构
21.                         if(p_process_io_info_tmp->rq_empty_count == 3)
22.                         {
23.                              p_process_io_info_del = p_process_io_info_tmp;
24.                         }
25.                 }
26.                 else if(p_process_io_info_tmp->rq_empty_count != 0)
27.                 {
28.                     p_process_io_info_tmp->rq_empty_count = 0;
29.                 }
30.         }
31.         spin_unlock_irq(&(p_process_io_tmp->process_lock));
32. ………………//这里也有代码,略掉
33. }

源码介绍完了,有些功能还不完善,但是先调试看下效果,没想到是痛苦的开始。

2:遇到内核莫名其妙卡死怎么办

源码编译过后重启,迫不及待的echo 1 >/sys/block/sda/process_rq_stat 看下效果,sda是系统盘所在。结果过了几秒,测试的虚拟机卡死,不能自动重启。强制重启后再测试几次,每次都是卡死。直觉告诉我,应该发生了非法内存访问导致了内核crash!看看哪里有问题

2.1 kdump 工作不正常

2.1.1  initramfs-*kdump.img 没生成

systemctl status kdump看到kdump 没正常工作,initramfs-*kdump.img没生成,提示空间不足。原来虚拟机默认给/boot目录挂载的sda3盘只分配了300M不到的空间。无语了,只能想办法删除不必要的文件。像initramfs-*kdump.img类文件可以删掉,因为它可以由kdump动态生成。再执行下 systemctl restart kdump ,kdump正常启动了!但是再echo 1 >/sys/block/sda/process_rq_stat测试,还是卡死不能自动重启。会是什么原因呢?

2.1.2 kdump内核预留空间不足

做个测试,我直接 echo c >/proc/sysrq-trigger 主动触发内核crash,重启,重启后 /var/crash 目录也没有看到crash日志。会是什么原因呢?把内核恢复到原生内核,echo c >/proc/sysrq-trigger ,重启,重启后 /var/crash 目录终于看到crash日志。

按照经验,难道是用自己编译的内核后,crashkernel 预留空间不够导致内核crash无法生成crash日志?

于是 修改 /etc/default/grub ,将 GRUB_CMDLINE_LINUX="crashkernel=auto "中的auto修改为512M,然后 grub2-mkconfig -o /boot/grub2/grub.cfg 更新到grub配置。恢复到我的内核,重启后,echo c >/proc/sysrq-trigger 重启后终于生成crash日志。

但是, echo 1 > /sys/block/sda/process_rq_stat 后,还是卡死,没有重启,没有生成crash日志。无语了,心碎了一地!

2.2 内核crash有关参数没配置

cat /proc/sys/kernel/panic*,发现大部分都是0,这些参数与内核软死锁、硬死锁、rcu stall等异常后内核主动crash有关。把它们设置一下,for painc in  `ls /proc/sys/kernel/panic*`;do  echo 1 > $painc;done;echo 1 >/proc/sys/kernel/softlockup_panic。echo 1 > /sys/block/sda/process_rq_stat ,果然又卡死,过了几分钟后,终于看到生成crash日志并重启。crash日志如下:

1. [  554.838125] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
2. [  554.838130] rcu:     2-...0: (0 ticks this GP) idle=c0a/1/0x4000000000000000 softirq=15277/15278 fqs=11504
3. [  554.838136]  (detected by 3, t=60003 jiffies, g=16785, q=2)
4. [  554.838138] Sending NMI from CPU 3 to CPUs 2:
5. [  554.839150] NMI backtrace for cpu 2
6. [  554.839151] CPU: 2 PID: 35 Comm: kworker/u256:2 Kdump: loaded Tainted: G                 ---------r-  - 4.18.0 #33
7. ....................
8.  [  554.839152] RIP: 0010:native_queued_spin_lock_slowpath+0x20/0x1d0
9. ....................
10.  [  554.839154] Call Trace:
11. [  554.839155]  _raw_spin_lock_irq+0x23/0x26
12. [  554.839155]  blk_mq_sched_request_inserted+0x44/0x280
13. [  554.839155]  dd_insert_requests+0x9e/0x220
14. [  554.839155]  blk_mq_sched_insert_requests+0x65/0xf0
15. [  554.839155]  blk_mq_flush_plug_list+0x196/0x2c0
16. [  554.839155]  blk_flush_plug_list+0xd7/0x100
17. [  554.839156]  blk_finish_plug+0x21/0x2e
18. [  554.839156]  wb_writeback+0x175/0x2f0
19. [  554.839156]  ? btf_generic_check_kflag_member+0x30/0x40
20. [  554.839156]  ? cpumask_next+0x17/0x20
21. [  554.839156]  wb_workfn+0x333/0x3f0
22. [  554.839156]  ? __switch_to_asm+0x35/0x70
23. [  554.839156]  process_one_work+0x1a7/0x360
24. [  554.839157]  worker_thread+0x30/0x390
25. [  554.839157]  ? create_worker+0x1a0/0x1a0
26. [  554.839157]  kthread+0x112/0x130
27. [  554.839157]  ? kthread_flush_work_fn+0x10/0x10
28. [  554.839157]  ret_from_fork+0x35/0x40
29. [  554.839196] Kernel panic - not syncing: RCU Stall

原来是spin_lock_irq(&(p_process_io_tmp->process_lock))这个spin lock锁异常,导致长时间进入了native_queued_spin_lock_slowpath分支,然后触发了RCU stall而crash。神奇了,竟然发生了死锁!

测试发现,以上配置后,有时echo 1 > /sys/block/sda/process_rq_stat后还是直接卡死,没有生成crash日志。很迷惑,我估计可能是半卡死,没有完全卡死,触发不了crash。好的,既然有了crash日志,那就对这源码好好分析下!

3:spin lock锁异常导致的卡死分析

首先,我添加的源码是比较简单的,怎么分析都不可能会发生spin lock锁死。即便考虑到IO传输完成时执行的blk_account_io_done()是软中断里,那也不可能是触发spin lock锁死呀。我用的是spin_lock_irq/spin_unlock_irq,并且只用了一把锁,怎么会导致死锁呢?

3.1 crash现场初步分析

启动crash工具分析刚crash时生成的vmcore日志

1. //bt -a 查看线程栈回溯
2. crash> bt -a
3. PID: 2493   TASK: ffff8e28425c17c0  CPU: 0   COMMAND: "process_rq_stat"
4. #0 [fffffe0000007e50] crash_nmi_callback at ffffffffaee4eee3
5. #1 [fffffe0000007e58] nmi_handle at ffffffffaee23703
6. #2 [fffffe0000007eb0] default_do_nmi at ffffffffaee23ade
7. #3 [fffffe0000007ed0] do_nmi at ffffffffaee23cb8
8. #4 [fffffe0000007ef0] end_repeat_nmi at ffffffffaf8016d4
9.     [exception RIP: native_queued_spin_lock_slowpath+32]
10. ....................
11. #5 [ffffa7c1c2007eb8] native_queued_spin_lock_slowpath at ffffffffaef11180
12. #6 [ffffa7c1c2007eb8] _raw_spin_lock_irq at ffffffffaf6cde43
13. #7 [ffffa7c1c2007ec0] print_process_io_info at ffffffffaf20cd09
14. #8 [ffffa7c1c2007f00] process_rq_stat_thread at ffffffffaf211598
15. #9 [ffffa7c1c2007f10] kthread at ffffffffaeed9502
16. #10 [ffffa7c1c2007f50] ret_from_fork at ffffffffaf800255
17. 
18. PID: 0      TASK: ffff8e27cf0197c0  CPU: 1   COMMAND: "swapper/1"
19. #0 [fffffe0000032e50] crash_nmi_callback at ffffffffaee4eee3
20. #1 [fffffe0000032e58] nmi_handle at ffffffffaee23703
21. #2 [fffffe0000032eb0] default_do_nmi at ffffffffaee23ade
22. #3 [fffffe0000032ed0] do_nmi at ffffffffaee23cb8
23. #4 [fffffe0000032ef0] end_repeat_nmi at ffffffffaf8016d4
24.     [exception RIP: native_safe_halt+14]
25. ....................
26. --- <NMI exception stack> ---
27. #5 [ffffa7c1c06b3ea0] native_safe_halt at ffffffffaf6cd67e
28. #6 [ffffa7c1c06b3ea0] default_idle at ffffffffaf6cd2ec
29. #7 [ffffa7c1c06b3ec8] do_idle at ffffffffaeeebbb7
30. #8 [ffffa7c1c06b3f10] cpu_startup_entry at ffffffffaeeebe0f
31. #9 [ffffa7c1c06b3f30] start_secondary at ffffffffaee50e51
32. #10 [ffffa7c1c06b3f50] secondary_startup_64 at ffffffffaee000e7
33. 
34. PID: 35     TASK: ffff8e27da3baf80  CPU: 2   COMMAND: "kworker/u256:2"
35. #0 [fffffe000005de50] crash_nmi_callback at ffffffffaee4eee3
36. #1 [fffffe000005de58] nmi_handle at ffffffffaee23703
37. #2 [fffffe000005deb0] default_do_nmi at ffffffffaee23ade
38. #3 [fffffe000005ded0] do_nmi at ffffffffaee23cb8
39. #4 [fffffe000005def0] end_repeat_nmi at ffffffffaf8016d4
40.     [exception RIP: native_queued_spin_lock_slowpath+32]
41. ....................
42. --- <NMI exception stack> ---
43. #5 [ffffa7c1c0b53bd8] native_queued_spin_lock_slowpath at ffffffffaef11180
44. #6 [ffffa7c1c0b53bd8] _raw_spin_lock_irq at ffffffffaf6cde43
45. #7 [ffffa7c1c0b53be0] blk_mq_sched_request_inserted at ffffffffaf20f224
46. #8 [ffffa7c1c0b53c18] dd_insert_requests at ffffffffaf225aae
47. #9 [ffffa7c1c0b53c60] blk_mq_sched_insert_requests at ffffffffaf20f9d5
48. #10 [ffffa7c1c0b53c98] blk_mq_flush_plug_list at ffffffffaf20acb6
49. #11 [ffffa7c1c0b53d18] blk_flush_plug_list at ffffffffaf1ffb67
50. #12 [ffffa7c1c0b53d68] blk_finish_plug at ffffffffaf1ffbb1
51. #13 [ffffa7c1c0b53d78] wb_writeback at ffffffffaf10b9d5
52. #14 [ffffa7c1c0b53e08] wb_workfn at ffffffffaf10ce43
53. #15 [ffffa7c1c0b53e98] process_one_work at ffffffffaeed3477
54. #16 [ffffa7c1c0b53ed8] worker_thread at ffffffffaeed3b40
55. #17 [ffffa7c1c0b53f10] kthread at ffffffffaeed9502
56. #18 [ffffa7c1c0b53f50] ret_from_fork at ffffffffaf800255
57. 
58. PID: 0      TASK: ffff8e27cf038000  CPU: 3   COMMAND: "swapper/3"
59. #0 [ffff8e285a0c3ce8] machine_kexec at ffffffffaee5bf3e
60. #1 [ffff8e285a0c3d40] __crash_kexec at ffffffffaef6071d
61. #2 [ffff8e285a0c3e08] panic at ffffffffaeeb5dc7
62. #3 [ffff8e285a0c3e88] rcu_sched_clock_irq.cold.70 at ffffffffaef33dbd
63. #4 [ffff8e285a0c3ed0] update_process_times at ffffffffaef3fbe4
64. #5 [ffff8e285a0c3ee0] tick_sched_handle at ffffffffaef508d2
65. #6 [ffff8e285a0c3ef8] tick_sched_timer at ffffffffaef50be7
66. #7 [ffff8e285a0c3f18] __hrtimer_run_queues at ffffffffaef40890
67. #8 [ffff8e285a0c3f78] hrtimer_interrupt at ffffffffaef41060
68. #9 [ffff8e285a0c3fd8] smp_apic_timer_interrupt at ffffffffaf8027da
69. #10 [ffff8e285a0c3ff0] apic_timer_interrupt at ffffffffaf801d6f
70. --- <IRQ stack> ---
71. #11 [ffffa7c1c06c3df8] apic_timer_interrupt at ffffffffaf801d6f
72.     [exception RIP: native_safe_halt+14]
73. ....................
74. #12 [ffffa7c1c06c3ea0] default_idle at ffffffffaf6cd2ec
75. #13 [ffffa7c1c06c3ec8] do_idle at ffffffffaeeebbb7
76. #14 [ffffa7c1c06c3f10] cpu_startup_entry at ffffffffaeeebe0f
77. #15 [ffffa7c1c06c3f30] start_secondary at ffffffffaee50e51
78. #16 [ffffa7c1c06c3f50] secondary_startup_64 at ffffffffaee000e7
bt -c 0 看下4个cpu的栈回溯
1. crash> bt -c 0
2. PID: 2493   TASK: ffff8e28425c17c0  CPU: 0   COMMAND: "process_rq_stat"
3. #0 [fffffe0000007e50] crash_nmi_callback at ffffffffaee4eee3
4. #1 [fffffe0000007e58] nmi_handle at ffffffffaee23703
5. #2 [fffffe0000007eb0] default_do_nmi at ffffffffaee23ade
6. #3 [fffffe0000007ed0] do_nmi at ffffffffaee23cb8
7. #4 [fffffe0000007ef0] end_repeat_nmi at ffffffffaf8016d4
8.     [exception RIP: native_queued_spin_lock_slowpath+32]
9. ....................
10. --- <NMI exception stack> ---
11. #5 [ffffa7c1c2007eb8] native_queued_spin_lock_slowpath at ffffffffaef11180
12. #6 [ffffa7c1c2007eb8] _raw_spin_lock_irq at ffffffffaf6cde43
13. #7 [ffffa7c1c2007ec0] print_process_io_info at ffffffffaf20cd09
14. #8 [ffffa7c1c2007f00] process_rq_stat_thread at ffffffffaf211598
15. #9 [ffffa7c1c2007f10] kthread at ffffffffaeed9502
16. #10 [ffffa7c1c2007f50] ret_from_fork at ffffffffaf800255
17. crash> bt -c 1
18. PID: 0      TASK: ffff8e27cf0197c0  CPU: 1   COMMAND: "swapper/1"
19. #0 [fffffe0000032e50] crash_nmi_callback at ffffffffaee4eee3
20. #1 [fffffe0000032e58] nmi_handle at ffffffffaee23703
21. #2 [fffffe0000032eb0] default_do_nmi at ffffffffaee23ade
22. #3 [fffffe0000032ed0] do_nmi at ffffffffaee23cb8
23. #4 [fffffe0000032ef0] end_repeat_nmi at ffffffffaf8016d4
24.     [exception RIP: native_safe_halt+14]
25. ....................
26. --- <NMI exception stack> ---
27. #5 [ffffa7c1c06b3ea0] native_safe_halt at ffffffffaf6cd67e
28. #6 [ffffa7c1c06b3ea0] default_idle at ffffffffaf6cd2ec
29. #7 [ffffa7c1c06b3ec8] do_idle at ffffffffaeeebbb7
30. #8 [ffffa7c1c06b3f10] cpu_startup_entry at ffffffffaeeebe0f
31. #9 [ffffa7c1c06b3f30] start_secondary at ffffffffaee50e51
32. #10 [ffffa7c1c06b3f50] secondary_startup_64 at ffffffffaee000e7
33. crash> bt -c 2
34. PID: 35     TASK: ffff8e27da3baf80  CPU: 2   COMMAND: "kworker/u256:2"
35. #0 [fffffe000005de50] crash_nmi_callback at ffffffffaee4eee3
36. #1 [fffffe000005de58] nmi_handle at ffffffffaee23703
37. #2 [fffffe000005deb0] default_do_nmi at ffffffffaee23ade
38. #3 [fffffe000005ded0] do_nmi at ffffffffaee23cb8
39. #4 [fffffe000005def0] end_repeat_nmi at ffffffffaf8016d4
40.     [exception RIP: native_queued_spin_lock_slowpath+32]
41. ....................
42. --- <NMI exception stack> ---
43. #5 [ffffa7c1c0b53bd8] native_queued_spin_lock_slowpath at ffffffffaef11180
44. #6 [ffffa7c1c0b53bd8] _raw_spin_lock_irq at ffffffffaf6cde43
45. #7 [ffffa7c1c0b53be0] blk_mq_sched_request_inserted at ffffffffaf20f224
46. #8 [ffffa7c1c0b53c18] dd_insert_requests at ffffffffaf225aae
47. #9 [ffffa7c1c0b53c60] blk_mq_sched_insert_requests at ffffffffaf20f9d5
48. #10 [ffffa7c1c0b53c98] blk_mq_flush_plug_list at ffffffffaf20acb6
49. #11 [ffffa7c1c0b53d18] blk_flush_plug_list at ffffffffaf1ffb67
50. #12 [ffffa7c1c0b53d68] blk_finish_plug at ffffffffaf1ffbb1
51. #13 [ffffa7c1c0b53d78] wb_writeback at ffffffffaf10b9d5
52. #14 [ffffa7c1c0b53e08] wb_workfn at ffffffffaf10ce43
53. #15 [ffffa7c1c0b53e98] process_one_work at ffffffffaeed3477
54. #16 [ffffa7c1c0b53ed8] worker_thread at ffffffffaeed3b40
55. #17 [ffffa7c1c0b53f10] kthread at ffffffffaeed9502
56. #18 [ffffa7c1c0b53f50] ret_from_fork at ffffffffaf800255
57. crash> bt -c 3
58. PID: 0      TASK: ffff8e27cf038000  CPU: 3   COMMAND: "swapper/3"
59. #0 [ffff8e285a0c3ce8] machine_kexec at ffffffffaee5bf3e
60. #1 [ffff8e285a0c3d40] __crash_kexec at ffffffffaef6071d
61. #2 [ffff8e285a0c3e08] panic at ffffffffaeeb5dc7
62. #3 [ffff8e285a0c3e88] rcu_sched_clock_irq.cold.70 at ffffffffaef33dbd
63. #4 [ffff8e285a0c3ed0] update_process_times at ffffffffaef3fbe4
64. #5 [ffff8e285a0c3ee0] tick_sched_handle at ffffffffaef508d2
65. #6 [ffff8e285a0c3ef8] tick_sched_timer at ffffffffaef50be7
66. #7 [ffff8e285a0c3f18] __hrtimer_run_queues at ffffffffaef40890
67. #8 [ffff8e285a0c3f78] hrtimer_interrupt at ffffffffaef41060
68. #9 [ffff8e285a0c3fd8] smp_apic_timer_interrupt at ffffffffaf8027da
69. #10 [ffff8e285a0c3ff0] apic_timer_interrupt at ffffffffaf801d6f
70. --- <IRQ stack> ---
71. #11 [ffffa7c1c06c3df8] apic_timer_interrupt at ffffffffaf801d6f
72.     [exception RIP: native_safe_halt+14]
73.     ....................
74. #12 [ffffa7c1c06c3ea0] default_idle at ffffffffaf6cd2ec
75. #13 [ffffa7c1c06c3ec8] do_idle at ffffffffaeeebbb7
76. #14 [ffffa7c1c06c3f10] cpu_startup_entry at ffffffffaeeebe0f
77. #15 [ffffa7c1c06c3f30] start_secondary at ffffffffaee50e51
78. #16 [ffffa7c1c06c3f50] secondary_startup_64 at ffffffffaee000e7

单看栈回溯日志,根本不是该有的spin lock现场呀!唯一的一个疑问是,有两个进程两个线程都是卡在 _raw_spin_lock_irq->native_queued_spin_lock_slowpath 获取锁失败分支。

  • 1:blk_mq_sched_request_inserted->_raw_spin_lock_irq->native_queued_spin_lock_slowpath
  • 2:print_process_io_info->_raw_spin_lock_irq->native_queued_spin_lock_slowpath

但是这些这些说明不了什么问题!种种现象说明不能按照常规的死锁思路来排查这个问题,更别提把spin lock debug功能打开了!

首先要明确,这个死锁肯定跟我的代码有关系。于是我用了一个非常原始的方法,在我修改的blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info 这几个函数里,通过一点一点注释我写的代码,竟然发现把blk_account_io_done()里的p_process_io_info_tmp->max_idc_time_rq = req注释掉,就不会卡死了!这也太神奇了。

初步分析导致spin lock锁卡死的原因有:

1:本身这个内核就有bug,正好被我的代码触发了。或者,我的代码有个隐藏很深的spin lock bug!

2:内存越界,内存越界到了spin_lock_irq(&(p_process_io_tmp->process_lock)) 这把spin lock锁变量里,最终导致了死锁。如果是这样,我分析问题大概率出在blk_mq_sched_request_inserted或 print_process_io_info两个函数里对链表process_io_control的process_io_control_head链表的遍历中。但是blk_mq_sched_request_inserted函数中对链表的遍历,即便遍历到的process_io_info有问题,那也是kmem_cache_alloc分配的process_io_info,怎么会影响到process_io_control结构里的spin lock锁呢?print_process_io_info函数嫌疑基本没有。

3:触发了未知问题?灵异事件?反正遇到奇葩的问题不少了!

怀疑的方向太多,一是不知道改向哪个方向排查。

3.2 spin lock进一步分析

后续的测试中,发现新的死锁日志,如下:

1. [  216.259727] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [kworker/0:1H:490]
2. [  216.259753]  mptbase serio_raw fuse
3. [  216.259765] CPU: 0 PID: 490 Comm: kworker/0:1H Kdump: loaded Tainted: G                 ---------r-  - 4.18.0 #42
4. ............
5. [  216.259800]  _raw_spin_lock+0x1c/0x20
6. [  216.259802]  dd_dispatch_request+0x31/0x230
7. [  216.259804]  ? __switch_to_asm+0x35/0x70
8. [  216.259806]  blk_mq_do_dispatch_sched+0xc5/0x160
9. [  216.259807]  ? __switch_to_asm+0x35/0x70
10. [  216.259809]  __blk_mq_sched_dispatch_requests+0xfe/0x160
11. [  216.259810]  blk_mq_sched_dispatch_requests+0x30/0x60
12. [  216.259811]  __blk_mq_run_hw_queue+0x51/0xd0
13. [  216.259813]  process_one_work+0x1a7/0x360
14. [  216.259814]  worker_thread+0x30/0x390
15. [  216.259815]  ? create_worker+0x1a0/0x1a0
16. [  216.259816]  kthread+0x112/0x130
17. [  216.259817]  ? kthread_flush_work_fn+0x10/0x10

可以发现,这是一个新的锁死现场, 但是我的分析是:应该还是某个IO传输的进程,卡死在 blk_mq_sched_request_inserted,,肯定跟我在 blk_mq_sched_request_inserted()里加的代码有关。

于是我在blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info 这几个函数每一步都添加printk调试信息(我是虚拟机里调试,不用担心printk会休眠,原本spin lock加锁后是不能休眠的)。为什么要这么做,就是为了验证进程执行我的spin lock代码,都是正常加锁/释放锁的,不存在某个进程会一直占着锁不释放。编译新内核后重启,echo 1 > /sys/block/sda/process_rq_stat后很快触发死锁内核crash。crash日志显示,执行blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info几个函数的进程,都是正常spin lock加锁/释放锁的。是突然某个进程执行blk_mq_sched_request_inserted或print_process_io_info里的spin_lock_irq,直接进入native_queued_spin_lock_slowpath分支而死锁。简单说,我的代码spin lock本身使用时没有问题的!不应该死锁!

接着,又怀疑是 print_process_io_info()函数里,list_del(&p_process_io_info_del->process_io_info_list) 和 kmem_cache_free(p_process_io_tmp->process_io_info_cachep, p_process_io_info_del)

删除process_io_info 导致的卡死。于是把 print_process_io_info()函数里 这些代码注释掉,编译新内核还是卡死。

无语了!但结合之前的测试,判定内核越界了!之前已经验证把 blk_account_io_done()里的 p_process_io_info_tmp->max_idc_time_rq = req 注释掉,不会卡死。应该是 p_process_io_info_tmp->max_idc_time_rq = req 内存越界,越界到 spin lock 的process_lcok 锁变量里,修改了锁变量,导致后续进程获取失败而卡死。怎么验证我的猜想,把spin lock 变量值打印出来!

3.3 打印spin lock锁变量验证内存越界

这个方法之前根本没用过,不知道能不能直接打印spin lock锁变量。试了一下竟然可以!看下它的定义

1. typedef struct spinlock {
2. union {
3. struct raw_spinlock rlock;
4.     }
5. }
6. struct raw_spinlock {
7.     arch_spinlock_t raw_lock;
8. }
9. crash> struct arch_spinlock_t
10. typedef struct qspinlock {
11. union {
12.         atomic_t val;
13. struct {
14.             u8 locked;
15.             u8 pending;
16.         };
17. struct {
18.             u16 locked_pending;
19.             u16 tail;
20.         };
21.     };
22. } arch_spinlock_t;

没错,直接把spin lock的raw_spinlock的arch_spinlock_t的val值打印出来即可。然后在struct process_io_control结构体增加一个atomic_t lock_count变量,主要是跟着 process_lock 一起加1和减1,证明我的代码没问题,不会有异常调用导致 process_lock 锁变量异常而触发死锁。

然后在blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info增加调试信息

1. void blk_mq_sched_request_inserted(struct request *rq)
2. {
3.         .........................
4.         if(rq->rq_disk && rq->rq_disk->process_io.enable){
5. struct process_rq_stat *p_process_rq_stat_tmp = NULL;
6. struct process_io_info *p_process_io_info_tmp = NULL;
7. int find = 0;
8. ///打印 process_io.lock_count 原子变量 ,打印 process_io.process_lock.rlock.raw_lock.val 原子变量
9.                 printk("%s %s %d process_io_count:%d lock_count:%d spin_lock:%d  1\n",__func__,current->comm,current->pid,process_io_count,atomic_read(&(rq->rq_disk->process_io.lock_count)),atomic_read(&(rq->rq_disk->process_io.process_lock.rlock.raw_lock.
10. val)));
11.                 spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
12.                 atomic_inc(&(rq->rq_disk->process_io.lock_count));
13.                 list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
14.                         if(p_process_io_info_tmp->pid == current->pid){
15.                               p_process_io_info_tmp->rq_count ++;
16.                               find = 1;
17.                         }
18.                 }
19.                 spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
20.                 
21.                 if(0 == find){
22.                     ....................
23.                     p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_ATOMIC);
24.                     ....................
25.                 }
26.                 p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_ATOMIC);
27.                 ....................
28.                 p_process_io_info_tmp->pid = current->pid;
29.                 strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
30. 
31.                 p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
32.                 p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
33.                 rq->p_process_rq_stat = p_process_rq_stat_tmp;
34.                 printk("%s %s %d process_io_count:%d 3\n",__func__,current->comm,current->pid,process_io_count);
35.                 return;
36.                 ...................
37.         }
38. 
39. }
40. void print_process_io_info(struct process_io_control *p_process_io_tmp)
41. {
42. struct process_io_info *p_process_io_info_tmp = NULL;
43. struct process_io_info *p_process_io_info_del = NULL;
44.         
45. ///打印 process_io.lock_count 原子变量 ,打印 process_io.process_lock.rlock.raw_lock.val 原子变量
46.         printk("%s %s %d process_io_count:%d lock_count:%d spin_lock:%d in\n",__func__,current->comm,current->pid,process_io_count,atomic_read(&(p_process_io_tmp->lock_count)),atomic_read(&(p_process_io_tmp->process_lock.rlock.raw_lock.val)));
47. 
48.         spin_lock_irq(&(p_process_io_tmp->process_lock));
49.         atomic_inc(&(p_process_io_tmp->lock_count));
50.         list_for_each_entry(p_process_io_info_tmp, &(p_process_io_tmp->process_io_control_head), process_io_info_list){
51.             ..........................
52.         }
53.         ..........................
54.         spin_unlock_irq(&(p_process_io_tmp->process_lock));
55.         atomic_dec(&(p_process_io_tmp->lock_count));
56.         printk("%s %s %d process_io_count:%d out\n",__func__,current->comm,current->pid,process_io_count);
57. }
58. void blk_mq_start_request(struct request *rq)
59. {   
60.     if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
61. struct process_rq_stat *p_process_rq_stat_tmp = rq->p_process_rq_stat;
62. struct process_io_info *p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
63. 
64.         printk("%s %s %d\n",__func__,current->comm,current->pid);
65.         p_process_rq_stat_tmp->rq_issue_time = ktime_to_us(ktime_get());
66.         ..........................
67.     }
68. }
69. void blk_account_io_done(struct request *req, u64 now)
70. {
71.     if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
72. struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
73. struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
74.             ..........................
75.             if(p_process_rq_stat_tmp->idc_time > p_process_io_info_tmp->max_idc_time){
76.                     p_process_io_info_tmp->max_idc_time = p_process_rq_stat_tmp->idc_time;
77. ///打印 process_io.process_lock.rlock.raw_lock.val 原子变量
78.                     printk("%s spin_lock:%d 1\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)));
79.                     p_process_io_info_tmp->max_idc_time_rq = req;
80. ///再次 打印 process_io.process_lock.rlock.raw_lock.val 原子变量
81.                     printk("%s spin_lock:%d 2\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)));
82.             }                    
83.             ..........................
84.             kmem_cache_free(req->rq_disk->process_io.process_rq_stat_cachep, p_process_rq_stat_tmp);
85.             req->p_process_rq_stat = NULL;
86. ///再次再 打印 process_io.process_lock.rlock.raw_lock.val 原子变量
87.             printk("%s spin_lock:%d 3\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)));
88.     }   
89. }

调试信息添加后,编译新内核,重启。然后echo 1 > /sys/block/sda/process_rq_stat ,很快内核死锁,生成了crash日志,如下关键信息如下:

1. [  315.237578] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 lock_count:0 spin_lock:0  1
2. [  315.237579] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 2
3. [  315.237580] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 3
4. [  315.237587] blk_mq_start_request kworker/0:1H 488
5. [  315.237768] blk_account_io_done spin_lock:0 1
6. //果然 blk_account_io_done()里 p_process_io_info_tmp->max_idc_time_rq = req 赋值后,spin lock 变量跳变了!
7. [  315.237769] blk_account_io_done spin_lock:-26476 2
8. [  315.237769] blk_account_io_done spin_lock:-26476 3
9. [  315.237798] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 lock_count:0 spin_lock:-26476  1
10. [  316.287974] print_process_io_info process_rq_stat 3257 process_io_count:1 lock_count:0 spin_lock:-26476 in
11. [  344.423606] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [kworker/u256:1:34]
12. ...............
13. [  344.423669] Call Trace:
14. [  344.423674]  _raw_spin_lock+0x1c/0x20
15. [  344.423676]  dd_bio_merge+0x43/0x90
16. [  344.423679]  blk_mq_make_request+0xe0/0x5d0
17. [  344.423681]  generic_make_request+0xcf/0x310
18. [  344.423683]  submit_bio+0x3c/0x160
19. [  344.423708]  ? xfs_prepare_ioend+0xc2/0xd0 [xfs]

终于找打了spin lock锁变量被篡改的关键日志。但是,我有个更神奇的想法,把对  p_process_io_info_tmp->max_idc_time_rq =req的赋值的req指针打印出来,也许有奇景!于是把blk_account_io_done函数里的第一个打印printk("%s spin_lock:%d 1\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)))改成printk("%s req:0x%llx  spin_lock:%d 1\n",__func__,(u64)req,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)))。

再次编译新内核并触发死锁,新的crash日志如下:

  1. blk_account_io_done req:0xffff95cfdce87810  spin_lock:0 1
  2. blk_account_io_done spin_lock:-27185 2
  3. blk_account_io_done spin_lock:-27185 3

显然req指针是0xffff95cfdce87810,spin lock锁变量值是-27185。-27185 按照16进制打印是  0xffff95cf ,正是 req:0xffff95cfdce87810 这个64位数据的高8位 0xffff95cf dce87810。如此100%坐实确实是p_process_io_info_tmp->max_idc_time_rq =req内存越界到了spin lock锁变量,具体是越界到process_io_control 结构体的spinlock_t process_lock 锁成员。

继续调试,我实在想不通,p_process_io_info_tmp->max_idc_time_rq = req 这个赋值,怎么就 赋值给了 process_io_control 结构体的spinlock_t process_lock 成员呢?再添加调试信息。

blk_mq_sched_request_inserted 函数的第1条printk打印改成:printk("%s %s %d process_io_count:%d lock_count:%d spin_lock:%d enable:%d  1\n",__func__,current->comm,current->pid,process_io_count,atomic_read(&(rq->rq_disk->process_io.lock_count)),atomic_read(&(rq->rq_disk->process_io.process_lock.rlock.raw_lock.val)),rq->rq_disk->process_io.enable)

blk_mq_sched_request_inserted 函数的第3条printk打印改成printk("%s %s %d process_io_count:%d process_io:0x%llx  rq:0x%llx rq->p_process_rq_stat:0x%llx process_io_info:0x%llx 3\n",__func__,current->comm,current->pid,process_io_count,(u64)(&rq->rq_disk->process_io),(u64)rq,(u64)rq->p_process_rq_stat,(u64)p_process_rq_stat_tmp->p_process_io_info)

blk_account_io_done 函数的第1条打印改成 printk("%s req:0x%llx process_rq_stat:0x%llx process_io_info:0x%llx max_idc_time_rq:0x%llx   spin_lock:%d 1\n",__func__,(u64)req,(u64)req->p_process_rq_stat,(u64)req->p_process_rq_stat->p_process_io_info,(u64)(&(p_process_io_info_tmp->max_idc_time_rq)),atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)))

这么做的目的就是把IO传输过程的 req、process_rq_stat、process_io、process_io_info 指针全打印出来。编译新内核后重启,测试 echo 1 > /sys/block/sda/process_rq_stat  后很快又卡死,生成crash日志,如下:

  1. [ 2255.500850] blk_account_io_done spin_lock:0 2
  2. [ 2255.500850] blk_account_io_done spin_lock:0 3
  3. [ 2255.501102] blk_mq_sched_request_inserted systemd-journal 632 process_io_count:3 lock_count:0 spin_lock:0 enable:1  1
  4. [ 2255.501103] blk_mq_sched_request_inserted systemd-journal 632 process_io_count:3 2
  5. //这里 process_io_info 指针竟然也是 0xffff928cb38f1800
  6. [ 2255.501104] blk_mq_sched_request_inserted systemd-journal 632 process_io_count:3 process_io:0xffff928cb38f1838  rq:0xffff928cb0add990 rq->p_process_rq_stat:0xffff928cb0089390 process_io_info:0xffff928cb38f1800 3
  7. [ 2255.501107] blk_mq_start_request systemd-journal 632
  8. //process_io_info:0xffff928cb38f1800 指针跟 process_io:0xffff928cb38f1838 指针竟然只差 0x38大小,并且 process_io_info的成员max_idc_time_rq的地址 竟然跟process_io一样
  9. [ 2255.501686] blk_account_io_done req:0xffff928cb0add990 process_rq_stat:0xffff928cb0089390 process_io_info:0xffff928cb38f1800 max_idc_time_rq:0xffff928cb38f1838   spin_lock:0 1
  10. [ 2255.501686] blk_account_io_done spin_lock:-28020 2
  11. [ 2255.501687] blk_account_io_done spin_lock:-28020 3
  12. [ 2256.511605] print_process_io_info process_rq_stat 6627 process_io_count:3 lock_count:0 spin_lock:-28020 in
  13. //不仅 process_io 的成员 spin_lock 被篡改了,连 process_io 的成员 enable也被篡改了,不用说二者肯定跟 req:0xffff928cb0add990 有关
  14. [ 2267.191646] blk_mq_sched_request_inserted kworker/u256:0 8 process_io_count:3 lock_count:0 spin_lock:-28020 enable:-1330783856  1
  15. [ 2292.484150] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [kworker/0:0:6455]

显然,根据目前的测试,blk_account_io_done 里的 p_process_io_info_tmp->max_idc_time_rq = req(p_process_io_info_tmp 就是 req->p_process_rq_stat->p_process_io_info),

根据打印信息, req->p_process_rq_stat->p_process_io_info 指针就与 process_io_control 结构体 指针就相差0x38 。简单说,req->p_process_rq_stat->p_process_io_info 指针就在 process_io_control 结构体里(process_io_control结构体0x38大小) 。req->p_process_rq_stat->p_process_io_info的成员max_idc_time_rq 地址正是 process_io_control结构体的首地址。那怪不得blk_account_io_done 里的 p_process_io_info_tmp->max_idc_time_rq = req的赋值,会赋值给process_io_control 结构体里开头的成员enable变量和process_lock这个spin lock锁变量了。

process_io_control 结构体包含在 struct gendisk 结构体里,只一个全局变量,而 req->p_process_rq_stat->p_process_io_info 指向的 process_io_info是 kmem_cache_alloc 动态分配的,正常情况二者的地址肯定相差很大,为什么二者的地址会重叠在一起呢?

req->p_process_rq_stat->p_process_io_info指针是在 blk_mq_sched_request_inserted()里赋值,看看blk_mq_sched_request_inserted()最后的打印,打印的 req->p_process_rq_stat->p_process_io_info 指针就是 0xffff928cb38f1800 呀!这个时候 req->p_process_rq_stat->p_process_io_info 指针已经不正常了,已经就指向 process_io_control 结构体了!

此时review blk_mq_sched_request_inserted() 函数代码,发现里边有个问题:

1. void blk_mq_sched_request_inserted(struct request *rq)
2. {
3.     ..................
4.     list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
5.             if(p_process_io_info_tmp->pid == current->pid){
6.                   p_process_io_info_tmp->rq_count ++;
7.                   find = 1;
8.             }
9.     }
10.     ..................
11. }

这个 list_for_each_entry()里遍历 process_io.process_io_control_head 链表上所有进程的 process_io_info ,每次都是遍历到链表尾的 process_io_info 才退出,就是说 list_for_each_entry()退出后,p_process_io_info_tmp 都指向process_io.process_io_control_head 链表上最后一个成员。这与我的涉及初衷不一样呀,如果当前进程插入IO请求,在 process_io.process_io_control_head 链表上 找到一个pid与当前进程pid相等的 process_io_info,if(p_process_io_info_tmp->pid == current->pid)成立。那说明这个进程之前已经传输IO请求时分配了一个 process_io_info 结构,已经插入到了 process_io.process_io_control_head 链表,不用再分配新的process_io_info 结构。

此时 if(p_process_io_info_tmp->pid == current->pid) 成立,应该break呀,不应该再遍历链表了。但是问题来了,即便 这里的代码设计有问题,p_process_io_info_tmp每次都是 process_io.process_io_control_head 链表上最后一个 成员process_io_info ,然后  p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp 和 rq->p_process_rq_stat = p_process_rq_stat_tmp 赋值后,怎么p_process_rq_stat_tmp->p_process_io_info 指针就变成了 process_io_control 结构体里的地址呢?

做个测试,在 if(p_process_io_info_tmp->pid == current->pid)最后 加个 break,编译内核重启,echo 1 > /sys/block/sda/process_rq_stat测试,怎么测试都不会卡死。显然问题的根本原因就在这里了,就在 list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list) 这个for循环。再看下list_for_each_entry源码吧!

1. #define list_for_each_entry(pos, head, member)              \
2.     for (pos = list_first_entry(head, typeof(*pos), member);    \
3.          &pos->member != (head);                    \
4.          pos = list_next_entry(pos, member))

看了 list_for_each_entry 源码,不禁有了疑问,list_for_each_entry 中,判断结束循环并不是根据链表最后一个成员的next指针是否是头结点head来判断的!想错了!实际情况是当pos指向最后一个链表成员,再执行pos = list_next_entry(pos, member) 后, pos 已经不再指向链表最后一个成员了,而是指向head链表头所在的 process_io_control 结构体了!然后 &pos->member != (head),此时就是在 process_io_control 结构体里偏移,判断是否是头结点地址。

就是说,从 list_for_each_entry()结束后,p_process_io_info_tmp指向的是链表头结点,即process_io_control 结构体里,而不是process_io_info,那怪不得后续对会p_process_io_info_tmp的成员赋值会内存越界到process_io_control 结构。但是这个问题并不是每次都出现,因为 if(p_process_io_info_tmp->pid == current->pid) 不成立时,下边就会新分配一个 process_io_info并赋值给 p_process_io_info_tmp,此时没问题。

3.4  spin lock锁异常根本原因分析

下边演示 p_process_io_info_tmp 指向的是 process_io_control 结构体的过程。正常情况,pos = list_next_entry(pos, member) ,pos->next 指向的是下一个 process_io_info 结构体的 struct list_head process_io_info_list,然后在 struct process_io_info 结构里,以 该结构体的成员 struct list_head process_io_info_list 的地址为基准,先减少 0x40 ,此时地址偏移到了 struct process_io_info 地址开头,最后返回的就是地址正是这个

1. crash> struct process_io_info -o -x  
2. struct process_io_info {
3.    [0x0] int pid;
4.    [0x4] char comm[16];
5.   [0x14] int rq_count;
6.   [0x18] int rq_empty_count;
7.   [0x1c] u32 max_id_time;
8.   [0x20] u32 max_dc_time;
9.   [0x24] u32 max_idc_time;
10.   [0x28] struct request *max_id_time_rq;
11.   [0x30] struct request *max_dc_time_rq;
12.   [0x38] struct request *max_idc_time_rq;
13.   [0x40] struct list_head process_io_info_list;
14. }
15. SIZE: 0x50
16. 
17. -(0x40-0x8) + 0x38 = 0
18. crash> struct process_io_control -o -x 
19. [-0x38] .........
20. struct process_io_control {
21.    [0x0] int enable;
22.    [0x4] spinlock_t process_lock;
23.    [0x8] struct list_head process_io_control_head;
24.   [0x18] struct task_struct *kernel_thread;
25.   [0x20] struct kmem_cache *process_rq_stat_cachep;
26.   [0x28] struct kmem_cache *process_io_info_cachep;
27.   [0x30] atomic_t lock_count;
28. }
29. SIZE: 0x38
30. #define list_for_each_entry(pos, head, member)              \
31.     for (pos = list_first_entry(head, typeof(*pos), member);    \
32.          &pos->member != (head);                    \
33.          pos = list_next_entry(pos, member))

当pos 是最后一个链表成员时,  pos = list_next_entry(pos, member)的过程是:pos->next指向的是链表头节点,即process_io_control 结构体的成员 struct list_head process_io_control_head 。然后以这个struct list_head process_io_control_head 的地址为基准,先减少 0x40 ,而struct list_head process_io_control_head在struct process_io_control结构体里偏移0x8,因此pos指针(也就是p_process_io_info_tmp指针)指向地址是process_io_control结构体向前偏移0x40-0x8=0x38。然后就结束了list_for_each_entry循环。

然后 p_process_io_info_tmp->max_idc_time_rq,p_process_io_info_tmp->max_idc_time_rq本质是相对 p_process_io_info_tmp地址向后偏移0x38。而此时 p_process_io_info_tmp指针指向的是 struct process_io_control 结构体向前偏移-(0x40-0x8)=-0x38,因此p_process_io_info_tmp->max_idc_time_rq 的地址是 0x38 + (-0x38),该地址正好是 struct process_io_control基地址。就是说 p_process_io_info_tmp->max_idc_time_rq指向 struct process_io_control基地址,此时向p_process_io_info_tmp->max_idc_time_rq=req赋值,那就内存越界到了 struct process_io_control 的前8个字节。

当初排查问题时想法太多,没有认真review 代码,blk_mq_sched_request_inserted()里list_for_each_entry()那段代码,我之前印象中 if(p_process_io_info_tmp->pid == current->pid) 里找到匹配的进程后有break。后来看了很多次 blk_mq_sched_request_inserted函数代码,但都没有发现没break的问题,印象主义真害人。这个日常挺常见的,明明错误的代码,但是因为印象主义作祟,大脑始终认为是正确的。不能100%相信自己的大脑,睡一觉再看代码也许就会有新的发现。

最初if(p_process_io_info_tmp->pid == current->pid)这个break是有的,但是测试遍历链表效果临时删掉,没想到后续就忘了。因为工作原因,这段代码断断续续编写测试了一个星期才ok,思维不连贯!

4:又一个内核锁死

一波未平一波又起,解决上个问题后,满心欢喜测试,结果又锁死了!

1. [  125.044681] blk_mq_sched_request_inserted make 8332 process_io_count:336 1
2. [  125.044684] blk_mq_sched_request_inserted make 8332 process_io_count:336 2
3. [  125.044685] blk_mq_sched_request_inserted make 8332 process_io_count:336 3
4. ......................
5. //pid 8316 号make进程现在卡死了,没有打印"blk_mq_sched_request_inserted make 8316 process_io_count:336 3",显然卡死了
6. [  125.061609] blk_mq_sched_request_inserted make 8316 process_io_count:336 2
7. [  148.553926] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [make:8332]
8. .............
9. [  148.553956] CPU: 0 PID: 8332 Comm: make Kdump: loaded Tainted: G                 ---------r-  - 4.18.0 #43
10. [  148.553957] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
11. [  148.553960] RIP: 0010:native_queued_spin_lock_slowpath+0x20/0x1d0
12. .............
13. [  148.553984] Call Trace:
14. [  148.553988]  _raw_spin_lock+0x1c/0x20
15. [  148.553991]  dd_bio_merge+0x43/0x90
16. [  148.553994]  blk_mq_make_request+0xe0/0x5d0
17. [  148.553997]  ? iomap_ioend_try_merge+0xe0/0xe0
18. [  148.553999]  generic_make_request+0xcf/0x310
19. [  148.554000]  submit_bio+0x3c/0x160
20. .............
21. crash> bt -a
22. PID: 8332   TASK: ffff9d1d43f9c740  CPU: 0   COMMAND: "make"
23. #0 [ffff9d1d9a003d48] machine_kexec at ffffffff8225bf3e
24. #1 [ffff9d1d9a003da0] __crash_kexec at ffffffff8236071d
25. #2 [ffff9d1d9a003e68] panic at ffffffff822b5dc7
26. #3 [ffff9d1d9a003ee8] watchdog_timer_fn.cold.8 at ffffffff82396db7
27. #4 [ffff9d1d9a003f18] __hrtimer_run_queues at ffffffff82340890
28. .............
29. #9 [ffffb725c34779e0] _raw_spin_lock at ffffffff82acde1c
30. #10 [ffffb725c34779e8] dd_bio_merge at ffffffff82625a93
31. #11 [ffffb725c3477a18] blk_mq_make_request at ffffffff8260a380
32. #12 [ffffb725c3477aa8] generic_make_request at ffffffff825fe92f
33. #13 [ffffb725c3477b00] submit_bio at ffffffff825febac
34. .............
35. 
36. PID: 8081   TASK: ffff9d1d43c24740  CPU: 1   COMMAND: "make"
37. #0 [fffffe0000032e50] crash_nmi_callback at ffffffff8224eee3
38. #1 [fffffe0000032e58] nmi_handle at ffffffff82223703
39. #2 [fffffe0000032eb0] default_do_nmi at ffffffff82223ade
40. #3 [fffffe0000032ed0] do_nmi at ffffffff82223cb8
41. .............
42. #5 [ffffb725c31b79e0] native_queued_spin_lock_slowpath at ffffffff82311180
43. #6 [ffffb725c31b79e0] _raw_spin_lock at ffffffff82acde1c
44. #7 [ffffb725c31b79e8] dd_bio_merge at ffffffff82625a93
45. #8 [ffffb725c31b7a18] blk_mq_make_request at ffffffff8260a380
46. #9 [ffffb725c31b7aa8] generic_make_request at ffffffff825fe92f
47. #10 [ffffb725c31b7b00] submit_bio at ffffffff825febac
48. .............
49. PID: 8326   TASK: ffff9d1d35f297c0  CPU: 2   COMMAND: "make"
50. #0 [fffffe000005de50] crash_nmi_callback at ffffffff8224eee3
51. #1 [fffffe000005de58] nmi_handle at ffffffff82223703
52. #2 [fffffe000005deb0] default_do_nmi at ffffffff82223ade
53. #3 [fffffe000005ded0] do_nmi at ffffffff82223cb8
54. .............
55. #5 [ffffb725c34279e0] native_queued_spin_lock_slowpath at ffffffff82311180
56. #6 [ffffb725c34279e0] _raw_spin_lock at ffffffff82acde1c
57. #7 [ffffb725c34279e8] dd_bio_merge at ffffffff82625a93
58. #8 [ffffb725c3427a18] blk_mq_make_request at ffffffff8260a380
59. #9 [ffffb725c3427aa8] generic_make_request at ffffffff825fe92f
60. #10 [ffffb725c3427b00] submit_bio at ffffffff825febac
61. .............
62. PID: 8306   TASK: ffff9d1d43f60000  CPU: 3   COMMAND: "make"
63. .............
64. #5 [ffffb725c34379e0] native_queued_spin_lock_slowpath at ffffffff82311180
65. #6 [ffffb725c34379e0] _raw_spin_lock at ffffffff82acde1c
66. #7 [ffffb725c34379e8] dd_bio_merge at ffffffff82625a93
67. #8 [ffffb725c3437a18] blk_mq_make_request at ffffffff8260a380
68. #9 [ffffb725c3437aa8] generic_make_request at ffffffff825fe92f
69. #10 [ffffb725c3437b00] submit_bio at ffffffff825febac
70. 没有发现异常的进程,但是 pid 8316 号make进程显然卡死了
71. crash> bt 8316
72. PID: 8316   TASK: ffff9d1d569c5f00  CPU: 1   COMMAND: "make"
73. #0 [ffffb725c3457930] __schedule at ffffffff82ac8826
74. #1 [ffffb725c34579c8] preempt_schedule_common at ffffffff82ac8d6a
75. #2 [ffffb725c34579d0] _cond_resched at ffffffff82ac8d9d
76. #3 [ffffb725c34579d8] kmem_cache_alloc at ffffffff824a6a07
77. #4 [ffffb725c3457a10] blk_mq_sched_request_inserted.cold.14 at ffffffff8260fd3d
78. #5 [ffffb725c3457a48] dd_insert_requests at ffffffff82625b7e
79. #6 [ffffb725c3457a90] blk_mq_sched_insert_requests at ffffffff8260f845
80. #7 [ffffb725c3457ac8] blk_mq_flush_plug_list at ffffffff8260ac96
81. #8 [ffffb725c3457b48] blk_flush_plug_list at ffffffff825ffb57
82. #9 [ffffb725c3457b98] blk_finish_plug at ffffffff825ffba1
83. .............

没想到 dd_insert_requests->spin_lock(&dd->lock)获取锁成功后执行 blk_mq_sched_request_inserted->kmem_cache_alloc->_cond_resched->preempt_schedule_common->__schedule 休眠了然后 其他进程 dd_insert_requests->spin_lock(&dd->lock)->_raw_spin_lock获取锁失败而进入 native_queued_spin_lock_slowpath 分支,死循环等待spin_lock锁。但是我很好奇,d_insert_requests->spin_lock(&dd->lock)获取锁成功后执行blk_mq_sched_request_inserted->kmem_cache_alloc->_cond_resched->preempt_schedule_common->__schedule 休眠,会休眠22s吗,这休眠的时间也太长了!!!!!!

这个问题解决方法很简单,修改 kmem_cache_alloc(......,GFP_KERNEL) 为 kmem_cache_alloc(......,GFP_ATOMIC),禁止分配内存时休眠。这里做个小总结:

1:bt -a  只能打印D、R 状态的进程栈回溯,不能打印S状态的。而有问题的进程可能正是S状态。

2:kmem_cache_alloc()函数有概率会休眠,因此不能用在 spin_lock 等加锁函数里,不能用 kmem_cache_alloc、get_free_pages等函数

3:dd_insert_requests函数里插入IO请求函数里是先 spin_lock(&dd->lock),然后dd_insert_request->blk_mq_sched_request_inserted ,而blk_mq_sched_request_inserted函数里在 kmem_cache_alloc()休眠,相当于是spin_lock(&dd->lock) 后发生了休眠。看来不仅仅要知道你自己的代码 spin_lock后不能休眠,上层调用也要知道有没有禁止休眠呀!

5:rq_count的异常

我在blk_mq_sched_request_inserted(),进程每派发一个IO请求,则p_process_io_info_tmp->rq_count ++,在IO请求传输完成时执行blk_account_io_done函数时,再p_process_io_info_tmp->rq_count --。正常p_process_io_info_tmp->rq_count 应该大于等于0,但是实际测试发现p_process_io_info_tmp->rq_count 竟然会是-1!

1. void blk_mq_sched_request_inserted(struct request *rq)
2. {
3.     .............
4.     if(rq->rq_disk && rq->rq_disk->process_io.enable){
5. struct process_rq_stat *p_process_rq_stat_tmp = NULL;
6. struct process_io_info *p_process_io_info_tmp = NULL;
7. int find = 0;
8. 
9.             spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
10. //先遍历process_io_control_head链表上是否已经有了当前进程绑定的 process_io_info 结构
11.             list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
12.                     if(p_process_io_info_tmp->pid == current->pid){
13.                           p_process_io_info_tmp->rq_count ++;
14.                           find = 1;
15.                           break;
16.                     }
17.             }
18.             spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
19. 
20.             if(0 == find){
21. //没有找到则为当前进程分配一个新的 process_io_info结构
22.                     p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_KERNEL);
23.                     ....................
24.                     spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
25.                     p_process_io_info_tmp->rq_count ++;
26. //把 process_io_info结构 添加到 process_io.process_io_control_head 链表
27.                     list_add(&p_process_io_info_tmp->process_io_info_list,&(rq->rq_disk->process_io.process_io_control_head));
28.                     spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
29.             }
30.             ....................
31.     }
32. }
33. void blk_account_io_done(struct request *req, u64 now)
34. {
35.     ....................
36.     if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
37. struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
38. struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
39.             .................
40.             p_process_io_info_tmp->rq_count --;
41.             if(p_process_io_info_tmp->rq_count < 0){
42.                     printk("%s error:%d\n",__func__,p_process_io_info_tmp->rq_count);
43.             }
44.             .................
45.     }
46. }

这个问题是因为:进程在执行blk_mq_sched_request_inserted向IO队列插入IO请求而p_process_io_info_tmp->rq_count ++,同时该进程上的IO请求传输完成了执行blk_account_io_done而令p_process_io_info_tmp->rq_count --,相当于p_process_io_info_tmp->rq_count同时加1和减1,那怪不得有问题了!解决方法很简单,blk_account_io_done()中执行p_process_io_info_tmp->rq_count --前spin lock加锁!

为什么会犯这个错误呢?因为没有想到进程执行blk_mq_sched_request_inserted向IO队列插入IO请求的同时,该进程的IO请求还会传输完成blk_account_io_done。没有理解透彻向内核插入的代码的执行时机,没理解透彻上下文!

大的问题终于解决了,后续还有很多细节需要处理。

6:总结

本次的调试经历感触很深,种种原因导致了一系列错误,也充分认识到内核开发难度很大,坑很多。这里主要做以下几个总结:

1:list_for_each_entry()遍历内核链表原理没有理解透彻,之前迷迷糊糊错误因为链表遍历结束后, pos指针是指向链表最后一个成员,实际确实链表头结点。


2:在 dd_insert_requests->spin_lock(&dd->lock)获取锁成功后执行 blk_mq_sched_request_inserted->kmem_cache_alloc分支, 没想到blk_mq_sched_request_inserted的上层,即dd_insert_requests函数里,是spin_lock(&dd->lock)后执行的blk_mq_sched_request_inserted。因此blk_mq_sched_request_inserted函数里是不能休眠的。blk_mq_sched_request_inserted里spin_lock_irq(&(rq->rq_disk->process_io.process_lock))后已经避开了执行kmem_cache_alloc而休眠,但是没想到blk_mq_sched_request_inserted的上层调用这也执行了spin_lock(&dd->lock)而全程不能休眠。不熟悉代码上下文!当然,spin lock加锁后,也不能有printk,因为printk可能会休眠。


3:没有想到进程执行blk_mq_sched_request_inserted向IO队列插入IO请求的同时,该进程的IO请求还会传输完成执行blk_account_io_done,造成p_process_io_info_tmp->rq_count加1和减1同时执行。还是没有理解透彻向内核插入的代码的执行时机,还是不清楚上下文!

还有一些其他心得

1:blk_mq_sched_request_inserted函数里,list_for_each_entry里的那个break,因为印象中有break,所以后续看代码潜意识一直因为有break。遇到奇葩的问题,一定要学会完全否定,做到怀疑每一行代码。

2:遇到疑难问题,不要动不动就怀疑硬件有问题,内存数据跳变,内核原生bug,编译器有问题等等。还是要多怀疑自己的代码问题,结合上下文,软中断和硬中断,多线程和高并发。在分析这个问题没思路时,还尝试把struct gendisk结构体里的struct process_io_control process_io移动位置,从struct gendisk最后一个成员移动到第一个成员,还把include/linux/blk_types.h文件里,struct process_io_control、struct process_io_info、struct process_rq_stat的原始定义从文件最开头移动到文件中间。为什么这么做?怀疑是编译器问题导致的spin lock异常卡死。