最近想搞一个统计进程IO延迟的内核模块,iostat和blktrace都有不足,iostat统计的是块设备总的IO数据,blktrace统计的IO数据太过杂乱。该模块主要在IO请求插入IO队列、派发IO请求、IO请求传输完成3个节点的关键函数blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done中插入代码,统计进程的IO请求在IO队列、IO请求在磁盘驱动层的耗时。原理其实比较简单,但是在开发和调试过程却遇到莫名其妙的内核卡死,排查到最后发现与spin lock锁异常、内核内存越界有关,很奇葩又很无语。
本文是在centos 8.3里做的测试,内核版本4.18.0-240.el8,主要内容包括:
1:内核莫名卡死不会自动重启怎么办?怎么生成crash日志?
2:spin lock莫名其妙进入native_queued_spin_lock_slowpath分支的分析
3:list_for_each_entry遍历内核链表的一个坑
1:添加的源码
这一节主要介绍在IO请求插入IO队列、派发IO请求、IO请求传输完成3个节点的关键函数blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done中插入的代码,具体原理不用懂,有个大体了解就行,本次遇到的内核卡死与这些代码息息相关。
首先在include/linux/blk_types.h中增加本次需要的结构体
1. #define COMM_LEN 16
2. struct process_io_control{
3. //使能标记
4. int enable;
5. spinlock_t process_lock;
6. //进程的 process_io_info 结构靠 process_io_info_list 插入 process_io_control 结构的 process_io_control_head 链表
7. struct list_head process_io_control_head;
8. //内核线程,定时打印每个进程的IO延迟信息
9. struct task_struct *kernel_thread;
10. //process_rq_stat 内存结构资源池
11. struct kmem_cache *process_rq_stat_cachep;
12. //process_io_info 内存结构资源池
13. struct kmem_cache *process_io_info_cachep;
14. };
15. struct process_io_info{
16. int pid;//记录进程PID
17. char comm[COMM_LEN];//进程名字
18. //进程有多少个IO请求在传输
19. int rq_count;
20. //进程没有IO请求时的统计次数
21. int rq_empty_count;
22. //IO请求在IO队列最长的停留时间(进程的)
23. u32 max_id_time;
24. //IO请求在磁盘驱动层的最长耗时(进程的)
25. u32 max_dc_time;
26. //IO请求从插入队列到传输完成的最大总耗时(进程的)
27. u32 max_idc_time;
28. //在IO队列停留时间最长的IO请求(进程的)
29. struct request *max_id_time_rq;
30. //在磁盘驱动层耗时最长的IO请求(进程的)
31. struct request *max_dc_time_rq;
32. //IO请求传输耗时最长的IO请求(进程的)
33. struct request *max_idc_time_rq;
34. //进程的 process_io_info 结构靠 process_io_info_list 插入 process_io_control 结构的 process_io_control_head 链表
35. struct list_head process_io_info_list;
36. };
37. struct process_rq_stat{
38. //指向IO请求结构
39. struct request *rq;
40. //IO请求插入IO队列的时间点
41. u64 rq_inset_time;
42. //IO请求派发的时间点
43. u64 rq_issue_time;
44. //IO请求在IO队列的停留时间
45. u32 id_time;
46. //IO请求在磁盘驱动层的耗时
47. u32 dc_time;
48. //IO请求从插入队列到传输完成的总耗时
49. u32 idc_time;
50. //指向IO请求所属的 process_io_info 结构
51. struct process_io_info *p_process_io_info;
52. };
struct process_rq_stat代表每次的IO请求,主要记录IO请求插入IO队列、IO请求派发、IO请求传输完成的时间点。struct process_io_info代表每个IO传输的进程,主要记录在IO队列时间最长的IO请求和停留时间、在磁盘驱动层时间最长的IO请求和停留时间。struct process_io_control是总的控制结构,每个参与IO传输的进程绑定的process_io_info结构都添加到process_io_control结构的process_io_control_head链表,并且process_io_control还开启一个内核线程,默认每隔1s打印一次每个IO传输进程的IO延迟信息(主要是IO请求在IO队列最大停留时间,IO请求在磁盘驱动的最大耗时)。
这些结构体是怎么跟块层联系上呢?看下:
1. struct request {
2. ............
3. struct process_rq_stat *p_process_rq_stat;
4. ............
5. }
6. struct gendisk {
7. ...........
8. struct process_io_control process_io;
9. ...........
10. }
只是在代表IO请求的struct request结构插入了struct process_rq_stat *p_process_rq_stat结构。在代表块设备的struct gendisk结构插入了struct process_io_control process_io。
在IO请求插入IO队列、派发IO请求、IO请求传输完成3个节点的关键函数blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done中插入的代码如下:
1. //IO请求插入IO队列时执行
2. void blk_mq_sched_request_inserted(struct request *rq)
3. {
4.
5. trace_block_rq_insert(rq->q, rq);
6.
7. if(rq->rq_disk && rq->rq_disk->process_io.enable){
8. struct process_rq_stat *p_process_rq_stat_tmp = NULL;
9. struct process_io_info *p_process_io_info_tmp = NULL;
10. int find = 0;
11.
12. spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
13. //先遍历process_io_control_head链表上是否已经有了当前进程绑定的 process_io_info 结构
14. list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
15. if(p_process_io_info_tmp->pid == current->pid){
16. p_process_io_info_tmp->rq_count ++;
17. find = 1;
18. }
19. }
20. spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
21.
22. if(0 == find){
23. //没有找到则为当前进程分配一个新的 process_io_info结构
24. p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_KERNEL);
25. if(!p_process_io_info_tmp)
26. goto fail;
27.
28. memset(p_process_io_info_tmp,0,sizeof(struct process_io_info));
29. spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
30. p_process_io_info_tmp->rq_count ++;
31. //把 process_io_info结构 添加到 process_io.process_io_control_head 链表
32. list_add(&p_process_io_info_tmp->process_io_info_list,&(rq->rq_disk->process_io.process_io_control_head));
33. spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
34. }
35. //为当前传输的IO请求分配一个 process_rq_stat 结构
36. p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_KERNEL);
37. if(!p_process_rq_stat_tmp)
38. goto fail;
39. memset(p_process_rq_stat_tmp,0,sizeof(struct process_rq_stat));
40. //process_io_info 结构记录当前进程的PID和名字
41. p_process_io_info_tmp->pid = current->pid;
42. strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
43. //process_rq_stat_tmp->p_process_io_info 指向 p_process_io_info
44. p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
45. //记录IO请求插入IO队列的时间
46. p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
47. //IO请求的 rq->p_process_rq_stat 指向 process_rq_stat 结构
48. rq->p_process_rq_stat = p_process_rq_stat_tmp;
49.
50. return;
51. fail:
52. if(p_process_rq_stat_tmp)
53. kmem_cache_free(rq->rq_disk->process_io.process_rq_stat_cachep, p_process_rq_stat_tmp);
54. if(p_process_io_info_tmp)
55. kmem_cache_free(rq->rq_disk->process_io.process_io_info_cachep, p_process_io_info_tmp);
56. }
57. }
58.
59. //派发IO请求时执行
60. void blk_mq_start_request(struct request *rq)
61. {
62. ....................
63. if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
64. struct process_rq_stat *p_process_rq_stat_tmp = rq->p_process_rq_stat;
65. struct process_io_info *p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
66. //process_rq_stat_tmp->rq_issue_time 记录IO请求派发的时间
67. p_process_rq_stat_tmp->rq_issue_time = ktime_to_us(ktime_get());
68. //process_rq_stat_tmp->id_time记录IO请求在IO队列停留的时间
69. p_process_rq_stat_tmp->id_time = p_process_rq_stat_tmp->rq_issue_time - p_process_rq_stat_tmp->rq_inset_time;
70. if(p_process_rq_stat_tmp->id_time > p_process_io_info_tmp->max_id_time){
71. //process_io_info_tmp->max_id_time 记录 IO请求在IO队列最长的停留时间
72. p_process_io_info_tmp->max_id_time = p_process_rq_stat_tmp->id_time;
73. p_process_io_info_tmp->max_id_time_rq = rq;
74. }
75. }
76. }
77.
78. //IO请求传输完成时执行
79. void blk_account_io_done(struct request *req, u64 now)
80. {
81. ....................
82. if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
83. struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
84. struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
85. //process_rq_stat_tmp->dc_time 记录IO请求在磁盘驱动的耗时
86. p_process_rq_stat_tmp->dc_time = ktime_to_us(ktime_get()) - p_process_rq_stat_tmp->rq_issue_time;
87. //process_rq_stat->idc_time 记录IO请求从插入队列到传输完成的总耗时
88. req->p_process_rq_stat->idc_time = p_process_rq_stat_tmp->dc_time + p_process_rq_stat_tmp->id_time;
89.
90. if(p_process_rq_stat_tmp->dc_time > p_process_io_info_tmp->max_dc_time){
91. //process_io_info_tmp->max_dc_time 记录IO请求在磁盘驱动层的最长耗时(进程的)
92. p_process_io_info_tmp->max_dc_time = p_process_rq_stat_tmp->dc_time;
93. p_process_io_info_tmp->max_dc_time_rq = req;
94. }
95.
96. if(p_process_rq_stat_tmp->idc_time > p_process_io_info_tmp->max_idc_time){
97. //process_io_info_tmp->max_idc_time 记录IO请求从插入队列到传输完成的最大总耗时(进程的)
98. p_process_io_info_tmp->max_idc_time = p_process_rq_stat_tmp->idc_time;
99. p_process_io_info_tmp->max_idc_time_rq = req;
100. }
101. p_process_io_info_tmp->rq_count --;
102. if(p_process_io_info_tmp->rq_count < 0){
103. printk("%s error:%d\n",__func__,p_process_io_info_tmp->rq_count);
104. }
105. kmem_cache_free(req->rq_disk->process_io.process_rq_stat_cachep, p_process_rq_stat_tmp);
106. req->p_process_rq_stat = NULL;
107. }
108. }
另外,在block/genhd.c 中添加源码如下:
1. #include <linux/delay.h>
2. extern void print_process_io_info(struct process_io_control *p_process_io_tmp);
3. //extern void free_all_process_io_info(struct process_io_control *p_process_io_tmp);
4. static int process_rq_stat_thread(void *arg)
5. {
6. struct process_io_control *p_process_io_tmp = (struct process_io_control *)arg;
7. while (!kthread_should_stop()) {
8. print_process_io_info(p_process_io_tmp);
9. msleep(1000);
10. }
11. p_process_io_tmp->kernel_thread = NULL;
12. return 0;
13. }
14. static ssize_t disk_process_rq_stat_show(struct device *dev,
15. struct device_attribute *attr,
16. char *buf)
17. {
18. struct gendisk *disk = dev_to_disk(dev);
19.
20. return sprintf(buf, "%d\n", disk->process_io.enable);
21. }
22. static ssize_t disk_process_rq_stat_store(struct device *dev,
23. struct device_attribute *attr,
24. const char *buf, size_t count)
25. {
26. struct gendisk *disk = dev_to_disk(dev);
27. long intv;
28.
29. if (!count || !sscanf(buf, "%ld", &intv))
30. return -EINVAL;
31.
32. if (!(intv == 0 || intv == 1))
33. return -EINVAL;
34.
35. if(disk->process_io.enable != intv){
36. if(intv)
37. {
38. INIT_LIST_HEAD(&(disk->process_io.process_io_control_head));
39. spin_lock_init(&(disk->process_io.process_lock));
40. atomic_set(&(disk->process_io.lock_count),0);
41. disk->process_io.process_rq_stat_cachep = KMEM_CACHE(process_rq_stat,0);
42. disk->process_io.process_io_info_cachep = KMEM_CACHE(process_io_info,0);
43.
44. disk->process_io.kernel_thread = kthread_create(process_rq_stat_thread,(void *)&disk->process_io,"process_rq_stat");
45. if (IS_ERR(disk->process_io.kernel_thread )) {
46. printk("%s kthread_create fail\n",__func__);
47. }else{
48. wake_up_process(disk->process_io.kernel_thread);
49. }
50. }
51. disk->process_io.enable = intv;
52. }
53. return count;
54. }
55. static const DEVICE_ATTR(process_rq_stat, 0644,
56. disk_process_rq_stat_show,
57. disk_process_rq_stat_store);
58. static const struct attribute *disk_events_attrs[] = {
59. &dev_attr_events.attr,
60. &dev_attr_events_async.attr,
61. &dev_attr_events_poll_msecs.attr,
62. //生成 /sys/block/sd*/process_rq_stat 文件节点
63. &dev_attr_process_rq_stat.attr,
64. NULL,
65. };
这些代码是保证生成/sys/block/sd*/process_rq_stat 文件节点,如此就可以echo 1 >/sys/block/sd*/process_rq_stat使能这个统计进程IO延迟的功能。当然,也包含process_io_control结构体成员(spin lock锁变量process_lock、process_rq_stat_cachep和process_io_info_cachep内存slab池、process_io_control_head链表等等)必要的初始化,创建内核线程等等。线程函数里重点执行print_process_io_info(),它打印进程IO延迟信息,源码如下:
1. block/blk-mq.c
2. void print_process_io_info(struct process_io_control *p_process_io_tmp)
3. {
4. struct process_io_info *p_process_io_info_tmp = NULL;
5. struct process_io_info *p_process_io_info_del = NULL;
6.
7. spin_lock_irq(&(p_process_io_tmp->process_lock));
8. //遍历 process_io_control的process_io_control_head 链表上的每个进程绑定的 process_io_info结构,打印这些进程的IO延迟数据
9. list_for_each_entry(p_process_io_info_tmp, &(p_process_io_tmp->process_io_control_head), process_io_info_list){
10.
11. if(p_process_io_info_del)//删除掉长时间没有IO传输的进程的 process_io_info结构
12. {
13. list_del(&p_process_io_info_del->process_io_info_list);
14. kmem_cache_free(p_process_io_tmp->process_io_info_cachep, p_process_io_info_del);
15. p_process_io_info_del = NULL;
16. }
17. if(p_process_io_info_tmp->rq_count == 0)
18. {
19. p_process_io_info_tmp->rq_empty_count ++;
20. //释放没有IO请求进程的process_io_info 结构
21. if(p_process_io_info_tmp->rq_empty_count == 3)
22. {
23. p_process_io_info_del = p_process_io_info_tmp;
24. }
25. }
26. else if(p_process_io_info_tmp->rq_empty_count != 0)
27. {
28. p_process_io_info_tmp->rq_empty_count = 0;
29. }
30. }
31. spin_unlock_irq(&(p_process_io_tmp->process_lock));
32. ………………//这里也有代码,略掉
33. }
源码介绍完了,有些功能还不完善,但是先调试看下效果,没想到是痛苦的开始。
2:遇到内核莫名其妙卡死怎么办
源码编译过后重启,迫不及待的echo 1 >/sys/block/sda/process_rq_stat 看下效果,sda是系统盘所在。结果过了几秒,测试的虚拟机卡死,不能自动重启。强制重启后再测试几次,每次都是卡死。直觉告诉我,应该发生了非法内存访问导致了内核crash!看看哪里有问题
2.1 kdump 工作不正常
2.1.1 initramfs-*kdump.img 没生成
systemctl status kdump看到kdump 没正常工作,initramfs-*kdump.img没生成,提示空间不足。原来虚拟机默认给/boot目录挂载的sda3盘只分配了300M不到的空间。无语了,只能想办法删除不必要的文件。像initramfs-*kdump.img类文件可以删掉,因为它可以由kdump动态生成。再执行下 systemctl restart kdump ,kdump正常启动了!但是再echo 1 >/sys/block/sda/process_rq_stat测试,还是卡死不能自动重启。会是什么原因呢?
2.1.2 kdump内核预留空间不足
做个测试,我直接 echo c >/proc/sysrq-trigger 主动触发内核crash,重启,重启后 /var/crash 目录也没有看到crash日志。会是什么原因呢?把内核恢复到原生内核,echo c >/proc/sysrq-trigger ,重启,重启后 /var/crash 目录终于看到crash日志。
按照经验,难道是用自己编译的内核后,crashkernel 预留空间不够导致内核crash无法生成crash日志?
于是 修改 /etc/default/grub ,将 GRUB_CMDLINE_LINUX="crashkernel=auto "中的auto修改为512M,然后 grub2-mkconfig -o /boot/grub2/grub.cfg 更新到grub配置。恢复到我的内核,重启后,echo c >/proc/sysrq-trigger 重启后终于生成crash日志。
但是, echo 1 > /sys/block/sda/process_rq_stat 后,还是卡死,没有重启,没有生成crash日志。无语了,心碎了一地!
2.2 内核crash有关参数没配置
cat /proc/sys/kernel/panic*,发现大部分都是0,这些参数与内核软死锁、硬死锁、rcu stall等异常后内核主动crash有关。把它们设置一下,for painc in `ls /proc/sys/kernel/panic*`;do echo 1 > $painc;done;echo 1 >/proc/sys/kernel/softlockup_panic。echo 1 > /sys/block/sda/process_rq_stat ,果然又卡死,过了几分钟后,终于看到生成crash日志并重启。crash日志如下:
1. [ 554.838125] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
2. [ 554.838130] rcu: 2-...0: (0 ticks this GP) idle=c0a/1/0x4000000000000000 softirq=15277/15278 fqs=11504
3. [ 554.838136] (detected by 3, t=60003 jiffies, g=16785, q=2)
4. [ 554.838138] Sending NMI from CPU 3 to CPUs 2:
5. [ 554.839150] NMI backtrace for cpu 2
6. [ 554.839151] CPU: 2 PID: 35 Comm: kworker/u256:2 Kdump: loaded Tainted: G ---------r- - 4.18.0 #33
7. ....................
8. [ 554.839152] RIP: 0010:native_queued_spin_lock_slowpath+0x20/0x1d0
9. ....................
10. [ 554.839154] Call Trace:
11. [ 554.839155] _raw_spin_lock_irq+0x23/0x26
12. [ 554.839155] blk_mq_sched_request_inserted+0x44/0x280
13. [ 554.839155] dd_insert_requests+0x9e/0x220
14. [ 554.839155] blk_mq_sched_insert_requests+0x65/0xf0
15. [ 554.839155] blk_mq_flush_plug_list+0x196/0x2c0
16. [ 554.839155] blk_flush_plug_list+0xd7/0x100
17. [ 554.839156] blk_finish_plug+0x21/0x2e
18. [ 554.839156] wb_writeback+0x175/0x2f0
19. [ 554.839156] ? btf_generic_check_kflag_member+0x30/0x40
20. [ 554.839156] ? cpumask_next+0x17/0x20
21. [ 554.839156] wb_workfn+0x333/0x3f0
22. [ 554.839156] ? __switch_to_asm+0x35/0x70
23. [ 554.839156] process_one_work+0x1a7/0x360
24. [ 554.839157] worker_thread+0x30/0x390
25. [ 554.839157] ? create_worker+0x1a0/0x1a0
26. [ 554.839157] kthread+0x112/0x130
27. [ 554.839157] ? kthread_flush_work_fn+0x10/0x10
28. [ 554.839157] ret_from_fork+0x35/0x40
29. [ 554.839196] Kernel panic - not syncing: RCU Stall
原来是spin_lock_irq(&(p_process_io_tmp->process_lock))这个spin lock锁异常,导致长时间进入了native_queued_spin_lock_slowpath分支,然后触发了RCU stall而crash。神奇了,竟然发生了死锁!
测试发现,以上配置后,有时echo 1 > /sys/block/sda/process_rq_stat后还是直接卡死,没有生成crash日志。很迷惑,我估计可能是半卡死,没有完全卡死,触发不了crash。好的,既然有了crash日志,那就对这源码好好分析下!
3:spin lock锁异常导致的卡死分析
首先,我添加的源码是比较简单的,怎么分析都不可能会发生spin lock锁死。即便考虑到IO传输完成时执行的blk_account_io_done()是软中断里,那也不可能是触发spin lock锁死呀。我用的是spin_lock_irq/spin_unlock_irq,并且只用了一把锁,怎么会导致死锁呢?
3.1 crash现场初步分析
启动crash工具分析刚crash时生成的vmcore日志
1. //bt -a 查看线程栈回溯
2. crash> bt -a
3. PID: 2493 TASK: ffff8e28425c17c0 CPU: 0 COMMAND: "process_rq_stat"
4. #0 [fffffe0000007e50] crash_nmi_callback at ffffffffaee4eee3
5. #1 [fffffe0000007e58] nmi_handle at ffffffffaee23703
6. #2 [fffffe0000007eb0] default_do_nmi at ffffffffaee23ade
7. #3 [fffffe0000007ed0] do_nmi at ffffffffaee23cb8
8. #4 [fffffe0000007ef0] end_repeat_nmi at ffffffffaf8016d4
9. [exception RIP: native_queued_spin_lock_slowpath+32]
10. ....................
11. #5 [ffffa7c1c2007eb8] native_queued_spin_lock_slowpath at ffffffffaef11180
12. #6 [ffffa7c1c2007eb8] _raw_spin_lock_irq at ffffffffaf6cde43
13. #7 [ffffa7c1c2007ec0] print_process_io_info at ffffffffaf20cd09
14. #8 [ffffa7c1c2007f00] process_rq_stat_thread at ffffffffaf211598
15. #9 [ffffa7c1c2007f10] kthread at ffffffffaeed9502
16. #10 [ffffa7c1c2007f50] ret_from_fork at ffffffffaf800255
17.
18. PID: 0 TASK: ffff8e27cf0197c0 CPU: 1 COMMAND: "swapper/1"
19. #0 [fffffe0000032e50] crash_nmi_callback at ffffffffaee4eee3
20. #1 [fffffe0000032e58] nmi_handle at ffffffffaee23703
21. #2 [fffffe0000032eb0] default_do_nmi at ffffffffaee23ade
22. #3 [fffffe0000032ed0] do_nmi at ffffffffaee23cb8
23. #4 [fffffe0000032ef0] end_repeat_nmi at ffffffffaf8016d4
24. [exception RIP: native_safe_halt+14]
25. ....................
26. --- <NMI exception stack> ---
27. #5 [ffffa7c1c06b3ea0] native_safe_halt at ffffffffaf6cd67e
28. #6 [ffffa7c1c06b3ea0] default_idle at ffffffffaf6cd2ec
29. #7 [ffffa7c1c06b3ec8] do_idle at ffffffffaeeebbb7
30. #8 [ffffa7c1c06b3f10] cpu_startup_entry at ffffffffaeeebe0f
31. #9 [ffffa7c1c06b3f30] start_secondary at ffffffffaee50e51
32. #10 [ffffa7c1c06b3f50] secondary_startup_64 at ffffffffaee000e7
33.
34. PID: 35 TASK: ffff8e27da3baf80 CPU: 2 COMMAND: "kworker/u256:2"
35. #0 [fffffe000005de50] crash_nmi_callback at ffffffffaee4eee3
36. #1 [fffffe000005de58] nmi_handle at ffffffffaee23703
37. #2 [fffffe000005deb0] default_do_nmi at ffffffffaee23ade
38. #3 [fffffe000005ded0] do_nmi at ffffffffaee23cb8
39. #4 [fffffe000005def0] end_repeat_nmi at ffffffffaf8016d4
40. [exception RIP: native_queued_spin_lock_slowpath+32]
41. ....................
42. --- <NMI exception stack> ---
43. #5 [ffffa7c1c0b53bd8] native_queued_spin_lock_slowpath at ffffffffaef11180
44. #6 [ffffa7c1c0b53bd8] _raw_spin_lock_irq at ffffffffaf6cde43
45. #7 [ffffa7c1c0b53be0] blk_mq_sched_request_inserted at ffffffffaf20f224
46. #8 [ffffa7c1c0b53c18] dd_insert_requests at ffffffffaf225aae
47. #9 [ffffa7c1c0b53c60] blk_mq_sched_insert_requests at ffffffffaf20f9d5
48. #10 [ffffa7c1c0b53c98] blk_mq_flush_plug_list at ffffffffaf20acb6
49. #11 [ffffa7c1c0b53d18] blk_flush_plug_list at ffffffffaf1ffb67
50. #12 [ffffa7c1c0b53d68] blk_finish_plug at ffffffffaf1ffbb1
51. #13 [ffffa7c1c0b53d78] wb_writeback at ffffffffaf10b9d5
52. #14 [ffffa7c1c0b53e08] wb_workfn at ffffffffaf10ce43
53. #15 [ffffa7c1c0b53e98] process_one_work at ffffffffaeed3477
54. #16 [ffffa7c1c0b53ed8] worker_thread at ffffffffaeed3b40
55. #17 [ffffa7c1c0b53f10] kthread at ffffffffaeed9502
56. #18 [ffffa7c1c0b53f50] ret_from_fork at ffffffffaf800255
57.
58. PID: 0 TASK: ffff8e27cf038000 CPU: 3 COMMAND: "swapper/3"
59. #0 [ffff8e285a0c3ce8] machine_kexec at ffffffffaee5bf3e
60. #1 [ffff8e285a0c3d40] __crash_kexec at ffffffffaef6071d
61. #2 [ffff8e285a0c3e08] panic at ffffffffaeeb5dc7
62. #3 [ffff8e285a0c3e88] rcu_sched_clock_irq.cold.70 at ffffffffaef33dbd
63. #4 [ffff8e285a0c3ed0] update_process_times at ffffffffaef3fbe4
64. #5 [ffff8e285a0c3ee0] tick_sched_handle at ffffffffaef508d2
65. #6 [ffff8e285a0c3ef8] tick_sched_timer at ffffffffaef50be7
66. #7 [ffff8e285a0c3f18] __hrtimer_run_queues at ffffffffaef40890
67. #8 [ffff8e285a0c3f78] hrtimer_interrupt at ffffffffaef41060
68. #9 [ffff8e285a0c3fd8] smp_apic_timer_interrupt at ffffffffaf8027da
69. #10 [ffff8e285a0c3ff0] apic_timer_interrupt at ffffffffaf801d6f
70. --- <IRQ stack> ---
71. #11 [ffffa7c1c06c3df8] apic_timer_interrupt at ffffffffaf801d6f
72. [exception RIP: native_safe_halt+14]
73. ....................
74. #12 [ffffa7c1c06c3ea0] default_idle at ffffffffaf6cd2ec
75. #13 [ffffa7c1c06c3ec8] do_idle at ffffffffaeeebbb7
76. #14 [ffffa7c1c06c3f10] cpu_startup_entry at ffffffffaeeebe0f
77. #15 [ffffa7c1c06c3f30] start_secondary at ffffffffaee50e51
78. #16 [ffffa7c1c06c3f50] secondary_startup_64 at ffffffffaee000e7
bt -c 0 看下4个cpu的栈回溯
1. crash> bt -c 0
2. PID: 2493 TASK: ffff8e28425c17c0 CPU: 0 COMMAND: "process_rq_stat"
3. #0 [fffffe0000007e50] crash_nmi_callback at ffffffffaee4eee3
4. #1 [fffffe0000007e58] nmi_handle at ffffffffaee23703
5. #2 [fffffe0000007eb0] default_do_nmi at ffffffffaee23ade
6. #3 [fffffe0000007ed0] do_nmi at ffffffffaee23cb8
7. #4 [fffffe0000007ef0] end_repeat_nmi at ffffffffaf8016d4
8. [exception RIP: native_queued_spin_lock_slowpath+32]
9. ....................
10. --- <NMI exception stack> ---
11. #5 [ffffa7c1c2007eb8] native_queued_spin_lock_slowpath at ffffffffaef11180
12. #6 [ffffa7c1c2007eb8] _raw_spin_lock_irq at ffffffffaf6cde43
13. #7 [ffffa7c1c2007ec0] print_process_io_info at ffffffffaf20cd09
14. #8 [ffffa7c1c2007f00] process_rq_stat_thread at ffffffffaf211598
15. #9 [ffffa7c1c2007f10] kthread at ffffffffaeed9502
16. #10 [ffffa7c1c2007f50] ret_from_fork at ffffffffaf800255
17. crash> bt -c 1
18. PID: 0 TASK: ffff8e27cf0197c0 CPU: 1 COMMAND: "swapper/1"
19. #0 [fffffe0000032e50] crash_nmi_callback at ffffffffaee4eee3
20. #1 [fffffe0000032e58] nmi_handle at ffffffffaee23703
21. #2 [fffffe0000032eb0] default_do_nmi at ffffffffaee23ade
22. #3 [fffffe0000032ed0] do_nmi at ffffffffaee23cb8
23. #4 [fffffe0000032ef0] end_repeat_nmi at ffffffffaf8016d4
24. [exception RIP: native_safe_halt+14]
25. ....................
26. --- <NMI exception stack> ---
27. #5 [ffffa7c1c06b3ea0] native_safe_halt at ffffffffaf6cd67e
28. #6 [ffffa7c1c06b3ea0] default_idle at ffffffffaf6cd2ec
29. #7 [ffffa7c1c06b3ec8] do_idle at ffffffffaeeebbb7
30. #8 [ffffa7c1c06b3f10] cpu_startup_entry at ffffffffaeeebe0f
31. #9 [ffffa7c1c06b3f30] start_secondary at ffffffffaee50e51
32. #10 [ffffa7c1c06b3f50] secondary_startup_64 at ffffffffaee000e7
33. crash> bt -c 2
34. PID: 35 TASK: ffff8e27da3baf80 CPU: 2 COMMAND: "kworker/u256:2"
35. #0 [fffffe000005de50] crash_nmi_callback at ffffffffaee4eee3
36. #1 [fffffe000005de58] nmi_handle at ffffffffaee23703
37. #2 [fffffe000005deb0] default_do_nmi at ffffffffaee23ade
38. #3 [fffffe000005ded0] do_nmi at ffffffffaee23cb8
39. #4 [fffffe000005def0] end_repeat_nmi at ffffffffaf8016d4
40. [exception RIP: native_queued_spin_lock_slowpath+32]
41. ....................
42. --- <NMI exception stack> ---
43. #5 [ffffa7c1c0b53bd8] native_queued_spin_lock_slowpath at ffffffffaef11180
44. #6 [ffffa7c1c0b53bd8] _raw_spin_lock_irq at ffffffffaf6cde43
45. #7 [ffffa7c1c0b53be0] blk_mq_sched_request_inserted at ffffffffaf20f224
46. #8 [ffffa7c1c0b53c18] dd_insert_requests at ffffffffaf225aae
47. #9 [ffffa7c1c0b53c60] blk_mq_sched_insert_requests at ffffffffaf20f9d5
48. #10 [ffffa7c1c0b53c98] blk_mq_flush_plug_list at ffffffffaf20acb6
49. #11 [ffffa7c1c0b53d18] blk_flush_plug_list at ffffffffaf1ffb67
50. #12 [ffffa7c1c0b53d68] blk_finish_plug at ffffffffaf1ffbb1
51. #13 [ffffa7c1c0b53d78] wb_writeback at ffffffffaf10b9d5
52. #14 [ffffa7c1c0b53e08] wb_workfn at ffffffffaf10ce43
53. #15 [ffffa7c1c0b53e98] process_one_work at ffffffffaeed3477
54. #16 [ffffa7c1c0b53ed8] worker_thread at ffffffffaeed3b40
55. #17 [ffffa7c1c0b53f10] kthread at ffffffffaeed9502
56. #18 [ffffa7c1c0b53f50] ret_from_fork at ffffffffaf800255
57. crash> bt -c 3
58. PID: 0 TASK: ffff8e27cf038000 CPU: 3 COMMAND: "swapper/3"
59. #0 [ffff8e285a0c3ce8] machine_kexec at ffffffffaee5bf3e
60. #1 [ffff8e285a0c3d40] __crash_kexec at ffffffffaef6071d
61. #2 [ffff8e285a0c3e08] panic at ffffffffaeeb5dc7
62. #3 [ffff8e285a0c3e88] rcu_sched_clock_irq.cold.70 at ffffffffaef33dbd
63. #4 [ffff8e285a0c3ed0] update_process_times at ffffffffaef3fbe4
64. #5 [ffff8e285a0c3ee0] tick_sched_handle at ffffffffaef508d2
65. #6 [ffff8e285a0c3ef8] tick_sched_timer at ffffffffaef50be7
66. #7 [ffff8e285a0c3f18] __hrtimer_run_queues at ffffffffaef40890
67. #8 [ffff8e285a0c3f78] hrtimer_interrupt at ffffffffaef41060
68. #9 [ffff8e285a0c3fd8] smp_apic_timer_interrupt at ffffffffaf8027da
69. #10 [ffff8e285a0c3ff0] apic_timer_interrupt at ffffffffaf801d6f
70. --- <IRQ stack> ---
71. #11 [ffffa7c1c06c3df8] apic_timer_interrupt at ffffffffaf801d6f
72. [exception RIP: native_safe_halt+14]
73. ....................
74. #12 [ffffa7c1c06c3ea0] default_idle at ffffffffaf6cd2ec
75. #13 [ffffa7c1c06c3ec8] do_idle at ffffffffaeeebbb7
76. #14 [ffffa7c1c06c3f10] cpu_startup_entry at ffffffffaeeebe0f
77. #15 [ffffa7c1c06c3f30] start_secondary at ffffffffaee50e51
78. #16 [ffffa7c1c06c3f50] secondary_startup_64 at ffffffffaee000e7
单看栈回溯日志,根本不是该有的spin lock现场呀!唯一的一个疑问是,有两个进程两个线程都是卡在 _raw_spin_lock_irq->native_queued_spin_lock_slowpath 获取锁失败分支。
- 1:blk_mq_sched_request_inserted->_raw_spin_lock_irq->native_queued_spin_lock_slowpath
- 2:print_process_io_info->_raw_spin_lock_irq->native_queued_spin_lock_slowpath
但是这些这些说明不了什么问题!种种现象说明不能按照常规的死锁思路来排查这个问题,更别提把spin lock debug功能打开了!
首先要明确,这个死锁肯定跟我的代码有关系。于是我用了一个非常原始的方法,在我修改的blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info 这几个函数里,通过一点一点注释我写的代码,竟然发现把blk_account_io_done()里的p_process_io_info_tmp->max_idc_time_rq = req注释掉,就不会卡死了!这也太神奇了。
初步分析导致spin lock锁卡死的原因有:
1:本身这个内核就有bug,正好被我的代码触发了。或者,我的代码有个隐藏很深的spin lock bug!
2:内存越界,内存越界到了spin_lock_irq(&(p_process_io_tmp->process_lock)) 这把spin lock锁变量里,最终导致了死锁。如果是这样,我分析问题大概率出在blk_mq_sched_request_inserted或 print_process_io_info两个函数里对链表process_io_control的process_io_control_head链表的遍历中。但是blk_mq_sched_request_inserted函数中对链表的遍历,即便遍历到的process_io_info有问题,那也是kmem_cache_alloc分配的process_io_info,怎么会影响到process_io_control结构里的spin lock锁呢?print_process_io_info函数嫌疑基本没有。
3:触发了未知问题?灵异事件?反正遇到奇葩的问题不少了!
怀疑的方向太多,一是不知道改向哪个方向排查。
3.2 spin lock进一步分析
后续的测试中,发现新的死锁日志,如下:
1. [ 216.259727] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [kworker/0:1H:490]
2. [ 216.259753] mptbase serio_raw fuse
3. [ 216.259765] CPU: 0 PID: 490 Comm: kworker/0:1H Kdump: loaded Tainted: G ---------r- - 4.18.0 #42
4. ............
5. [ 216.259800] _raw_spin_lock+0x1c/0x20
6. [ 216.259802] dd_dispatch_request+0x31/0x230
7. [ 216.259804] ? __switch_to_asm+0x35/0x70
8. [ 216.259806] blk_mq_do_dispatch_sched+0xc5/0x160
9. [ 216.259807] ? __switch_to_asm+0x35/0x70
10. [ 216.259809] __blk_mq_sched_dispatch_requests+0xfe/0x160
11. [ 216.259810] blk_mq_sched_dispatch_requests+0x30/0x60
12. [ 216.259811] __blk_mq_run_hw_queue+0x51/0xd0
13. [ 216.259813] process_one_work+0x1a7/0x360
14. [ 216.259814] worker_thread+0x30/0x390
15. [ 216.259815] ? create_worker+0x1a0/0x1a0
16. [ 216.259816] kthread+0x112/0x130
17. [ 216.259817] ? kthread_flush_work_fn+0x10/0x10
可以发现,这是一个新的锁死现场, 但是我的分析是:应该还是某个IO传输的进程,卡死在 blk_mq_sched_request_inserted,,肯定跟我在 blk_mq_sched_request_inserted()里加的代码有关。
于是我在blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info 这几个函数每一步都添加printk调试信息(我是虚拟机里调试,不用担心printk会休眠,原本spin lock加锁后是不能休眠的)。为什么要这么做,就是为了验证进程执行我的spin lock代码,都是正常加锁/释放锁的,不存在某个进程会一直占着锁不释放。编译新内核后重启,echo 1 > /sys/block/sda/process_rq_stat后很快触发死锁内核crash。crash日志显示,执行blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info几个函数的进程,都是正常spin lock加锁/释放锁的。是突然某个进程执行blk_mq_sched_request_inserted或print_process_io_info里的spin_lock_irq,直接进入native_queued_spin_lock_slowpath分支而死锁。简单说,我的代码spin lock本身使用时没有问题的!不应该死锁!
接着,又怀疑是 print_process_io_info()函数里,list_del(&p_process_io_info_del->process_io_info_list) 和 kmem_cache_free(p_process_io_tmp->process_io_info_cachep, p_process_io_info_del)
删除process_io_info 导致的卡死。于是把 print_process_io_info()函数里 这些代码注释掉,编译新内核还是卡死。
无语了!但结合之前的测试,判定内核越界了!之前已经验证把 blk_account_io_done()里的 p_process_io_info_tmp->max_idc_time_rq = req 注释掉,不会卡死。应该是 p_process_io_info_tmp->max_idc_time_rq = req 内存越界,越界到 spin lock 的process_lcok 锁变量里,修改了锁变量,导致后续进程获取失败而卡死。怎么验证我的猜想,把spin lock 变量值打印出来!
3.3 打印spin lock锁变量验证内存越界
这个方法之前根本没用过,不知道能不能直接打印spin lock锁变量。试了一下竟然可以!看下它的定义
1. typedef struct spinlock {
2. union {
3. struct raw_spinlock rlock;
4. }
5. }
6. struct raw_spinlock {
7. arch_spinlock_t raw_lock;
8. }
9. crash> struct arch_spinlock_t
10. typedef struct qspinlock {
11. union {
12. atomic_t val;
13. struct {
14. u8 locked;
15. u8 pending;
16. };
17. struct {
18. u16 locked_pending;
19. u16 tail;
20. };
21. };
22. } arch_spinlock_t;
没错,直接把spin lock的raw_spinlock的arch_spinlock_t的val值打印出来即可。然后在struct process_io_control结构体增加一个atomic_t lock_count变量,主要是跟着 process_lock 一起加1和减1,证明我的代码没问题,不会有异常调用导致 process_lock 锁变量异常而触发死锁。
然后在blk_mq_sched_request_inserted、blk_mq_start_request、blk_account_io_done、print_process_io_info增加调试信息
1. void blk_mq_sched_request_inserted(struct request *rq)
2. {
3. .........................
4. if(rq->rq_disk && rq->rq_disk->process_io.enable){
5. struct process_rq_stat *p_process_rq_stat_tmp = NULL;
6. struct process_io_info *p_process_io_info_tmp = NULL;
7. int find = 0;
8. ///打印 process_io.lock_count 原子变量 ,打印 process_io.process_lock.rlock.raw_lock.val 原子变量
9. printk("%s %s %d process_io_count:%d lock_count:%d spin_lock:%d 1\n",__func__,current->comm,current->pid,process_io_count,atomic_read(&(rq->rq_disk->process_io.lock_count)),atomic_read(&(rq->rq_disk->process_io.process_lock.rlock.raw_lock.
10. val)));
11. spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
12. atomic_inc(&(rq->rq_disk->process_io.lock_count));
13. list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
14. if(p_process_io_info_tmp->pid == current->pid){
15. p_process_io_info_tmp->rq_count ++;
16. find = 1;
17. }
18. }
19. spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
20.
21. if(0 == find){
22. ....................
23. p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_ATOMIC);
24. ....................
25. }
26. p_process_rq_stat_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_rq_stat_cachep,GFP_ATOMIC);
27. ....................
28. p_process_io_info_tmp->pid = current->pid;
29. strncpy(p_process_io_info_tmp->comm,current->comm,COMM_LEN-1);
30.
31. p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp;
32. p_process_rq_stat_tmp->rq_inset_time = ktime_to_us(ktime_get());
33. rq->p_process_rq_stat = p_process_rq_stat_tmp;
34. printk("%s %s %d process_io_count:%d 3\n",__func__,current->comm,current->pid,process_io_count);
35. return;
36. ...................
37. }
38.
39. }
40. void print_process_io_info(struct process_io_control *p_process_io_tmp)
41. {
42. struct process_io_info *p_process_io_info_tmp = NULL;
43. struct process_io_info *p_process_io_info_del = NULL;
44.
45. ///打印 process_io.lock_count 原子变量 ,打印 process_io.process_lock.rlock.raw_lock.val 原子变量
46. printk("%s %s %d process_io_count:%d lock_count:%d spin_lock:%d in\n",__func__,current->comm,current->pid,process_io_count,atomic_read(&(p_process_io_tmp->lock_count)),atomic_read(&(p_process_io_tmp->process_lock.rlock.raw_lock.val)));
47.
48. spin_lock_irq(&(p_process_io_tmp->process_lock));
49. atomic_inc(&(p_process_io_tmp->lock_count));
50. list_for_each_entry(p_process_io_info_tmp, &(p_process_io_tmp->process_io_control_head), process_io_info_list){
51. ..........................
52. }
53. ..........................
54. spin_unlock_irq(&(p_process_io_tmp->process_lock));
55. atomic_dec(&(p_process_io_tmp->lock_count));
56. printk("%s %s %d process_io_count:%d out\n",__func__,current->comm,current->pid,process_io_count);
57. }
58. void blk_mq_start_request(struct request *rq)
59. {
60. if(rq->rq_disk && rq->rq_disk->process_io.enable && rq->p_process_rq_stat){
61. struct process_rq_stat *p_process_rq_stat_tmp = rq->p_process_rq_stat;
62. struct process_io_info *p_process_io_info_tmp = rq->p_process_rq_stat->p_process_io_info;
63.
64. printk("%s %s %d\n",__func__,current->comm,current->pid);
65. p_process_rq_stat_tmp->rq_issue_time = ktime_to_us(ktime_get());
66. ..........................
67. }
68. }
69. void blk_account_io_done(struct request *req, u64 now)
70. {
71. if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
72. struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
73. struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
74. ..........................
75. if(p_process_rq_stat_tmp->idc_time > p_process_io_info_tmp->max_idc_time){
76. p_process_io_info_tmp->max_idc_time = p_process_rq_stat_tmp->idc_time;
77. ///打印 process_io.process_lock.rlock.raw_lock.val 原子变量
78. printk("%s spin_lock:%d 1\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)));
79. p_process_io_info_tmp->max_idc_time_rq = req;
80. ///再次 打印 process_io.process_lock.rlock.raw_lock.val 原子变量
81. printk("%s spin_lock:%d 2\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)));
82. }
83. ..........................
84. kmem_cache_free(req->rq_disk->process_io.process_rq_stat_cachep, p_process_rq_stat_tmp);
85. req->p_process_rq_stat = NULL;
86. ///再次再 打印 process_io.process_lock.rlock.raw_lock.val 原子变量
87. printk("%s spin_lock:%d 3\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)));
88. }
89. }
调试信息添加后,编译新内核,重启。然后echo 1 > /sys/block/sda/process_rq_stat ,很快内核死锁,生成了crash日志,如下关键信息如下:
1. [ 315.237578] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 lock_count:0 spin_lock:0 1
2. [ 315.237579] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 2
3. [ 315.237580] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 3
4. [ 315.237587] blk_mq_start_request kworker/0:1H 488
5. [ 315.237768] blk_account_io_done spin_lock:0 1
6. //果然 blk_account_io_done()里 p_process_io_info_tmp->max_idc_time_rq = req 赋值后,spin lock 变量跳变了!
7. [ 315.237769] blk_account_io_done spin_lock:-26476 2
8. [ 315.237769] blk_account_io_done spin_lock:-26476 3
9. [ 315.237798] blk_mq_sched_request_inserted systemd-journal 631 process_io_count:1 lock_count:0 spin_lock:-26476 1
10. [ 316.287974] print_process_io_info process_rq_stat 3257 process_io_count:1 lock_count:0 spin_lock:-26476 in
11. [ 344.423606] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [kworker/u256:1:34]
12. ...............
13. [ 344.423669] Call Trace:
14. [ 344.423674] _raw_spin_lock+0x1c/0x20
15. [ 344.423676] dd_bio_merge+0x43/0x90
16. [ 344.423679] blk_mq_make_request+0xe0/0x5d0
17. [ 344.423681] generic_make_request+0xcf/0x310
18. [ 344.423683] submit_bio+0x3c/0x160
19. [ 344.423708] ? xfs_prepare_ioend+0xc2/0xd0 [xfs]
终于找打了spin lock锁变量被篡改的关键日志。但是,我有个更神奇的想法,把对 p_process_io_info_tmp->max_idc_time_rq =req的赋值的req指针打印出来,也许有奇景!于是把blk_account_io_done函数里的第一个打印printk("%s spin_lock:%d 1\n",__func__,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)))改成printk("%s req:0x%llx spin_lock:%d 1\n",__func__,(u64)req,atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)))。
再次编译新内核并触发死锁,新的crash日志如下:
- blk_account_io_done req:0xffff95cfdce87810 spin_lock:0 1
- blk_account_io_done spin_lock:-27185 2
- blk_account_io_done spin_lock:-27185 3
显然req指针是0xffff95cfdce87810,spin lock锁变量值是-27185。-27185 按照16进制打印是 0xffff95cf ,正是 req:0xffff95cfdce87810 这个64位数据的高8位 0xffff95cf dce87810。如此100%坐实确实是p_process_io_info_tmp->max_idc_time_rq =req内存越界到了spin lock锁变量,具体是越界到process_io_control 结构体的spinlock_t process_lock 锁成员。
继续调试,我实在想不通,p_process_io_info_tmp->max_idc_time_rq = req 这个赋值,怎么就 赋值给了 process_io_control 结构体的spinlock_t process_lock 成员呢?再添加调试信息。
blk_mq_sched_request_inserted 函数的第1条printk打印改成:printk("%s %s %d process_io_count:%d lock_count:%d spin_lock:%d enable:%d 1\n",__func__,current->comm,current->pid,process_io_count,atomic_read(&(rq->rq_disk->process_io.lock_count)),atomic_read(&(rq->rq_disk->process_io.process_lock.rlock.raw_lock.val)),rq->rq_disk->process_io.enable)
blk_mq_sched_request_inserted 函数的第3条printk打印改成printk("%s %s %d process_io_count:%d process_io:0x%llx rq:0x%llx rq->p_process_rq_stat:0x%llx process_io_info:0x%llx 3\n",__func__,current->comm,current->pid,process_io_count,(u64)(&rq->rq_disk->process_io),(u64)rq,(u64)rq->p_process_rq_stat,(u64)p_process_rq_stat_tmp->p_process_io_info)
blk_account_io_done 函数的第1条打印改成 printk("%s req:0x%llx process_rq_stat:0x%llx process_io_info:0x%llx max_idc_time_rq:0x%llx spin_lock:%d 1\n",__func__,(u64)req,(u64)req->p_process_rq_stat,(u64)req->p_process_rq_stat->p_process_io_info,(u64)(&(p_process_io_info_tmp->max_idc_time_rq)),atomic_read(&(req->rq_disk->process_io.process_lock.rlock.raw_lock.val)))
这么做的目的就是把IO传输过程的 req、process_rq_stat、process_io、process_io_info 指针全打印出来。编译新内核后重启,测试 echo 1 > /sys/block/sda/process_rq_stat 后很快又卡死,生成crash日志,如下:
- [ 2255.500850] blk_account_io_done spin_lock:0 2
- [ 2255.500850] blk_account_io_done spin_lock:0 3
- [ 2255.501102] blk_mq_sched_request_inserted systemd-journal 632 process_io_count:3 lock_count:0 spin_lock:0 enable:1 1
- [ 2255.501103] blk_mq_sched_request_inserted systemd-journal 632 process_io_count:3 2
- //这里 process_io_info 指针竟然也是 0xffff928cb38f1800
- [ 2255.501104] blk_mq_sched_request_inserted systemd-journal 632 process_io_count:3 process_io:0xffff928cb38f1838 rq:0xffff928cb0add990 rq->p_process_rq_stat:0xffff928cb0089390 process_io_info:0xffff928cb38f1800 3
- [ 2255.501107] blk_mq_start_request systemd-journal 632
- //process_io_info:0xffff928cb38f1800 指针跟 process_io:0xffff928cb38f1838 指针竟然只差 0x38大小,并且 process_io_info的成员max_idc_time_rq的地址 竟然跟process_io一样
- [ 2255.501686] blk_account_io_done req:0xffff928cb0add990 process_rq_stat:0xffff928cb0089390 process_io_info:0xffff928cb38f1800 max_idc_time_rq:0xffff928cb38f1838 spin_lock:0 1
- [ 2255.501686] blk_account_io_done spin_lock:-28020 2
- [ 2255.501687] blk_account_io_done spin_lock:-28020 3
- [ 2256.511605] print_process_io_info process_rq_stat 6627 process_io_count:3 lock_count:0 spin_lock:-28020 in
- //不仅 process_io 的成员 spin_lock 被篡改了,连 process_io 的成员 enable也被篡改了,不用说二者肯定跟 req:0xffff928cb0add990 有关
- [ 2267.191646] blk_mq_sched_request_inserted kworker/u256:0 8 process_io_count:3 lock_count:0 spin_lock:-28020 enable:-1330783856 1
- [ 2292.484150] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [kworker/0:0:6455]
显然,根据目前的测试,blk_account_io_done 里的 p_process_io_info_tmp->max_idc_time_rq = req(p_process_io_info_tmp 就是 req->p_process_rq_stat->p_process_io_info),
根据打印信息, req->p_process_rq_stat->p_process_io_info 指针就与 process_io_control 结构体 指针就相差0x38 。简单说,req->p_process_rq_stat->p_process_io_info 指针就在 process_io_control 结构体里(process_io_control结构体0x38大小) 。req->p_process_rq_stat->p_process_io_info的成员max_idc_time_rq 地址正是 process_io_control结构体的首地址。那怪不得blk_account_io_done 里的 p_process_io_info_tmp->max_idc_time_rq = req的赋值,会赋值给process_io_control 结构体里开头的成员enable变量和process_lock这个spin lock锁变量了。
process_io_control 结构体包含在 struct gendisk 结构体里,只一个全局变量,而 req->p_process_rq_stat->p_process_io_info 指向的 process_io_info是 kmem_cache_alloc 动态分配的,正常情况二者的地址肯定相差很大,为什么二者的地址会重叠在一起呢?
req->p_process_rq_stat->p_process_io_info指针是在 blk_mq_sched_request_inserted()里赋值,看看blk_mq_sched_request_inserted()最后的打印,打印的 req->p_process_rq_stat->p_process_io_info 指针就是 0xffff928cb38f1800 呀!这个时候 req->p_process_rq_stat->p_process_io_info 指针已经不正常了,已经就指向 process_io_control 结构体了!
此时review blk_mq_sched_request_inserted() 函数代码,发现里边有个问题:
1. void blk_mq_sched_request_inserted(struct request *rq)
2. {
3. ..................
4. list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
5. if(p_process_io_info_tmp->pid == current->pid){
6. p_process_io_info_tmp->rq_count ++;
7. find = 1;
8. }
9. }
10. ..................
11. }
这个 list_for_each_entry()里遍历 process_io.process_io_control_head 链表上所有进程的 process_io_info ,每次都是遍历到链表尾的 process_io_info 才退出,就是说 list_for_each_entry()退出后,p_process_io_info_tmp 都指向process_io.process_io_control_head 链表上最后一个成员。这与我的涉及初衷不一样呀,如果当前进程插入IO请求,在 process_io.process_io_control_head 链表上 找到一个pid与当前进程pid相等的 process_io_info,if(p_process_io_info_tmp->pid == current->pid)成立。那说明这个进程之前已经传输IO请求时分配了一个 process_io_info 结构,已经插入到了 process_io.process_io_control_head 链表,不用再分配新的process_io_info 结构。
此时 if(p_process_io_info_tmp->pid == current->pid) 成立,应该break呀,不应该再遍历链表了。但是问题来了,即便 这里的代码设计有问题,p_process_io_info_tmp每次都是 process_io.process_io_control_head 链表上最后一个 成员process_io_info ,然后 p_process_rq_stat_tmp->p_process_io_info = p_process_io_info_tmp 和 rq->p_process_rq_stat = p_process_rq_stat_tmp 赋值后,怎么p_process_rq_stat_tmp->p_process_io_info 指针就变成了 process_io_control 结构体里的地址呢?
做个测试,在 if(p_process_io_info_tmp->pid == current->pid)最后 加个 break,编译内核重启,echo 1 > /sys/block/sda/process_rq_stat测试,怎么测试都不会卡死。显然问题的根本原因就在这里了,就在 list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list) 这个for循环。再看下list_for_each_entry源码吧!
1. #define list_for_each_entry(pos, head, member) \
2. for (pos = list_first_entry(head, typeof(*pos), member); \
3. &pos->member != (head); \
4. pos = list_next_entry(pos, member))
看了 list_for_each_entry 源码,不禁有了疑问,list_for_each_entry 中,判断结束循环并不是根据链表最后一个成员的next指针是否是头结点head来判断的!想错了!实际情况是当pos指向最后一个链表成员,再执行pos = list_next_entry(pos, member) 后, pos 已经不再指向链表最后一个成员了,而是指向head链表头所在的 process_io_control 结构体了!然后 &pos->member != (head),此时就是在 process_io_control 结构体里偏移,判断是否是头结点地址。
就是说,从 list_for_each_entry()结束后,p_process_io_info_tmp指向的是链表头结点,即process_io_control 结构体里,而不是process_io_info,那怪不得后续对会p_process_io_info_tmp的成员赋值会内存越界到process_io_control 结构。但是这个问题并不是每次都出现,因为 if(p_process_io_info_tmp->pid == current->pid) 不成立时,下边就会新分配一个 process_io_info并赋值给 p_process_io_info_tmp,此时没问题。
3.4 spin lock锁异常根本原因分析
下边演示 p_process_io_info_tmp 指向的是 process_io_control 结构体的过程。正常情况,pos = list_next_entry(pos, member) ,pos->next 指向的是下一个 process_io_info 结构体的 struct list_head process_io_info_list,然后在 struct process_io_info 结构里,以 该结构体的成员 struct list_head process_io_info_list 的地址为基准,先减少 0x40 ,此时地址偏移到了 struct process_io_info 地址开头,最后返回的就是地址正是这个
1. crash> struct process_io_info -o -x
2. struct process_io_info {
3. [0x0] int pid;
4. [0x4] char comm[16];
5. [0x14] int rq_count;
6. [0x18] int rq_empty_count;
7. [0x1c] u32 max_id_time;
8. [0x20] u32 max_dc_time;
9. [0x24] u32 max_idc_time;
10. [0x28] struct request *max_id_time_rq;
11. [0x30] struct request *max_dc_time_rq;
12. [0x38] struct request *max_idc_time_rq;
13. [0x40] struct list_head process_io_info_list;
14. }
15. SIZE: 0x50
16.
17. -(0x40-0x8) + 0x38 = 0
18. crash> struct process_io_control -o -x
19. [-0x38] .........
20. struct process_io_control {
21. [0x0] int enable;
22. [0x4] spinlock_t process_lock;
23. [0x8] struct list_head process_io_control_head;
24. [0x18] struct task_struct *kernel_thread;
25. [0x20] struct kmem_cache *process_rq_stat_cachep;
26. [0x28] struct kmem_cache *process_io_info_cachep;
27. [0x30] atomic_t lock_count;
28. }
29. SIZE: 0x38
30. #define list_for_each_entry(pos, head, member) \
31. for (pos = list_first_entry(head, typeof(*pos), member); \
32. &pos->member != (head); \
33. pos = list_next_entry(pos, member))
当pos 是最后一个链表成员时, pos = list_next_entry(pos, member)的过程是:pos->next指向的是链表头节点,即process_io_control 结构体的成员 struct list_head process_io_control_head 。然后以这个struct list_head process_io_control_head 的地址为基准,先减少 0x40 ,而struct list_head process_io_control_head在struct process_io_control结构体里偏移0x8,因此pos指针(也就是p_process_io_info_tmp指针)指向地址是process_io_control结构体向前偏移0x40-0x8=0x38。然后就结束了list_for_each_entry循环。
然后 p_process_io_info_tmp->max_idc_time_rq,p_process_io_info_tmp->max_idc_time_rq本质是相对 p_process_io_info_tmp地址向后偏移0x38。而此时 p_process_io_info_tmp指针指向的是 struct process_io_control 结构体向前偏移-(0x40-0x8)=-0x38,因此p_process_io_info_tmp->max_idc_time_rq 的地址是 0x38 + (-0x38),该地址正好是 struct process_io_control基地址。就是说 p_process_io_info_tmp->max_idc_time_rq指向 struct process_io_control基地址,此时向p_process_io_info_tmp->max_idc_time_rq=req赋值,那就内存越界到了 struct process_io_control 的前8个字节。
当初排查问题时想法太多,没有认真review 代码,blk_mq_sched_request_inserted()里list_for_each_entry()那段代码,我之前印象中 if(p_process_io_info_tmp->pid == current->pid) 里找到匹配的进程后有break。后来看了很多次 blk_mq_sched_request_inserted函数代码,但都没有发现没break的问题,印象主义真害人。这个日常挺常见的,明明错误的代码,但是因为印象主义作祟,大脑始终认为是正确的。不能100%相信自己的大脑,睡一觉再看代码也许就会有新的发现。
最初if(p_process_io_info_tmp->pid == current->pid)这个break是有的,但是测试遍历链表效果临时删掉,没想到后续就忘了。因为工作原因,这段代码断断续续编写测试了一个星期才ok,思维不连贯!
4:又一个内核锁死
一波未平一波又起,解决上个问题后,满心欢喜测试,结果又锁死了!
1. [ 125.044681] blk_mq_sched_request_inserted make 8332 process_io_count:336 1
2. [ 125.044684] blk_mq_sched_request_inserted make 8332 process_io_count:336 2
3. [ 125.044685] blk_mq_sched_request_inserted make 8332 process_io_count:336 3
4. ......................
5. //pid 8316 号make进程现在卡死了,没有打印"blk_mq_sched_request_inserted make 8316 process_io_count:336 3",显然卡死了
6. [ 125.061609] blk_mq_sched_request_inserted make 8316 process_io_count:336 2
7. [ 148.553926] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [make:8332]
8. .............
9. [ 148.553956] CPU: 0 PID: 8332 Comm: make Kdump: loaded Tainted: G ---------r- - 4.18.0 #43
10. [ 148.553957] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
11. [ 148.553960] RIP: 0010:native_queued_spin_lock_slowpath+0x20/0x1d0
12. .............
13. [ 148.553984] Call Trace:
14. [ 148.553988] _raw_spin_lock+0x1c/0x20
15. [ 148.553991] dd_bio_merge+0x43/0x90
16. [ 148.553994] blk_mq_make_request+0xe0/0x5d0
17. [ 148.553997] ? iomap_ioend_try_merge+0xe0/0xe0
18. [ 148.553999] generic_make_request+0xcf/0x310
19. [ 148.554000] submit_bio+0x3c/0x160
20. .............
21. crash> bt -a
22. PID: 8332 TASK: ffff9d1d43f9c740 CPU: 0 COMMAND: "make"
23. #0 [ffff9d1d9a003d48] machine_kexec at ffffffff8225bf3e
24. #1 [ffff9d1d9a003da0] __crash_kexec at ffffffff8236071d
25. #2 [ffff9d1d9a003e68] panic at ffffffff822b5dc7
26. #3 [ffff9d1d9a003ee8] watchdog_timer_fn.cold.8 at ffffffff82396db7
27. #4 [ffff9d1d9a003f18] __hrtimer_run_queues at ffffffff82340890
28. .............
29. #9 [ffffb725c34779e0] _raw_spin_lock at ffffffff82acde1c
30. #10 [ffffb725c34779e8] dd_bio_merge at ffffffff82625a93
31. #11 [ffffb725c3477a18] blk_mq_make_request at ffffffff8260a380
32. #12 [ffffb725c3477aa8] generic_make_request at ffffffff825fe92f
33. #13 [ffffb725c3477b00] submit_bio at ffffffff825febac
34. .............
35.
36. PID: 8081 TASK: ffff9d1d43c24740 CPU: 1 COMMAND: "make"
37. #0 [fffffe0000032e50] crash_nmi_callback at ffffffff8224eee3
38. #1 [fffffe0000032e58] nmi_handle at ffffffff82223703
39. #2 [fffffe0000032eb0] default_do_nmi at ffffffff82223ade
40. #3 [fffffe0000032ed0] do_nmi at ffffffff82223cb8
41. .............
42. #5 [ffffb725c31b79e0] native_queued_spin_lock_slowpath at ffffffff82311180
43. #6 [ffffb725c31b79e0] _raw_spin_lock at ffffffff82acde1c
44. #7 [ffffb725c31b79e8] dd_bio_merge at ffffffff82625a93
45. #8 [ffffb725c31b7a18] blk_mq_make_request at ffffffff8260a380
46. #9 [ffffb725c31b7aa8] generic_make_request at ffffffff825fe92f
47. #10 [ffffb725c31b7b00] submit_bio at ffffffff825febac
48. .............
49. PID: 8326 TASK: ffff9d1d35f297c0 CPU: 2 COMMAND: "make"
50. #0 [fffffe000005de50] crash_nmi_callback at ffffffff8224eee3
51. #1 [fffffe000005de58] nmi_handle at ffffffff82223703
52. #2 [fffffe000005deb0] default_do_nmi at ffffffff82223ade
53. #3 [fffffe000005ded0] do_nmi at ffffffff82223cb8
54. .............
55. #5 [ffffb725c34279e0] native_queued_spin_lock_slowpath at ffffffff82311180
56. #6 [ffffb725c34279e0] _raw_spin_lock at ffffffff82acde1c
57. #7 [ffffb725c34279e8] dd_bio_merge at ffffffff82625a93
58. #8 [ffffb725c3427a18] blk_mq_make_request at ffffffff8260a380
59. #9 [ffffb725c3427aa8] generic_make_request at ffffffff825fe92f
60. #10 [ffffb725c3427b00] submit_bio at ffffffff825febac
61. .............
62. PID: 8306 TASK: ffff9d1d43f60000 CPU: 3 COMMAND: "make"
63. .............
64. #5 [ffffb725c34379e0] native_queued_spin_lock_slowpath at ffffffff82311180
65. #6 [ffffb725c34379e0] _raw_spin_lock at ffffffff82acde1c
66. #7 [ffffb725c34379e8] dd_bio_merge at ffffffff82625a93
67. #8 [ffffb725c3437a18] blk_mq_make_request at ffffffff8260a380
68. #9 [ffffb725c3437aa8] generic_make_request at ffffffff825fe92f
69. #10 [ffffb725c3437b00] submit_bio at ffffffff825febac
70. 没有发现异常的进程,但是 pid 8316 号make进程显然卡死了
71. crash> bt 8316
72. PID: 8316 TASK: ffff9d1d569c5f00 CPU: 1 COMMAND: "make"
73. #0 [ffffb725c3457930] __schedule at ffffffff82ac8826
74. #1 [ffffb725c34579c8] preempt_schedule_common at ffffffff82ac8d6a
75. #2 [ffffb725c34579d0] _cond_resched at ffffffff82ac8d9d
76. #3 [ffffb725c34579d8] kmem_cache_alloc at ffffffff824a6a07
77. #4 [ffffb725c3457a10] blk_mq_sched_request_inserted.cold.14 at ffffffff8260fd3d
78. #5 [ffffb725c3457a48] dd_insert_requests at ffffffff82625b7e
79. #6 [ffffb725c3457a90] blk_mq_sched_insert_requests at ffffffff8260f845
80. #7 [ffffb725c3457ac8] blk_mq_flush_plug_list at ffffffff8260ac96
81. #8 [ffffb725c3457b48] blk_flush_plug_list at ffffffff825ffb57
82. #9 [ffffb725c3457b98] blk_finish_plug at ffffffff825ffba1
83. .............
没想到 dd_insert_requests->spin_lock(&dd->lock)获取锁成功后执行 blk_mq_sched_request_inserted->kmem_cache_alloc->_cond_resched->preempt_schedule_common->__schedule 休眠了然后 其他进程 dd_insert_requests->spin_lock(&dd->lock)->_raw_spin_lock获取锁失败而进入 native_queued_spin_lock_slowpath 分支,死循环等待spin_lock锁。但是我很好奇,d_insert_requests->spin_lock(&dd->lock)获取锁成功后执行blk_mq_sched_request_inserted->kmem_cache_alloc->_cond_resched->preempt_schedule_common->__schedule 休眠,会休眠22s吗,这休眠的时间也太长了!!!!!!
这个问题解决方法很简单,修改 kmem_cache_alloc(......,GFP_KERNEL) 为 kmem_cache_alloc(......,GFP_ATOMIC),禁止分配内存时休眠。这里做个小总结:
1:bt -a 只能打印D、R 状态的进程栈回溯,不能打印S状态的。而有问题的进程可能正是S状态。
2:kmem_cache_alloc()函数有概率会休眠,因此不能用在 spin_lock 等加锁函数里,不能用 kmem_cache_alloc、get_free_pages等函数
3:dd_insert_requests函数里插入IO请求函数里是先 spin_lock(&dd->lock),然后dd_insert_request->blk_mq_sched_request_inserted ,而blk_mq_sched_request_inserted函数里在 kmem_cache_alloc()休眠,相当于是spin_lock(&dd->lock) 后发生了休眠。看来不仅仅要知道你自己的代码 spin_lock后不能休眠,上层调用也要知道有没有禁止休眠呀!
5:rq_count的异常
我在blk_mq_sched_request_inserted(),进程每派发一个IO请求,则p_process_io_info_tmp->rq_count ++,在IO请求传输完成时执行blk_account_io_done函数时,再p_process_io_info_tmp->rq_count --。正常p_process_io_info_tmp->rq_count 应该大于等于0,但是实际测试发现p_process_io_info_tmp->rq_count 竟然会是-1!
1. void blk_mq_sched_request_inserted(struct request *rq)
2. {
3. .............
4. if(rq->rq_disk && rq->rq_disk->process_io.enable){
5. struct process_rq_stat *p_process_rq_stat_tmp = NULL;
6. struct process_io_info *p_process_io_info_tmp = NULL;
7. int find = 0;
8.
9. spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
10. //先遍历process_io_control_head链表上是否已经有了当前进程绑定的 process_io_info 结构
11. list_for_each_entry(p_process_io_info_tmp, &(rq->rq_disk->process_io.process_io_control_head), process_io_info_list){
12. if(p_process_io_info_tmp->pid == current->pid){
13. p_process_io_info_tmp->rq_count ++;
14. find = 1;
15. break;
16. }
17. }
18. spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
19.
20. if(0 == find){
21. //没有找到则为当前进程分配一个新的 process_io_info结构
22. p_process_io_info_tmp = kmem_cache_alloc(rq->rq_disk->process_io.process_io_info_cachep,GFP_KERNEL);
23. ....................
24. spin_lock_irq(&(rq->rq_disk->process_io.process_lock));
25. p_process_io_info_tmp->rq_count ++;
26. //把 process_io_info结构 添加到 process_io.process_io_control_head 链表
27. list_add(&p_process_io_info_tmp->process_io_info_list,&(rq->rq_disk->process_io.process_io_control_head));
28. spin_unlock_irq(&(rq->rq_disk->process_io.process_lock));
29. }
30. ....................
31. }
32. }
33. void blk_account_io_done(struct request *req, u64 now)
34. {
35. ....................
36. if(req->rq_disk && req->rq_disk->process_io.enable && req->p_process_rq_stat){
37. struct process_rq_stat *p_process_rq_stat_tmp = req->p_process_rq_stat;
38. struct process_io_info *p_process_io_info_tmp = req->p_process_rq_stat->p_process_io_info;
39. .................
40. p_process_io_info_tmp->rq_count --;
41. if(p_process_io_info_tmp->rq_count < 0){
42. printk("%s error:%d\n",__func__,p_process_io_info_tmp->rq_count);
43. }
44. .................
45. }
46. }
这个问题是因为:进程在执行blk_mq_sched_request_inserted向IO队列插入IO请求而p_process_io_info_tmp->rq_count ++,同时该进程上的IO请求传输完成了执行blk_account_io_done而令p_process_io_info_tmp->rq_count --,相当于p_process_io_info_tmp->rq_count同时加1和减1,那怪不得有问题了!解决方法很简单,blk_account_io_done()中执行p_process_io_info_tmp->rq_count --前spin lock加锁!
为什么会犯这个错误呢?因为没有想到进程执行blk_mq_sched_request_inserted向IO队列插入IO请求的同时,该进程的IO请求还会传输完成blk_account_io_done。没有理解透彻向内核插入的代码的执行时机,没理解透彻上下文!
大的问题终于解决了,后续还有很多细节需要处理。
6:总结
本次的调试经历感触很深,种种原因导致了一系列错误,也充分认识到内核开发难度很大,坑很多。这里主要做以下几个总结:
1:list_for_each_entry()遍历内核链表原理没有理解透彻,之前迷迷糊糊错误因为链表遍历结束后, pos指针是指向链表最后一个成员,实际确实链表头结点。
2:在 dd_insert_requests->spin_lock(&dd->lock)获取锁成功后执行 blk_mq_sched_request_inserted->kmem_cache_alloc分支, 没想到blk_mq_sched_request_inserted的上层,即dd_insert_requests函数里,是spin_lock(&dd->lock)后执行的blk_mq_sched_request_inserted。因此blk_mq_sched_request_inserted函数里是不能休眠的。blk_mq_sched_request_inserted里spin_lock_irq(&(rq->rq_disk->process_io.process_lock))后已经避开了执行kmem_cache_alloc而休眠,但是没想到blk_mq_sched_request_inserted的上层调用这也执行了spin_lock(&dd->lock)而全程不能休眠。不熟悉代码上下文!当然,spin lock加锁后,也不能有printk,因为printk可能会休眠。
3:没有想到进程执行blk_mq_sched_request_inserted向IO队列插入IO请求的同时,该进程的IO请求还会传输完成执行blk_account_io_done,造成p_process_io_info_tmp->rq_count加1和减1同时执行。还是没有理解透彻向内核插入的代码的执行时机,还是不清楚上下文!
还有一些其他心得
1:blk_mq_sched_request_inserted函数里,list_for_each_entry里的那个break,因为印象中有break,所以后续看代码潜意识一直因为有break。遇到奇葩的问题,一定要学会完全否定,做到怀疑每一行代码。
2:遇到疑难问题,不要动不动就怀疑硬件有问题,内存数据跳变,内核原生bug,编译器有问题等等。还是要多怀疑自己的代码问题,结合上下文,软中断和硬中断,多线程和高并发。在分析这个问题没思路时,还尝试把struct gendisk结构体里的struct process_io_control process_io移动位置,从struct gendisk最后一个成员移动到第一个成员,还把include/linux/blk_types.h文件里,struct process_io_control、struct process_io_info、struct process_rq_stat的原始定义从文件最开头移动到文件中间。为什么这么做?怀疑是编译器问题导致的spin lock异常卡死。