注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4

1、关于drop_caches

通常在内存不足时,我们习惯通过echo 3 > /proc/sys/vm/drop_caches 的方式手动清理系统缓存,

[root@localhost  ~]# free -m
              total        used        free      shared  buff/cache   available
Mem:           7822        3436        2068          40        2317        3997
Swap:             0           0           0
[root@localhost  ~]# echo 3 > /proc/sys/vm/drop_caches 
[root@localhost ~]# free -m
              total        used        free      shared  buff/cache   available
Mem:           7822        3433        4036          40         352        4037
Swap:             0           0           0

对于数字3的含义,我们可以通过内核文档了解其具体含义,

To free pagecache: 
	echo 1 > /proc/sys/vm/drop_caches 
To free reclaimable slab objects (includes dentries and inodes): 
	echo 2 > /proc/sys/vm/drop_caches 
To free slab objects and pagecache: 
	echo 3 > /proc/sys/vm/drop_caches

2、释放pagecache

在之前我们知道当内存低于某个阈值时,会触发脏页回写,提交回写work到对应BDI设备上,由BDI writebacke进程回写脏页释放内存。这和drop_caches中的echo 1类似,都是释放脏页,因此其最后路径是一致的。

int drop_caches_sysctl_handler(ctl_table *table, int write,
	void __user *buffer, size_t *length, loff_t *ppos)
{
	int ret;
ret <span >=</span> <span >proc_dointvec_minmax</span><span >(</span>table<span >,</span> write<span >,</span> buffer<span >,</span> length<span >,</span> ppos<span >)</span><span >;</span>
<span >if</span> <span >(</span>ret<span >)</span>
	<span >return</span> ret<span >;</span>
<span >if</span> <span >(</span>write<span >)</span> <span >{<!-- --></span>
	<span >static</span> <span >int</span> stfu<span >;</span>
    <span >// echo 1 > drop_caches</span>
	<span >if</span> <span >(</span>sysctl_drop_caches <span >&</span> <span >1</span><span >)</span> <span >{<!-- --></span>
		<span >iterate_supers</span><span >(</span>drop_pagecache_sb<span >,</span> <span class="token constant">NULL</span><span >)</span><span >;</span>
		<span >count_vm_event</span><span >(</span>DROP_PAGECACHE<span >)</span><span >;</span>
	<span >}</span>
    <span >// echo 2 > drop_caches</span>
	<span >if</span> <span >(</span>sysctl_drop_caches <span >&</span> <span >2</span><span >)</span> <span >{<!-- --></span>
		<span >drop_slab</span><span >(</span><span >)</span><span >;</span>
		<span >count_vm_event</span><span >(</span>DROP_SLAB<span >)</span><span >;</span>
	<span >}</span>
	<span >if</span> <span >(</span><span >!</span>stfu<span >)</span> <span >{<!-- --></span>
		<span >pr_info</span><span >(</span><span class="token string">"%s (%d): drop_caches: %d\n"</span><span >,</span>
			current<span >-></span>comm<span >,</span> <span >task_pid_nr</span><span >(</span>current<span >)</span><span >,</span>
			sysctl_drop_caches<span >)</span><span >;</span>
	<span >}</span>
    <span >//置位,否则就一直在回收了</span>
	stfu <span >|</span><span >=</span> sysctl_drop_caches <span >&</span> <span >4</span><span >;</span>
<span >}</span>
<span >return</span> <span >0</span><span >;</span>

}

ret <span >=</span> <span >proc_dointvec_minmax</span><span >(</span>table<span >,</span> write<span >,</span> buffer<span >,</span> length<span >,</span> ppos<span >)</span><span >;</span>
<span >if</span> <span >(</span>ret<span >)</span>
	<span >return</span> ret<span >;</span>
<span >if</span> <span >(</span>write<span >)</span> <span >{<!-- --></span>
	<span >static</span> <span >int</span> stfu<span >;</span>
    <span >// echo 1 > drop_caches</span>
	<span >if</span> <span >(</span>sysctl_drop_caches <span >&</span> <span >1</span><span >)</span> <span >{<!-- --></span>
		<span >iterate_supers</span><span >(</span>drop_pagecache_sb<span >,</span> <span class="token constant">NULL</span><span >)</span><span >;</span>
		<span >count_vm_event</span><span >(</span>DROP_PAGECACHE<span >)</span><span >;</span>
	<span >}</span>
    <span >// echo 2 > drop_caches</span>
	<span >if</span> <span >(</span>sysctl_drop_caches <span >&</span> <span >2</span><span >)</span> <span >{<!-- --></span>
		<span >drop_slab</span><span >(</span><span >)</span><span >;</span>
		<span >count_vm_event</span><span >(</span>DROP_SLAB<span >)</span><span >;</span>
	<span >}</span>
	<span >if</span> <span >(</span><span >!</span>stfu<span >)</span> <span >{<!-- --></span>
		<span >pr_info</span><span >(</span><span class="token string">"%s (%d): drop_caches: %d\n"</span><span >,</span>
			current<span >-></span>comm<span >,</span> <span >task_pid_nr</span><span >(</span>current<span >)</span><span >,</span>
			sysctl_drop_caches<span >)</span><span >;</span>
	<span >}</span>
    <span >//置位,否则就一直在回收了</span>
	stfu <span >|</span><span >=</span> sysctl_drop_caches <span >&</span> <span >4</span><span >;</span>
<span >}</span>
<span >return</span> <span >0</span><span >;</span>

可见,echo 1时,会调用drop_pagecache_sb去释放pagecache,我们继续往下查,

drop_pagecache_sb ->
	iput ->
		iput_final->
			write_inode_now -> #提交writeback_control,立即回写
				writeback_single_inode ->
					__writeback_single_inode ->
						do_writepages #调用对应文件系统的writepage写回磁盘

在BDI回写里,一开始提交的是wb_writeback_work,等到实际要执行回写操作时,都会转换为writeback_control,再去执行回写。

因此,echo 1的操作就是,遍历每个超级块,调用drop_pagecache_sb,drop_pagecache_sb中会遍历该超级块所有的inode,对其关联的pagecache进行回写。与BDI不同的是,该操作是立马执行,不需要等待周期执行或者inode过期。

3、释放slab cache

而对于echo 2的情况,就比较复杂一点,

static void drop_slab(void)
{
	int nr_objects;
	struct shrink_control shrink = {
		.gfp_mask = GFP_KERNEL,
	};
    //上次回收缓存数量高于10,就再进行一次回收
    //这个条件其实挺苛刻的,回收后整个系统空闲slab不会超过10
	do {
		nr_objects = shrink_slab(&shrink, 1000, 1000);
	} while (nr_objects > 10);
}
unsigned long shrink_slab(struct shrink_control shrink,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker shrinker;
unsigned long ret = 0;
...
//遍历系统中所有的shrinker,回收各个slab管理区的空闲缓存
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
long total_scan;
long max_pass;
int shrink_ret = 0;
long nr;
long new_nr;
//获取批处理数量,默认每次回收128,对于超级块而言是1024
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
//获取该slab管理区可回收的缓存数量

max_pass = do_shrinker_shrink(shrinker, shrink, 0);
if (max_pass <= 0)
continue;
	nr <span >=</span> <span >atomic_long_xchg</span><span >(</span><span >&</span>shrinker<span >-></span>nr_in_batch<span >,</span> <span >0</span><span >)</span><span >;</span>

	total_scan <span >=</span> nr<span >;</span>
    <span >//计算该slab管理区此次缓存回收额度,一堆操作</span>
    <span >//针对手动释放缓存的场景,基本上是两倍的max_pass,也就是尽可能去释放</span>
    <span >//对于kswap或其他路径上,不会超过一倍的max_pass</span>
	delta <span >=</span> <span >(</span><span >4</span> <span >*</span> nr_pages_scanned<span >)</span> <span >/</span> shrinker<span >-></span>seeks<span >;</span>
	delta <span >*</span><span >=</span> max_pass<span >;</span>
	<span >do_div</span><span >(</span>delta<span >,</span> lru_pages <span >+</span> <span >1</span><span >)</span><span >;</span>
	total_scan <span >+</span><span >=</span> delta<span >;</span>
	<span >if</span> <span >(</span>total_scan <span ><</span> <span >0</span><span >)</span> <span >{<!-- --></span>
		<span >printk</span><span >(</span>KERN_ERR <span class="token string">"shrink_slab: %pF negative objects to "</span>
		       <span class="token string">"delete nr=%ld\n"</span><span >,</span>
		       shrinker<span >-></span>shrink<span >,</span> total_scan<span >)</span><span >;</span>
		total_scan <span >=</span> max_pass<span >;</span>
	<span >}</span>
    <span >//如果delta偏小,意味着系统中inactive的缓存偏少,我们回收的额度也不能设置太大</span>
	<span >if</span> <span >(</span>delta <span ><</span> max_pass <span >/</span> <span >4</span><span >)</span>
		total_scan <span >=</span> <span >min</span><span >(</span>total_scan<span >,</span> max_pass <span >/</span> <span >2</span><span >)</span><span >;</span>

    <span >//控制回收总额上限,避免死循环</span>
	<span >if</span> <span >(</span>total_scan <span >></span> max_pass <span >*</span> <span >2</span><span >)</span>
		total_scan <span >=</span> max_pass <span >*</span> <span >2</span><span >;</span>

	<span >trace_mm_shrink_slab_start</span><span >(</span>shrinker<span >,</span> shrink<span >,</span> nr<span >,</span>
				nr_pages_scanned<span >,</span> lru_pages<span >,</span>
				max_pass<span >,</span> delta<span >,</span> total_scan<span >)</span><span >;</span>
    <span >//循环回收缓存</span>
	<span >while</span> <span >(</span>total_scan <span >>=</span> batch_size<span >)</span> <span >{<!-- --></span>
		<span >int</span> nr_before<span >;</span>
        <span >//记录处理前缓存数量</span>
		nr_before <span >=</span> <span >do_shrinker_shrink</span><span >(</span>shrinker<span >,</span> shrink<span >,</span> <span >0</span><span >)</span><span >;</span>
        <span >//回收后缓存数量</span>
		shrink_ret <span >=</span> <span >do_shrinker_shrink</span><span >(</span>shrinker<span >,</span> shrink<span >,</span>
						batch_size<span >)</span><span >;</span>
		<span >if</span> <span >(</span>shrink_ret <span >==</span> <span >-</span><span >1</span><span >)</span>
			<span >break</span><span >;</span>
        <span >//统计此次回收的缓存数量</span>
		<span >if</span> <span >(</span>shrink_ret <span ><</span> nr_before<span >)</span>
			ret <span >+</span><span >=</span> nr_before <span >-</span> shrink_ret<span >;</span>
		<span >count_vm_events</span><span >(</span>SLABS_SCANNED<span >,</span> batch_size<span >)</span><span >;</span>
        <span >//减少扫描总额</span>
		total_scan <span >-</span><span >=</span> batch_size<span >;</span>

		<span >cond_resched</span><span >(</span><span >)</span><span >;</span>
	<span >}</span>
    <span >//如果剩下的额度不够一个batch_size,留着下次使用,记录在nr_in_batch</span>
	<span >if</span> <span >(</span>total_scan <span >></span> <span >0</span><span >)</span>
		new_nr <span >=</span> <span >atomic_long_add_return</span><span >(</span>total_scan<span >,</span>
				<span >&</span>shrinker<span >-></span>nr_in_batch<span >)</span><span >;</span>
	<span >else</span>
		new_nr <span >=</span> <span >atomic_long_read</span><span >(</span><span >&</span>shrinker<span >-></span>nr_in_batch<span >)</span><span >;</span>

	<span >trace_mm_shrink_slab_end</span><span >(</span>shrinker<span >,</span> shrink_ret<span >,</span> nr<span >,</span> new_nr<span >)</span><span >;</span>
<span >}</span>
<span >up_read</span><span >(</span><span >&</span>shrinker_rwsem<span >)</span><span >;</span>

out:
cond_resched();
return ret;
}

	nr <span >=</span> <span >atomic_long_xchg</span><span >(</span><span >&</span>shrinker<span >-></span>nr_in_batch<span >,</span> <span >0</span><span >)</span><span >;</span>

	total_scan <span >=</span> nr<span >;</span>
    <span >//计算该slab管理区此次缓存回收额度,一堆操作</span>
    <span >//针对手动释放缓存的场景,基本上是两倍的max_pass,也就是尽可能去释放</span>
    <span >//对于kswap或其他路径上,不会超过一倍的max_pass</span>
	delta <span >=</span> <span >(</span><span >4</span> <span >*</span> nr_pages_scanned<span >)</span> <span >/</span> shrinker<span >-></span>seeks<span >;</span>
	delta <span >*</span><span >=</span> max_pass<span >;</span>
	<span >do_div</span><span >(</span>delta<span >,</span> lru_pages <span >+</span> <span >1</span><span >)</span><span >;</span>
	total_scan <span >+</span><span >=</span> delta<span >;</span>
	<span >if</span> <span >(</span>total_scan <span ><</span> <span >0</span><span >)</span> <span >{<!-- --></span>
		<span >printk</span><span >(</span>KERN_ERR <span class="token string">"shrink_slab: %pF negative objects to "</span>
		       <span class="token string">"delete nr=%ld\n"</span><span >,</span>
		       shrinker<span >-></span>shrink<span >,</span> total_scan<span >)</span><span >;</span>
		total_scan <span >=</span> max_pass<span >;</span>
	<span >}</span>
    <span >//如果delta偏小,意味着系统中inactive的缓存偏少,我们回收的额度也不能设置太大</span>
	<span >if</span> <span >(</span>delta <span ><</span> max_pass <span >/</span> <span >4</span><span >)</span>
		total_scan <span >=</span> <span >min</span><span >(</span>total_scan<span >,</span> max_pass <span >/</span> <span >2</span><span >)</span><span >;</span>

    <span >//控制回收总额上限,避免死循环</span>
	<span >if</span> <span >(</span>total_scan <span >></span> max_pass <span >*</span> <span >2</span><span >)</span>
		total_scan <span >=</span> max_pass <span >*</span> <span >2</span><span >;</span>

	<span >trace_mm_shrink_slab_start</span><span >(</span>shrinker<span >,</span> shrink<span >,</span> nr<span >,</span>
				nr_pages_scanned<span >,</span> lru_pages<span >,</span>
				max_pass<span >,</span> delta<span >,</span> total_scan<span >)</span><span >;</span>
    <span >//循环回收缓存</span>
	<span >while</span> <span >(</span>total_scan <span >>=</span> batch_size<span >)</span> <span >{<!-- --></span>
		<span >int</span> nr_before<span >;</span>
        <span >//记录处理前缓存数量</span>
		nr_before <span >=</span> <span >do_shrinker_shrink</span><span >(</span>shrinker<span >,</span> shrink<span >,</span> <span >0</span><span >)</span><span >;</span>
        <span >//回收后缓存数量</span>
		shrink_ret <span >=</span> <span >do_shrinker_shrink</span><span >(</span>shrinker<span >,</span> shrink<span >,</span>
						batch_size<span >)</span><span >;</span>
		<span >if</span> <span >(</span>shrink_ret <span >==</span> <span >-</span><span >1</span><span >)</span>
			<span >break</span><span >;</span>
        <span >//统计此次回收的缓存数量</span>
		<span >if</span> <span >(</span>shrink_ret <span ><</span> nr_before<span >)</span>
			ret <span >+</span><span >=</span> nr_before <span >-</span> shrink_ret<span >;</span>
		<span >count_vm_events</span><span >(</span>SLABS_SCANNED<span >,</span> batch_size<span >)</span><span >;</span>
        <span >//减少扫描总额</span>
		total_scan <span >-</span><span >=</span> batch_size<span >;</span>

		<span >cond_resched</span><span >(</span><span >)</span><span >;</span>
	<span >}</span>
    <span >//如果剩下的额度不够一个batch_size,留着下次使用,记录在nr_in_batch</span>
	<span >if</span> <span >(</span>total_scan <span >></span> <span >0</span><span >)</span>
		new_nr <span >=</span> <span >atomic_long_add_return</span><span >(</span>total_scan<span >,</span>
				<span >&</span>shrinker<span >-></span>nr_in_batch<span >)</span><span >;</span>
	<span >else</span>
		new_nr <span >=</span> <span >atomic_long_read</span><span >(</span><span >&</span>shrinker<span >-></span>nr_in_batch<span >)</span><span >;</span>

	<span >trace_mm_shrink_slab_end</span><span >(</span>shrinker<span >,</span> shrink_ret<span >,</span> nr<span >,</span> new_nr<span >)</span><span >;</span>
<span >}</span>
<span >up_read</span><span >(</span><span >&</span>shrinker_rwsem<span >)</span><span >;</span>

空闲slab缓存计算和回收都是在do_shrinker_shrink完成,它其实调用的是一个函数指针,不同slab管理区有自己定义的shrink函数,第三个入参nr_to_scan为0时,是计算空闲slab缓存;不为空时,表示扫描和回收缓存的数量。

static inline int do_shrinker_shrink(struct shrinker *shrinker,
				     struct shrink_control *sc,
				     unsigned long nr_to_scan)
{
	int objects;
	sc->nr_to_scan = nr_to_scan;
	objects = (*shrinker->shrink)(shrinker, sc);
<span >if</span> <span >(</span>objects <span ><</span> <span >-</span><span >1</span><span >)</span>
	<span >return</span> INT_MAX<span >;</span>

<span >return</span> objects<span >;</span>

}

<span >if</span> <span >(</span>objects <span ><</span> <span >-</span><span >1</span><span >)</span>
	<span >return</span> INT_MAX<span >;</span>

<span >return</span> objects<span >;</span>

总的来说,drop_slab就是调用每个slab管理区定义的shrink函数,先计算出可回收的slab缓存数量,然后确定扫描数量,最后调用shrink函数执行缓存扫描和回收。