ltp（四） MM之max_map_count.c源码分析

原创

最是人间第七味 2023-05-24 15:59:49 博主文章分类：LINUX-OS-性能/稳定性分析 ©著作权

文章标签 linux LTP max_map_count VMA 文章分类 开源 指尖人生

©著作权归作者所有：来自51CTO博客作者最是人间第七味的原创作品，请联系作者获取转载授权，否则将追究法律责任

前言

本篇文章主要是为了对ltp内MM模块的测试用例之一的max_map_count进行源码分析，作为对内核VMA(虚拟内存区域)的专项测试，其蕴含的技术知识，还是很值得学习一下的。

MM是内核顶级的子系统，也是内核最复杂的模块之一。不过作为一名内核开发者来说，虽然内核学习之路布满荆棘，但了解其背后的设计原理对自身也是裨益极大的，我们有理由相信，道阻且长，行则将至！

言归正传，本次博客的重点的linux kernel参数max_map_count，官方原文为:

“This file contains the maximum number of memory map areas a process may have. Memory map areas are used as a side-effect of calling malloc, directly by mmap and mprotect, and also when loading shared libraries.While most applications need less than a thousand maps, certain programs, particularly malloc debuggers, may consume lots of them, e.g., up to one or two maps per allocation.The default value is 65536.”

对其进行翻译后，可知，参数max_map_count限制了一个进程可以拥有的VMA(虚拟内存区域)的数量。虚拟内存区域是一个连续的虚拟地址空间区域。在进程的生命周期中，每当程序尝试在内存中映射文件，链接到共享内存段，或者分配堆空间的时候，这些区域将被创建，这个参数默认值为65536。

值得注意的是，调优这个值将限制进程可拥有VMA的数量，这有可能导致应用程序跑不起来。

1.源码分析

1.1函数调用关系图

1.1.1单文件调用图

该图只描述max_map_count.c内的函数调用关系，比较明了，但是深入度不够。

ltp（四） MM之max_map_count.c源码分析_LTP

1.1.2多文件调用图

该图描述了max_map_count.c与mem.c两个文件之间的函数关系调用图，深入度足够但是不够清晰，可以结合单文件图了解函数调用关系。

ltp（四） MM之max_map_count.c源码分析_max_map_count_02

1.2源码分析

因为max_map_count的主要测试逻辑都分布在max_map_count.c内，我们就不拆分函数块来看了，直接看源码！

/*
 * Copyright (C) 2012-2017  Red Hat, Inc.
 *
 * This program is free software;  you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY;  without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 * the GNU General Public License for more details.
 *
 * Description:
 *
 * The program is designed to test max_map_count tunable file
 *
 * The kernel Documentation say that:
 * /proc/sys/vm/max_map_count contains the maximum number of memory map
 * areas a process may have. Memory map areas are used as a side-effect
 * of calling malloc, directly by mmap and mprotect, and also when
 * loading shared libraries.
 *
 * Each process has his own maps file: /proc/[pid]/maps, and each line
 * indicates a map entry, so it can caculate the amount of maps by reading
 * the file lines' number to check the tunable performance.
 *
 * The program tries to invoke mmap() endlessly until it triggers MAP_FAILED,
 * then reads the process's maps file /proc/[pid]/maps, save the line number to
 * map_count variable, and compare it with /proc/sys/vm/max_map_count,
 * map_count should be greater than max_map_count by 1;
 *
 * Note: On some architectures there is a special vma VSYSCALL, which
 * is allocated without incrementing mm->map_count variable. On these
 * architectures each /proc/<pid>/maps has at the end:
 * ...
 * ...
 * ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0   [vsyscall]
 *
 * so we ignore this line during /proc/[pid]/maps reading.
 */

#define _GNU_SOURCE
#include <sys/wait.h>
#include <errno.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/utsname.h>
#include "mem.h"

#define MAP_COUNT_DEFAULT	1024
#define  MAX_MAP_COUNT		65536L

static long old_max_map_count = -1;
static long old_overcommit = -1;

static void setup(void)
{
	SAFE_ACCESS(PATH_SYSVM "max_map_count", F_OK);

	// 读取/proc/sys/vm/max_map_count，max_map_count决定进程虚拟内存区域数量
	old_max_map_count = get_sys_tune("max_map_count");
	/*
	*读取/proc/sys/vm/overcommit_memory，overcommit_memory决定内存分配策略,ubuntu默认为0
	*0， 表示内核将检查是否有足够的可用内存供应用进程使用；如果有足够的可用内存，内存申请允许；否则，内存申请失败，并把错误返回给应用进程。
	*1， 表示内核允许分配所有的物理内存，而不管当前的内存状态如何。
	*2， 表示内核允许分配超过所有物理内存和交换空间总和的内存
	*/
	old_overcommit = get_sys_tune("overcommit_memory");
	// 更改内存管理策略为0，并检查写入结果
	set_sys_tune("overcommit_memory", 0, 1);
}

static void cleanup(void)
{
	// 恢复最初的设置
	if (old_overcommit != -1)
		set_sys_tune("overcommit_memory", old_overcommit, 0);
	if (old_max_map_count != -1)
		set_sys_tune("max_map_count", old_max_map_count, 0);
}

/* This is a filter to exclude map entries which aren't accounted
 * for in the vm_area_struct's map_count.
 */
static bool filter_map(const char *line)
{
	char buf[BUFSIZ];
	int ret;

	ret = sscanf(line, "%*p-%*p %*4s %*p %*2d:%*2d %*d %s", buf);
	if (ret != 1)
		return false;

	switch (tst_arch.type) {
	case TST_X86:
	case TST_X86_64:
		/* On x86, there's an old compat vsyscall page */
		if (!strcmp(buf, "[vsyscall]"))
			return true;
		break;
	case TST_IA64:
		/* On ia64, the vdso is not a proper mapping */
		if (!strcmp(buf, "[vdso]"))
			return true;
		break;
	case TST_ARM:
		/* Skip it when run it in aarch64 */
		if (tst_kernel_bits() == 64)
			return false;

		/* Older arm kernels didn't label their vdso maps */
		if (!strncmp(line, "ffff0000-ffff1000", 17))
			return true;
		break;
	default:
		break;
	};

	return false;
}

static long count_maps(pid_t pid)
{
	FILE *fp;
	size_t len;
	char *line = NULL;
	char buf[BUFSIZ];
	long map_count = 0;

	snprintf(buf, BUFSIZ, "/proc/%d/maps", pid);
	fp = fopen(buf, "r");
	if (fp == NULL)
		tst_brk(TBROK | TERRNO, "fopen %s", buf);
	while (getline(&line, &len, fp) != -1) {
		/* exclude vdso and vsyscall */
		if (filter_map(line))
			continue;
		map_count++;
	}
	fclose(fp);

	return map_count;
}

static void max_map_count_test(void)
{
	int status;
	pid_t pid;
	long max_maps;
	long map_count;
	long max_iters;
	long memfree;

	/*
	 * XXX Due to a possible kernel bug, oom-killer can be easily
	 * triggered when doing small piece mmaps in huge amount even if
	 * enough free memory available. Also it has been observed that
	 * oom-killer often kill wrong victims in this situation, we
	 * decided to do following steps to make sure no oom happen:
	 * 1) use a safe maximum max_map_count value as upper-bound,
	 *    we set it 65536 in this case, i.e., we don't test too big
	 *    value;
	 * 2) make sure total mapping isn't larger than
	 *        CommitLimit - Committed_AS
	 */
	/*
	* overcommit_memory=2时且/proc/sys/vm/overcommit_kbytes默认为0：
	* CommitLimit(总虚拟内存) = SwapTotal + （MemTotal * overcommit_ratio / 100)
	*overcommit_memory=2时且/proc/sys/vm/overcommit_kbytes非0：
	* CommitLimit(总虚拟内存) = SwapTotal + MemTotal * overcommit_ratio
	* Committed_AS：代表OS已分配的内存情况
	* cat /proc/meminfo，可以看到这些信息
	*/
	memfree = SAFE_READ_MEMINFO("CommitLimit:") - SAFE_READ_MEMINFO("Committed_AS:");
	/* 64 used as a bias to make sure no overflow happen */
	// _SC_PAGESIZE:内存页大小，max_iters则为空闲内存页剩余，为避免分配越界，此数再减少64页
	max_iters = memfree / sysconf(_SC_PAGESIZE) * 1024 - 64;
	if (max_iters > MAX_MAP_COUNT)
		max_iters = MAX_MAP_COUNT;

	max_maps = MAP_COUNT_DEFAULT;
	// 测试的最低counts为1024，达不到标准就退出
	if (max_iters < max_maps)
		tst_brk(TCONF, "test requires more free memory");

	while (max_maps <= max_iters) {
		// 限制进程可申请VMA区域数量为max_maps，并启动检查
		set_sys_tune("max_map_count", max_maps, 1);

		switch (pid = SAFE_FORK()) {
			// child precess
		case 0:
			// 每个进程都有自己独立的虚拟内存空间
			// 进行虚拟地址映射，非映射文件时mmap以页为单位进行映射；每次映射一页，循环直至申请失败
			while (mmap(NULL, 1, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0) != MAP_FAILED)
				;
			// send SIGSTOP信号
			if (raise(SIGSTOP) != 0)
				tst_brk(TBROK | TERRNO, "raise");
			exit(0);
		default:
			break;
		}
		/* wait child done mmap and stop */
		SAFE_WAITPID(pid, &status, WUNTRACED);
		if (!WIFSTOPPED(status))
			tst_brk(TBROK, "child did not stopped");
		// 计算/proc/pid_num/maps vma块数量
		map_count = count_maps(pid);
		/* Note max_maps will be exceeded by one for
		 * the sysctl setting of max_map_count. This
		 * is the mm failure point at the time of
		 * writing this COMMENT!
		*/
		if (map_count == (max_maps + 1))
			tst_res(TPASS, "%ld map entries in total " "as expected.", max_maps);
		else
			tst_res(TFAIL, "%ld map entries in total, but " "expected %ld entries", map_count, max_maps);

		/* make child continue to exit */
		SAFE_KILL(pid, SIGCONT);
		SAFE_WAITPID(pid, &status, 0);

		// max_maps逐次扩大2倍，直至超出max_iters
		max_maps = max_maps << 1;
	}
}

static struct tst_test test = {
	.needs_root = 1,
	.forks_child = 1,
	.setup = setup,
	.cleanup = cleanup,
	.test_all = max_map_count_test,
};

1.3综述

max_map_count.c整个文件都在测试/proc/sys/vm/max_map_count这一内核参数是否正常起作用，限制进程拥有的VMA区域数量，具体的测试方法跟着我的注释走就可以。

博文还牵扯了两个概念:vsyscall和vdso，我们来看一个例子:

@liuding-HP-288-Pro-G2-MT:~$ sudo cat /proc/2055402/maps
55a95dac0000-55a95dac1000 r--p 00000000 08:02 56626900                   /home/liudinghu/chuyongjia/fun
55a95dac1000-55a95dac2000 r-xp 00001000 08:02 56626900                   /home/liudinghu/chuyongjia/fun
55a95dac2000-55a95dac3000 r--p 00002000 08:02 56626900                   /home/liudinghu/chuyongjia/fun
55a95dac3000-55a95dac4000 r--p 00002000 08:02 56626900                   /home/liudinghu/chuyongjia/fun
55a95dac4000-55a95dac5000 rw-p 00003000 08:02 56626900                   /home/liudinghu/chuyongjia/fun
55a95f6d5000-55a95f6f6000 rw-p 00000000 00:00 0                          [heap]
7f587e984000-7f58be985000 rw-p 00000000 00:00 0
7f58be985000-7f58be9a7000 r--p 00000000 08:02 43780246                   /usr/lib/x86_64-linux-gnu/libc-2.31.so
7f58be9a7000-7f58beb1f000 r-xp 00022000 08:02 43780246                   /usr/lib/x86_64-linux-gnu/libc-2.31.so
7f58beb1f000-7f58beb6d000 r--p 0019a000 08:02 43780246                   /usr/lib/x86_64-linux-gnu/libc-2.31.so
7f58beb6d000-7f58beb71000 r--p 001e7000 08:02 43780246                   /usr/lib/x86_64-linux-gnu/libc-2.31.so
7f58beb71000-7f58beb73000 rw-p 001eb000 08:02 43780246                   /usr/lib/x86_64-linux-gnu/libc-2.31.so
7f58beb73000-7f58beb79000 rw-p 00000000 00:00 0
7f58beb8d000-7f58beb8e000 r--p 00000000 08:02 43780238                   /usr/lib/x86_64-linux-gnu/ld-2.31.so
7f58beb8e000-7f58bebb1000 r-xp 00001000 08:02 43780238                   /usr/lib/x86_64-linux-gnu/ld-2.31.so
7f58bebb1000-7f58bebb9000 r--p 00024000 08:02 43780238                   /usr/lib/x86_64-linux-gnu/ld-2.31.so
7f58bebba000-7f58bebbb000 r--p 0002c000 08:02 43780238                   /usr/lib/x86_64-linux-gnu/ld-2.31.so
7f58bebbb000-7f58bebbc000 rw-p 0002d000 08:02 43780238                   /usr/lib/x86_64-linux-gnu/ld-2.31.so
7f58bebbc000-7f58bebbd000 rw-p 00000000 00:00 0
7ffd57045000-7ffd57066000 rw-p 00000000 00:00 0                          [stack]
7ffd571da000-7ffd571de000 r--p 00000000 00:00 0                          [vvar]
7ffd571de000-7ffd571e0000 r-xp 00000000 00:00 0                          [vdso]
ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0                  [vsyscall]

这个表就是2055402号进程的VMA区域信息，最底部就是vdso和vsyscall，它俩都是通过直接映射物理内存完成用户态到内核态访问的，不过区别是一个为动态的，一个是静态的。关于这两个，大家有兴趣的，可以参考如下的帖子，讲解的非常详细:

(41条消息) Linux内核深入理解系统调用（2）：vsyscall 和 vDSO 以及程序是如何运行的（execve）_rtoax的博客-CSDN博客