vmemmap是内核中page 数据的虚拟地址。针对sparse内存模型。内核申请page获取的page地址从此开始。

vmemm Vmemmap_vmemm

start_kernel:自上而下

setup_arch
	arm64_memblock_init
	paging_init
		map_kernel
		map_mem   //线性映射物理内存
	bootmem_init
		sparse_init
			//内存模型,sparse等。建立所有page并映射 vmemmap_start。供后续buddy使用。
		zone_sizes_init
			free_area_init   //初始化zone下面的free_area[MAX_ORDER]
				free_area_init_node
		
build_all_zonelists   //建立

mm_init
	mem_init
		memblock_free_all //memblock将权利交给buddy
		
build_all_zonelists   //建立bubby

SPARSEMEM原理:

  • section的概念:

SPARSEMEM内存模型引入了section的概念,可以简单将它理解为struct page的集合(数组)。内核使用struct mem_section去描述section,定义如下:

struct mem_section {
        unsigned long section_mem_map;
        /* See declaration of similar field in struct zone */
        unsigned long *pageblock_flags;
};

其中的section_mem_map成员存放的是struct page数组的地址,每个section可容纳PFN_SECTION_SHIFT个struct page,arm64地址位宽为48bit时定义了每个section可囊括的地址范围是1GB。

  • 全局变量**mem_section

内核中用了一个二级指针struct mem_section **mem_section去管理section,我们可以简单理解为一个动态的二维数组。所谓二维即内核又将SECTIONS_PER_ROOT个section划分为一个ROOT,ROOT的个数不是固定的,根据系统实际的物理地址大小来分配。

  • 物理页帧号PFN

SPARSEMEM将PFN差分成了三个level,每个level分别对应:ROOT编号、ROOT内的section偏移、section内的page偏移。(可以类比多级页表来理解)

  • vmemmap区域

vmemmap区域是一块起始地址是VMEMMAP_START,范围是2TB的虚拟地址区域,位于kernel space。以section为单位来存放strcut page结构的虚拟地址空间,然后线性映射到物理内存。

vmemm Vmemmap_arm开发_02

  • PFN和struct page的转换:

SPARSEMEM中__pfn_to_page和__page_to_pfn的实现如下:

#define __pfn_to_page(pfn)      (vmemmap + (pfn))
#define __page_to_pfn(page)     (unsigned long)((page) - vmemmap)      
#define vmemmap        ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))

其中vmemmap指针指向VMEMMAP_START偏移memstart_addr的地址处,memstart_addr则是根据物理起始地址PHYS_OFFSET算出来的偏移。

arm64:setup_arch(arch/arm64/kernel/setup.c)

                ->bootmem_init->sparse_init

void __init bootmem_init(void)
{
        unsigned long min, max;

        min = PFN_UP(memblock_start_of_DRAM());
        max = PFN_DOWN(memblock_end_of_DRAM());

        early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

        max_pfn = max_low_pfn = max;

        arm64_numa_init();
        /*
         * Sparsemem tries to allocate bootmem in memory_present(), so must be
         * done after the fixed reservations.
         */
        arm64_memory_present();

        sparse_init();
        zone_sizes_init(min, max);

        memblock_dump_all();
}


void __init sparse_init(void)
{
        unsigned long pnum_begin = first_present_section_nr();
        int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
        unsigned long pnum_end, map_count = 1;

        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
        set_pageblock_order();
        printk("===sparse_init nid_begin %d pnum_begin %llu  pnum_end %llu \n",nid_begin,pnum_begin,pnum_end);
        for_each_present_section_nr(pnum_begin + 1, pnum_end) {
                int nid = sparse_early_nid(__nr_to_section(pnum_end));

                if (nid == nid_begin) {
                        map_count++;
                        continue;
                }
                /* Init node with sections in range [pnum_begin, pnum_end) */
                printk("===sparse_init::sparse_init_nid 0 nid_begin %d pnum_begin %llu pnum_end %llu map_count %d\n",nid_begin, pnum_begin, pnum_end, map_count);
                sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
                nid_begin = nid;
                pnum_begin = pnum_end;
                map_count = 1;
        }
        /* cover the last node */
        printk("===sparse_init::sparse_init_nid 1  nid_begin %d pnum_begin %llu pnum_end %llu map_count %d\n",nid_begin, pnum_begin, pnum_end, map_count);
        sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
        vmemmap_populate_print_last();
}

sparse_init 参考:https://zhuanlan.zhihu.com/p/555478708 

先找到物理地址的pfn,可以计算出其struct page 所在的地址根据vmemmap 。

sparse_buffer_init 通过memblock申请实际物理内存。然后通过vmemmap_pxx_populate  映射页表,完成struct page 虚拟地址 到物理内存的映射。 建立pfn,page,pa的关系。原文引用:

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
void __init sparse_init(void)
{
	unsigned long pnum_begin = first_present_section_nr();        //找到第一个section和node_id,这是在上一步中通
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); //过mem_present函数初始化的。
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {       //遍历所有的section
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); //为section申请mem_map
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
	struct mem_section_usage *usage;
	unsigned long pnum;
	struct page *map;

	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),   //记录subsection的bitmap
			mem_section_usage_size() * map_count);
	if (!usage) {
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);   //--为mem_map数组预申请的内存,只在非VMEMMAP时使用
	for_each_present_section_nr(pnum_begin, pnum) {            //--遍历所有section
		unsigned long pfn = section_nr_to_pfn(pnum);

		if (pnum >= pnum_end)
			break;

		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,   //--为mem_map申请内存,VMEMMAP下内存被
				nid, NULL);                               //映射到了virtual memory map内核空间
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
		check_usemap_section_nr(nid, usage);
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);                        //将mem_map数组赋值给对应section的指针
		usage = (void *) usage + mem_section_usage_size();
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

struct page * __meminit __populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
	unsigned long start;
	unsigned long end;

	/*
	 * The minimum granularity of memmap extensions is
	 * PAGES_PER_SUBSECTION as allocations are tracked in the
	 * 'subsection_map' bitmap of the section.
	 */
	end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);   //pfn和end都向上对齐
	pfn &= PAGE_SUBSECTION_MASK;
	nr_pages = end - pfn;

	start = (unsigned long) pfn_to_page(pfn);            //算出在virtual memory map中struct page数组的地址范围
	end = start + nr_pages * sizeof(struct page);

	if (vmemmap_populate(start, end, nid, altmap))       //为该范围建立页表,并映射物理页框
		return NULL;

	return pfn_to_page(pfn);
}

int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
		struct vmem_altmap *altmap)
{
	int err;

	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
		err = vmemmap_populate_basepages(start, end, node);
	else if (boot_cpu_has(X86_FEATURE_PSE))
		err = vmemmap_populate_hugepages(start, end, node, altmap);
	else if (altmap) {
		pr_err_once("%s: no cpu support for altmap allocations\n",
				__func__);
		err = -ENOMEM;
	} else
		err = vmemmap_populate_basepages(start, end, node);  //后边调用这个通用的分支
	if (!err)
		sync_global_pgds(start, end - 1);
	return err;
}
int __meminit vmemmap_populate_basepages(unsigned long start,
					 unsigned long end, int node)
{
	unsigned long addr = start;
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

        //--这里对内核地址空间中virtual memory map区间对应的页表进行初始化,for初始化的地址范围是[addr,end)。
        //--为页表条目申请一页内存并用其地址初始化该条目,注意各级页表中的条目是页框的物理地址。
        //--页表条目为什么不使用虚拟地址?(以下是个人想法)MMU是通过逐级查询页表中的条目最终把虚拟地址转换为物理
        //地址的,如果页表条目使用虚拟地址,那么要查找页表条目指向的页框,需要完成虚拟地址到物理地址的转换,
        //这样好像又需要下一级MMU···一直这样下去。所以把虚拟地址相关的放到MMU的前边做输入,后边MMU查找的内存
        //都是物理地址,就能很好的工作了。而且MMU是硬件,更适合访问物理地址。
	for (; addr < end; addr += PAGE_SIZE) {
		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
		p4d = vmemmap_p4d_populate(pgd, addr, node);
		if (!p4d)
			return -ENOMEM;
		pud = vmemmap_pud_populate(p4d, addr, node);
		if (!pud)
			return -ENOMEM;
		pmd = vmemmap_pmd_populate(pud, addr, node);
		if (!pmd)
			return -ENOMEM;
		pte = vmemmap_pte_populate(pmd, addr, node);
		if (!pte)
			return -ENOMEM;
		vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
	}

	return 0;
}

pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
{
	pgd_t *pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd)) {
		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);  //申请一个内存页并返回其虚拟地址    
		if (!p)              //这里我有个疑问,现在buddy系统还没初始化,为啥是从buddy系统申请的? //应该不是buddy,是memblock吧?
			return NULL;
		pgd_populate(&init_mm, pgd, p);                       //用内存页地址初始化页表条目
	}
	return pgd;
}

static void __meminit sparse_init_one_section(struct mem_section *ms,
		unsigned long pnum, struct page *mem_map,
		struct mem_section_usage *usage, unsigned long flags)
{
	ms->section_mem_map &= ~SECTION_MAP_MASK;
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)   //对struct page数组编码后赋值给mem_map指针
		| SECTION_HAS_MEM_MAP | flags;
	ms->usage = usage;
}

 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
}

这里对 mem_map 编码做一下计算,从 struct page *p 得到real_pfn:
real_pfn = p - section_mem_map = p - mem_map + section_nr_to_pfn(pnum) = section_offset + pfn_section_start

 在S2500服务器上,16个NODE节点的情况下,日志如下,可以看出每个node的map_count为32 也就是每个NODE内存32G。因为每个map_section代表1G内存。

[    0.000000] ===sparse_init nid_begin 0 pnum_begin 2  pnum_end 0 
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 0 pnum_begin 2 pnum_end 1088 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 1 pnum_begin 1088 pnum_end 1152 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 2 pnum_begin 1152 pnum_end 1216 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 3 pnum_begin 1216 pnum_end 1280 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 4 pnum_begin 1280 pnum_end 1344 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 5 pnum_begin 1344 pnum_end 1408 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 6 pnum_begin 1408 pnum_end 1472 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 7 pnum_begin 1472 pnum_end 2050 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 8 pnum_begin 2050 pnum_end 3136 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 9 pnum_begin 3136 pnum_end 3200 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 10 pnum_begin 3200 pnum_end 3264 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 11 pnum_begin 3264 pnum_end 3328 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 12 pnum_begin 3328 pnum_end 3392 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 13 pnum_begin 3392 pnum_end 3456 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 0 nid_begin 14 pnum_begin 3456 pnum_end 3520 map_count 32
 [    0.000000] ===sparse_init::sparse_init_nid 1  nid_begin 15 pnum_begin 3520 pnum_end 18446744073709551615 map_count 32 //最后的pnum_end是上面循环的原因导致到了最后一个section。不过其中大多数是不存在实际物理内存。通过for_each_present_section_nr 可以循环其中存在的section。确定section的内容参考函数arm64_memory_present。

sparse_init_nid解析一个node节点的物理内存:

static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
                                   unsigned long pnum_end,
                                   unsigned long map_count)
{
        unsigned long pnum, usemap_longs, *usemap;
        struct page *map;

        usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
                                                          usemap_size() *
                                                          map_count);
        if (!usemap) {
                pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
                goto failed;
        }
        printk("===sparse_init_nid nid %d pnum_begin  %llu  pnum_end %llu \n",nid,pnum_begin,pnum_end);
        sparse_buffer_init(map_count * section_map_size(), nid);
        for_each_present_section_nr(pnum_begin, pnum) {
                if (pnum >= pnum_end)
                        break;
                printk("===sparse_init_nid loop nid %d pnum %llu \n",nid,pnum);
                map = sparse_mem_map_populate(pnum, nid, NULL);
                if (!map) {
                        pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
                               __func__, nid);
                        pnum_begin = pnum;
                        goto failed;
                }
                check_usemap_section_nr(nid, usemap);
                sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
                usemap += usemap_longs;
        }
        sparse_buffer_fini();
        return;
failed:
        /* We failed to allocate, mark all the following pnums as not present */
        for_each_present_section_nr(pnum_begin, pnum) {
                struct mem_section *ms;

                if (pnum >= pnum_end)
                        break;
                ms = __nr_to_section(pnum);
                ms->section_mem_map = 0;
        }
}

[    0.000000] ===sparse_init_nid nid 0 pnum_begin  2  pnum_end 1088 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 2 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 3 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1024 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1025 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1028 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1029 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1030 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1031 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1032 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1033 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1034 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1035 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1036 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1037 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1038 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1039 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1056 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1057 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1058 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1059 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1060 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1061 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1062 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1063 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1064 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1065 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1066 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1067 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1068 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1069 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1070 
[    0.000000] ===sparse_init_nid loop nid 0 pnum 1071 
[    0.000000] ===sparse_init_nid nid 1 pnum_begin  1088  pnum_end 1152 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1088 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1089 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1090 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1091 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1092 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1093 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1094 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1095 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1096 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1097 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1098 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1099 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1100 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1101 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1102 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1103 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1120 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1121 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1122 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1123 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1124 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1125 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1126 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1127 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1128 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1129 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1130 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1131 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1132 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1133 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1134 
[    0.000000] ===sparse_init_nid loop nid 1 pnum 1135 
[    0.000000] ===sparse_init_nid nid 2 pnum_begin  1152  pnum_end 1216

[    0.000000] ===sparse_init_nid loop nid 2 pnum 1152

。。。

省略

。。。

[    0.000000] ===sparse_init_nid nid 3 pnum_begin  1216  pnum_end 1280 

。。。
[    0.000000] ===sparse_init_nid nid 4 pnum_begin  1280  pnum_end 1344 

。。。
[    0.000000] ===sparse_init_nid nid 5 pnum_begin  1344  pnum_end 1408 

。。。
[    0.000000] ===sparse_init_nid nid 6 pnum_begin  1408  pnum_end 1472 

。。。
[    0.000000] ===sparse_init_nid nid 7 pnum_begin  1472  pnum_end 2050 

。。。
[    0.000000] ===sparse_init_nid nid 8 pnum_begin  2050  pnum_end 3136 

。。。
[    0.000000] ===sparse_init_nid nid 9 pnum_begin  3136  pnum_end 3200 

。。。
[    0.000000] ===sparse_init_nid nid 10 pnum_begin  3200  pnum_end 3264 

。。。
[    0.000000] ===sparse_init_nid nid 11 pnum_begin  3264  pnum_end 3328 

。。。
[    0.000000] ===sparse_init_nid nid 12 pnum_begin  3328  pnum_end 3392 

。。。
[    0.000000] ===sparse_init_nid nid 13 pnum_begin  3392  pnum_end 3456 

。。。
[    0.000000] ===sparse_init_nid nid 14 pnum_begin  3456  pnum_end 3520 

。。。
[    0.000000] ===sparse_init_nid nid 15 pnum_begin  3520  pnum_end

18446744073709551615
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3520 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3521 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3522 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3523 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3524 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3525 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3526 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3527 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3528 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3529 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3530 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3531 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3532 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3533 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3534 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3535 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3552 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3553 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3554 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3555 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3556 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3557 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3558 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3559 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3560 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3561 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3562 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3563 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3564 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3565 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3566 
[    0.000000] ===sparse_init_nid loop nid 15 pnum 3567

上述日志。在确定的一个nid中,循环一个存在的section。可以看到循环的次数是32。

前面已经得出每个node下的物理内存。且已经完成**mem_section 的创建。虽然此时page的地址已经确定,单具体的内容还是空的。mem_section对应一个G的物理内存,所有page的内容要根据实际物理内存来初始化。

bootmem_init

        zone_sizes_init

                free_area_init_nodes
                    free_area_init_node
                        calculate_node_totalpages
                        free_area_init_core
                            memmap_init(memmap_init_zone)
                                __init_single_page
    

在memmap_init_zone函数中,会根据实际的物理内存进行page的初始化,调用__init_single_page函数。下面我就打印

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn, enum memmap_context context,
                struct vmem_altmap *altmap)
{
        unsigned long realcount = 0 ;
        unsigned long end_pfn = start_pfn + size;
        pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long pfn;
        unsigned long nr_initialised = 0;
        struct page *page;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        struct memblock_region *r = NULL, *tmp;
#endif

        if (highest_memmap_pfn < end_pfn - 1)
                highest_memmap_pfn = end_pfn - 1;

        /*
         * Honor reservation requested by the driver for this ZONE_DEVICE
         * memory
         */
        if (altmap && start_pfn == altmap->base_pfn)
                start_pfn += altmap->reserve;

        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
                 * There can be holes in boot-time mem_map[]s handed to this
                 * function.  They do not exist on hotplugged memory.
                 */
                if (context != MEMMAP_EARLY)
                        goto not_early;

                if (!early_pfn_valid(pfn))
                        continue;
                if (!early_pfn_in_nid(pfn, nid))
                        continue;
                if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
                        break;
。。。。。省略。。。。。。

not_early:
                realcount++;
                page = pfn_to_page(pfn);
                __init_single_page(page, pfn, zone, nid);
                if (context == MEMMAP_HOTPLUG)
                        SetPageReserved(page);

                if (!(pfn & (pageblock_nr_pages - 1))) {
                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                        cond_resched();
                }
        }
        printk("===memmap_init_zone nid %d size %llu zone %llu start_pfn %llu readlcount  %llu\n",nid,size,zone,start_pfn,realcount);
}

下面打印除了每个node下时间的page数量。并初始化page结构内容。


[    0.000000] ===memmap_init_zone nid 0 size 32768 zone 0 start_pfn 32768 readlcount  31661
[    0.000000] ===memmap_init_zone nid 0 size 17498112 zone 1 start_pfn 65536 readlcount  490496
[    0.000000] ===memmap_init_zone nid 1 size 786432 zone 1 start_pfn 17825792 readlcount  524288
[    0.000000] ===memmap_init_zone nid 2 size 786432 zone 1 start_pfn 18874368 readlcount  524288
[    0.000000] ===memmap_init_zone nid 3 size 786432 zone 1 start_pfn 19922944 readlcount  524288
[    0.000000] ===memmap_init_zone nid 4 size 786432 zone 1 start_pfn 20971520 readlcount  524288
[    0.000000] ===memmap_init_zone nid 5 size 786432 zone 1 start_pfn 22020096 readlcount  524288
[    0.000000] ===memmap_init_zone nid 6 size 786432 zone 1 start_pfn 23068672 readlcount  524288
[    0.000000] ===memmap_init_zone nid 7 size 786432 zone 1 start_pfn 24117248 readlcount  524288
[    0.000000] ===memmap_init_zone nid 8 size 17530880 zone 1 start_pfn 33587200 readlcount  523264
[    0.000000] ===memmap_init_zone nid 9 size 786432 zone 1 start_pfn 51380224 readlcount  524288
[    0.000000] ===memmap_init_zone nid 10 size 786432 zone 1 start_pfn 52428800 readlcount  524288
[    0.000000] ===memmap_init_zone nid 11 size 786432 zone 1 start_pfn 53477376 readlcount  524288
[    0.000000] ===memmap_init_zone nid 12 size 786432 zone 1 start_pfn 54525952 readlcount  524288
[    0.000000] ===memmap_init_zone nid 13 size 786432 zone 1 start_pfn 55574528 readlcount  524288
[    0.000000] ===memmap_init_zone nid 14 size 786432 zone 1 start_pfn 56623104 readlcount  524288
[    0.000000] ===memmap_init_zone nid 15 size 786432 zone 1 start_pfn 57671680 readlcount  524288

测试page地址是否在VMEMMAP定义的地址范围:

#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/device.h>
#include <linux/io.h>


static int __init test_init(void)
{
	struct page *page;
	unsigned long vaddr ;

	printk("VMEMMAP_START %llx vmemmap %llx \n ",VMEMMAP_START,vmemmap);
	page = alloc_pages(GFP_ATOMIC & ~__GFP_HIGHMEM, 0);
	if (!page)
		return 0;
	
	printk("page_to_pfn(%llx):%llu  pa  %llx \n",page,page_to_pfn(page),PFN_PHYS(page_to_pfn(page)));
	vaddr = page_address(page);
	printk("page %llx vaddr %llx \n",page,vaddr);
	printk(KERN_INFO "test_init\n");
	return 0;
}

static void __exit test_exit(void)
{
	printk(KERN_INFO "test_exit\n");
}

module_init(test_init);
module_exit(test_exit);


MODULE_LICENSE("GPL");

输出:

[239935.492355] VMEMMAP_START ffff7fe000000000 vmemmap ffff7fdfffe00000 
                 
[239935.492359] page_to_pfn(ffff7fe008b6f100):2317252  pa  235bc40000 
[239935.518223] page ffff7fe008b6f100 vaddr ffff8022dbc40000 

page地址在VMEMMAP_START 开始。最终的虚拟地址是通过pfn->pa->va。 虚拟地址自然是内核之前映射的线性地址区间。
 

物理地址获取numa id 函数:


pfn_to_nid page_to_nid