在Linux内核启动过程中,内核根据系统配置来设置虚拟地址空间的布局,如PAGE_OFFSET的起始地址,PHYS_OFFSET等。对于宏PHYS_OFFSET来说,其描述的是物理内存的起始地址,一般由硬件给出。如下面一些设置:

ARM:
arch/arm/include/asm/memory.h
#define PLAT_PHYS_OFFSET    UL(CONFIG_PHYS_OFFSET)
#if defined(__virt_to_phys)
 #define PHYS_OFFSET    PLAT_PHYS_OFFSET
 #define PHYS_PFN_OFFSET    ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))

 #define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT)

 #elif defined(CONFIG_ARM_PATCH_PHYS_VIRT)

 /*
  * Constants used to force the right instruction encodings and shifts
  * so that all we need to do is modify the 8-bit constant field.
  */
 #define __PV_BITS_31_24    0x81000000
 #define __PV_BITS_7_0    0x81

 extern unsigned long __pv_phys_pfn_offset;
 extern u64 __pv_offset;
 extern void fixup_pv_table(const void *, unsigned long);
 extern const void *__pv_table_begin, *__pv_table_end;

 #define PHYS_OFFSET    ((phys_addr_t)__pv_phys_pfn_offset << PAGE_SHIFT)
 #define PHYS_PFN_OFFSET    (__pv_phys_pfn_offset)

 #define virt_to_pfn(kaddr) \
     ((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
      PHYS_PFN_OFFSET)

 #define __pv_stub(from,to,instr,type)            \
     __asm__("@ __pv_stub\n"                \
     "1:    " instr "    %0, %1, %2\n"        \
     "    .pushsection .pv_table,\"a\"\n"        \
     "    .long    1b\n"                \
     "    .popsection\n"                \
     : "=r" (to)                    \
     : "r" (from), "I" (type))

 #define __pv_stub_mov_hi(t)                \
     __asm__ volatile("@ __pv_stub_mov\n"        \
     "1:    mov    %R0, %1\n"            \
     "    .pushsection .pv_table,\"a\"\n"        \
     "    .long    1b\n"                \
     "    .popsection\n"                \
     : "=r" (t)                    \
     : "I" (__PV_BITS_7_0))

 #define __pv_add_carry_stub(x, y)            \
     __asm__ volatile("@ __pv_add_carry_stub\n"    \
     "1:    adds    %Q0, %1, %2\n"            \
     "    adc    %R0, %R0, #0\n"            \
     "    .pushsection .pv_table,\"a\"\n"        \
     "    .long    1b\n"                \
     "    .popsection\n"                \
     : "+r" (y)                    \
     : "r" (x), "I" (__PV_BITS_31_24)        \
     : "cc")

 static inline phys_addr_t __virt_to_phys(unsigned long x)
 {
     phys_addr_t t;

     if (sizeof(phys_addr_t) == 4) {
         __pv_stub(x, t, "add", __PV_BITS_31_24);
     } else {
         __pv_stub_mov_hi(t);
         __pv_add_carry_stub(x, t);
     }
     return t;
 }

 static inline unsigned long __phys_to_virt(phys_addr_t x)
 {
     unsigned long t;

     /*
      * 'unsigned long' cast discard upper word when
      * phys_addr_t is 64 bit, and makes sure that inline
      * assembler expression receives 32 bit argument
      * in place where 'r' 32 bit operand is expected.
      */
     __pv_stub((unsigned long) x, t, "sub", __PV_BITS_31_24);
     return t;
 }

 #else

 #define PHYS_OFFSET    PLAT_PHYS_OFFSET
 #define PHYS_PFN_OFFSET    ((unsigned long)(PHYS_OFFSET >> PAGE_SHIFT))

 static inline phys_addr_t __virt_to_phys(unsigned long x)
 {
     return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
 }

 static inline unsigned long __phys_to_virt(phys_addr_t x)
 {
     return x - PHYS_OFFSET + PAGE_OFFSET;
 }

 #define virt_to_pfn(kaddr) \
     ((((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT) + \
      PHYS_PFN_OFFSET)

 #endif



可以认为PHYS_OFFSET是由CONFIG_PHYS_OFFSET来设置的。对于上面不同的定义,其影响物理地址到虚拟地址的转化,但并无本质的区别。

另外一个地址PAGE_OFFSET也是非常重要的,其也在arch/arm/include/asm/memory.h中给出,如下:

#define UL(x) _AC(x, UL)

/* PAGE_OFFSET - the virtual address of the start of the kernel image */
#define PAGE_OFFSET        UL(CONFIG_PAGE_OFFSET)

注意PAGE_OFFSET  是内核虚拟地址开始的地方。通常来说,PAGE_OFFSET  会被直接映射到PHYS_OFFSET处。

我们本篇主要是描述虚拟的布局,可以看到,PAGE_OFFSET之上是内核虚拟地址,而PAGE_OFFSET之下是进程的用户空间地址。用户空间较为重要一个宏TASK_SIZE,如下:

#define TASK_SIZE        (UL(CONFIG_PAGE_OFFSET) - UL(SZ_16M))
#define TASK_UNMAPPED_BASE    ALIGN(TASK_SIZE / 3, SZ_16M)

也就是说,用户空间到内核空间有个SZ_16M大小的洞。

需要注意的是TASK_UNMAPPED_BASE,其给出了mmap类函数对虚拟地址空间映射时使用的最小虚拟地址。如果CONFIG_PAGE_OFFSET为0xC00 00000(3G) ,那么TASK_SIZE为0XBF0 00000.

TASK_SIZE / 3 = 0XBF0 00000/3 =0x3FAAAAAA 

再对SZ_16M对齐,所以TASK_UNMAPPED_BASE为0x3FAAAAB0。

所以,使用mmap类函数,虚拟地址最小为3FAAAAB0,最大为TASK_SIZE。


通常PAGE_OFFSET被映射到物理地址起始处,即PHYS_OFFSET给出的地址。这样,以PHYS_OFFSET为开始地址,至于结束地址则不同的平台定义并不相同,但是有一点是相同的,即有块连续的物理内存被直接映射到虚拟地址空间,也就是我们说的低端内存直接映射。至于这块低端内存Linux内核是如何计算的,我们下文会详细的论述,这里与低端内存对应的是high_memory,即高端内存,从PHYS_OFFSET开始,到高端内存,我们用LOW_BOUNCE_HIGH做边界。max_low_pfn是给出的低端内存的最大页帧号,而min_low_pfn则是最小的页帧编号,由于我们从PHYS_OFFSET开始,其一般为0,所以min_low_pfn一般为0.如果我们把低端内存限制在512M,则max_low_pfn为512M/4k = 0x20000 = 131072,。内核还有一个max_pfn,其对应整个内存对应的最大的页帧号,如果内存为

mem= 1G, max_pfn= 2^18 =262144
mem= 2G, max_pfn=2^19 =524288
mem= 3G, max_pfn=0xC0000 = 786432
mem= 4G, max_pfn=2^20 =1048576
mem= 8G, max_pfn=2^21 =2097152
 mem=16G, max_pfn=2^22= 4194304



对于低端内存区大小和高端内存区设置由平台来配置,对于high_memory端内存,其之上对应vmalloc区域,Linux中主要配置如下:

arch/arm/include/asm/pgtable.h
#define VMALLOC_OFFSET        (8*1024*1024)
 #define VMALLOC_START        (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
 #define VMALLOC_END        0xff000000UL // 4G - 16M == 4080M

可以看到,对于VMALLOC区域,其从高端内存high_memory附近开始,这里有8M的空洞。然后到VMALLOC_END处,可以看到,VMALLOC_END到0xFFFFFFFF还有16M的空间,这些一般被特殊的设备使用。

一般来说,VMALLOC区大小可以由内核的命令行参数来指定,如下函数分析命令行参数,设置VMALLOC区大小:

arch/arm/mm/mmu.c
static void *  vmalloc_min =
     (void *)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET);

 /*
  * vmalloc=size forces the vmalloc area to be exactly 'size'
  * bytes. This can be used to increase (or decrease) the vmalloc
  * area - the default is 240m. vmalloc
  */
 static int __init early_vmalloc(char *arg)
 {
     unsigned long vmalloc_reserve = memparse(arg, NULL);

     if (vmalloc_reserve < SZ_16M) {
         vmalloc_reserve = SZ_16M;
         pr_warn("vmalloc area too small, limiting to %luMB\n",
             vmalloc_reserve >> 20);
     }

     if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) { 
         vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M); 
         pr_warn("vmalloc area is too big, limiting to %luMB\n",
             vmalloc_reserve >> 20);
     }
  
     vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve); 
     return 0;
 }


early_param("vmalloc", early_vmalloc);

这段代码接受VMALLOC大小在16M和976M之间。

如果命令行提供vmalloc大小,假若为240M,则vmalloc_min为0xF00 00000

如果命令行没有提供,内核采用默认配置,则vmalloc_min为0xEF8 00000


在了解上面内存布局主要参数定义后,我们再来详细的看看内核启动过程中主要函数处理,函数sanity_check_meminfo()对主要的的内存区检查。函数实现如下:

void __init sanity_check_meminfo(void)
 {
     phys_addr_t memblock_limit = 0;
     int highmem = 0;
     phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1;
     struct memblock_region *reg;
     bool should_use_highmem = false;
 //了解内存区情况
     for_each_memblock(memory, reg) {
         phys_addr_t block_start = reg->base;
         phys_addr_t block_end = reg->base + reg->size;
         phys_addr_t size_limit = reg->size;

         if (reg->base >= vmalloc_limit)
             highmem = 1;
         else
             size_limit = vmalloc_limit - reg->base;


         if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) {

             if (highmem) {
                 pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n",
                       &block_start, &block_end);
                 memblock_remove(reg->base, reg->size);
                 should_use_highmem = true;
                 continue;
             }

             if (reg->size > size_limit) {
                 phys_addr_t overlap_size = reg->size - size_limit;

                 pr_notice("Truncating RAM at %pa-%pa to -%pa",
                       &block_start, &block_end, &vmalloc_limit);
                 memblock_remove(vmalloc_limit, overlap_size);
                 block_end = vmalloc_limit;
                 should_use_highmem = true;
             }
         }

         if (!highmem) {
             if (block_end > arm_lowmem_limit) {
                 if (reg->size > size_limit)
                     arm_lowmem_limit = vmalloc_limit;
                 else
                     arm_lowmem_limit = block_end;
             }

             /*
              * Find the first non-pmd-aligned page, and point
              * memblock_limit at it. This relies on rounding the
              * limit down to be pmd-aligned, which happens at the
              * end of this function.
              *
              * With this algorithm, the start or end of almost any
              * bank can be non-pmd-aligned. The only exception is
              * that the start of the bank 0 must be section-
              * aligned, since otherwise memory would need to be
              * allocated when mapping the start of bank 0, which
              * occurs before any free memory is mapped.
              */
             if (!memblock_limit) {
                 if (!IS_ALIGNED(block_start, PMD_SIZE))
                     memblock_limit = block_start;
                 else if (!IS_ALIGNED(block_end, PMD_SIZE))
                     memblock_limit = arm_lowmem_limit;
             }

         }
     }

     if (should_use_highmem)
         pr_notice("Consider using a HIGHMEM enabled kernel.\n");

     high_memory = __va(arm_lowmem_limit - 1) + 1;

     /*
      * Round the memblock limit down to a pmd size.  This
      * helps to ensure that we will allocate memory from the
      * last full pmd, which should be mapped.
      */
     if (memblock_limit)
         memblock_limit = round_down(memblock_limit, PMD_SIZE);
     if (!memblock_limit)
         memblock_limit = arm_lowmem_limit;

     memblock_set_current_limit(memblock_limit);
 }

在了解此函数之前,我们熟悉下面几个函数

int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)

int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
                       int nid)

在系统启动过程中,内核会通过上面函数添加有效内存。内核有几处添加处理:

一. early_init_dt_add_memory_arch() --memblock_add()

二. arm_add_memory() --> memblock_add()

三. 体系决定增加,直接调用函数memblock_add()


函数sanity_check_meminfo()作用很简单,对内存大小分析后设置high_memory和arm_lowmem_limit。