文章目录

  • 前言
  • 一、内存模型
  • 二、(N)UMA
  • 2.1 简介
  • 2.2 节点
  • 2.3 UMA节点与Flat Memory Model
  • 2.4 zone
  • 2.4.1 zone
  • 2.4.2 zone_type
  • 2.5 内存布局
  • 三、节点相关函数
  • 3.1 for_each_online_node
  • 3.2 NODE_DATA
  • 参考资料


前言

一、内存模型

所谓memory model,其实就是从cpu的角度看,其物理内存的分布情况,在linux kernel中,使用什么的方式来管理这些物理内存。某些体系架构支持多种内存模型,但在内核编译构建时只能选择使用一种内存模型。
Linux内核目前支持三种内存模型(物理内存分布情况):Flat Memory Model,Discontiguous Memory Model和Sparse Memory Model。

我的x86_64架构下的Linux内核配置选项:

CONFIG_SPARSEMEM_MANUAL=y
CONFIG_SPARSEMEM=y
CONFIG_NEED_MULTIPLE_NODES=y
CONFIG_HAVE_MEMORY_PRESENT=y
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER=y
CONFIG_SPARSEMEM_VMEMMAP=y

可以看到其内存模型是Sparse Memory Model,支持热插拔。

Kylin Linux Desktop V10 (SP1) aarch64(内核版本5.418)的内核配置选项:

#
# Memory Management options
#
CONFIG_SELECT_MEMORY_MODEL=y
CONFIG_SPARSEMEM_MANUAL=y
CONFIG_SPARSEMEM=y
CONFIG_NEED_MULTIPLE_NODES=y
CONFIG_HAVE_MEMORY_PRESENT=y
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_VMEMMAP=y
CONFIG_HAVE_MEMBLOCK_NODE_MAP=y
CONFIG_HAVE_FAST_GUP=y
CONFIG_ARCH_KEEP_MEMBLOCK=y
CONFIG_MEMORY_ISOLATION=y
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTPLUG_SPARSE=y
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MEMORY_BALLOON=y
CONFIG_BALLOON_COMPACTION=y
CONFIG_COMPACTION=y

可以看到其内存模型也是Sparse Memory Model。

请参考:
http://www.wowotech.net/memory_management/memory_model.htmlhttps://zhuanlan.zhihu.com/p/452891440

二、(N)UMA

2.1 简介

x86 的工作模式的时候,讲过 CPU 是通过总线去访问内存,

Memory Device 1 B 6 代表的是服务器哪个内存条_服务器


在这种模式下,CPU 也会有多个,在总线的一侧。所有的内存条组成一大片内存,在总线的另一侧,所有的 CPU 访问内存都要过总线,而且距离都是一样的,这种模式称为 SMP(Symmetric multiprocessing),即对称多处理器。当然,它也有一个显著的缺点,就是总线会成为瓶颈,因为数据都要走它。

为了提高性能和可扩展性,后来有了一种更高级的模式,NUMA(Non-uniform memory access),非一致内存访问。在这种模式下,内存不是一整块。每个 CPU 都有自己的本地内存,CPU 访问本地内存不用过总线,因而速度要快很多,每个 CPU 和内存在一起,称为一个 NUMA 节点。但是,在本地内存不足的情况下,每个 CPU 都可以去另外的 NUMA 节点申请内存,这个时候访问延时就会比较长。

Memory Device 1 B 6 代表的是服务器哪个内存条_#endif_02

我的x86_64架构下的Linux内核配置选项:

CONFIG_NUMA=y
CONFIG_AMD_NUMA=y
CONFIG_X86_64_ACPI_NUMA=y
CONFIG_NODES_SPAN_OTHER_NODES=y

虽然配置了NUMA,但通常也只有一个节点:

Memory Device 1 B 6 代表的是服务器哪个内存条_服务器_03


Memory Device 1 B 6 代表的是服务器哪个内存条_#endif_04

2.2 节点

enum zone_type {
#ifdef CONFIG_ZONE_DMA
	/*
	 * ZONE_DMA is used when there are devices that are not able
	 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
	 * carve out the portion of memory that is needed for these devices.
	 * The range is arch specific.
	 *
	 * Some examples
	 *
	 * Architecture		Limit
	 * ---------------------------
	 * parisc, ia64, sparc	<4G
	 * s390			<2G
	 * arm			Various
	 * alpha		Unlimited or 0-16MB.
	 *
	 * i386, x86_64 and multiple other arches
	 * 			<16M.
	 */
	ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
	/*
	 * x86_64 needs two ZONE_DMAs because it supports devices that are
	 * only able to do DMA to the lower 16M but also 32 bit devices that
	 * can only do DMA areas below 4G.
	 */
	ZONE_DMA32,
#endif
	/*
	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
	 * performed on pages in ZONE_NORMAL if the DMA devices support
	 * transfers to all addressable memory.
	 */
	ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
	/*
	 * A memory area that is only addressable by the kernel through
	 * mapping portions into its own address space. This is for example
	 * used by i386 to allow the kernel to address the memory beyond
	 * 900MB. The kernel will set up special mappings (page
	 * table entries on i386) for each page that the kernel needs to
	 * access.
	 */
	ZONE_HIGHMEM,
#endif
	ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
	ZONE_DEVICE,
#endif
	__MAX_NR_ZONES

};
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()	- Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()	- Return the index of the zone for an entry
 * zonelist_node_idx()	- Return the index of the node for an entry
 */
struct zonelist {
	struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */

typedef struct pglist_data {
	struct zone node_zones[MAX_NR_ZONES];
	struct zonelist node_zonelists[MAX_ZONELISTS];
	int nr_zones;
	
	......
	
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
	/*
	 * Must be held any time you expect node_start_pfn, node_present_pages
	 * or node_spanned_pages stay constant.  Holding this will also
	 * guarantee that any pfn_valid() stays that way.
	 *
	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
	 * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
	 *
	 * Nests above zone->lock and zone->span_seqlock
	 */
	spinlock_t node_size_lock;
#endif
	unsigned long node_start_pfn;
	unsigned long node_present_pages; /* total number of physical pages */
	unsigned long node_spanned_pages; /* total size of physical page
					     range, including holes */
	int node_id;
	wait_queue_head_t kswapd_wait;
	wait_queue_head_t pfmemalloc_wait;
	struct task_struct *kswapd;	/* Protected by
					   mem_hotplug_begin/end() */
	int kswapd_order;
	enum zone_type kswapd_classzone_idx;

	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
	int kcompactd_max_order;
	enum zone_type kcompactd_classzone_idx;
	wait_queue_head_t kcompactd_wait;
	struct task_struct *kcompactd;
#endif
#ifdef CONFIG_NUMA_BALANCING
	/* Lock serializing the migrate rate limiting window */
	spinlock_t numabalancing_migrate_lock;

	/* Rate limiting time interval */
	unsigned long numabalancing_migrate_next_window;

	/* Number of pages migrated during the rate limiting time interval */
	unsigned long numabalancing_migrate_nr_pages;
#endif
	/*
	 * This is a per-node reserve of pages that are not available
	 * to userspace allocations.
	 */
	unsigned long		totalreserve_pages;

#ifdef CONFIG_NUMA
	/*
	 * zone reclaim becomes active if more unmapped pages exist.
	 */
	unsigned long		min_unmapped_pages;
	unsigned long		min_slab_pages;
#endif /* CONFIG_NUMA */

	/* Write-intensive fields used by page reclaim */
	ZONE_PADDING(_pad1_)
	spinlock_t		lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
	/*
	 * If memory initialisation on large machines is deferred then this
	 * is the first PFN that needs to be initialised.
	 */
	unsigned long first_deferred_pfn;
	/* Number of non-deferred pages */
	unsigned long static_init_pgcnt;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	spinlock_t split_queue_lock;
	struct list_head split_queue;
	unsigned long split_queue_len;
#endif

	/* Fields commonly accessed by the page reclaim scanner */
	struct lruvec		lruvec;

	unsigned long		flags;

	ZONE_PADDING(_pad2_)

	/* Per-node vmstats */
	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;

每个NUMA节点都有一个 pglist_data 来描述它的内存布局(在UMA机器只有一个内存结点,因此用一个单独的pglist_data描述了整个内存)。

(1)node_zones:是一个内存区数组,包含了该节点中各个内存zones的数据结构。

(2)node_zonelists:两个zonelist,一个是指向本节点的的内存区,另一个指向由本节点分配不到内存时可选的备用内存区。第一个 zonelist 类型的元素指向本节点内的 zone 数组,第二个 zonelist 类型的元素指向其它节点的 zone 数组。指定了备用节点及其内存zones的列表,以便在当前节点没有可用空间时,在备用节点分配内存。
这样在本节点中分配不到内存页面的时候,就会到其它节点中分配内存页面。当计算机不是 NUMA 时,这时 Linux 就只创建一个节点。

(3)nr_zones:本节点有多少个内存区

(4)node_start_pfn:是这个节点的起始页号,即本节点开始的page索引号;

(5)node_spanned_pages:是这个节点中包含不连续的物理内存地址的页面数,可能节点中会有空洞。

(6)node_present_pages:是真正可用的物理页面的数目。

(7)node_id:每一个节点都有自己的 ID,是一个全局的节点ID,系统中NUMA节点中都从0开始编号。

// linux-4.18/arch/x86/include/asm/mmzone_64.h

#ifdef CONFIG_NUMA

#include <linux/mmdebug.h>
#include <asm/smp.h>

extern struct pglist_data *node_data[];

#define NODE_DATA(nid)		(node_data[nid])

pglist_data 、zone 和 zonelist 三者之间的关系如图所示:

Memory Device 1 B 6 代表的是服务器哪个内存条_linux_05

2.3 UMA节点与Flat Memory Model

该节内容可以跳过,目前x86_64和aarch64架构都没有以下的配置。

(1)
虽然Linux在x86_64机器通常只有一个节点,但是配置了CONFIG_NEED_MULTIPLE_NODES和CONFIG_NUMA选项。

CONFIG_NEED_MULTIPLE_NODES=y
// linux-4.18/linux-4.18/mm/memory.c

#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);

struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif

mem_map是一个struct page类型的数组,用来存放系统中所有的struct page。

// linux-4.18/linux-4.18/include/linux/mmzone.h

#ifndef CONFIG_NEED_MULTIPLE_NODES

extern struct pglist_data contig_page_data;
#define NODE_DATA(nid)		(&contig_page_data)
#define NODE_MEM_MAP(nid)	mem_map

#else /* CONFIG_NEED_MULTIPLE_NODES */

在UMA结构的机器中, 只有一个node结点即contig_page_data,

(2)

typedef struct pglist_data {
	......
#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
	struct page *node_mem_map;
	......
}

node_mem_map 就是这个节点的 struct page 数组,用于描述这个节点里面的所有的页;
对于UMA系统中node_mem_map等于全局的mem_map。

Linux x86_64目前是Sparse Memory Model,因此没有配置CONFIG_FLAT_NODE_MEM_MAP,所以struct page *node_mem_map可以忽略。

2.4 zone

2.4.1 zone

每一个节点分成一个个区域 zone,放在数组 node_zones 里面。这个数组的大小为 MAX_NR_ZONES。

struct zone {
	/* Read-mostly fields */

	/* zone watermarks, access with *_wmark_pages(zone) macros */
	unsigned long watermark[NR_WMARK];

	unsigned long nr_reserved_highatomic;

	/*
	 * We don't know if the memory that we're going to allocate will be
	 * freeable or/and it will be released eventually, so to avoid totally
	 * wasting several GB of ram we must reserve some of the lower zone
	 * memory (otherwise we risk to run OOM on the lower zones despite
	 * there being tons of freeable ram on the higher zones).  This array is
	 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
	 * changes.
	 */
	long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
	int node;
#endif
	struct pglist_data	*zone_pgdat;
	struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM
	/*
	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
	 * In SPARSEMEM, this map is stored in struct mem_section
	 */
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
	unsigned long		zone_start_pfn;

	/*
	 * spanned_pages is the total pages spanned by the zone, including
	 * holes, which is calculated as:
	 * 	spanned_pages = zone_end_pfn - zone_start_pfn;
	 *
	 * present_pages is physical pages existing within the zone, which
	 * is calculated as:
	 *	present_pages = spanned_pages - absent_pages(pages in holes);
	 *
	 * managed_pages is present pages managed by the buddy system, which
	 * is calculated as (reserved_pages includes pages allocated by the
	 * bootmem allocator):
	 *	managed_pages = present_pages - reserved_pages;
	 *
	 * So present_pages may be used by memory hotplug or memory power
	 * management logic to figure out unmanaged pages by checking
	 * (present_pages - managed_pages). And managed_pages should be used
	 * by page allocator and vm scanner to calculate all kinds of watermarks
	 * and thresholds.
	 *
	 * Locking rules:
	 *
	 * zone_start_pfn and spanned_pages are protected by span_seqlock.
	 * It is a seqlock because it has to be read outside of zone->lock,
	 * and it is done in the main allocator path.  But, it is written
	 * quite infrequently.
	 *
	 * The span_seq lock is declared along with zone->lock because it is
	 * frequently read in proximity to zone->lock.  It's good to
	 * give them a chance of being in the same cacheline.
	 *
	 * Write access to present_pages at runtime should be protected by
	 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
	 * present_pages should get_online_mems() to get a stable value.
	 *
	 * Read access to managed_pages should be safe because it's unsigned
	 * long. Write access to zone->managed_pages and totalram_pages are
	 * protected by managed_page_count_lock at runtime. Idealy only
	 * adjust_managed_page_count() should be used instead of directly
	 * touching zone->managed_pages and totalram_pages.
	 */
	unsigned long		managed_pages;
	unsigned long		spanned_pages;
	unsigned long		present_pages;

	const char		*name;

#ifdef CONFIG_MEMORY_ISOLATION
	/*
	 * Number of isolated pageblock. It is used to solve incorrect
	 * freepage counting problem due to racy retrieving migratetype
	 * of pageblock. Protected by zone->lock.
	 */
	unsigned long		nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
	/* see spanned/present_pages for more description */
	seqlock_t		span_seqlock;
#endif

	int initialized;

	/* Write-intensive fields used from the page allocator */
	ZONE_PADDING(_pad1_)

	/* free areas of different sizes */
	struct free_area	free_area[MAX_ORDER];

	/* zone flags, see below */
	unsigned long		flags;

	/* Primarily protects free_area */
	spinlock_t		lock;

	/* Write-intensive fields used by compaction and vmstats. */
	ZONE_PADDING(_pad2_)

	/*
	 * When free pages are below this point, additional steps are taken
	 * when reading the number of free pages to avoid per-cpu counter
	 * drift allowing watermarks to be breached
	 */
	unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	/* pfn where compaction free scanner should start */
	unsigned long		compact_cached_free_pfn;
	/* pfn where async and sync compaction migration scanner should start */
	unsigned long		compact_cached_migrate_pfn[2];
#endif

#ifdef CONFIG_COMPACTION
	/*
	 * On compaction failure, 1<<compact_defer_shift compactions
	 * are skipped before trying again. The number attempted since
	 * last failure is tracked with compact_considered.
	 */
	unsigned int		compact_considered;
	unsigned int		compact_defer_shift;
	int			compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	/* Set to true when the PG_migrate_skip bits should be cleared */
	bool			compact_blockskip_flush;
#endif

	bool			contiguous;

	ZONE_PADDING(_pad3_)
	/* Zone statistics */
	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
	atomic_long_t		vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;

watermark:每个zone在系统启动时会计算三个水位,分别是最低水位线,低水位线,高水位线。在页面分配器和kswapd页面回收中用到。

enum zone_watermarks {
	WMARK_MIN,
	WMARK_LOW,
	WMARK_HIGH,
	NR_WMARK
};
#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])

lowmem_reserve:该数组为各个内存区制定了若干页,用于一些无论如何都不能失败的关键性内存分配。各个内存区域的份额根据重要性确定。
防止页面分配器过度使用低端zone的内存。我们不知道我们将要分配的内存是可释放的还是它最终会被释放,所以为了避免完全浪费几个GB的内存,我们必须保留一些较低区域的内存(否则我们就有在较低区域运行OOM的风险,尽管在较高区域有大量的可释放内存)。如果sysctl_lowmem_reserve_ratio sysctl发生变化,则在运行时重新计算该数组。

zone_pgdat:指向内存节点。

pageset:用于维护每个CPU上的一系列页面,以减少自旋锁的争用。用于区分冷热页。如果一个页被加载到 CPU 高速缓存里面,这就是一个热页(Hot Page),CPU 读起来速度会快很多,如果没有就是冷页(Cold Page)。由于每个 CPU 都有自己的高速缓存,因而 per_cpu_pageset 也是每个 CPU 一个。

struct per_cpu_pages {
	int count;		/* number of pages in the list */
	int high;		/* high watermark, emptying needed */
	int batch;		/* chunk size for buddy add/remove */

	/* Lists of pages, one per migrate type stored on the pcp-lists */
	struct list_head lists[MIGRATE_PCPTYPES];
};

struct per_cpu_pageset {
	struct per_cpu_pages pcp;
	......
};
cat /proc/zoneinfo
Node 0, zone      DMA

Memory Device 1 B 6 代表的是服务器哪个内存条_#endif_06

Node 0, zone   Normal

Memory Device 1 B 6 代表的是服务器哪个内存条_#endif_07

zone_start_pfn: 表示属于这个 zone 的起始页帧号。

spanned_pages = zone_end_pfn - zone_start_pfn,也即 spanned_pages 指的是不管中间有没有物理内存空洞,反正就是最后的页号减去起始的页号,即spanned_pages包含内存空洞区域页。

present_pages = spanned_pages - absent_pages(pages in holes),也即 present_pages 是这个 zone 在物理内存中真实存在的所有 page 数目。

managed_pages = present_pages - reserved_pages,也即 managed_pages 是这个 zone 被伙伴系统管理的所有的 page 数目。

三者的关系是: spanned_pages > present_pages > managed_pages

memory hotplug or memory power management logic可以使用present_pages来通过检查(present_pages - managed_pages)找出unmanaged pages。
而页面分配器和vm扫描程序应该使用managed_pages来计算各种 watermarks and thresholds。

free_area:伙伴系统的核心数据结构,管理空闲页块链表的数组。

lock:主要是用来保护free_area数组的自旋锁。

2.4.2 zone_type

enum zone_type {
#ifdef CONFIG_ZONE_DMA
	/*
	 * ZONE_DMA is used when there are devices that are not able
	 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
	 * carve out the portion of memory that is needed for these devices.
	 * The range is arch specific.
	 *
	 * Some examples
	 *
	 * Architecture		Limit
	 * ---------------------------
	 * parisc, ia64, sparc	<4G
	 * s390			<2G
	 * arm			Various
	 * alpha		Unlimited or 0-16MB.
	 *
	 * i386, x86_64 and multiple other arches
	 * 			<16M.
	 */
	ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
	/*
	 * x86_64 needs two ZONE_DMAs because it supports devices that are
	 * only able to do DMA to the lower 16M but also 32 bit devices that
	 * can only do DMA areas below 4G.
	 */
	ZONE_DMA32,
#endif
	/*
	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
	 * performed on pages in ZONE_NORMAL if the DMA devices support
	 * transfers to all addressable memory.
	 */
	ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
	/*
	 * A memory area that is only addressable by the kernel through
	 * mapping portions into its own address space. This is for example
	 * used by i386 to allow the kernel to address the memory beyond
	 * 900MB. The kernel will set up special mappings (page
	 * table entries on i386) for each page that the kernel needs to
	 * access.
	 */
	ZONE_HIGHMEM,
#endif
	ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
	ZONE_DEVICE,
#endif
	__MAX_NR_ZONES

};

(1)我的x86_64配置:

CONFIG_ZONE_DMA=y
CONFIG_ZONE_DMA32=y
CONFIG_ZONE_DEVICE=y

centos7 – 3.10.0 – x86_64:

# cat /proc/zoneinfo | grep Node
Node 0, zone      DMA
Node 0, zone    DMA32
Node 0, zone   Normal

(2)arrch64:

CONFIG_ZONE_DMA32=y

Kylin V10 SP1–5.4.18–aarch64:

# cat /proc/zoneinfo | grep Node
Node 0, zone    DMA32
Node 0, zone   Normal
Node 0, zone  Movable

备注:ARM架构没有 ZONE_DMA 这个内存管理区。

对于区域的划分,都是针对物理内存的:

ZONE_DMA 是指可用于作 DMA(Direct Memory Access,直接内存存取)的内存,zone DMA 区域用于管理使用物理地址小于或等于 16 MB 的设备的内存。这些设备通常是一些旧的硬件,如 ISA 总线上的设备、旧版显卡、声卡等。

DMA 是这样一种机制:要把外设的数据读入内存或把内存的数据传送到外设,原来都要通过 CPU 控制完成,但是这会占用 CPU,影响 CPU 处理其他事情,所以有了 DMA 模式。CPU 只需向 DMA 控制器下达指令,让 DMA 控制器来处理数据的传送,数据传送完毕再把信息反馈给 CPU,这样就可以解放 CPU。对于 64 位系统,有两个 DMA 区域。

ZONE_DMA32:用于低于4GB的物理内存访问的设备,如只支持32位的DMA设备。
ARM架构只有ZONE_DMA32没有ZONE_DMA。

ZONE_NORMAL 是直接映射区,4GB以后的物理内存,从物理内存到虚拟内存的内核区域,通过加上一个常量直接映射。

ZONE_HIGHMEM 是高端内存区,对于 32 位系统来说超过 896M 的地方,对于 64 位没有该内存区域。

ZONE_MOVABLE 是可移动区域,通过将物理内存划分为可移动分配区域和不可移动分配区域来避免内存碎片。

CONFIG_ZONE_DEVICE是为了支持热插拔设备而分配的非易失性内存区域。

可以通过/proc/zoneinfo文件查看区域信息:

Memory Device 1 B 6 代表的是服务器哪个内存条_#endif_08


Memory Device 1 B 6 代表的是服务器哪个内存条_服务器_09


Memory Device 1 B 6 代表的是服务器哪个内存条_c语言_10

2.5 内存布局

查看启动时内核消息的node和zone消息:

[    0.007276] NUMA: Node 0 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0xbfffffff] -> [mem 0x00000000-0xbfffffff]
[    0.007278] NUMA: Node 0 [mem 0x00000000-0xbfffffff] + [mem 0x100000000-0x13fffffff] -> [mem 0x00000000-0x13fffffff]
[    0.007288] NODE_DATA(0) allocated [mem 0x13ffd4000-0x13fffdfff]
[    0.007740] Zone ranges:
[    0.007742]   DMA      [mem 0x0000000000001000-0x0000000000ffffff]
[    0.007745]   DMA32    [mem 0x0000000001000000-0x00000000ffffffff]
[    0.007748]   Normal   [mem 0x0000000100000000-0x000000013fffffff]
[    0.007749]   Device   empty
[    0.007751] Movable zone start for each node
[    0.007755] Early memory node ranges
[    0.007755]   node   0: [mem 0x0000000000001000-0x000000000009dfff]
[    0.007757]   node   0: [mem 0x0000000000100000-0x00000000bfecffff]
[    0.007759]   node   0: [mem 0x00000000bff00000-0x00000000bfffffff]
[    0.007760]   node   0: [mem 0x0000000100000000-0x000000013fffffff]
[    0.007761] Initmem setup node 0 [mem 0x0000000000001000-0x000000013fffffff]
[    0.007771] On node 0, zone DMA: 1 pages in unavailable ranges
[    0.007864] On node 0, zone DMA: 98 pages in unavailable ranges
[    0.027778] On node 0, zone DMA32: 48 pages in unavailable ranges

三、节点相关函数

3.1 for_each_online_node

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
	N_POSSIBLE,		/* The node could become online at some point */
	N_ONLINE,		/* The node is online */
	N_NORMAL_MEMORY,	/* The node has regular memory */
#ifdef CONFIG_HIGHMEM
	N_HIGH_MEMORY,		/* The node has regular or high memory */
#else
	N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
	N_MEMORY,		/* The node has memory(regular, high, movable) */
	N_CPU,		/* The node has one or more cpus */
	NR_NODE_STATES
};

定义了一组用于节点状态的位掩码(bitmask),用于表示节点的不同状态。这些状态用于描述系统中的节点,并且在内核中被广泛使用。
以下是定义的节点状态的位掩码的解释:
(1)N_POSSIBLE:表示该节点在某个时间点可以在线上。即使节点当前不在线,但具备潜在的在线能力。

(2)N_ONLINE:表示该节点当前处于在线状态。

(3)N_NORMAL_MEMORY:表示该节点拥有常规内存(regular memory)。

(4)N_HIGH_MEMORY:表示该节点拥有常规内存或高端内存(high memory)。如果系统未启用高端内存支持,则该位与N_NORMAL_MEMORY的位相同。

(5)N_MEMORY:表示该节点具有内存(包括常规内存、高端内存和可移动内存)。

(6)N_CPU:表示该节点具有一个或多个CPU。

#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT     CONFIG_NODES_SHIFT

#define MAX_NUMNODES    (1 << NODES_SHIFT)

对于CONFIG_NODES_SHIFT:
x86_64 通常是:

CONFIG_NODES_SHIFT=10

对于aarch64通常是:

CONFIG_NODES_SHIFT=6
/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min_ts could be dropped. */

#define first_node(src) __first_node(&(src))
static inline int __first_node(const nodemask_t *srcp)
{
	return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static inline int __next_node(int n, const nodemask_t *srcp)
{
	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)			\
	for ((node) = first_node(mask);			\
		(node) < MAX_NUMNODES;			\
		(node) = next_node((node), (mask)))

#if MAX_NUMNODES > 1
#define for_each_node_state(__node, __state) \
	for_each_node_mask((__node), node_states[__state])

#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

for_each_online_node(node) 宏用于遍历在线节点。它使用 for_each_node_state() 宏实现,并将 N_ONLINE 作为节点状态。

3.2 NODE_DATA

// linux-4.18/arch/arm64/mm/numa.c

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
//  linux-4.18/arch/arm64/include/asm/mmzone.h

#ifdef CONFIG_NUMA

#include <asm/numa.h>

extern struct pglist_data *node_data[];
#define NODE_DATA(nid)		(node_data[(nid)])
// linux-4.18/include/linux/mmzone.h

#define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)

#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
	return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

node_start_pfn宏和node_end_pfn宏用来获取节点nid的 start 页帧号和 end 页帧号。

参考资料

Linux 4.18

深入Linux内核架构
极客时间:趣谈操作系统
奔跑吧Linux内核