初始化

内核的大部分管理数据结构都是通过kmalloc分配内存的,那么slab本身结构的内存管理就出现了一个鸡与蛋的问题,slab数据结构所需内存远小于一整页的内存块,这些最适合kmalloc分配,而kmalloc只有在slab初始化完之后才能使用。因此在kmem_cache_init中静态定义了两个kmem_cache。

kmem_cache & kmem_cache_node。

kmem_cache是分配kmem_cache结构体的slub分配器。kmem_cache_node是分配kmem_cache->node用的。kmem_cache_init中会初始化分配node

的kmem_cache_node,而里面会分配node,这种情况下必然存在一个蛋与鸡的故事,如何处理的?参考如下。看这个(状态enum来决定第一次调用early_kmem_cache_node_alloc) ?

Linux 内存管理之 SLUB分配器(5):slub初始化过程_And乔的博客

初始化静态定义的kmem_cache:
create_boot_cache    //static struct [kmem_cache boot_kmem_cache,boot_kmem_cache_node]
	__kmem_cache_create
		kmem_cache_open
			init_kmem_cache_nodes //使用boot_kmem_cache_node 来分配node
				1:early_kmem_cache_node_alloc    //由于boot_kmem_cache_node没有就绪,自身的node 使用该接口来分配,使用的是buddy
				2:kmem_cache_alloc_node    //其它的node分配直接使用boot_kmem_cache_node
					slab_alloc_node   //会访问到node。而此时node需要分配
			alloc_kmem_cache_cpus
		sysfs_slab_add

创建其它默认大小的slub,供kmalloc使用:
create_kmalloc_caches
	new_kmalloc_cache
		create_kmalloc_cache
			kmem_cache_zalloc
			create_boot_cache
			
其它动态申请大小的slub:			
kmem_cache_create
	cache_create
	    __kmem_cache_create

 

start_kernel->mm_init->kmem_cache_init

void __init kmem_cache_init(void)
{
	static __initdata struct kmem_cache boot_kmem_cache,
		boot_kmem_cache_node;

	if (debug_guardpage_minorder())
		slub_max_order = 0;

	kmem_cache_node = &boot_kmem_cache_node;
	kmem_cache = &boot_kmem_cache;

	create_boot_cache(kmem_cache_node, "kmem_cache_node",
		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);

	register_hotmemory_notifier(&slab_memory_callback_nb);

	/* Able to allocate the per node structures */
	slab_state = PARTIAL;

	create_boot_cache(kmem_cache, "kmem_cache",
			offsetof(struct kmem_cache, node) +
				nr_node_ids * sizeof(struct kmem_cache_node *),
		       SLAB_HWCACHE_ALIGN, 0, 0);

	kmem_cache = bootstrap(&boot_kmem_cache);
	kmem_cache_node = bootstrap(&boot_kmem_cache_node);

	/* Now we can use the kmem_cache to allocate kmalloc slabs */
	setup_kmalloc_cache_index_table();
	create_kmalloc_caches(0);

	/* Setup random freelists for each cache */
	init_freelist_randomization();

	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
				  slub_cpu_dead);

	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
		cache_line_size(),
		slub_min_order, slub_max_order, slub_min_objects,
		nr_cpu_ids, nr_node_ids);
}

create_kmalloc_caches中循环通过new_kmalloc_cache创建kmem_cache。 

struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_caches);

#ifdef CONFIG_ZONE_DMA
struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_dma_caches);
#endif


void __init create_kmalloc_caches(slab_flags_t flags)
{
	int i, type;

	for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
			if (!kmalloc_caches[type][i])
				new_kmalloc_cache(i, type, flags); //创建kmem_cache数组。

			/*
			 * Caches that are not of the two-to-the-power-of size.
			 * These have to be created immediately after the
			 * earlier power of two caches
			 */
			if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
					!kmalloc_caches[type][1])
				new_kmalloc_cache(1, type, flags);
			if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
					!kmalloc_caches[type][2])
				new_kmalloc_cache(2, type, flags);
		}
	}

	/* Kmalloc array is now usable */
	slab_state = UP;

#ifdef CONFIG_ZONE_DMA
	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
		struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];

		if (s) {
			unsigned int size = kmalloc_size(i);
			const char *n = kmalloc_cache_name("dma-kmalloc", size);

			BUG_ON(!n);
			kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
				n, size, SLAB_CACHE_DMA | flags, 0, 0);
		}
	}
#endif
}

 new_kmalloc_cache创建内核预先准备的固定大小的内存块。 

static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
{
	kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
					kmalloc_info[idx].size, flags, 0,
					kmalloc_info[idx].size);
}

/*
 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
 * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
 * kmalloc-67108864.
 */
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
	{NULL,                      0},		{"kmalloc-96",             96},
	{"kmalloc-192",           192},		{"kmalloc-8",               8},
	{"kmalloc-16",             16},		{"kmalloc-32",             32},
	{"kmalloc-64",             64},		{"kmalloc-128",           128},
	{"kmalloc-256",           256},		{"kmalloc-512",           512},
	{"kmalloc-1024",         1024},		{"kmalloc-2048",         2048},
	{"kmalloc-4096",         4096},		{"kmalloc-8192",         8192},
	{"kmalloc-16384",       16384},		{"kmalloc-32768",       32768},
	{"kmalloc-65536",       65536},		{"kmalloc-131072",     131072},
	{"kmalloc-262144",     262144},		{"kmalloc-524288",     524288},
	{"kmalloc-1048576",   1048576},		{"kmalloc-2097152",   2097152},
	{"kmalloc-4194304",   4194304},		{"kmalloc-8388608",   8388608},
	{"kmalloc-16777216", 16777216},		{"kmalloc-33554432", 33554432},
	{"kmalloc-67108864", 67108864}
};

kmem_cache_create

内核通过kmem_cache_create()接口来创建一个slab缓存。

一共有5个参数。

name: 要创建的slab对象的名称
size: slab对象的大小
align: slab对象的对齐大小
flags: slab内存分配器的掩码和标志位, 比如常用的SLAB_HWCACHE_ALIGN标志位,创建的kmem_cache管理的object按照硬件cache 对齐
ctor: 对象的构造函数

struct kmem_cache *
kmem_cache_create(const char *name, unsigned int size, unsigned int align,
		slab_flags_t flags, void (*ctor)(void *))
{
	return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
					  ctor);
}

/*内容通过精简*/
struct kmem_cache *
kmem_cache_create_usercopy(const char *name,
		  unsigned int size, unsigned int align,
		  slab_flags_t flags,
		  unsigned int useroffset, unsigned int usersize,
		  void (*ctor)(void *))
{
	struct kmem_cache *s = NULL;
	const char *cache_name;
	int err;




	if (!usersize)
		s = __kmem_cache_alias(name, size, align, flags, ctor);
	if (s)
		goto out_unlock;

	cache_name = kstrdup_const(name, GFP_KERNEL);
	if (!cache_name) {
		err = -ENOMEM;
		goto out_unlock;
	}
/*通过create_cache函数 创建*/
	s = create_cache(cache_name, size,
			 calculate_alignment(flags, align, size),
			 flags, useroffset, usersize, ctor, NULL, NULL);
	if (IS_ERR(s)) {
		err = PTR_ERR(s);
		kfree_const(cache_name);
	}

out_unlock:


	return s;
}

create_cache函数

static struct kmem_cache *create_cache(const char *name,
		unsigned int object_size, unsigned int align,
		slab_flags_t flags, unsigned int useroffset,
		unsigned int usersize, void (*ctor)(void *),
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
	struct kmem_cache *s;
	int err;

	if (WARN_ON(useroffset + usersize > object_size))
		useroffset = usersize = 0;

	err = -ENOMEM;
	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); //kmem_cache是 kmem 初始化创建的 kmem_cache对象
	if (!s)
		goto out;

	s->name = name;
	s->size = s->object_size = object_size;
	s->align = align;
	s->ctor = ctor;
	s->useroffset = useroffset;
	s->usersize = usersize;

	err = init_memcg_params(s, memcg, root_cache);
	if (err)
		goto out_free_cache;

	err = __kmem_cache_create(s, flags); //进一步初始化
	if (err)
		goto out_free_cache;

	s->refcount = 1;
	list_add(&s->list, &slab_caches);  //加入到slab_caches链表中
	memcg_link_cache(s);
out:
	if (err)
		return ERR_PTR(err);
	return s;

out_free_cache:
	destroy_memcg_params(s);
	kmem_cache_free(kmem_cache, s);
	goto out;
}

__kmem_cache_create函数 

int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
{
	int err;

	err = kmem_cache_open(s, flags);
	if (err)
		return err;

	/* Mutex is not taken during early boot */
	if (slab_state <= UP)
		return 0;

	memcg_propagate_slab_attrs(s);
	err = sysfs_slab_add(s);
	if (err)
		__kmem_cache_release(s);

	return err;
}

//内容精简
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
....................

	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
	s->random = get_random_long();
#endif

	if (!calculate_sizes(s, -1))
        goto error;

	if (!init_kmem_cache_nodes(s))   //创建kmem_cache 的struct kmem_cache_node *node[MAX_NUMNODES];
		goto error;

	if (alloc_kmem_cache_cpus(s))   //创建kmem_cache的struct kmem_cache_cpu __percpu *cpu_slab;

...........
}



static int init_kmem_cache_nodes(struct kmem_cache *s)
{
	int node;

	for_each_node_state(node, N_NORMAL_MEMORY) {
		struct kmem_cache_node *n;

		if (slab_state == DOWN) {
			early_kmem_cache_node_alloc(node);
			continue;
		}
		n = kmem_cache_alloc_node(kmem_cache_node,
						GFP_KERNEL, node);  //使用kmem_cache_node分配struct kmem_cache_node 。因此存在是时候kmem_cache_node自身的时候的 鸡生蛋的问题。 因此有一个slab_state状态enum的使用,在一开始调用early_kmem_cache_node_alloc

		if (!n) {
			free_kmem_cache_nodes(s);
			return 0;
		}

		init_kmem_cache_node(n);
		s->node[node] = n;
	}
	return 1;
}

 

kmalloc函数

4.19内核SLUB内存分配器_java

从kmalloc函数看,内核如何分配小块内存:

KMALLOC(SLUB):
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
	if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
		unsigned int index;
#endif
		if (size > KMALLOC_MAX_CACHE_SIZE)
			return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
		index = kmalloc_index(size);

		if (!index)
			return ZERO_SIZE_PTR;

		return kmem_cache_alloc_trace(
				kmalloc_caches[kmalloc_type(flags)][index],
				flags, size);
#endif
	}
	return __kmalloc(size, flags);
}



void *__kmalloc(size_t size, gfp_t flags)
{
	struct kmem_cache *s;
	void *ret;

	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
		return kmalloc_large(size, flags);

	s = kmalloc_slab(size, flags);   // 1 获取kmem_cache

	if (unlikely(ZERO_OR_NULL_PTR(s)))
		return s;

	ret = slab_alloc(s, flags, _RET_IP_); //2 ,分配内存

	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);

	ret = kasan_kmalloc(s, ret, size, flags);

	return ret;
}


static __always_inline void *slab_alloc(struct kmem_cache *s,
		gfp_t gfpflags, unsigned long addr)
{
	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}

1:kmalloc_slab获取kmem_cahce

struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
        unsigned int index;

        if (unlikely(size > KMALLOC_MAX_SIZE)) {
                WARN_ON_ONCE(!(flags & __GFP_NOWARN));
                return NULL;
        }

        if (size <= 192) {
                if (!size)
                        return ZERO_SIZE_PTR;

                index = size_index[size_index_elem(size)];
        } else
                index = fls(size - 1);

#ifdef CONFIG_ZONE_DMA
        if (unlikely((flags & GFP_DMA)))
                return kmalloc_dma_caches[index];

#endif
        return kmalloc_caches[index];   //从已经存在的kmalloc_caches中获取 。
}

2 slab_alloc函数

static __always_inline void *slab_alloc(struct kmem_cache *s,
		gfp_t gfpflags, unsigned long addr)
{
	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}

kmem_cache_alloc 

相比于kmalloc。kmem_cache_alloc创建内存是需要携带kmem_cache * 指针,其创建的内存块大小是固定的。因为kmalloc所利用的内存块的大小是事先定义好的,所以很多情况下会产生内部碎片,浪费空间,而kmem_cache_alloc由于内存大小也是量身定做的缘故则不会。

void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
	void *ret = slab_alloc(s, gfpflags, _RET_IP_);  // 依然是slab_alloc函数分配内存

	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
				s->size, gfpflags);

	return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc);
static __always_inline void *slab_alloc(struct kmem_cache *s,
		gfp_t gfpflags, unsigned long addr)
{
	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}

static __always_inline void *slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr)
{
	void *object;
	struct kmem_cache_cpu *c;
	struct page *page;
	unsigned long tid;

	s = slab_pre_alloc_hook(s, gfpflags);
	if (!s)
		return NULL;
redo:
	/*
	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
	 * enabled. We may switch back and forth between cpus while
	 * reading from one cpu area. That does not matter as long
	 * as we end up on the original cpu again when doing the cmpxchg.
	 *
	 * We should guarantee that tid and kmem_cache are retrieved on
	 * the same cpu. It could be different if CONFIG_PREEMPT so we need
	 * to check if it is matched or not.
	 */
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
	} while (IS_ENABLED(CONFIG_PREEMPT) &&
		 unlikely(tid != READ_ONCE(c->tid)));

	/*
	 * Irqless object alloc/free algorithm used here depends on sequence
	 * of fetching cpu_slab's data. tid should be fetched before anything
	 * on c to guarantee that object and page associated with previous tid
	 * won't be used with current tid. If we fetch tid first, object and
	 * page could be one associated with next tid and our alloc/free
	 * request will be failed. In this case, we will retry. So, no problem.
	 */
	barrier();

	/*
	 * The transaction ids are globally unique per cpu and per operation on
	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
	 * occurs on the right processor and that there was no operation on the
	 * linked list in between.
	 */

	object = c->freelist;
	page = c->page;
	if (unlikely(!object || !node_match(page, node))) { //当前per-cpu 没有空闲或者node节点不匹配  从新创建一个缓冲区
		object = __slab_alloc(s, gfpflags, node, addr, c);
		stat(s, ALLOC_SLOWPATH);
	} else {
		void *next_object = get_freepointer_safe(s, object);

		/*
		 * The cmpxchg will only match if there was no additional
		 * operation and if we are on the right processor.
		 *
		 * The cmpxchg does the following atomically (without lock
		 * semantics!)
		 * 1. Relocate first pointer to the current per cpu area.
		 * 2. Verify that tid and freelist have not been changed
		 * 3. If they were not changed replace tid and freelist
		 *
		 * Since this is without lock semantics the protection is only
		 * against code executing on this cpu *not* from access by
		 * other cpus.
		 */
/* 下面交换cpu_slab->freelist 与next_object的值,使得freelist指向下次申请的object地址*/
		if (unlikely(!this_cpu_cmpxchg_double(
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
				next_object, next_tid(tid)))) {

			note_cmpxchg_failure("slab_alloc", s, tid);
			goto redo;
		}
		prefetch_freepointer(s, next_object);
		stat(s, ALLOC_FASTPATH);
	}

	if (unlikely(gfpflags & __GFP_ZERO) && object)
		memset(object, 0, s->object_size);

	slab_post_alloc_hook(s, gfpflags, 1, &object);

	return object;
}

slab_alloc_node函数

slab_alloc_node
    __slab_alloc
        ___slab_alloc
            new_slab_objects
                new_slab
                    allocate_slab
                        alloc_slab_page
                        shuffle_freelist  //会循环填充page中的object中的地址内容。
                c=raw_cpu_ptr
                c->page = page

          c->freelist = get_freepoint
            
    get_freepointer_safe  //获取下一个object的地址

___slab_alloc 精简 

static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *freelist;
	struct page *page;

	page = c->page;
	if (!page)
		goto new_slab;
redo:

.....

load_freelist:
	/*
	 * freelist is pointing to the list of objects to be used.
	 * page is pointing to the page from which the objects are obtained.
	 * That page must be frozen for per cpu allocations to work.
	 */
	VM_BUG_ON(!c->page->frozen);
	c->freelist = get_freepointer(s, freelist);
	c->tid = next_tid(c->tid);
	return freelist;

new_slab:

	freelist = new_slab_objects(s, gfpflags, node, &c);

	page = c->page;
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
		goto load_freelist; //跳转,设置c->freelist

........

}

 shuffle_freelist函数

static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
{
	void *start;
	void *cur;
	void *next;
	unsigned long idx, pos, page_limit, freelist_count;

	if (page->objects < 2 || !s->random_seq)
		return false;

	freelist_count = oo_objects(s->oo);
	pos = get_random_int() % freelist_count;

	page_limit = page->objects * s->size;
	start = fixup_red_left(s, page_address(page));

	/* First entry is used as the base of the freelist */
	cur = next_freelist_entry(s, page, &pos, start, page_limit,
				freelist_count);
	page->freelist = cur;

	for (idx = 1; idx < page->objects; idx++) {
		setup_object(s, page, cur);
		next = next_freelist_entry(s, page, &pos, start, page_limit,
			freelist_count);
		set_freepointer(s, cur, next);   //设置每个object的地址,根据偏移量。
		cur = next;
	}
	setup_object(s, page, cur);
	set_freepointer(s, cur, NULL);

	return true;
}


//往当前object 偏移 offset 的位置中,填充下一个object的地址fp 。
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
	unsigned long freeptr_addr = (unsigned long)object + s->offset;

#ifdef CONFIG_SLAB_FREELIST_HARDENED
	BUG_ON(object == fp); /* naive detection of double free or corruption */
#endif

	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
}

slab缓存区的object布局: 

4.19内核SLUB内存分配器_开发语言_02

static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
	unsigned long freepointer_addr;
	void *p;

	if (!debug_pagealloc_enabled())
		return get_freepointer(s, object);

	freepointer_addr = (unsigned long)object + s->offset;
	probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); //从地址freepointer_addr读取8字节的内容,到p。 实际上该位置存储这下一个object的地址。
	return freelist_ptr(s, p, freepointer_addr);
}


static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
				 unsigned long ptr_addr)
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
	return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
#else
	return ptr;
#endif
}