1. 全局内存分析

1.1 /proc/meminfo

详细参考:《/proc/meminfo》。

while true; do cat /proc/meminfo | grep MemFree; sleep 10; done

1.2 /proc/pagetypeinfo

 

1.3 slab相关问题定位(/proc/slabinfo、/sys/kernel/slab、slabinfo)

在内核中打开slub_debug,相关的工具有slabinfo。

slabinfo的数据来源是/sys/kernel/slab。

slabinfo从/sys/kernel/slab目录获取数据,并格式化输出。每个slab的详细信息可以从/sys/kernel/slab/找出。

关于slab,内核提供CONFIG_MODULEDEBUG_SLAB_LEAK用于监测slab的泄漏。但是对slub不生效。

CONFIG_SLUB_STATS提供每个slab更加详细的统计信息,这些信息用于slab分配器性能,作为优化分配器参考标准。

1.3.1 通过/proc/slabinfo查看slub统计信息

/proc/slabinfo在slab_proc_init()中创建,核心函数是slab_show()。

static int __init slab_proc_init(void)
{
    proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
                        &proc_slabinfo_operations);
    return 0;
}

数据结构slabinfo表示每个slab的统计信息,是/proc/slabinfo的数据来源。

struct slabinfo {
    unsigned long active_objs;----------使用中的高速缓存数目。
    unsigned long num_objs;-------------总高速缓存数目。
    unsigned long active_slabs;---------使用中的slab数目。
    unsigned long num_slabs;------------总slab数目。
    unsigned long shared_avail;
    unsigned int limit;
    unsigned int batchcount;
    unsigned int shared;
    unsigned int objects_per_slab;------一个slab包含多少高速缓存。
    unsigned int cache_order;-----------一个slab占用页面数order。
};

slab_show()显示/proc/slabinfo头后,遍历所有的struct kmem_cache,然后通过get_slabinfo()获取信息,cache_show()显示信息。

static int slab_show(struct seq_file *m, void *p)
{
    struct kmem_cache *s = list_entry(p, struct kmem_cache, list);

    if (p == slab_caches.next)
        print_slabinfo_header(m);
    if (is_root_cache(s))
        cache_show(s, m);
    return 0;
}

static void print_slabinfo_header(struct seq_file *m)
{
    /*
     * Output format version, so at least we can change it
     * without _too_ many complaints.
     */
#ifdef CONFIG_DEBUG_SLAB
    seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
#else
    seq_puts(m, "slabinfo - version: 2.1\n");
#endif
    seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
    seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
    seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#ifdef CONFIG_DEBUG_SLAB
    seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
    seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
    seq_putc(m, '\n');
}

static void cache_show(struct kmem_cache *s, struct seq_file *m)
{
    struct slabinfo sinfo;

    memset(&sinfo, 0, sizeof(sinfo));
    get_slabinfo(s, &sinfo);------------------------------根据s在所有node中遍历,填充sinfo。

    memcg_accumulate_slabinfo(s, &sinfo);-----------------在未定义CONFIG_MEMCG的时候无效。

    seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
           cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
           sinfo.objects_per_slab, (1 << sinfo.cache_order));

    seq_printf(m, " : tunables %4u %4u %4u",
           sinfo.limit, sinfo.batchcount, sinfo.shared);
    seq_printf(m, " : slabdata %6lu %6lu %6lu",
           sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
    slabinfo_show_stats(m, s);
    seq_putc(m, '\n');
}

void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
{
    unsigned long nr_slabs = 0;
    unsigned long nr_objs = 0;
    unsigned long nr_free = 0;
    int node;
    struct kmem_cache_node *n;

    for_each_kmem_cache_node(s, node, n) {--------------根据struct kmem_cache->node[]找到具体node下的对应struct kmem_cache_node,也即n。
        nr_slabs += node_nr_slabs(n);-------------------然后累计nr_slabs、nr_objs、free数目。
        nr_objs += node_nr_objs(n);
        nr_free += count_partial(n, count_free);
    }

    sinfo->active_objs = nr_objs - nr_free;
    sinfo->num_objs = nr_objs;
    sinfo->active_slabs = nr_slabs;
    sinfo->num_slabs = nr_slabs;
    sinfo->objects_per_slab = oo_objects(s->oo);-------同一类型struct kmem_cache占用相同的页面阶数。
    sinfo->cache_order = oo_order(s->oo);
}

根据上面的代码可知,kmalloc-1024共78个高速缓存obj,其中72个在使用中。每个占用obj大小为1248,这个大小是包含meta data的。

每个slab包含13个obj,共占用4个页面;一共6个slab,都处于活跃状态。

slabinfo - version: 2.1
# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>...
kmalloc-2048          27     28   2272   14    8 : tunables    0    0    0 : slabdata      2      2      0
kmalloc-1024          72     78   1248   13    4 : tunables    0    0    0 : slabdata      6      6      0...

/proc/slabinfo和slabinfo工具的objsize不一样,是因为两者差别在是否包含meta data。

对应struc kmem_cache中的size和object_size。

Name                   Objects Objsize           Space Slabs/Part/Cpu  O/S O %Fr %Ef Flg
...
kmalloc-1024                72    1024           98.3K          6/1/0   13 2  16  75 PZFU
kmalloc-2048                27    2048           65.5K          2/1/0   14 3  50  84 PZFU
...

通过/proc/slabinfo前后对比,可以分析出不同高速缓存的增加或者减少情况。对于分析内存使用,或者高速缓存泄漏问题大有裨益。

1.3.2 slab详细信息接口/sys/kernel/slab

系统initcall阶段,调用slab_sysfs_init()创建/sys/kernel/slab目录。

static int __init slab_sysfs_init(void)
{
    struct kmem_cache *s;
    int err;

    mutex_lock(&slab_mutex);

    slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
...
    mutex_unlock(&slab_mutex);
    resiliency_test();
    return 0;
}

系统在kmem_cache_create()创建高速缓存的时候,会在/sys/kernel/slab目录下创建同名的目录。

在目录下创建一系列节点slab_attr_group,用于设置显示高速缓存的信息。

struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
          unsigned long flags, void (*ctor)(void *))
{
...
    s = create_cache(cache_name, size, size,
             calculate_alignment(flags, align, size),
             flags, ctor, NULL, NULL);
...
}

static struct kmem_cache *create_cache(const char *name,
        size_t object_size, size_t size, size_t align,
        unsigned long flags, void (*ctor)(void *),
        struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
...
    err = __kmem_cache_create(s, flags);
...
}

int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
{
...
    err = sysfs_slab_add(s);
    if (err)
        __kmem_cache_release(s);
...
}

static int sysfs_slab_add(struct kmem_cache *s)
{
...
    s->kobj.kset = cache_kset(s);
    err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
    if (err)
        goto out;

    err = sysfs_create_group(&s->kobj, &slab_attr_group);
    if (err)
        goto out_del_kobj;
...
}

static struct attribute_group slab_attr_group = {
    .attrs = slab_attrs,
};

static struct attribute *slab_attrs[] = {
    &slab_size_attr.attr,
    &object_size_attr.attr,
    &objs_per_slab_attr.attr,
    &order_attr.attr,
    &min_partial_attr.attr,
    &cpu_partial_attr.attr,
....
    NULL
};

alloc_calls/free_calls:显示分配/释放者

struct location用于记录slabcache在什么地方分配、释放。还记录了最大最小耗时、最大最小pid。

struct location {
    unsigned long count;
    unsigned long addr;
    long long sum_time;
    long min_time;
    long max_time;
    long min_pid;
    long max_pid;
    DECLARE_BITMAP(cpus, NR_CPUS);
    nodemask_t nodes;
};

alloc_calls_show()用于显示分配者的信息,free_calls_show()用于显示释放者的信息。

两者都通过list_locations()来输出struct location中记录的信息。

static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
{
    if (!(s->flags & SLAB_STORE_USER))
        return -ENOSYS;
    return list_locations(s, buf, TRACK_ALLOC);
}
SLAB_ATTR_RO(alloc_calls);

  static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
  {
      if (!(s->flags & SLAB_STORE_USER))
          return -ENOSYS;
      return list_locations(s, buf, TRACK_FREE);
  }
  SLAB_ATTR_RO(free_calls);

static int list_locations(struct kmem_cache *s, char *buf,
                    enum track_item alloc)
{
...
    for_each_kmem_cache_node(s, node, n) {
        unsigned long flags;
        struct page *page;

        if (!atomic_long_read(&n->nr_slabs))
            continue;

        spin_lock_irqsave(&n->list_lock, flags);
        list_for_each_entry(page, &n->partial, lru)
            process_slab(&t, s, page, alloc, map);
        list_for_each_entry(page, &n->full, lru)
            process_slab(&t, s, page, alloc, map);
        spin_unlock_irqrestore(&n->list_lock, flags);
    }

    for (i = 0; i < t.count; i++) {
        struct location *l = &t.loc[i];

        if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
            break;
        len += sprintf(buf + len, "%7ld ", l->count);------------此location总次数。

        if (l->addr)
            len += sprintf(buf + len, "%pS", (void *)l->addr);---此location对应地址的函数名,否则<not-available>。
        else
            len += sprintf(buf + len, "<not-available>");

        if (l->sum_time != l->min_time) {------------------------age依次显示min/average/max。
            len += sprintf(buf + len, " age=%ld/%ld/%ld",
                l->min_time,
                (long)div_u64(l->sum_time, l->count),
                l->max_time);
        } else
            len += sprintf(buf + len, " age=%ld",
                l->min_time);

        if (l->min_pid != l->max_pid)----------------------------显示pid范围。
            len += sprintf(buf + len, " pid=%ld-%ld",
                l->min_pid, l->max_pid);
        else
            len += sprintf(buf + len, " pid=%ld",
                l->min_pid);
...
        len += sprintf(buf + len, "\n");
    }
...
}

如下分别是kmalloc-32的alloc_calls和free_calls调用者:

      5 register_tracer+0xa2/0x19c age=703110/703970/704186 pid=1
      3 ipc_init_proc_interface+0x2e/0xa4 age=704057/704057/704057 pid=1
...
     21 register_blkdev+0x4c/0xec age=703173/703452/704152 pid=1
     29 disk_expand_part_tbl+0x4a/0xbc age=703072/703748/703896 pid=1-78


   4371 <not-available> age=635262 pid=0
     53 of_clk_init+0x1c0/0x224 age=710262/710262/710262 pid=0
...
      1 led_trigger_set+0x11e/0x1b8 age=709217 pid=1
    154 __of_attach_node_sysfs+0x74/0x114 age=710237/710244/710253 pid=1

在/proc/slabinfo可以看到不同高速缓存的增加减小的量,通过/sys/kernel/slab/可以更加详细的看到是谁分配或者释放了此高速缓存。

shrink:尽可能释放高速缓存

使能shrink则会尽可能释放高速缓存,调用kmem_cache_shrink()。

static ssize_t shrink_store(struct kmem_cache *s,
            const char *buf, size_t length)
{
    if (buf[0] == '1')
        kmem_cache_shrink(s);
    else
        return -EINVAL;
    return length;
}
SLAB_ATTR(shrink);

int kmem_cache_shrink(struct kmem_cache *cachep)
{
    int ret;

    get_online_cpus();
    get_online_mems();
    kasan_cache_shrink(cachep);
    ret = __kmem_cache_shrink(cachep);
    put_online_mems();
    put_online_cpus();
    return ret;
}

store_user:记录调用者信息

使能store_user将调用者信息记录到struct track中。

trace:跟踪高速缓存分配释放

通过写入0/1来关闭/打开对高速缓存的分配释放的跟踪,是通过flags中增加删减SLAB_TRACE来设置的。

static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
    return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}

static ssize_t trace_store(struct kmem_cache *s, const char *buf,
                            size_t length)
{
    if (s->refcount > 1)
        return -EINVAL;

    s->flags &= ~SLAB_TRACE;
    if (buf[0] == '1') {
        s->flags &= ~__CMPXCHG_DOUBLE;
        s->flags |= SLAB_TRACE;
    }
    return length;
}
SLAB_ATTR(trace);

在__slab_alloc()中调用alloc_debug_processing(),在__slab_free()中调用free_alloc_processing()。

最终都是通过trace()实现:

static void trace(struct kmem_cache *s, struct page *page, void *object,
                                int alloc)
{
    if (s->flags & SLAB_TRACE) {
        pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
            s->name,
            alloc ? "alloc" : "free",
            object, page->inuse,
            page->freelist);

        if (!alloc)
            print_section(KERN_INFO, "Object ", (void *)object,
                    s->object_size);

        dump_stack();-----------------------------------------------显示当前调用栈信息。
    }
}

在shell中ls一下,监控kmalloc-32实际输出如下:

[ 7685.244170] TRACE kmalloc-32 alloc 0xbc99d820 inuse=16 fp=0x  (null)--------------kmalloc-32的alloc,进程是sh。
[ 7685.250561] CPU: 0 PID: 218 Comm: sh Not tainted 4.9.56 #93
[ 7685.256140] 
Call Trace:
[<803006fe>] dump_stack+0x1e/0x3c
[<8012c180>] alloc_debug_processing+0x5c/0x17c
[<8012c46e>] ___slab_alloc.constprop.28+0x1ce/0x22c
[<8012c52c>] __slab_alloc.constprop.27+0x60/0xb0
[<8012c6b0>] __kmalloc+0x134/0x158
[<8018c644>] load_elf_binary+0x254/0x12ec
[<8013ee2a>] search_binary_handler+0x7a/0x1a4
[<8013f8ac>] do_execveat_common+0x4f4/0x6a0
[<8013fd7c>] SyS_execve+0x38/0x4c
[<80046186>] csky_systemcall+0x96/0xe0
[ 7685.300343] TRACE kmalloc-32 free 0xbc99d820 inuse=8 fp=0xbc99d720
[ 7685.306560] Object bc99d820: 2f 6c 69 62 2f 6c 64 2e 73 6f 2e 31 00 6b 6b 6b  /lib/ld.so.1.kkk
[ 7685.315181] Object bc99d830: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  kkkkkkkkkkkkkkk.
[ 7685.323807] CPU: 0 PID: 218 Comm: which Not tainted 4.9.56 #93
[ 7685.329647] 
Call Trace:
[<803006fe>] dump_stack+0x1e/0x3c
[<8012cd34>] free_debug_processing+0x28c/0x3b8
[<8012d05c>] __slab_free+0x1fc/0x310
[<8012d6c8>] kfree+0x148/0x178
[<8018d438>] load_elf_binary+0x1048/0x12ec
[<8013ee2a>] search_binary_handler+0x7a/0x1a4
[<8013f8ac>] do_execveat_common+0x4f4/0x6a0
[<8013fd7c>] SyS_execve+0x38/0x4c
[<80046186>] csky_systemcall+0x96/0xe0

1.3.3 slabinfo

slabinfo从/sys/kernel/slab中获取数据。

直接输入slabinfo可以获得一个统计信息列表:

Name                   Objects Objsize           Space Slabs/Part/Cpu  O/S O %Fr %Ef Flg
anon_vma                    84      32           24.5K          6/4/0   19 0  66  10 PZFU
anon_vma_chain             111      32           28.6K          7/5/0   19 0  71  12 PZFU
bdev_cache                   4     408            8.1K          1/1/0   12 1 100  19 APaZFU
bio-0                       60     132           36.8K          9/4/0   11 0  44  21 APZFU
bio_integrity_payload        2     104            4.0K          1/1/0   12 0 100   5 APZFU
...

slabinfo -r显示所有单个高速缓存的详细统计信息。

slabinfo -r kmalloc-32查看单个高速缓存的详细信息,包括分配者和释放者信息。

Slabcache: kmalloc-32       Aliases:  0 Order :  0 Objects: 5126

Sizes (bytes)     Slabs              Debug                Memory
------------------------------------------------------------------------
Object :      32  Total  :     321   Sanity Checks : On   Total: 1314816
SlabObj:     256  Full   :     319   Redzoning     : On   Used :  164032
SlabSiz:    4096  Partial:       2   Poisoning     : On   Loss : 1150784
Loss   :     224  CpuSlab:       0   Tracking      : On   Lalig: 1148224
Align  :      32  Objects:      16   Tracing       : Off  Lpadd:       0

kmalloc-32 has no kmem_cache operations

kmalloc-32: Kernel object allocation
-----------------------------------------------------------------------
      5 register_tracer+0xa2/0x19c age=1203320/1204180/1204396 pid=1
      3 ipc_init_proc_interface+0x2e/0xa4 age=1204267/1204267/1204267 pid=1
...
      8 blk_mq_realloc_hw_ctxs+0x1b8/0x3f0 age=1204078/1204082/1204086 pid=1
      8 blk_mq_init_allocated_queue+0x3e/0x2cc age=1204078/1204082/1204086 pid=1

kmalloc-32: Kernel object freeing
------------------------------------------------------------------------
   4371 <not-available> age=1129512 pid=0
     53 of_clk_init+0x1c0/0x224 age=1204512/1204512/1204512 pid=0
      1 free_resource+0x62/0x70 age=1204203 pid=1
...
      1 serio_handle_event+0x162/0x248 age=1203596 pid=19
      2 media_entity_graph_walk_cleanup+0x1e/0x30 age=1203347/1203360/1203373 pid=98
      1 led_trigger_set+0x11e/0x1b8 age=1203467 pid=1
    154 __of_attach_node_sysfs+0x74/0x114 age=1204487/1204494/1204503 pid=1

kmalloc-32: No NUMA information available.

slabinfo -s对所有的尽量释放高速缓存,slabinfo -s <cache name>则释放单个高速缓存。

slabinfo -T显示所有高速缓存的整体统计信息。

1.4 /proc/vmalloc

1.5 /proc/iomem

1.6 /proc/vmstat

2. 进程内存分析

/proc/xxx/maps简要记录

glibc提供的malloc()的调试工具