目录

一、概述

二、初始化流程

2.1 main之前

2.2 rte_eal_init

2.3 rte_eal_cpu_init

2.4 eal_parse_args

2.4.1 eal_parse_common_option

2.4.2 eal_create_runtime_dir

2.4.3 eal_adjust_config

2.5 eal_plugins_init

2.6 eal_option_device_parse

2.7 rte_config_init

2.7.1 rte_eal_config_create

2.7.2 eal_update_mem_config


一、概述

分析一下dpdk源码,基于dpdk19.02

二、初始化流程

2.1 main之前

main执行之前,会执行constructor,DPDK实现都在如下函数中,有四个优先级

[lib/librte_eal/common/include/Rte_common.h]

#define RTE_PRIORITY_LOG 101
#define RTE_PRIORITY_BUS 110
#define RTE_PRIORITY_CLASS 120
#define RTE_PRIORITY_LAST 65535

#define RTE_PRIO(prio) \
	RTE_PRIORITY_ ## prio
#define RTE_INIT_PRIO(func, prio) \
static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void)

#define RTE_INIT(func) \
	RTE_INIT_PRIO(func, LAST)

调用RTE_INIT_PRIO主要集中在如下几处:

[lib/librte_eal/common/eal_common_log.c]

RTE_INIT_PRIO(rte_log_init, LOG)

#define RTE_REGISTER_BUS(nm, bus) \
RTE_INIT_PRIO(businitfn_ ##nm, BUS) \
{\
	(bus).name = RTE_STR(nm);\
	rte_bus_register(&bus); \
}

#define RTE_REGISTER_CLASS(nm, cls) \
RTE_INIT_PRIO(classinitfn_ ##nm, CLASS) \
{\
	(cls).name = RTE_STR(nm); \
	rte_class_register(&cls); \
}

和kernel的驱动模型类似,DPDK抽象了bus,class这些结构,

struct rte_bus {
	TAILQ_ENTRY(rte_bus) next;   /**< Next bus object in linked list */
	const char *name;            /**< Name of the bus */
	rte_bus_scan_t scan;         /**< Scan for devices attached to bus */
	rte_bus_probe_t probe;       /**< Probe devices on bus */
	rte_bus_find_device_t find_device; /**< Find a device on the bus */
	rte_bus_plug_t plug;         /**< Probe single device for drivers */
	rte_bus_unplug_t unplug;     /**< Remove single device from driver */
	rte_bus_parse_t parse;       /**< Parse a device name */
	struct rte_bus_conf conf;    /**< Bus configuration */
	rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu class */
	rte_dev_iterate_t dev_iterate; /**< Device iterator. */
	rte_bus_hot_unplug_handler_t hot_unplug_handler;
				/**< handle hot-unplug failure on the bus */
	rte_bus_sigbus_handler_t sigbus_handler;
					/**< handle sigbus error on the bus */
};
  • bus主要抽象了一组bus接口,是其他bus类型的基类。

bus的注册也很简单,就是将bus结构挂到全局的rte_bus_list上,在我阅读的这个版本中,主要有以下集中bus类型:

Dpaa_bus.c (drivers\bus\dpaa):RTE_REGISTER_BUS(FSL_DPAA_BUS_NAME, rte_dpaa_bus.bus);
Fslmc_bus.c (drivers\bus\fslmc):RTE_REGISTER_BUS(FSLMC_BUS_NAME, rte_fslmc_bus.bus);
Ifpga_bus.c (drivers\bus\ifpga):RTE_REGISTER_BUS(IFPGA_BUS_NAME, rte_ifpga_bus);
Pci_common.c (drivers\bus\pci):RTE_REGISTER_BUS(pci, rte_pci_bus.bus);
Vdev.c (drivers\bus\vdev):RTE_REGISTER_BUS(vdev, rte_vdev_bus);
Vmbus_common.c (drivers\bus\vmbus):RTE_REGISTER_BUS(vmbus, rte_vmbus_bus.bus);

本文关注红色部分的两种bus,pci bus和vdev bus,蓝色部分为bus name

[driver/bus/pci/pci_common.c]

struct rte_pci_bus rte_pci_bus = {
	.bus = {
		.scan = rte_pci_scan,
		.probe = rte_pci_probe,
		.find_device = pci_find_device,
		.plug = pci_plug,
		.unplug = pci_unplug,
		.parse = pci_parse,
		.get_iommu_class = rte_pci_get_iommu_class,
		.dev_iterate = rte_pci_dev_iterate,
		.hot_unplug_handler = pci_hot_unplug_handler,
		.sigbus_handler = pci_sigbus_handler,
	},
	.device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
	.driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
};

[driver/bus/vdev/vdev.c]

static struct rte_bus rte_vdev_bus = {
	.scan = vdev_scan,
	.probe = vdev_probe,
	.find_device = rte_vdev_find_device,
	.plug = vdev_plug,
	.unplug = vdev_unplug,
	.parse = vdev_parse,
	.dev_iterate = rte_vdev_dev_iterate,
};

RTE_REGISTER_BUS(vdev, rte_vdev_bus);

rte_pci_bus继承rte_bus,主要还是应该借鉴了内核的实现——一种bus下挂多个设备,设备由基于特定总线的驱动程序来驱动,构成了三者的对应关系,关于bus的接口调用会在初始化的时候展开。

 

RTE_INIT对应最低优先级的注册,RTE_INIT对应的注册很多,主要是驱动的注册,也有对其进行的基本封装,如pci

#define RTE_PMD_REGISTER_PCI(nm, pci_drv) \
RTE_INIT(pciinitfn_ ##nm) \
{\
	(pci_drv).driver.name = RTE_STR(nm);\
	rte_pci_register(&pci_drv); \
} \

基于pci总线的驱动有很多,列一下intel相关的网卡:

I40e_ethdev.c (drivers\net\i40e):RTE_PMD_REGISTER_PCI(net_i40e, rte_i40e_pmd);
I40e_ethdev_vf.c (drivers\net\i40e):RTE_PMD_REGISTER_PCI(net_i40e_vf, rte_i40evf_pmd);
Igb_ethdev.c (drivers\net\e1000):RTE_PMD_REGISTER_PCI(net_e1000_igb, rte_igb_pmd);
Igb_ethdev.c (drivers\net\e1000):RTE_PMD_REGISTER_PCI(net_e1000_igb_vf, rte_igbvf_pmd);
Ixgbe_ethdev.c (drivers\net\ixgbe):RTE_PMD_REGISTER_PCI(net_ixgbe, rte_ixgbe_pmd);
Ixgbe_ethdev.c (drivers\net\ixgbe):RTE_PMD_REGISTER_PCI(net_ixgbe_vf, rte_ixgbevf_pmd);

2.2 rte_eal_init

 

2.3 rte_eal_cpu_init

 

这个函数的目的就是要确定运行环境中node(socket), lcore, core的数量和他们之间的对应关系。

确定lcore_id和socket_id (node)的对应关系,注释写的很明白,通过判定下面的路径是否可访问

/sys/devices/system/node/nodeX/cpuX

确定cpu的在位情况,通过下面的信息是否可访问确定(每个lcore对应的core_id):

/sys/devices/system/cpu/cpuX/topology/core_id

最终这些信息都会在DPDK启动时输出出来

RTE_LOG(DEBUG, EAL, "Detected lcore %u as " "core %u on socket %u\n", 
                              lcore_id, lcore_config[lcore_id].core_id, core_config[lcore_id].socket_id);
RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count);
RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count);

这些信息存储的位置在全局配置(rte_config)中,红色部分

struct rte_config {
     uint32_t master_lcore;     
     uint32_t lcore_count;                                                                   //逻辑核的数量
     uint32_t numa_node_count;                                                        //node的数量
     uint32_t numa_nodes[RTE_MAX_NUMA_NODES];                  //node对应的node id
     uint32_t service_lcore_count;
     enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE];              //每个逻辑核的在位情况    enum rte_proc_type_t process_type;
    enum rte_iova_mode iova_mode;
    struct rte_mem_config *mem_config;
 } __attribute__((__packed__));

lcore内部配置lcore_config中

struct lcore_config {
     unsigned detected;         /**< true if lcore was detected */
     pthread_t thread_id;       /**< pthread identifier */
     int pipe_master2slave[2];  /**< communication pipe with master */
     int pipe_slave2master[2];  /**< communication pipe with master */
     lcore_function_t * volatile f;         /**< function to call */
     void * volatile arg;       /**< argument of function */
     volatile int ret;          /**< return value of function */
     volatile enum rte_lcore_state_t state; /**< lcore state */
     unsigned socket_id;        /**< physical socket id for this lcore */
     unsigned core_id;          /**< core number on socket for this lcore */
     int core_index;            /**< relative index, starting from 0 */
     rte_cpuset_t cpuset;       /**< cpu set which the lcore affinity to */
     uint8_t core_role;         /**< role of core eg: OFF, RTE, SERVICE */
 };

2.4 eal_parse_args

eal_parse_args和eal_log_level_parse流程一致,都是解析DPDK基本的参数,具体过程没什么可说的,把参数选项列一下。

  • --log-level在eal_log_level_parse解析

2.4.1 eal_parse_common_option

  • -b 黑名单
  • -w 白名单,不能与-b同时指定 (eg. -w0000:02:01.0,-w0000:02:05.0)

上述两个参数都会将对应的类型及传递的参数加入devopt_list,具体参见eal_option_device_add。这里仅列一下类型:

enum rte_devtype {
	RTE_DEVTYPE_WHITELISTED_PCI,
	RTE_DEVTYPE_BLACKLISTED_PCI,
	RTE_DEVTYPE_VIRTUAL,
};

struct device_option {
	TAILQ_ENTRY(device_option) next;

	enum rte_devtype type;
	char arg[];
};
  • -c coremask, 以掩码的形式指定使用哪个lcore,会综合lcore的在位情况,取交集——只将用户指定使用的cpu置为对应的状态
  • -l corelist
  • --lcores,用法在eal_parse_lcores的注释中找到

使用下面的定义描述core指定方式:

#define LCORE_OPT_LST 1   // -l
#define LCORE_OPT_MSK 2   //-c
#define LCORE_OPT_MAP 3   //--lcores
  • -s 指定service core,和-c用法一致
  • -S指定service corelist,和-l用法一致

在lcore_config的core_role中以下面的定义标记

enum rte_lcore_role_t {
	ROLE_RTE,
	ROLE_OFF,
	ROLE_SERVICE,
};
  • -m memory      指定memory大小
  • -n  channels    强制channels
  • -r   ranks         强制ranks

 

  • -d 强制加载外部driver,其实就是使用动态库做插件机制,加入solib_list管理,后续分析eal_plugins_init会说明。
/* Definition for shared object drivers. */
struct shared_driver {
	TAILQ_ENTRY(shared_driver) next;

	char    name[PATH_MAX];
	void*   lib_handle;
};
  • --huge-unlink
  • --no-huge
  • --legacy-mem
  • --hugepage_unlink
  • --no-pci
  • --no-hpet
  • --vmware-tsc-map
  • --no-shconf                 不使用shared conf
  • --in-memory               开启后默认打开--hugepage_unlink和--no-shconf 
  • --proc-type

proc-type主要包括如下类型:

/**
 * The type of process in a linuxapp, multi-process setup
 */
enum rte_proc_type_t {
	RTE_PROC_AUTO = -1,   /* allow auto-detection of primary/secondary */
	RTE_PROC_PRIMARY = 0, /* set to zero, so primary is the default */
	RTE_PROC_SECONDARY,

	RTE_PROC_INVALID
};
  • --master-lcore ,指定master lcore,默认0
  • --vdev   和-b/-w 用法类似,只不过 指定类型是RTE_DEVTYPE_VIRTUAL
  • --syslog 
  • --single-file-segments
  • --iova-mode 指定iommu的工作模式,pa/va
/**
 * IOVA mapping mode.
 *
 * IOVA mapping mode is iommu programming mode of a device.
 * That device (for example: IOMMU backed DMA device) based
 * on rte_iova_mode will generate physical or virtual address.
 *
 */
enum rte_iova_mode {
	RTE_IOVA_DC = 0,	/* Don't care mode */
	RTE_IOVA_PA = (1 << 0), /* DMA using physical address */
	RTE_IOVA_VA = (1 << 1)  /* DMA using virtual address */
};
  • --vfio-intr
{ "legacy", RTE_INTR_MODE_LEGACY },
{ "msi", RTE_INTR_MODE_MSI },
{ "msix", RTE_INTR_MODE_MSIX },
  • --huge-dir     指定hugepage的mount目录
  • --file-prefix   hugefile的前缀
  • --socket-mem
  • --socket-limit
  • --base-virtaddr          指定mmap起始地址,16M对齐
  • --create-uio-dev
  • --mbuf-pool-ops-name
  • --match-allocations

2.4.2 eal_create_runtime_dir

函数比较简单,就是 为dpdk创建统一的runtime的各种文件,其目录默认如下,对应全局变量runtime_dir

/var/run/dpdk/rte

2.4.3 eal_adjust_config

主要是调整internal_config,包括

  1. 如果没有指定-c/-l/--lcores这些指定运行于哪个lcore的参数(core_parsed), 自动检测亲和性关系
  2. 如果没有指定--master-lcore(master_lcore_parsed),自动选择,一般是第一个可用lcore

基本上,参数解析都存储在internal_config这个结构:

struct internal_config {
     volatile size_t memory;                                         // -m memory 
     volatile unsigned force_nchannel;                        // -n  channels
     volatile unsigned force_nrank;                              // -r   ranks
     volatile unsigned no_hugetlbfs;                            // --no-huge
     unsigned hugepage_unlink;                                 // --hugepage_unlink
     volatile unsigned no_pci;                                     // --no-pci
     volatile unsigned no_hpet;                                   // --no-hpet
     volatile unsigned vmware_tsc_map;                    // --vmware-tsc-map                                        
     volatile unsigned no_shconf;                               // --no-shconf
     volatile unsigned in_memory;                             // --in-memory 
     
     volatile unsigned create_uio_dev;                      // --create-uio-dev
     volatile enum rte_proc_type_t process_type;     // --proc-typeforce_sockets;                                                      // --socket-mem
     volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES];           // --socket-mem
     volatile unsigned force_socket_limits;                                             // --socket-limit
     volatile uint64_t socket_limit[RTE_MAX_NUMA_NODES];            // --socket-limit
     uintptr_t base_virtaddr;                                                                   // --base-virtaddr
     volatile unsigned legacy_mem;                                                      // --legacy-mem, 使用--no-huge时默认打开match_allocations;                                            // --match-allocations
     volatile unsigned single_file_segments;                                       // --single-file-segments
     volatile int syslog_facility;                                                             // --syslog
     volatile enum rte_intr_mode vfio_intr_mode;                              /// --vfio-intr
     char *hugefile_prefix;                                                                   // --file-prefix
     char *hugepage_dir;                                                                    // --huge-dir    
     char *user_mbuf_pool_ops_name;                                             // --mbuf-pool-ops-name
             
     unsigned num_hugepage_sizes;     
     struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
     enum rte_iova_mode iova_mode ;                 // --iova-mode
     volatile unsigned int init_complete;
 };

 

参数解析过后,rte_config赋值情况:

struct rte_config {
     uint32_t master_lcore;                                                                 // --master-lcore
     uint32_t lcore_count;                                                                   //逻辑核的数量
     uint32_t numa_node_count;                                                        //node的数量
     uint32_t numa_nodes[RTE_MAX_NUMA_NODES];                  //node对应的node id
     uint32_t service_lcore_count;
     enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE];              //每个逻辑核的在位情况    enum rte_proc_type_t process_type;
    enum rte_iova_mode iova_mode;
    struct rte_mem_config *mem_config;
 } __attribute__((__packed__));

2.5 eal_plugins_init

插件机制,就是动态库,插件的路径可以由-d参数指定,也可以由RTE_EAL_PMD_PATH(通过CONFIG_RTE_EAL_PMD_PATH)指定,函数会加载-d指定的so或者将指定路径下满足条件的插件加入管理(eal_plugin_add),并通过dlopen加载

2.6 eal_option_device_parse

还记得在参数解析的时候,有三个参数(-b/-w/--vdev),他们分别将设备的特性注册到了devopt_list上,在本节的函数调用时,bus的scan和probe还没有进行,先将用户对device的一些限制或者要求解析出来,这样后面做bus相关处理的时候就可以应用了。

函数的实现很简单,就是将之前devopt_list的项都摘下来,构造新参数rte_devargs并挂到新的全局devargs_list上

/**
  * Structure that stores a device given by the user with its arguments
  *
  * A user device is a physical or a virtual device given by the user to
  * the DPDK application at startup through command line arguments.
  *
  * The structure stores the configuration of the device, its PCI
  * identifier if it's a PCI device or the driver name if it's a virtual
  * device.
  */
 struct rte_devargs {
     TAILQ_ENTRY(rte_devargs) next;
     enum rte_devtype type;
     enum rte_dev_policy policy;
     char name[RTE_DEV_NAME_MAX_LEN];
     RTE_STD_C11
     union {
     /** Arguments string as given by user or "" for no argument. */
         char *args;
         const char *drv_str;
     };
     struct rte_bus *bus; /**< bus handle. */
     struct rte_class *cls; /**< class handle. */
     const char *bus_str; /**< bus-related part of device string. */
     const char *cls_str; /**< class-related part of device string. */
     const char *data; /**< Device string storage. */
 };

rte_devarg才是描述一个device构造参数的真正结构,之前用户传递的参数对应args变量,这里我们重点关注:

__rte_experimental int rte_devargs_parse(struct rte_devargs *da, const char *dev)
{
    ...
	/* Retrieve eventual bus info */
	do {
		devname = dev;
		bus = rte_bus_find(bus, bus_name_cmp, dev);
		if (bus == NULL)
			break;
		devname = dev + strlen(bus->name) + 1;
		if (rte_bus_find_by_device_name(devname) == bus)
			break;
	} while (1);
	/* Store device name */
	i = 0;
	while (devname[i] != '\0' && devname[i] != ',') {
		da->name[i] = devname[i];
		i++;
		if (i == maxlen) {
			RTE_LOG(WARNING, EAL, "Parsing \"%s\": device name should be shorter than %zu\n",
				dev, maxlen);
			da->name[i - 1] = '\0';
			return -EINVAL;
		}
	}
	da->name[i] = '\0';
	if (bus == NULL) {
		bus = rte_bus_find_by_device_name(da->name);
		if (bus == NULL) {
			RTE_LOG(ERR, EAL, "failed to parse device \"%s\"\n",
				da->name);
			return -EFAULT;
		}
	}
	da->bus = bus;
	/* Parse eventual device arguments */
	if (devname[i] == ',')
		da->args = strdup(&devname[i + 1]);
	else
		da->args = strdup("");
	if (da->args == NULL) {
		RTE_LOG(ERR, EAL, "not enough memory to parse arguments\n");
		return -ENOMEM;
	}
	return 0;
  • rte_bus_find匹配dev和bus->name,如果是参数-w/-b dev是DBDF/BDF的格式,bus->name是总线名称,这时候不会匹配
  • 如果不匹配(没有找到bus)就使用DBDF/BDF填充rte_devargs中的name
  • 这时候rte_bus_find_by_device_name需找匹配bus,深入这个函数可以发现匹配DBDF/BDF这种格式,这时候一定会匹配pci bus,因为只有pci bus是这种格式
  • 最后,对rte_devargs中的bus字段进行赋值。

解析过后,就是对其他rte_devargs字段进行赋值,只贴下代码,很简单,主要就是根据指定类型确定scan_mode和policy:

devargs->type = devtype;
	bus = devargs->bus;
	if (devargs->type == RTE_DEVTYPE_BLACKLISTED_PCI)
		devargs->policy = RTE_DEV_BLACKLISTED;
	if (bus->conf.scan_mode == RTE_BUS_SCAN_UNDEFINED) {
		if (devargs->policy == RTE_DEV_WHITELISTED)
			bus->conf.scan_mode = RTE_BUS_SCAN_WHITELIST;
		else if (devargs->policy == RTE_DEV_BLACKLISTED)
			bus->conf.scan_mode = RTE_BUS_SCAN_BLACKLIST;
	}
	TAILQ_INSERT_TAIL(&devargs_list, devargs, next);

2.7 rte_config_init

在这个阶段,默认情况下执行PRIMARY的流程

2.7.1 rte_eal_config_create

映射/var/run/dpdk/rte/config文件(mem_cfg_fd)为文件共享

rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
				   PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);

这块内存对应全局rte_config的mem_config,映射完成后,将基本的内容进行赋值:

memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
	rte_config.mem_config = rte_mem_cfg_addr;

	/* store address of the config in the config itself so that secondary
	 * processes could later map the config into this exact location */
	rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;

	rte_config.mem_config->dma_maskbits = 0;

2.7.2 eal_update_mem_config

将配置的参数进行赋值

static void eal_update_mem_config(void)
{
	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
	mcfg->legacy_mem = internal_config.legacy_mem;
	mcfg->single_file_segments = internal_config.single_file_segments;
}

最终看下赋值情况:

/**
  * the structure for the memory configuration for the RTE.
  * Used by the rte_config structure. It is separated out, as for multi-process
  * support, the memory details should be shared across instances
  */
 struct rte_mem_config {
     volatile uint32_t magic;   /**< Magic number - Sanity check. */    uint32_t nchannel;    /**< Number of channels (0 if unknown). */
     uint32_t nrank;       /**< Number of ranks (0 if unknown). */    rte_rwlock_t mlock;   /**< only used by memzone LIB for thread-safe. */
     rte_rwlock_t qlock;   /**< used for tailq operation for thread safe. */
     rte_rwlock_t mplock;  /**< only used by mempool LIB for thread-safe. */    rte_rwlock_t memory_hotplug_lock;
     struct rte_fbarray memzones; /**< Memzone descriptors. */    struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
     struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */    struct malloc_heap malloc_heaps[RTE_MAX_HEAPS];
    int next_socket_id;
mem_cfg_addr;
legacy_mem;
     uint32_t single_file_segments;dma_maskbits;
 } __attribute__((__packed__));