Fusion 虚拟化虚拟化iommu

转载

mob64ca140088a9 2023-09-19 07:21:29

文章标签 Fusion 虚拟化 linux 运维服务器 ci 文章分类 虚拟化云计算

Linux虚拟化之IOMMU

引言
1 IOMMU初始化

1.1 初始化函数集
1.2 初始化流程

2 Intel-IOMMU

2.1 技术原理
2.2 数据结构
2.3 核心流程

2.3.1 intel_iommu_init
2.3.2 iommu_group_get_for_dev
2.3.3 intel_iommu_add_device
2.3.4 iommu_domain_alloc
2.3.5 iommu_attach_device
2.3.6 iommu_map

引言

DMA操作允许设备直接访问内存，但是也带来了诸多问题：

当设备需要大量连续物理内存时，OS未必能满足；
当某些设备不能访问高端内存时，必须采用反弹缓冲区搬移，影响性能；
在虚拟化环境中，存在多设备访问同一物理内存的冲突；

IOMMU与cpu的MMU类似，给设备提供一套虚拟地址空间，设备发出虚拟总线地址空间的访问请求、送到IOMMU单元翻译成物理地址的方式间接访问物理内存。

1 IOMMU初始化

1.1 初始化函数集

以x86环境为例，在文件arch/x86/include/asm/iommu_table.h文件中定义了每种IOMMU的初始化函数集结构体：

struct iommu_table_entry {
	initcall_t	detect; //IOMMU探测函数，返回非0表示该类IOMMU开启，返回0表示关闭。
	initcall_t	depend; //另一类IOMMU的detect函数，用于多种IOMMU的初始化排序。
	void		(*early_init)(void); /* No memory allocate available. */
	void		(*late_init)(void); /* Yes, can allocate memory. */
//当flags的IOMMU_FINISH_IF_DETECTED置位，detect函数返回1后不再扫描后续的IOMMU
#define IOMMU_FINISH_IF_DETECTED (1<<0)
//当flags的IOMMU_DETECTED置位，表示该类IOMMU已经成功探测到开启
#define IOMMU_DETECTED		 (1<<1)
	int		flags; 
};

此外，在该文件中还定义了生成iommu_table_entry全局变量的宏：

//将全局变量编译到".iommu_table"段中
#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
	static const struct iommu_table_entry				\
		__iommu_entry_##_detect __used				\
	__attribute__ ((unused, __section__(".iommu_table"),		\
			aligned((sizeof(void *)))))	\
	= {_detect, _depend, _early_init, _late_init,			\
	   _finish ? IOMMU_FINISH_IF_DETECTED : 0}

其他的宏IOMMU_INIT_POST/IOMMU_INIT_POST_FINISH，和IOMMU_INIT_FINISH/IOMMU_INIT都是基于__IOMMU_INIT的封装。

1.2 初始化流程

内核加载后，通过start_kernel->mm_init->mem_init->pci_iommu_alloc，执行IOMMU初始化。

Fusion 虚拟化虚拟化iommu_linux

pci_iommu_alloc函数定义在arch/x86/kernel/pci-dma.c文件中：

void __init pci_iommu_alloc(void)
{
	struct iommu_table_entry *p;

    //首先，对所有类型的IOMMU函数集合进行排序
	sort_iommu_table(__iommu_table, __iommu_table_end);
	//发现depend等于自己的detect，将其置NULL；如果发现排序后的某个IOMMU在其被依赖的IOMMU前面，报错！！！
	check_iommu_entries(__iommu_table, __iommu_table_end);

	for (p = __iommu_table; p < __iommu_table_end; p++) {
		if (p && p->detect && p->detect() > 0) {
			p->flags |= IOMMU_DETECTED;
			if (p->early_init)
				p->early_init();
			if (p->flags & IOMMU_FINISH_IF_DETECTED)
				break;
		}
	}
}

最后，在pci_iommu_init函数中调用late_init完成最后的IOMMU初始化操作。该函数也在arch/x86/kernel/pci-dma.c文件中

static int __init pci_iommu_init(void)
{
	struct iommu_table_entry *p;

	x86_init.iommu.iommu_init();

	for (p = __iommu_table; p < __iommu_table_end; p++) {
		if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
			p->late_init();
	}

	return 0;
}
/* Must execute after PCI subsystem */
rootfs_initcall(pci_iommu_init);

2 Intel-IOMMU

2.1 技术原理

下图是x86物理服务器视图：

Fusion 虚拟化虚拟化iommu_linux_02

如上图，在主桥中有多个DMA Remapping Unit。每个单元管理相关设备的DMA请求，负责将它们的设备虚拟地址转为设备物理地址。图中，DMA Remapping Unit #1管理PCIe Root Port Dev[14:0]及其下属设备; 依次类推，DMA Remapping Unit #4管理PCIe Root Port Dev[30:0]及其下属设备和南桥设备。

注：BIOS通过在ACPI表中的DMA Remapping Reporting Structure 信息来描述这些管理信息。

2.2 数据结构

linux中关于intel IOMMU的相关数据结构如下：

Fusion 虚拟化虚拟化iommu_服务器_03

linux通过全局变量dmar_drhd_units将所有的struct dmar_drhd_unit串成一个链表，该结构对应于上面的DMA Remapping Unit。每个struct dmar_drhd_unit指向一个struct intel_iommu。；
每个struct intel_iommu包含一个root entry，用于建立设备与域的映射关系。通过总线号、设备号、功能号索引IOMMU映射的页表基地址.
每个struct intel_iommu包含多个domain，一般一个虚拟机对应一个域。一个域内共享相同的虚拟IO地址空间；
每个struct intel_iommu也嵌套一个struct iommu_device，一个DMA Remapping Unit也是一个硬件设备，从linux设备模型看属于一个特定功能的设备，其中还包括一个iommu_ops的函数集合，该函数集合用于域的构建、释放、域内设备管理等功能；
每个struct dmar_domain嵌套一个struct iommu_domain，其中也包括一个iommu_ops的函数集合；
每个struct dmar_domain还包括一个IOMMU映射的页表基地址；
一个struct iommu_group代表设备透传的最小单元，其中的设备不能在不同的域，也就是不能透传给不同的guest，也不能一部分在host、一部分在guest。所以，每个struct iommu_group属于一个域；

2.3 核心流程

2.3.1 intel_iommu_init

通过IOMMU_INIT_POST(detect_intel_iommu)定义了用于intel-iommu的detect，同时该函数会在pci_swiotlb_detect_4gb后执行。在detect_intel_iommu函数中对全局回调x86_init.iommu.iommu_init赋值为intel_iommu_init，最后会在pci_iommu_init函数中调用该回调(即intel_iommu_init函数)。

int __init intel_iommu_init(void)
{
    # 略略略... ...

	if (iommu_init_mempool()) {
		if (force_on)
			panic("tboot: Failed to initialize iommu memory\n");
		return -ENOMEM;
	}

	down_write(&dmar_global_lock);
	if (dmar_table_init()) {
		if (force_on)
			panic("tboot: Failed to initialize DMAR table\n");
		goto out_free_dmar;
	}

	if (dmar_dev_scope_init() < 0) {
		if (force_on)
			panic("tboot: Failed to initialize DMAR device scope\n");
		goto out_free_dmar;
	}

	up_write(&dmar_global_lock);

	dmar_register_bus_notifier();

	down_write(&dmar_global_lock);

    # 略略略... ...

	if (dmar_init_reserved_ranges()) {
		if (force_on)
			panic("tboot: Failed to reserve iommu ranges\n");
		goto out_free_reserved_range;
	}

	if (dmar_map_gfx)
		intel_iommu_gfx_mapped = 1;

	init_no_remapping_devices();

	ret = init_dmars();
	if (ret) {
		if (force_on)
			panic("tboot: Failed to initialize DMARs\n");
		pr_err("Initialization failed\n");
		goto out_free_reserved_range;
	}
	up_write(&dmar_global_lock);

#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
	if (!has_untrusted_dev() || intel_no_bounce)
		swiotlb = 0;
#endif
	dma_ops = &intel_dma_ops;

	init_iommu_pm_ops();

	down_read(&dmar_global_lock);
	for_each_active_iommu(iommu, drhd) {
		iommu_device_sysfs_add(&iommu->iommu, NULL,
				       intel_iommu_groups,
				       "%s", iommu->name);
		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
		iommu_device_register(&iommu->iommu);
	}
	up_read(&dmar_global_lock);

	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
	if (si_domain && !hw_pass_through)
		register_memory_notifier(&intel_iommu_memory_nb);
	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
			  intel_iommu_cpu_dead);

	down_read(&dmar_global_lock);
	if (probe_acpi_namespace_devices())
		pr_warn("ACPI name space devices didn't probe correctly\n");

	/* Finally, we enable the DMA remapping hardware. */
	for_each_iommu(iommu, drhd) {
		if (!drhd->ignored && !translation_pre_enabled(iommu))
			iommu_enable_translation(iommu);

		iommu_disable_protect_mem_regions(iommu);
	}
	up_read(&dmar_global_lock);

	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");

	intel_iommu_enabled = 1;

	return 0;

out_free_reserved_range:
    # 略略略... ...
out_free_dmar:
    # 略略略... ...
}

核心流程如下：

首先，通过iommu_init_mempool函数创建slob缓存池；
然后，通过dmar_table_init函数解析ACPI表中与DMA Remapping Unit相关的信息；
继续，通过dmar_dev_scope_init将解析每个PCI设备的bus链，通过bus链将设备添加到struct dmar_drhd_unit中；
继续，通过dmar_init_reserved_ranges函数保留IOAPIC和每个PCI设备的bar空间，这些区域不能用于内存分配；
继续，通过init_no_remapping_devices函数忽略掉相关的struct dmar_drhd_unit，具体是：若struct dmar_drhd_unit下挂设备只有gfx设备，且dmar_map_gfx为0(因为很多图形设备将内存集成在显卡内部)；或者struct dmar_drhd_unit下没有挂设备；
通过init_dmars对每个struct dmar_drhd_unit做正式初始化，包括设置root entry；
继续，设置全局回调函数集dma_ops为intel_dma_ops，该函数集用于dma_alloc_*系列函数；
继续，设置每个struct iommu_device的ops为intel_iommu_ops；
继续，设置pci_bus_type的iommu_ops为intel_iommu_ops；
继续，通过probe_acpi_namespace_devices将acpi namespace设备建立iommu_group；
最后，使能每个struct dmar_drhd_unit，并将intel_iommu_enabled置1；

2.3.2 iommu_group_get_for_dev

iommu_group_get_for_dev函数负责获取iommu_group，若不存在就创建新的iommu_group并将设备添加到iommu_group之中：

struct iommu_group *iommu_group_get_for_dev(struct device *dev)
{
	const struct iommu_ops *ops = dev->bus->iommu_ops;
    # 略略略... ...

	group = iommu_group_get(dev);
	if (group)
		return group;

    # 略略略... ...

	group = ops->device_group(dev);

    # 略略略... ...

	if (!group->default_domain) {
		struct iommu_domain *dom;

		dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
		if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) {
			dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
            # 略略略... ...
		}

		group->default_domain = dom;
		if (!group->domain)
			group->domain = dom;

		if (dom && !iommu_dma_strict) {
			int attr = 1;
			iommu_domain_set_attr(dom,
					      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
					      &attr);
		}
	}

	ret = iommu_group_add_device(group, dev);
	if (ret) {
		iommu_group_put(group);
		return ERR_PTR(ret);
	}

	return group;
}

函数中通过ops->device_group创建iommu_group，并创建默认的struct iommu_domain赋值给iommu_group的default_domain和domain字段。当iommu_group透传到虚拟机的时候，其domain会指向虚拟机的域。
在intel-iommu实现中，ops->device_group回调的实现函数为intel_iommu_device_group。在该函数中，对于PCI设备会执行pci_device_group；反之，则执行generic_device_group。

pci_device_group函数如下：

struct iommu_group *pci_device_group(struct device *dev)
{
    # 略略略... ...
	u64 devfns[4] = { 0 };
	
    # 略略略... ...
    
	if (pci_for_each_dma_alias(pdev, get_pci_alias_or_group, &data))
		return data.group;

	pdev = data.pdev;

	for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
		if (!bus->self)
			continue;

		if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
			break;

		pdev = bus->self;

		group = iommu_group_get(&pdev->dev);
		if (group)
			return group;
	}

	group = get_pci_alias_group(pdev, (unsigned long *)devfns);
	if (group)
		return group;

	group = get_pci_function_alias_group(pdev, (unsigned long *)devfns);
	if (group)
		return group;

	/* No shared group found, allocate new */
	return iommu_group_alloc();
}

执行流程如下：

首先，通过pci_for_each_dma_alias函数迭代，若成功迭代到group，则返回；否则，将pdev更新到最后迭代的pdev，继续往后执行；
通过for循环，从pdev开始往上遍历总线。每次，若bus开启了ACS特性(access controller service，该特性会关闭PCIE的点对点传输特性，确保下属设备的访问直通到DMA Remapping Unit)，退出循环；否则，总线绑定了iommu_group，就将其return；
然后，通过get_pci_alias_group函数先查询pdev是否绑定iommu_group，若绑定就将其返回；否则，查询同一个bus上，被pdev设置为dma alias或将pdev设置为dma alias的直接关联设备以及间接关联设备（间接关联只两个设备和某其他相同设备有直接关联或间接关联）。若查询成功，将其group返回，如果group没有，遍历结束返回NULL；
继续在get_pci_function_alias_group中，查询和pdev在同一总线上，有相同设备号、不同功能号的设备，如果pdev和找到的设备都没有开启ACS特性，则将找到设备的group返回（前提是group存在，不存在继续查询，直到返回NULL）

pci_for_each_dma_alias迭代规则如下：

int pci_for_each_dma_alias(struct pci_dev *pdev,
			   int (*fn)(struct pci_dev *pdev,
				     u16 alias, void *data), void *data)
{
    # 略略略... ...
	ret = fn(pdev, pci_dev_id(pdev), data);
	if (ret)
		return ret;

	if (unlikely(pdev->dma_alias_mask)) {
		unsigned int devfn;

		for_each_set_bit(devfn, pdev->dma_alias_mask, MAX_NR_DEVFNS) {
			ret = fn(pdev, PCI_DEVID(pdev->bus->number, devfn),
				 data);
			if (ret)
				return ret;
		}
	}

	for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) {
		struct pci_dev *tmp;

		/* Skip virtual buses */
		if (!bus->self)
			continue;

		tmp = bus->self;

		/* stop at bridge where translation unit is associated */
		if (tmp->dev_flags & PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT)
			return ret;

		if (pci_is_pcie(tmp)) {
			switch (pci_pcie_type(tmp)) {
			case PCI_EXP_TYPE_ROOT_PORT:
			case PCI_EXP_TYPE_UPSTREAM:
			case PCI_EXP_TYPE_DOWNSTREAM:
				continue;
			case PCI_EXP_TYPE_PCI_BRIDGE:
				ret = fn(tmp,
					 PCI_DEVID(tmp->subordinate->number,
						   PCI_DEVFN(0, 0)), data);
				if (ret)
					return ret;
				continue;
			case PCI_EXP_TYPE_PCIE_BRIDGE:
				ret = fn(tmp, pci_dev_id(tmp), data);
				if (ret)
					return ret;
				continue;
			}
		} else {
			if (tmp->dev_flags & PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS)
				ret = fn(tmp,
					 PCI_DEVID(tmp->subordinate->number,
						   PCI_DEVFN(0, 0)), data);
			else
				ret = fn(tmp, pci_dev_id(tmp), data);
			if (ret)
				return ret;
		}
	}

	return ret;
}

执行流程如下：

首先，通过回调查询pdev是否绑定了iommu_group，存在就将其返回，否则继续下一步；
unlikely的条件可以忽略，因为fn回调get_pci_alias_or_group，其实现与PCI_DEVID(pdev->bus->number, devfn)没有影响；
在for循环的迭代核心思想是：从pdev所属总线往上遍历，如果遇到bus具备PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT特性，迭代结束；否则，若设备是PCIE设备，且是PCIE-PCI桥或PCI-PCIE桥，查询其iommu_group，存在就将其返回。若设备不是PCIE设备，就查询其iommu_group；

注：对于PCIE总线，之所以需要查询PCIE-PCI桥或PCI-PCIE桥，是因为PCI接入到PCIE其下属设备均共享source identifier(使用桥设备的bus、device、func)。

2.3.3 intel_iommu_add_device

2.3.4 iommu_domain_alloc

2.3.5 iommu_attach_device

2.3.6 iommu_map

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：Android水平滑动控件 android水平布局

下一篇：java 对象数组调用方法 java对象数组添加新数据

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯