基于OpenStack Queens版
一、创建vGPU虚机流程简述
[nova-api进程]
1、nova/api/openstack/compute/servers.py create()
2、nova/compute/api.py create() 调用同文件中 _create_instance()
3、nova/conductor/api.py
build_instances() --> nova\conductor\rpcapi.py build_instances() rpc远程调用
[nova-conductor进程]
4、nova/conductor/manager.py
build_instances()通过self.compute_rpcapi.build_and_run_instance --> rpc远程调用nova/compute/rpcapi.py的build_and_run_instance()
[nova-compute进程]
5、nova/compute/manager.py
build_and_run_instance() --> _do_build_and_run_instance() --> _build_and_run_instance(),调用_build_resources创建对应(vGPU)资源,然后调spawn()方法,创建libvirt实例。
二、spawn方法创建vGPU实例入口函数
1、传入包含vGPU信息的allocations参数到_allocate_mdevs方法,生成并返回mdev设备(allocations表示分配给此instance的资源,通过placement api获取)
2、_get_guest_xml()方法传入mdevs参数,生成实例xml
3、调用libvirt API创建实例
# nova\virt\libvirt\driver.py
def spawn(self, context, instance, image_meta, injected_files,
admin_password, allocations, network_info=None,
block_device_info=None):
mdevs = self._allocate_mdevs(allocations)
xml = self._get_guest_xml(context, instance, network_info,
disk_info, image_meta,
block_device_info=block_device_info,
mdevs=mdevs)
self._create_domain_and_network(
context, xml, instance, network_info,
block_device_info=block_device_info,
post_xml_callback=gen_confdrive,
destroy_disks_on_failure=True,
power_on=power_on)
三、spawn方法中,vGPU相关变量allocations如何生成
传入vGPU资源变量,通过REST方式,调用placement API获取计算节点对应vGPU资源
1,nova\compute\manager.py
def _build_and_run_instance():
try:
with self._build_resources(context, instance, requested_networks, security_groups, image_meta, block_device_mapping) as resources:
self.driver.spawn(context, instance, image_meta, injected_files, admin_password, allocs, network_info=network_info, block_device_info=block_device_info)
def _build_resources():
try:
resources['allocations'] = (self.reportclient.get_allocations_for_consumer(context,instance.uuid))
2、 nova/scheduler/client/report.py
# 此处get_allocations_for_consumer调用placement API,通过REST方式,获取allocations资源(其中包含vGPU资源)
def get_allocations_for_consumer(self, context, consumer):
url = '/allocations/%s' % consumer
resp = self.get(url, global_request_id=context.global_id)
if not resp:
return {}
else:
return resp.json()['allocations']
四、allocate_mdevs方法获取mdev设备
1、allocate_mdevs方法,流程概览
1)_vgpu_allocations方法,过滤只与vGPU相关的allocations信息
2)_get_supported_vgpu_types方法,从nova配置文件读取当前节点支持的vgpu类型,对应参数enabled_vgpu_types
3)_get_existing_mdevs_not_assigned 方法,获取当前节点可用的mdev设备,即已创建成功,但未分配出去的mdev设备列表。若能获取到有效的mdev设备,则返回设备uuid,否则执行下一步创建mdev设备。
4)_create_new_mediated_device方法,根据物理设备,创建新的mdev设备。适用于初始化mdev设备使用,以及后续新增GPU设备的场景。
调用流程图:
代码实现入口:
@utils.synchronized(VGPU_RESOURCE_SEMAPHORE)
def _allocate_mdevs(self, allocations):
# 返回与可用资源相对应的中介设备uuid列表,我们可以将这些可用资源分配给 通过传递参数请求分配相对应资源的客户。
# 该方法也可以找到一个现有的但未分配的中介设备,如果有足够的剩余容量,也可以创建一个新的中介设备物理设备。
vgpu_allocations = self._vgpu_allocations(allocations)
if not vgpu_allocations:
return
if len(vgpu_allocations) > 1:
LOG.warning('More than one allocation was passed over to libvirt '
'while at the moment libvirt only supports one. Only '
'the first allocation will be looked up.')
alloc = six.next(six.itervalues(vgpu_allocations))
vgpus_asked = alloc['resources'][fields.ResourceClass.VGPU]
# 从nova配置文件中,读取CONF.devices.enabled_vgpu_types的第一个值。
requested_types = self._get_supported_vgpu_types()
# Which mediated devices are created but not assigned to a guest ?
mdevs_available = self._get_existing_mdevs_not_assigned(
requested_types)
chosen_mdevs = []
for c in six.moves.range(vgpus_asked):
chosen_mdev = None
if mdevs_available:
# Take the first available mdev
chosen_mdev = mdevs_available.pop()
else:
chosen_mdev = self._create_new_mediated_device(requested_types)
if not chosen_mdev:
# If we can't find devices having available VGPUs, just raise
raise exception.ComputeResourcesUnavailable(
reason='vGPU resource is not available')
else:
chosen_mdevs.append(chosen_mdev)
return chosen_mdevs
2、_vgpu_allocations方法分析
_vgpu_allocations方法,过滤allocations,过滤allocations获取vGPU相关请求
@staticmethod
def _vgpu_allocations(allocations):
if not allocations:
# If no allocations, there is no vGPU request.
return {}
RC_VGPU = fields.ResourceClass.VGPU
vgpu_allocations = {}
for rp in allocations:
res = allocations[rp]['resources']
if RC_VGPU in res and res[RC_VGPU] > 0:
vgpu_allocations[rp] = {'resources': {RC_VGPU: res[RC_VGPU]}}
return vgpu_allocations
3、_get_supported_vgpu_types方法分析
_get_supported_vgpu_types方法,读取计算节点nova配置文件中支持的vGPU类型
# 从nova配置文件中,读取CONF.devices.enabled_vgpu_types的第一个值,只支持一种类型,例如nvidia-319
def _get_supported_vgpu_types(self):
if not CONF.devices.enabled_vgpu_types:
return []
# TODO(sbauza): Move this check up to compute_manager.init_host
if len(CONF.devices.enabled_vgpu_types) > 1:
LOG.warning('libvirt only supports one GPU type per compute node,'
' only first type will be used.')
requested_types = CONF.devices.enabled_vgpu_types[:1]
return requested_types
4、_get_existing_mdevs_not_assigned 方法分析
_get_existing_mdevs_not_assigned 方法,调用libvirt接口,获取当前节点可用的mdev设备
获取未分配状态的mdev设备,为下一步创建mdev设备做准备:
第一步,从所有虚机中,查看已分配出去的mdev设备列表;
第二步,查询当前节点上所有mdev设备列表;
第三步,所有的设备列表,减去已分配的设备列表,就是可用的设备列表,available_mdevs。
def _get_existing_mdevs_not_assigned(self, requested_types=None):
allocated_mdevs = self._get_all_assigned_mediated_devices()
mdevs = self._get_mediated_devices(requested_types)
available_mdevs = set([mdev["uuid"]
for mdev in mdevs]) - set(allocated_mdevs)
return available_mdevs
4.1 _get_all_assigned_mediated_devices方法,调用libvirt接口,获取计算节点所有虚机中已分配出去的所有mdev设备,以字典格式返回。
def _get_all_assigned_mediated_devices(self, instance=None):
allocated_mdevs = {}
# 暂不考虑指定instance的情况
if instance:
try:
guest = self._host.get_guest(instance)
except exception.InstanceNotFound:
return {}
guests = [guest]
else:
# 调用libvirt接口,获取所有虚机
guests = self._host.list_guests(only_running=False)
for guest in guests:
# 遍历所有guest,查询guest(XML配置)的devices列表中,是否有mdev设备,若有,则记录虚机的uuid到allocated_mdevs字典。字典格式{mdev设备uuid:虚机uuid}
cfg = guest.get_config()
for device in cfg.devices:
if isinstance(device, vconfig.LibvirtConfigGuestHostdevMDEV):
allocated_mdevs[device.uuid] = guest.uuid
return allocated_mdevs
4.2 _get_mediated_devices方法,根据nova配置,过滤指定vGPU类型的mdev设备
# 获取主机mdev设备。从libvirt获取与nova配置CONF.devices.enabled_vgpu_types匹配的所有mdev设备信息,并以列表的格式返回。
def _get_mediated_devices(self, types=None):
if not self._host.has_min_version(MIN_LIBVIRT_MDEV_SUPPORT):
return []
dev_names = self._host.list_mediated_devices() or []
mediated_devices = []
for name in dev_names:
device = self._get_mediated_device_information(name)
if not types or device["type"] in types:
mediated_devices.append(device)
return mediated_devices
5、_create_new_mediated_device方法创建mdev设备
# 找到一个可以支持新中介mediated设备的物理设备,创建mediated设备。
def _create_new_mediated_device(self, requested_types, uuid=None):
# 遍历获取所有可用的mdev设备,返回mdev设备列表【以下for循环创建所有mdev设备,其中chosen_mdev会有多次刷新,最终只返回最后一个生成的chosen_mdev】
devices = self._get_mdev_capable_devices(requested_types)
for device in devices:
# 遍历当前所有可用的mdev设备,调用create_mdev,注入uuid创建mdev设备
asked_type = requested_types[0]
if device['types'][asked_type]['availableInstances'] > 0:
# That physical GPU has enough room for a new mdev
dev_name = device['dev_id']
# We need the PCI address, not the libvirt name
# The libvirt name is like 'pci_0000_84_00_0'
pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
chosen_mdev = nova.privsep.libvirt.create_mdev(pci_addr,
asked_type,
uuid=uuid)
return chosen_mdev
5.1 _get_mdev_capable_devices()方法,获取支持mdev类型的主机物理设备(pGPU卡)
def _get_mdev_capable_devices(self, types=None):
# 获取支持mdev类型的主机设备
if not self._host.has_min_version(MIN_LIBVIRT_MDEV_SUPPORT):
return []
dev_names = self._host.list_mdev_capable_devices() or []
mdev_capable_devices = []
for name in dev_names:
device = self._get_mdev_capabilities_for_dev(name, types)
if not device["types"]:
continue
mdev_capable_devices.append(device)
return mdev_capable_devices
#5.1.1 list_mdev_capable_devices()方法,调用libvirt接口,查找支持mdev功能的设备
def list_mdev_capable_devices(self, flags=0):
"""Lookup devices supporting mdev capabilities.
:returns: a list of virNodeDevice instance
"""
return self._list_devices("mdev_types", flags=flags)
def _list_devices(self, cap, flags=0):
"""Lookup devices.
:returns: a list of virNodeDevice instance
"""
try:
return self.get_connection().listDevices(cap, flags)
except libvirt.libvirtError as ex:
error_code = ex.get_error_code()
if error_code == libvirt.VIR_ERR_NO_SUPPORT:
LOG.warning("URI %(uri)s does not support "
"listDevices: %(error)s",
{'uri': self._uri, 'error': ex})
return []
else:
raise
#5.1.2 _get_mdev_capabilities_for_dev方法,用于遍历GPU卡的过程中,提取有效的设备信息,并组合成列表形式返回
# 最终单个GPU卡返回一个具有MDEV功能设备的dict,device = {"dev_id": cfgdev.name,"types": {.....}}
# 返回一个具有MDEV功能设备的dict,该设备信息的字典中,字典的第一组信息为ID值的kv,字典的第二组信息为受支持类型的列表,每个类型也都是dict字典类型。
def _get_mdev_capabilities_for_dev(self, devname, types=None):
"""Returns a dict of MDEV capable device with the ID as first key
and then a list of supported types, each of them being a dict.
:param types: Only return those specific types.
"""
virtdev = self._host.device_lookup_by_name(devname)
xmlstr = virtdev.XMLDesc(0)
cfgdev = vconfig.LibvirtConfigNodeDevice()
cfgdev.parse_str(xmlstr)
device = {
"dev_id": cfgdev.name,
"types": {},
}
for mdev_cap in cfgdev.pci_capability.mdev_capability:
for cap in mdev_cap.mdev_types:
if not types or cap['type'] in types:
device["types"].update({cap['type']: {
'availableInstances': cap['availableInstances'],
'name': cap['name'],
'deviceAPI': cap['deviceAPI']}})
return device
def device_lookup_by_name(self, name):
"""Lookup a node device by its name.
:returns: a virNodeDevice instance
"""
return self.get_connection().nodeDeviceLookupByName(name)
5.2 create_mdev()方法,指定物理设备的mdev设备,写入随机uuid
通过向/sysfs/devices//mdev_supported_types//create输入一个UUID,就可以创建一个类型的Mediated Device。
@nova.privsep.sys_admin_pctxt.entrypoint
def create_mdev(physical_device, mdev_type, uuid=None):
"""Instantiate a mediated device."""
if uuid is None:
uuid = uuidutils.generate_uuid()
fpath = '/sys/class/mdev_bus/{0}/mdev_supported_types/{1}/create'
fpath = fpath.format(physical_device, mdev_type)
with open(fpath, 'w') as f:
f.write(uuid)
return uuid
五、vGPU资源的释放
虚机挂起、关机、删除,都会释放vGPU资源
1、虚机删除操作,直接调用libvirt的destroy方法,之后vGPU资源释放到资源池。
2、vGPU虚机的暂停、挂起操作,mdev设备的处理
pause()方法调用suspend()方法,suspend调用_detach_mediated_devices()方法,执行detach_device,释放mdev设备
# nova\virt\libvirt\driver.py
def pause(self, instance):
"""Pause VM instance."""
self._host.get_guest(instance).pause()
# nova\virt\libvirt\guest.py
def pause(self):
self._domain.suspend()
# nova\virt\libvirt\driver.py
def suspend(self, context, instance):
"""Suspend the specified instance."""
guest = self._host.get_guest(instance)
self._detach_pci_devices(guest,pci_manager.get_instance_pci_devs(instance))
self._detach_direct_passthrough_ports(context, instance, guest)
self._detach_mediated_devices(guest)
guest.save_memory_state()
3、rescue方法,先获取原来vGPU虚机对应mdev设备信息,再组装xml创建实例
suspend对应rescue方法,先调用_get_all_assigned_mediated_devices()方法,获取mdev列表,再调用_get_guest_xml()方法,把mdev设备信息组装到xml中。
def rescue():
mdevs = self._get_all_assigned_mediated_devices(instance)
mdevs = list(mdevs.keys())
xml = self._get_guest_xml(context,...,mdevs=mdevs)
self._destroy(instance)
self._create_domain(xml, ...)
4、reboot和power_on,都有调用_hard_reboot方法
在_hard_reboot方法中,也是先调用_get_all_assigned_mediated_devices()方法,传入instance参数,获取实例对应的mdev列表,再调用_get_guest_xml()方法,把mdev设备信息组装到xml中。
删除(挂起)实例时,会清除释放mdev设备;重新调用libivrt创建原来实例时,需要重新获取实例销毁之前使用的mdev设备(列表),以便重用原来的mdev设备。
def _hard_reboot():
mdevs = self._get_all_assigned_mediated_devices(instance)
mdevs = list(mdevs.keys())
self.destroy(...)
#传入原来vm使用的mdev设备,继续创建libvirt实例
xml = self._get_guest_xml(context,...,mdevs=mdevs)
self._create_domain_and_network()
5、对比新建实例流程(先申请mdev设备,再组装xml)
def spawn():
mdevs = self._allocate_mdevs(allocations)
xml = self._get_guest_xml(context,....,mdevs=mdevs)
self._create_domain_and_network(context, xml,...)
六、vGPU实例迁移
当前冷热迁移都不支持,可以先创建镜像再新建实例,模拟冷迁移操作。