Skip to content

Commit bf7254b

Browse files
committed
Update SRIOV port pci_slot when unshelving
There are a few things we need to do to make that work: * Always set the PCIRequest's requester_id. Previously, this was only done for resource requests. The requester_id is the port UUID, so we can use that to correlate which port to update with which pci_slot (in the case of multiple SRIOV ports per instance). This has the side effect of making the fix work only for instances created *after* this patch has been applied. It's not ideal, but there does not appear to be a better way. * Call setup_networks_on_host() within the instance_claim context. This means the instance's pci_devices are updated when we call it, allowing us to get the pci_slot information from them. With the two previous changes in place, we can figure out the port's new pci_slot in _update_port_binding_for_instance(). Closes: bug 1851545 Change-Id: Icfa8c1d6e84eab758af6223a2870078685584aaa (cherry picked from commit 00f1d47)
1 parent 3625d53 commit bf7254b

File tree

5 files changed

+127
-53
lines changed

5 files changed

+127
-53
lines changed

nova/compute/manager.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6556,12 +6556,6 @@ def _unshelve_instance(self, context, instance, image, filter_properties,
65566556
context, self.reportclient, instance.pci_requests.requests,
65576557
provider_mappings)
65586558

6559-
self.network_api.setup_instance_network_on_host(
6560-
context, instance, self.host,
6561-
provider_mappings=provider_mappings)
6562-
network_info = self.network_api.get_instance_nw_info(
6563-
context, instance)
6564-
65656559
accel_info = []
65666560
if accel_uuids:
65676561
try:
@@ -6571,11 +6565,17 @@ def _unshelve_instance(self, context, instance, image, filter_properties,
65716565
LOG.exception('Failure getting accelerator requests '
65726566
'with the exception: %s', exc,
65736567
instance=instance)
6574-
self._build_resources_cleanup(instance, network_info)
6568+
self._build_resources_cleanup(instance, None)
65756569
raise
65766570

65776571
with self.rt.instance_claim(context, instance, node, allocations,
65786572
limits):
6573+
self.network_api.setup_instance_network_on_host(
6574+
context, instance, self.host,
6575+
provider_mappings=provider_mappings)
6576+
network_info = self.network_api.get_instance_nw_info(
6577+
context, instance)
6578+
65796579
self.driver.spawn(context, instance, image_meta,
65806580
injected_files=[],
65816581
admin_password=None,

nova/network/neutron.py

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2106,16 +2106,16 @@ def create_resource_requests(
21062106
port_numa_policy = None
21072107

21082108
if request_net.port_id:
2109+
# InstancePCIRequest.requester_id is semantically linked
2110+
# to a port with a resource_request.
2111+
requester_id = request_net.port_id
21092112
(vnic_type, trusted, network_id, resource_request,
21102113
port_numa_policy) = self._get_port_vnic_info(
21112114
context, neutron, request_net.port_id)
21122115
physnet, tunneled_ = self._get_physnet_tunneled_info(
21132116
context, neutron, network_id)
21142117

21152118
if resource_request:
2116-
# InstancePCIRequest.requester_id is semantically linked
2117-
# to a port with a resource_request.
2118-
requester_id = request_net.port_id
21192119
# NOTE(gibi): explicitly orphan the RequestGroup by setting
21202120
# context=None as we never intended to save it to the DB.
21212121
resource_requests.append(
@@ -3363,6 +3363,37 @@ def _get_pci_mapping_for_migration(self, instance, migration):
33633363
migration.get('status') == 'reverted')
33643364
return instance.migration_context.get_pci_mapping_for_migration(revert)
33653365

3366+
def _get_port_pci_slot(self, context, instance, port):
3367+
"""Find the PCI address of the device corresponding to the port.
3368+
Assumes the port is an SRIOV one.
3369+
3370+
:param context: The request context.
3371+
:param instance: The instance to which the port is attached.
3372+
:param port: The Neutron port, as obtained from the Neutron API
3373+
JSON form.
3374+
:return: The PCI address as a string, or None if unable to find.
3375+
"""
3376+
# Find the port's PCIRequest, or return None
3377+
for r in instance.pci_requests.requests:
3378+
if r.requester_id == port['id']:
3379+
request = r
3380+
break
3381+
else:
3382+
LOG.debug('No PCI request found for port %s', port['id'],
3383+
instance=instance)
3384+
return None
3385+
# Find the request's device, or return None
3386+
for d in instance.pci_devices:
3387+
if d.request_id == request.request_id:
3388+
device = d
3389+
break
3390+
else:
3391+
LOG.debug('No PCI device found for request %s',
3392+
request.request_id, instance=instance)
3393+
return None
3394+
# Return the device's PCI address
3395+
return device.address
3396+
33663397
def _update_port_binding_for_instance(
33673398
self, context, instance, host, migration=None,
33683399
provider_mappings=None):
@@ -3371,7 +3402,6 @@ def _update_port_binding_for_instance(
33713402
search_opts = {'device_id': instance.uuid,
33723403
'tenant_id': instance.project_id}
33733404
data = neutron.list_ports(**search_opts)
3374-
pci_mapping = None
33753405
port_updates = []
33763406
ports = data['ports']
33773407
FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND,
@@ -3404,25 +3434,36 @@ def _update_port_binding_for_instance(
34043434
# that this function is called without a migration object, such
34053435
# as in an unshelve operation.
34063436
vnic_type = p.get('binding:vnic_type')
3407-
if (vnic_type in network_model.VNIC_TYPES_SRIOV and
3408-
migration is not None and
3409-
not migration.is_live_migration):
3410-
# Note(adrianc): for live migration binding profile was already
3411-
# updated in conductor when calling bind_ports_to_host()
3412-
if not pci_mapping:
3413-
pci_mapping = self._get_pci_mapping_for_migration(
3414-
instance, migration)
3415-
3416-
pci_slot = binding_profile.get('pci_slot')
3417-
new_dev = pci_mapping.get(pci_slot)
3418-
if new_dev:
3419-
binding_profile.update(
3420-
self._get_pci_device_profile(new_dev))
3421-
updates[constants.BINDING_PROFILE] = binding_profile
3437+
if vnic_type in network_model.VNIC_TYPES_SRIOV:
3438+
# NOTE(artom) For migrations, update the binding profile from
3439+
# the migration object...
3440+
if migration is not None:
3441+
# NOTE(artom) ... except for live migrations, because the
3442+
# conductor has already done that whe calling
3443+
# bind_ports_to_host().
3444+
if not migration.is_live_migration:
3445+
pci_mapping = self._get_pci_mapping_for_migration(
3446+
instance, migration)
3447+
3448+
pci_slot = binding_profile.get('pci_slot')
3449+
new_dev = pci_mapping.get(pci_slot)
3450+
if new_dev:
3451+
binding_profile.update(
3452+
self._get_pci_device_profile(new_dev))
3453+
updates[
3454+
constants.BINDING_PROFILE] = binding_profile
3455+
else:
3456+
raise exception.PortUpdateFailed(port_id=p['id'],
3457+
reason=_("Unable to correlate PCI slot %s") %
3458+
pci_slot)
3459+
# NOTE(artom) If migration is None, this is an unshevle, and we
3460+
# need to figure out the pci_slot from the InstancePCIRequest
3461+
# and PciDevice objects.
34223462
else:
3423-
raise exception.PortUpdateFailed(port_id=p['id'],
3424-
reason=_("Unable to correlate PCI slot %s") %
3425-
pci_slot)
3463+
pci_slot = self._get_port_pci_slot(context, instance, p)
3464+
if pci_slot:
3465+
binding_profile.update({'pci_slot': pci_slot})
3466+
updates[constants.BINDING_PROFILE] = binding_profile
34263467

34273468
# NOTE(gibi): during live migration the conductor already sets the
34283469
# allocation key in the port binding. However during resize, cold

nova/tests/functional/libvirt/test_pci_sriov_servers.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -434,30 +434,17 @@ def _test_move_operation_with_neutron(self, move_operation,
434434
source_port = self.neutron.show_port(source_port['port']['id'])
435435
same_slot_port = self.neutron.show_port(same_slot_port['port']['id'])
436436

437-
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the
438-
# pci_slot.
439-
if expect_fail:
440-
self.assertEqual(
441-
source_port['port']['binding:profile']['pci_slot'],
442-
same_slot_port['port']['binding:profile']['pci_slot'])
443-
else:
444-
self.assertNotEqual(
445-
source_port['port']['binding:profile']['pci_slot'],
446-
same_slot_port['port']['binding:profile']['pci_slot'])
437+
self.assertNotEqual(
438+
source_port['port']['binding:profile']['pci_slot'],
439+
same_slot_port['port']['binding:profile']['pci_slot'])
447440

448441
conn = self.computes['dest'].driver._host.get_connection()
449442
vms = [vm._def for vm in conn._vms.values()]
450443
self.assertEqual(2, len(vms))
451444
for vm in vms:
452445
self.assertEqual(1, len(vm['devices']['nics']))
453-
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the
454-
# XML.
455-
if expect_fail:
456-
self.assertEqual(vms[0]['devices']['nics'][0]['source'],
457-
vms[1]['devices']['nics'][0]['source'])
458-
else:
459-
self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
460-
vms[1]['devices']['nics'][0]['source'])
446+
self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
447+
vms[1]['devices']['nics'][0]['source'])
461448

462449
def test_unshelve_server_with_neutron(self):
463450
def move_operation(source_server):
@@ -466,10 +453,7 @@ def move_operation(source_server):
466453
self.api.put_service(self.computes['source'].service_ref.uuid,
467454
{'status': 'disabled'})
468455
self._unshelve_server(source_server)
469-
# FIXME(artom) Bug 1851545 means we explain failure here: the pci_slot
470-
# and XML will not get updated.
471-
self._test_move_operation_with_neutron(move_operation,
472-
expect_fail=True)
456+
self._test_move_operation_with_neutron(move_operation)
473457

474458
def test_cold_migrate_server_with_neutron(self):
475459
def move_operation(source_server):

nova/tests/unit/network/test_neutron.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5828,9 +5828,11 @@ def test_create_resource_requests(self, getclient,
58285828
else:
58295829
self.assertNotIn(pci_request.PCI_TRUSTED_TAG, spec)
58305830

5831-
# Only the port with a resource_request will have pci_req.requester_id.
5831+
# Only SRIOV ports and those with a resource_request will have
5832+
# pci_req.requester_id.
58325833
self.assertEqual(
5833-
[None, None, None, None, uuids.trusted_port, None],
5834+
[uuids.portid_1, uuids.portid_3, uuids.portid_4, uuids.portid_5,
5835+
uuids.trusted_port, uuids.portid_vdpa],
58345836
[pci_req.requester_id for pci_req in pci_requests.requests])
58355837

58365838
self.assertCountEqual(
@@ -6381,6 +6383,41 @@ def test_get_segment_id_for_subnet_fails(self, mock_client):
63816383
self.api.get_segment_id_for_subnet,
63826384
self.context, uuids.subnet_id)
63836385

6386+
@mock.patch.object(neutronapi.LOG, 'debug')
6387+
def test_get_port_pci_slot(self, mock_debug):
6388+
fake_port = {'id': uuids.fake_port_id}
6389+
request = objects.InstancePCIRequest(requester_id=uuids.fake_port_id,
6390+
request_id=uuids.pci_request_id)
6391+
bad_request = objects.InstancePCIRequest(
6392+
requester_id=uuids.wrong_port_id)
6393+
device = objects.PciDevice(request_id=uuids.pci_request_id,
6394+
address='fake-pci-address')
6395+
bad_device = objects.PciDevice(request_id=uuids.wrong_request_id)
6396+
# Test the happy path
6397+
instance = objects.Instance(
6398+
pci_requests=objects.InstancePCIRequests(requests=[request]),
6399+
pci_devices=objects.PciDeviceList(objects=[device]))
6400+
self.assertEqual(
6401+
'fake-pci-address',
6402+
self.api._get_port_pci_slot(self.context, instance, fake_port))
6403+
# Test not finding the request
6404+
instance = objects.Instance(
6405+
pci_requests=objects.InstancePCIRequests(
6406+
requests=[objects.InstancePCIRequest(bad_request)]))
6407+
self.assertIsNone(
6408+
self.api._get_port_pci_slot(self.context, instance, fake_port))
6409+
mock_debug.assert_called_with('No PCI request found for port %s',
6410+
uuids.fake_port_id, instance=instance)
6411+
mock_debug.reset_mock()
6412+
# Test not finding the device
6413+
instance = objects.Instance(
6414+
pci_requests=objects.InstancePCIRequests(requests=[request]),
6415+
pci_devices=objects.PciDeviceList(objects=[bad_device]))
6416+
self.assertIsNone(
6417+
self.api._get_port_pci_slot(self.context, instance, fake_port))
6418+
mock_debug.assert_called_with('No PCI device found for request %s',
6419+
uuids.pci_request_id, instance=instance)
6420+
63846421

63856422
class TestAPIModuleMethods(test.NoDBTestCase):
63866423

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
fixes:
3+
- |
4+
`Bug 1851545 <https://bugs.launchpad.net/nova/+bug/1851545>`_, wherein
5+
unshelving an instance with SRIOV Neutron ports did not update the port
6+
binding's ``pci_slot`` and could cause libvirt PCI conflicts, has been
7+
fixed.
8+
9+
.. important:: Constraints in the fix's implementation mean that it only
10+
applies to instances booted **after** it has been applied. Existing
11+
instances will still experience bug 1851545 after being shelved and
12+
unshelved, even with the fix applied.

0 commit comments

Comments
 (0)