Skip to content

Commit 5f4ead1

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Remove unavailable but not reported PCI devices at startup" into stable/yoga
2 parents 2d5ae25 + d7bca63 commit 5f4ead1

File tree

4 files changed

+123
-7
lines changed

4 files changed

+123
-7
lines changed

nova/objects/pci_device.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -447,11 +447,30 @@ def allocate(self, instance):
447447
instance.pci_devices.objects.append(copy.copy(self))
448448

449449
def remove(self):
450-
if self.status != fields.PciDeviceStatus.AVAILABLE:
450+
# We allow removal of a device is if it is unused. It can be unused
451+
# either by being in available state or being in a state that shows
452+
# that the parent or child device blocks the consumption of this device
453+
expected_states = [
454+
fields.PciDeviceStatus.AVAILABLE,
455+
fields.PciDeviceStatus.UNAVAILABLE,
456+
fields.PciDeviceStatus.UNCLAIMABLE,
457+
]
458+
if self.status not in expected_states:
451459
raise exception.PciDeviceInvalidStatus(
452460
compute_node_id=self.compute_node_id,
453461
address=self.address, status=self.status,
454-
hopestatus=[fields.PciDeviceStatus.AVAILABLE])
462+
hopestatus=expected_states)
463+
# Just to be on the safe side, do not allow removal of device that has
464+
# an owner even if the state of the device suggests that it is not
465+
# owned.
466+
if 'instance_uuid' in self and self.instance_uuid is not None:
467+
raise exception.PciDeviceInvalidOwner(
468+
compute_node_id=self.compute_node_id,
469+
address=self.address,
470+
owner=self.instance_uuid,
471+
hopeowner=None,
472+
)
473+
455474
self.status = fields.PciDeviceStatus.REMOVED
456475
self.instance_uuid = None
457476
self.request_id = None

nova/pci/manager.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,10 +217,13 @@ def _set_hvdevs(self, devices: ty.List[ty.Dict[str, ty.Any]]) -> None:
217217
# from the pci whitelist.
218218
try:
219219
existed.remove()
220-
except exception.PciDeviceInvalidStatus as e:
221-
LOG.warning("Unable to remove device with %(status)s "
222-
"ownership %(instance_uuid)s because of "
223-
"%(pci_exception)s. "
220+
except (
221+
exception.PciDeviceInvalidStatus,
222+
exception.PciDeviceInvalidOwner,
223+
) as e:
224+
LOG.warning("Unable to remove device with status "
225+
"'%(status)s' and ownership %(instance_uuid)s "
226+
"because of %(pci_exception)s. "
224227
"Check your [pci]passthrough_whitelist "
225228
"configuration to make sure this allocated "
226229
"device is whitelisted. If you have removed "
@@ -250,7 +253,10 @@ def _set_hvdevs(self, devices: ty.List[ty.Dict[str, ty.Any]]) -> None:
250253
else:
251254
# Note(yjiang5): no need to update stats if an assigned
252255
# device is hot removed.
253-
self.stats.remove_device(existed)
256+
# NOTE(gibi): only remove the device from the pools if it
257+
# is not already removed
258+
if existed in self.stats.get_free_devs():
259+
self.stats.remove_device(existed)
254260
else:
255261
# Update tracked devices.
256262
new_value: ty.Dict[str, ty.Any]

nova/tests/unit/objects/test_pci_device.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,16 @@ def test_remove_device_fail(self):
467467
devobj.claim(self.inst.uuid)
468468
self.assertRaises(exception.PciDeviceInvalidStatus, devobj.remove)
469469

470+
def test_remove_device_fail_owned_with_unavailable_state(self):
471+
# This test creates an PCI device in an invalid state. This should
472+
# not happen in any known scenario. But we want to be save not to allow
473+
# removing a device that has an owner. See bug 1969496 for more details
474+
self._create_fake_instance()
475+
devobj = pci_device.PciDevice.create(None, dev_dict)
476+
devobj.claim(self.inst.uuid)
477+
devobj.status = fields.PciDeviceStatus.UNAVAILABLE
478+
self.assertRaises(exception.PciDeviceInvalidOwner, devobj.remove)
479+
470480

471481
class TestPciDeviceObject(test_objects._LocalTest,
472482
_TestPciDeviceObject):

nova/tests/unit/pci/test_manager.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,87 @@ def test_set_hvdev_changed_stal(self):
397397
self.assertEqual(len(self.tracker.stale), 1)
398398
self.assertEqual(self.tracker.stale['0000:00:00.2']['vendor_id'], 'v2')
399399

400+
def _get_device_by_address(self, address):
401+
devs = [dev for dev in self.tracker.pci_devs if dev.address == address]
402+
if len(devs) == 1:
403+
return devs[0]
404+
if devs:
405+
raise ValueError('ambiguous address', devs)
406+
else:
407+
raise ValueError('device not found', address)
408+
409+
def test_set_hvdevs_unavailable_vf_removed(self):
410+
# We start with a PF parent and two VF children
411+
self._create_tracker([fake_db_dev_3, fake_db_dev_4, fake_db_dev_5])
412+
pci_requests_obj = self._create_pci_requests_object(
413+
[
414+
{
415+
'count': 1,
416+
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
417+
}
418+
],
419+
instance_uuid=uuidsentinel.instance1,
420+
)
421+
# then claim and allocate the PF that makes the VFs unavailable
422+
self.tracker.claim_instance(
423+
mock.sentinel.context, pci_requests_obj, None)
424+
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
425+
426+
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
427+
self.assertEqual('allocated', dev3_pf.status)
428+
self.assertEqual(uuidsentinel.instance1, dev3_pf.instance_uuid)
429+
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
430+
self.assertEqual('unavailable', dev4_vf.status)
431+
dev5_vf = self._get_device_by_address(fake_db_dev_5['address'])
432+
self.assertEqual('unavailable', dev5_vf.status)
433+
434+
# now simulate that one VF (dev_5) is removed from the hypervisor and
435+
# the compute is restarted. As the VF is not claimed or allocated we
436+
# are free to remove it from the tracker.
437+
self.tracker._set_hvdevs(copy.deepcopy([fake_pci_3, fake_pci_4]))
438+
439+
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
440+
self.assertEqual('allocated', dev3_pf.status)
441+
self.assertEqual(uuidsentinel.instance1, dev3_pf.instance_uuid)
442+
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
443+
self.assertEqual('unavailable', dev4_vf.status)
444+
dev5_vf = self._get_device_by_address(fake_db_dev_5['address'])
445+
self.assertEqual('removed', dev5_vf.status)
446+
447+
def test_set_hvdevs_unavailable_pf_removed(self):
448+
# We start with one PF parent and one child VF
449+
self._create_tracker([fake_db_dev_3, fake_db_dev_4])
450+
pci_requests_obj = self._create_pci_requests_object(
451+
[
452+
{
453+
'count': 1,
454+
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_VF}]
455+
}
456+
],
457+
instance_uuid=uuidsentinel.instance1,
458+
)
459+
# Then we claim and allocate the VF that makes the PF unavailable
460+
self.tracker.claim_instance(
461+
mock.sentinel.context, pci_requests_obj, None)
462+
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
463+
464+
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
465+
self.assertEqual('unavailable', dev3_pf.status)
466+
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
467+
self.assertEqual('allocated', dev4_vf.status)
468+
self.assertEqual(uuidsentinel.instance1, dev4_vf.instance_uuid)
469+
470+
# now simulate that the parent PF is removed from the hypervisor and
471+
# the compute is restarted. As the PF is not claimed or allocated we
472+
# are free to remove it from the tracker.
473+
self.tracker._set_hvdevs(copy.deepcopy([fake_pci_4]))
474+
475+
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
476+
self.assertEqual('removed', dev3_pf.status)
477+
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
478+
self.assertEqual('allocated', dev4_vf.status)
479+
self.assertEqual(uuidsentinel.instance1, dev4_vf.instance_uuid)
480+
400481
def test_update_pci_for_instance_active(self):
401482
pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)
402483
self.tracker.claim_instance(mock.sentinel.context,

0 commit comments

Comments
 (0)