Skip to content

Commit 4ca4b2e

Browse files
committed
Allow claiming PCI PF if child VF is unavailable
As If9ab424cc7375a1f0d41b03f01c4a823216b3eb8 stated there is a way for the pci_device table to become inconsistent. Parent PF can be in 'available' state while children VFs are still in 'unavailable' state. In this situation the PF is schedulable but the PCI claim will fail when try to mark the dependent VFs unavailable. This patch changes the PCI claim logic to allow claiming the parent PF in the inconsistent situation as we assume that it is safe to do so. This claim also fixed the inconsistency so that when the parent PF is freed the children VFs become available again. Closes-Bug: #1969496 Change-Id: I575ce06bcc913add7db0849f85728371da2032fc (cherry picked from commit 3af2ecc)
1 parent 23c48b6 commit 4ca4b2e

File tree

3 files changed

+160
-24
lines changed

3 files changed

+160
-24
lines changed

nova/objects/pci_device.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -346,10 +346,40 @@ def claim(self, instance_uuid):
346346
# Update PF status to CLAIMED if all of it dependants are free
347347
# and set their status to UNCLAIMABLE
348348
vfs_list = self.child_devices
349-
if not all([vf.is_available() for vf in vfs_list]):
350-
raise exception.PciDeviceVFInvalidStatus(
351-
compute_node_id=self.compute_node_id,
352-
address=self.address)
349+
non_free_dependants = [
350+
vf for vf in vfs_list if not vf.is_available()]
351+
if non_free_dependants:
352+
# NOTE(gibi): There should not be any dependent devices that
353+
# are UNCLAIMABLE or UNAVAILABLE as the parent is AVAILABLE,
354+
# but we got reports in bug 1969496 that this inconsistency
355+
# can happen. So check if the only non-free devices are in
356+
# state UNCLAIMABLE or UNAVAILABLE then we log a warning but
357+
# allow to claim the parent.
358+
actual_statuses = {
359+
child.status for child in non_free_dependants}
360+
allowed_non_free_statues = {
361+
fields.PciDeviceStatus.UNCLAIMABLE,
362+
fields.PciDeviceStatus.UNAVAILABLE,
363+
}
364+
if actual_statuses - allowed_non_free_statues == set():
365+
LOG.warning(
366+
"Some child device of parent %s is in an inconsistent "
367+
"state. If you can reproduce this warning then please "
368+
"report a bug at "
369+
"https://bugs.launchpad.net/nova/+filebug with "
370+
"reproduction steps. Inconsistent children with "
371+
"state: %s",
372+
self.address,
373+
",".join(
374+
"%s - %s" % (child.address, child.status)
375+
for child in non_free_dependants
376+
),
377+
)
378+
379+
else:
380+
raise exception.PciDeviceVFInvalidStatus(
381+
compute_node_id=self.compute_node_id,
382+
address=self.address)
353383
self._bulk_update_status(vfs_list,
354384
fields.PciDeviceStatus.UNCLAIMABLE)
355385

nova/pci/stats.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,12 @@ def _handle_device_dependents(self, pci_dev: 'objects.PciDevice') -> None:
279279
if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:
280280
vfs_list = pci_dev.child_devices
281281
if vfs_list:
282+
free_devs = self.get_free_devs()
282283
for vf in vfs_list:
283-
self.remove_device(vf)
284+
# NOTE(gibi): do not try to remove a device that are
285+
# already removed
286+
if vf in free_devs:
287+
self.remove_device(vf)
284288
elif pci_dev.dev_type in (
285289
fields.PciDeviceType.SRIOV_VF,
286290
fields.PciDeviceType.VDPA,

nova/tests/unit/pci/test_manager.py

Lines changed: 121 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -507,32 +507,134 @@ def test_claim_available_pf_while_child_vf_is_unavailable(self):
507507
)
508508
# now try to claim and allocate the PF. It should work as it is
509509
# available
510-
# This is bug 1969496 as the claim fails with exception
511-
ex = self.assertRaises(
512-
exception.PciDevicePoolEmpty,
513-
self.tracker.claim_instance,
514-
mock.sentinel.context,
515-
pci_requests_obj,
516-
None
517-
)
510+
self.tracker.claim_instance(
511+
mock.sentinel.context, pci_requests_obj, None)
512+
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
513+
514+
pf_dev = self._get_device_by_address(pf['address'])
515+
self.assertEqual('allocated', pf_dev.status)
516+
vf_dev = self._get_device_by_address(vf['address'])
517+
self.assertEqual('unavailable', vf_dev.status)
518+
518519
self.assertIn(
519-
'Attempt to consume PCI device 1:0000:00:02.1 from empty pool',
520-
str(ex)
520+
'Some child device of parent 0000:00:01.1 is in an inconsistent '
521+
'state. If you can reproduce this warning then please report a '
522+
'bug at https://bugs.launchpad.net/nova/+filebug with '
523+
'reproduction steps. Inconsistent children with state: '
524+
'0000:00:02.1 - unavailable',
525+
self.stdlog.logger.output
521526
)
527+
528+
# Ensure that the claim actually fixes the inconsistency so when the
529+
# parent if freed the children become available too.
530+
self.tracker.free_instance(
531+
mock.sentinel.context, {'uuid': uuidsentinel.instance1})
532+
522533
pf_dev = self._get_device_by_address(pf['address'])
523534
self.assertEqual('available', pf_dev.status)
524535
vf_dev = self._get_device_by_address(vf['address'])
525-
self.assertEqual('unavailable', vf_dev.status)
536+
self.assertEqual('available', vf_dev.status)
537+
538+
def test_claim_available_pf_while_children_vfs_are_in_mixed_state(self):
539+
# We start with a PF parent and two VF children. The PF is available
540+
# and one of the VF is unavailable while the other is available.
541+
pf = copy.deepcopy(fake_db_dev_3)
542+
vf1 = copy.deepcopy(fake_db_dev_4)
543+
vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
544+
vf2 = copy.deepcopy(fake_db_dev_5)
545+
vf2['status'] = fields.PciDeviceStatus.AVAILABLE
546+
self._create_tracker([pf, vf1, vf2])
547+
548+
pf_dev = self._get_device_by_address(pf['address'])
549+
self.assertEqual('available', pf_dev.status)
550+
vf1_dev = self._get_device_by_address(vf1['address'])
551+
self.assertEqual('unavailable', vf1_dev.status)
552+
vf2_dev = self._get_device_by_address(vf2['address'])
553+
self.assertEqual('available', vf2_dev.status)
554+
555+
pci_requests_obj = self._create_pci_requests_object(
556+
[
557+
{
558+
'count': 1,
559+
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
560+
}
561+
],
562+
instance_uuid=uuidsentinel.instance1,
563+
)
564+
# now try to claim and allocate the PF. It should work as it is
565+
# available
566+
self.tracker.claim_instance(
567+
mock.sentinel.context, pci_requests_obj, None)
568+
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
569+
570+
pf_dev = self._get_device_by_address(pf['address'])
571+
self.assertEqual('allocated', pf_dev.status)
572+
vf1_dev = self._get_device_by_address(vf1['address'])
573+
self.assertEqual('unavailable', vf1_dev.status)
574+
vf2_dev = self._get_device_by_address(vf2['address'])
575+
self.assertEqual('unavailable', vf2_dev.status)
576+
577+
self.assertIn(
578+
'Some child device of parent 0000:00:01.1 is in an inconsistent '
579+
'state. If you can reproduce this warning then please report a '
580+
'bug at https://bugs.launchpad.net/nova/+filebug with '
581+
'reproduction steps. Inconsistent children with state: '
582+
'0000:00:02.1 - unavailable',
583+
self.stdlog.logger.output
584+
)
585+
586+
# Ensure that the claim actually fixes the inconsistency so when the
587+
# parent if freed the children become available too.
588+
self.tracker.free_instance(
589+
mock.sentinel.context, {'uuid': uuidsentinel.instance1})
590+
591+
pf_dev = self._get_device_by_address(pf['address'])
592+
self.assertEqual('available', pf_dev.status)
593+
vf1_dev = self._get_device_by_address(vf1['address'])
594+
self.assertEqual('available', vf1_dev.status)
595+
vf2_dev = self._get_device_by_address(vf2['address'])
596+
self.assertEqual('available', vf2_dev.status)
597+
598+
def test_claim_available_pf_while_a_child_is_used(self):
599+
pf = copy.deepcopy(fake_db_dev_3)
600+
vf1 = copy.deepcopy(fake_db_dev_4)
601+
vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
602+
vf2 = copy.deepcopy(fake_db_dev_5)
603+
vf2['status'] = fields.PciDeviceStatus.CLAIMED
604+
self._create_tracker([pf, vf1, vf2])
605+
606+
pf_dev = self._get_device_by_address(pf['address'])
607+
self.assertEqual('available', pf_dev.status)
608+
vf1_dev = self._get_device_by_address(vf1['address'])
609+
self.assertEqual('unavailable', vf1_dev.status)
610+
vf2_dev = self._get_device_by_address(vf2['address'])
611+
self.assertEqual('claimed', vf2_dev.status)
526612

527-
# This should work when the bug is fixed
528-
# self.tracker.claim_instance(
529-
# mock.sentinel.context, pci_requests_obj, None)
530-
# self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
613+
pci_requests_obj = self._create_pci_requests_object(
614+
[
615+
{
616+
'count': 1,
617+
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
618+
}
619+
],
620+
instance_uuid=uuidsentinel.instance1,
621+
)
622+
# now try to claim and allocate the PF. The claim should fail as on of
623+
# the child is used.
624+
self.assertRaises(
625+
exception.PciDeviceVFInvalidStatus,
626+
self.tracker.claim_instance,
627+
mock.sentinel.context,
628+
pci_requests_obj,
629+
None,
630+
)
531631

532-
# pf_dev = self._get_device_by_address(pf['address'])
533-
# self.assertEqual('allocated', pf_dev.status)
534-
# vf_dev = self._get_device_by_address(vf['address'])
535-
# self.assertEqual('unavailable', vf_dev.status)
632+
pf_dev = self._get_device_by_address(pf['address'])
633+
self.assertEqual('available', pf_dev.status)
634+
vf1_dev = self._get_device_by_address(vf1['address'])
635+
self.assertEqual('unavailable', vf1_dev.status)
636+
vf2_dev = self._get_device_by_address(vf2['address'])
637+
self.assertEqual('claimed', vf2_dev.status)
536638

537639
def test_update_pci_for_instance_active(self):
538640
pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)

0 commit comments

Comments
 (0)