Skip to content

Commit 6b9c55a

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Allow claiming PCI PF if child VF is unavailable" into stable/yoga
2 parents 5307d75 + 4ca4b2e commit 6b9c55a

File tree

3 files changed

+160
-24
lines changed

3 files changed

+160
-24
lines changed

nova/objects/pci_device.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -346,10 +346,40 @@ def claim(self, instance_uuid):
346346
# Update PF status to CLAIMED if all of it dependants are free
347347
# and set their status to UNCLAIMABLE
348348
vfs_list = self.child_devices
349-
if not all([vf.is_available() for vf in vfs_list]):
350-
raise exception.PciDeviceVFInvalidStatus(
351-
compute_node_id=self.compute_node_id,
352-
address=self.address)
349+
non_free_dependants = [
350+
vf for vf in vfs_list if not vf.is_available()]
351+
if non_free_dependants:
352+
# NOTE(gibi): There should not be any dependent devices that
353+
# are UNCLAIMABLE or UNAVAILABLE as the parent is AVAILABLE,
354+
# but we got reports in bug 1969496 that this inconsistency
355+
# can happen. So check if the only non-free devices are in
356+
# state UNCLAIMABLE or UNAVAILABLE then we log a warning but
357+
# allow to claim the parent.
358+
actual_statuses = {
359+
child.status for child in non_free_dependants}
360+
allowed_non_free_statues = {
361+
fields.PciDeviceStatus.UNCLAIMABLE,
362+
fields.PciDeviceStatus.UNAVAILABLE,
363+
}
364+
if actual_statuses - allowed_non_free_statues == set():
365+
LOG.warning(
366+
"Some child device of parent %s is in an inconsistent "
367+
"state. If you can reproduce this warning then please "
368+
"report a bug at "
369+
"https://bugs.launchpad.net/nova/+filebug with "
370+
"reproduction steps. Inconsistent children with "
371+
"state: %s",
372+
self.address,
373+
",".join(
374+
"%s - %s" % (child.address, child.status)
375+
for child in non_free_dependants
376+
),
377+
)
378+
379+
else:
380+
raise exception.PciDeviceVFInvalidStatus(
381+
compute_node_id=self.compute_node_id,
382+
address=self.address)
353383
self._bulk_update_status(vfs_list,
354384
fields.PciDeviceStatus.UNCLAIMABLE)
355385

nova/pci/stats.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,12 @@ def _handle_device_dependents(self, pci_dev: 'objects.PciDevice') -> None:
279279
if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:
280280
vfs_list = pci_dev.child_devices
281281
if vfs_list:
282+
free_devs = self.get_free_devs()
282283
for vf in vfs_list:
283-
self.remove_device(vf)
284+
# NOTE(gibi): do not try to remove a device that are
285+
# already removed
286+
if vf in free_devs:
287+
self.remove_device(vf)
284288
elif pci_dev.dev_type in (
285289
fields.PciDeviceType.SRIOV_VF,
286290
fields.PciDeviceType.VDPA,

nova/tests/unit/pci/test_manager.py

Lines changed: 121 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -507,32 +507,134 @@ def test_claim_available_pf_while_child_vf_is_unavailable(self):
507507
)
508508
# now try to claim and allocate the PF. It should work as it is
509509
# available
510-
# This is bug 1969496 as the claim fails with exception
511-
ex = self.assertRaises(
512-
exception.PciDevicePoolEmpty,
513-
self.tracker.claim_instance,
514-
mock.sentinel.context,
515-
pci_requests_obj,
516-
None
517-
)
510+
self.tracker.claim_instance(
511+
mock.sentinel.context, pci_requests_obj, None)
512+
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
513+
514+
pf_dev = self._get_device_by_address(pf['address'])
515+
self.assertEqual('allocated', pf_dev.status)
516+
vf_dev = self._get_device_by_address(vf['address'])
517+
self.assertEqual('unavailable', vf_dev.status)
518+
518519
self.assertIn(
519-
'Attempt to consume PCI device 1:0000:00:02.1 from empty pool',
520-
str(ex)
520+
'Some child device of parent 0000:00:01.1 is in an inconsistent '
521+
'state. If you can reproduce this warning then please report a '
522+
'bug at https://bugs.launchpad.net/nova/+filebug with '
523+
'reproduction steps. Inconsistent children with state: '
524+
'0000:00:02.1 - unavailable',
525+
self.stdlog.logger.output
521526
)
527+
528+
# Ensure that the claim actually fixes the inconsistency so when the
529+
# parent if freed the children become available too.
530+
self.tracker.free_instance(
531+
mock.sentinel.context, {'uuid': uuidsentinel.instance1})
532+
522533
pf_dev = self._get_device_by_address(pf['address'])
523534
self.assertEqual('available', pf_dev.status)
524535
vf_dev = self._get_device_by_address(vf['address'])
525-
self.assertEqual('unavailable', vf_dev.status)
536+
self.assertEqual('available', vf_dev.status)
537+
538+
def test_claim_available_pf_while_children_vfs_are_in_mixed_state(self):
539+
# We start with a PF parent and two VF children. The PF is available
540+
# and one of the VF is unavailable while the other is available.
541+
pf = copy.deepcopy(fake_db_dev_3)
542+
vf1 = copy.deepcopy(fake_db_dev_4)
543+
vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
544+
vf2 = copy.deepcopy(fake_db_dev_5)
545+
vf2['status'] = fields.PciDeviceStatus.AVAILABLE
546+
self._create_tracker([pf, vf1, vf2])
547+
548+
pf_dev = self._get_device_by_address(pf['address'])
549+
self.assertEqual('available', pf_dev.status)
550+
vf1_dev = self._get_device_by_address(vf1['address'])
551+
self.assertEqual('unavailable', vf1_dev.status)
552+
vf2_dev = self._get_device_by_address(vf2['address'])
553+
self.assertEqual('available', vf2_dev.status)
554+
555+
pci_requests_obj = self._create_pci_requests_object(
556+
[
557+
{
558+
'count': 1,
559+
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
560+
}
561+
],
562+
instance_uuid=uuidsentinel.instance1,
563+
)
564+
# now try to claim and allocate the PF. It should work as it is
565+
# available
566+
self.tracker.claim_instance(
567+
mock.sentinel.context, pci_requests_obj, None)
568+
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
569+
570+
pf_dev = self._get_device_by_address(pf['address'])
571+
self.assertEqual('allocated', pf_dev.status)
572+
vf1_dev = self._get_device_by_address(vf1['address'])
573+
self.assertEqual('unavailable', vf1_dev.status)
574+
vf2_dev = self._get_device_by_address(vf2['address'])
575+
self.assertEqual('unavailable', vf2_dev.status)
576+
577+
self.assertIn(
578+
'Some child device of parent 0000:00:01.1 is in an inconsistent '
579+
'state. If you can reproduce this warning then please report a '
580+
'bug at https://bugs.launchpad.net/nova/+filebug with '
581+
'reproduction steps. Inconsistent children with state: '
582+
'0000:00:02.1 - unavailable',
583+
self.stdlog.logger.output
584+
)
585+
586+
# Ensure that the claim actually fixes the inconsistency so when the
587+
# parent if freed the children become available too.
588+
self.tracker.free_instance(
589+
mock.sentinel.context, {'uuid': uuidsentinel.instance1})
590+
591+
pf_dev = self._get_device_by_address(pf['address'])
592+
self.assertEqual('available', pf_dev.status)
593+
vf1_dev = self._get_device_by_address(vf1['address'])
594+
self.assertEqual('available', vf1_dev.status)
595+
vf2_dev = self._get_device_by_address(vf2['address'])
596+
self.assertEqual('available', vf2_dev.status)
597+
598+
def test_claim_available_pf_while_a_child_is_used(self):
599+
pf = copy.deepcopy(fake_db_dev_3)
600+
vf1 = copy.deepcopy(fake_db_dev_4)
601+
vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
602+
vf2 = copy.deepcopy(fake_db_dev_5)
603+
vf2['status'] = fields.PciDeviceStatus.CLAIMED
604+
self._create_tracker([pf, vf1, vf2])
605+
606+
pf_dev = self._get_device_by_address(pf['address'])
607+
self.assertEqual('available', pf_dev.status)
608+
vf1_dev = self._get_device_by_address(vf1['address'])
609+
self.assertEqual('unavailable', vf1_dev.status)
610+
vf2_dev = self._get_device_by_address(vf2['address'])
611+
self.assertEqual('claimed', vf2_dev.status)
526612

527-
# This should work when the bug is fixed
528-
# self.tracker.claim_instance(
529-
# mock.sentinel.context, pci_requests_obj, None)
530-
# self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
613+
pci_requests_obj = self._create_pci_requests_object(
614+
[
615+
{
616+
'count': 1,
617+
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
618+
}
619+
],
620+
instance_uuid=uuidsentinel.instance1,
621+
)
622+
# now try to claim and allocate the PF. The claim should fail as on of
623+
# the child is used.
624+
self.assertRaises(
625+
exception.PciDeviceVFInvalidStatus,
626+
self.tracker.claim_instance,
627+
mock.sentinel.context,
628+
pci_requests_obj,
629+
None,
630+
)
531631

532-
# pf_dev = self._get_device_by_address(pf['address'])
533-
# self.assertEqual('allocated', pf_dev.status)
534-
# vf_dev = self._get_device_by_address(vf['address'])
535-
# self.assertEqual('unavailable', vf_dev.status)
632+
pf_dev = self._get_device_by_address(pf['address'])
633+
self.assertEqual('available', pf_dev.status)
634+
vf1_dev = self._get_device_by_address(vf1['address'])
635+
self.assertEqual('unavailable', vf1_dev.status)
636+
vf2_dev = self._get_device_by_address(vf2['address'])
637+
self.assertEqual('claimed', vf2_dev.status)
536638

537639
def test_update_pci_for_instance_active(self):
538640
pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)

0 commit comments

Comments
 (0)