Skip to content

Commit 48229b4

Browse files
committed
Retry /reshape at provider generation conflict
During a normal update_available_resources run if the local provider tree caches is invalid (i.e. due to the scheduler made an allocation bumping the generation of the RPs) and the virt driver try to update the inventory of an RP based on the cache Placement will report conflict, the report client will invalidate the caches and the retry decorator on ResourceTracker._update_to_placement will re-drive the top of the fresh RP data. However the same thing can happen during reshape as well but the retry mechanism is missing in that code path so the stale caches can cause reshape failures. This patch adds specific error handling in the reshape code path to implement the same retry mechanism as exists for inventory update. blueprint: pci-device-tracking-in-placement Change-Id: Ieb954a04e6aba827611765f7f401124a1fe298f3
1 parent bdec962 commit 48229b4

File tree

5 files changed

+97
-11
lines changed

5 files changed

+97
-11
lines changed

nova/compute/resource_tracker.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,9 +1195,16 @@ def _get_traits(self, context, nodename, provider_tree):
11951195

11961196
return list(traits)
11971197

1198-
@retrying.retry(stop_max_attempt_number=4,
1199-
retry_on_exception=lambda e: isinstance(
1200-
e, exception.ResourceProviderUpdateConflict))
1198+
@retrying.retry(
1199+
stop_max_attempt_number=4,
1200+
retry_on_exception=lambda e: isinstance(
1201+
e,
1202+
(
1203+
exception.ResourceProviderUpdateConflict,
1204+
exception.PlacementReshapeConflict,
1205+
),
1206+
),
1207+
)
12011208
def _update_to_placement(self, context, compute_node, startup):
12021209
"""Send resource and inventory changes to placement."""
12031210
# NOTE(jianghuaw): Some resources(e.g. VGPU) are not saved in the

nova/exception.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2077,6 +2077,16 @@ class ResourceProviderUpdateConflict(PlacementAPIConflict):
20772077
"provider %(uuid)s (generation %(generation)d): %(error)s")
20782078

20792079

2080+
class PlacementReshapeConflict(PlacementAPIConflict):
2081+
"""A 409 caused by generation mismatch from attempting to reshape a
2082+
provider tree.
2083+
"""
2084+
msg_fmt = _(
2085+
"A conflict was encountered attempting to reshape a provider tree: "
2086+
"$(error)s"
2087+
)
2088+
2089+
20802090
class InvalidResourceClass(Invalid):
20812091
msg_fmt = _("Resource class '%(resource_class)s' invalid.")
20822092

nova/scheduler/client/report.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,11 @@ def _reshape(self, context, inventories, allocations):
12771277
resp = self.post('/reshaper', payload, version=RESHAPER_VERSION,
12781278
global_request_id=context.global_id)
12791279
if not resp:
1280+
if resp.status_code == 409:
1281+
err = resp.json()['errors'][0]
1282+
if err['code'] == 'placement.concurrent_update':
1283+
raise exception.PlacementReshapeConflict(error=resp.text)
1284+
12801285
raise exception.ReshapeFailed(error=resp.text)
12811286

12821287
return resp
@@ -1310,7 +1315,7 @@ def _set_up_and_do_reshape(self, context, old_tree, new_tree, allocations):
13101315
# failure here to be fatal to the caller.
13111316
try:
13121317
self._reshape(context, inventories, allocations)
1313-
except exception.ReshapeFailed:
1318+
except (exception.ReshapeFailed, exception.PlacementReshapeConflict):
13141319
raise
13151320
except Exception as e:
13161321
# Make sure the original stack trace gets logged.
@@ -1429,8 +1434,16 @@ def catch_all(rp_uuid):
14291434
if allocations is not None:
14301435
# NOTE(efried): We do not catch_all here, because ReshapeFailed
14311436
# needs to bubble up right away and be handled specially.
1432-
self._set_up_and_do_reshape(context, old_tree, new_tree,
1433-
allocations)
1437+
try:
1438+
self._set_up_and_do_reshape(
1439+
context, old_tree, new_tree, allocations)
1440+
except exception.PlacementReshapeConflict:
1441+
# The conflict means we need to invalidate the local caches and
1442+
# let the retry mechanism in _update_to_placement to re-drive
1443+
# the reshape top of the fresh data
1444+
with excutils.save_and_reraise_exception():
1445+
self.clear_provider_cache()
1446+
14341447
# The reshape updated provider generations, so the ones we have in
14351448
# the cache are now stale. The inventory update below will short
14361449
# out, but we would still bounce with a provider generation

nova/tests/functional/test_report_client.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,6 +1363,17 @@ def test_reshape(self):
13631363
resp = self.client._reshape(self.context, inventories, allocs)
13641364
self.assertEqual(204, resp.status_code)
13651365

1366+
# Trigger generation conflict
1367+
# We can do this is by simply sending back the same reshape as that
1368+
# will not work because the previous reshape updated generations
1369+
self.assertRaises(
1370+
exception.PlacementReshapeConflict,
1371+
self.client._reshape,
1372+
self.context,
1373+
inventories,
1374+
allocs,
1375+
)
1376+
13661377
def test_update_from_provider_tree_reshape(self):
13671378
"""Run update_from_provider_tree with reshaping."""
13681379
exp_ptree = self._set_up_provider_tree()
@@ -1519,3 +1530,44 @@ def test_update_from_provider_tree_reshape(self):
15191530
self.context, self.compute_name)
15201531
self.assertProviderTree(orig_exp_ptree, ptree)
15211532
self.assertAllocations(orig_exp_allocs, allocs)
1533+
1534+
def test_update_from_provider_tree_reshape_conflict_retry(self):
1535+
exp_ptree = self._set_up_provider_tree()
1536+
1537+
ptree = self.client.get_provider_tree_and_ensure_root(
1538+
self.context, self.compute_uuid)
1539+
allocs = self.client.get_allocations_for_provider_tree(
1540+
self.context, self.compute_name)
1541+
self.assertProviderTree(exp_ptree, ptree)
1542+
self.assertAllocations({}, allocs)
1543+
1544+
exp_allocs = self._set_up_provider_tree_allocs()
1545+
1546+
# we prepare inventory and allocation changes to trigger a reshape
1547+
for rp_uuid in ptree.get_provider_uuids():
1548+
# Add a new resource class to the inventories
1549+
ptree.update_inventory(
1550+
rp_uuid, dict(ptree.data(rp_uuid).inventory,
1551+
CUSTOM_FOO={'total': 10}))
1552+
exp_ptree[rp_uuid]['inventory']['CUSTOM_FOO'] = {'total': 10}
1553+
for c_uuid, alloc in allocs.items():
1554+
for rp_uuid, res in alloc['allocations'].items():
1555+
res['resources']['CUSTOM_FOO'] = 1
1556+
exp_allocs[c_uuid]['allocations'][rp_uuid][
1557+
'resources']['CUSTOM_FOO'] = 1
1558+
1559+
# As the inventory update happens is the same request as the allocation
1560+
# update the allocation update will have a generation conflict.
1561+
# So we expect that it is signalled with an exception so that the
1562+
# upper layer can re-drive the reshape process with a fresh tree that
1563+
# now has the inventories
1564+
self.assertRaises(
1565+
exception.PlacementReshapeConflict,
1566+
self.client.update_from_provider_tree,
1567+
self.context,
1568+
ptree,
1569+
allocations=allocs,
1570+
)
1571+
# also we except that the internal caches is cleared so that the
1572+
# re-drive will have a chance to load fresh data from placement
1573+
self.assertEqual(0, len(self.client._provider_tree.roots))

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,12 +1710,18 @@ def fake_upt(ptree, nodename, allocations=None):
17101710
self.assertEqual(exp_inv, ptree.data(new_compute.uuid).inventory)
17111711
mock_sync_disabled.assert_called_once()
17121712

1713+
@ddt.data(
1714+
exc.ResourceProviderUpdateConflict(
1715+
uuid='uuid', generation=42, error='error'),
1716+
exc.PlacementReshapeConflict(error='error'),
1717+
)
17131718
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
17141719
'_sync_compute_service_disabled_trait')
17151720
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
17161721
'_resource_change', return_value=False)
1717-
def test_update_retry_success(self, mock_resource_change,
1718-
mock_sync_disabled):
1722+
def test_update_retry_success(
1723+
self, exc, mock_resource_change, mock_sync_disabled
1724+
):
17191725
self._setup_rt()
17201726
orig_compute = _COMPUTE_NODE_FIXTURES[0].obj_clone()
17211727
self.rt.compute_nodes[_NODENAME] = orig_compute
@@ -1729,9 +1735,7 @@ def test_update_retry_success(self, mock_resource_change,
17291735
self.driver_mock.update_provider_tree.side_effect = lambda *a: None
17301736

17311737
ufpt_mock = self.rt.reportclient.update_from_provider_tree
1732-
ufpt_mock.side_effect = (
1733-
exc.ResourceProviderUpdateConflict(
1734-
uuid='uuid', generation=42, error='error'), None)
1738+
ufpt_mock.side_effect = (exc, None)
17351739

17361740
self.rt._update(mock.sentinel.ctx, new_compute)
17371741

0 commit comments

Comments
 (0)