Skip to content

Commit 0bab2e5

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Retry /reshape at provider generation conflict"
2 parents e4328ed + 48229b4 commit 0bab2e5

File tree

5 files changed

+97
-11
lines changed

5 files changed

+97
-11
lines changed

nova/compute/resource_tracker.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,9 +1195,16 @@ def _get_traits(self, context, nodename, provider_tree):
11951195

11961196
return list(traits)
11971197

1198-
@retrying.retry(stop_max_attempt_number=4,
1199-
retry_on_exception=lambda e: isinstance(
1200-
e, exception.ResourceProviderUpdateConflict))
1198+
@retrying.retry(
1199+
stop_max_attempt_number=4,
1200+
retry_on_exception=lambda e: isinstance(
1201+
e,
1202+
(
1203+
exception.ResourceProviderUpdateConflict,
1204+
exception.PlacementReshapeConflict,
1205+
),
1206+
),
1207+
)
12011208
def _update_to_placement(self, context, compute_node, startup):
12021209
"""Send resource and inventory changes to placement."""
12031210
# NOTE(jianghuaw): Some resources(e.g. VGPU) are not saved in the

nova/exception.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2088,6 +2088,16 @@ class ResourceProviderUpdateConflict(PlacementAPIConflict):
20882088
"provider %(uuid)s (generation %(generation)d): %(error)s")
20892089

20902090

2091+
class PlacementReshapeConflict(PlacementAPIConflict):
2092+
"""A 409 caused by generation mismatch from attempting to reshape a
2093+
provider tree.
2094+
"""
2095+
msg_fmt = _(
2096+
"A conflict was encountered attempting to reshape a provider tree: "
2097+
"$(error)s"
2098+
)
2099+
2100+
20912101
class InvalidResourceClass(Invalid):
20922102
msg_fmt = _("Resource class '%(resource_class)s' invalid.")
20932103

nova/scheduler/client/report.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,11 @@ def _reshape(self, context, inventories, allocations):
12771277
resp = self.post('/reshaper', payload, version=RESHAPER_VERSION,
12781278
global_request_id=context.global_id)
12791279
if not resp:
1280+
if resp.status_code == 409:
1281+
err = resp.json()['errors'][0]
1282+
if err['code'] == 'placement.concurrent_update':
1283+
raise exception.PlacementReshapeConflict(error=resp.text)
1284+
12801285
raise exception.ReshapeFailed(error=resp.text)
12811286

12821287
return resp
@@ -1310,7 +1315,7 @@ def _set_up_and_do_reshape(self, context, old_tree, new_tree, allocations):
13101315
# failure here to be fatal to the caller.
13111316
try:
13121317
self._reshape(context, inventories, allocations)
1313-
except exception.ReshapeFailed:
1318+
except (exception.ReshapeFailed, exception.PlacementReshapeConflict):
13141319
raise
13151320
except Exception as e:
13161321
# Make sure the original stack trace gets logged.
@@ -1429,8 +1434,16 @@ def catch_all(rp_uuid):
14291434
if allocations is not None:
14301435
# NOTE(efried): We do not catch_all here, because ReshapeFailed
14311436
# needs to bubble up right away and be handled specially.
1432-
self._set_up_and_do_reshape(context, old_tree, new_tree,
1433-
allocations)
1437+
try:
1438+
self._set_up_and_do_reshape(
1439+
context, old_tree, new_tree, allocations)
1440+
except exception.PlacementReshapeConflict:
1441+
# The conflict means we need to invalidate the local caches and
1442+
# let the retry mechanism in _update_to_placement to re-drive
1443+
# the reshape top of the fresh data
1444+
with excutils.save_and_reraise_exception():
1445+
self.clear_provider_cache()
1446+
14341447
# The reshape updated provider generations, so the ones we have in
14351448
# the cache are now stale. The inventory update below will short
14361449
# out, but we would still bounce with a provider generation

nova/tests/functional/test_report_client.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,6 +1363,17 @@ def test_reshape(self):
13631363
resp = self.client._reshape(self.context, inventories, allocs)
13641364
self.assertEqual(204, resp.status_code)
13651365

1366+
# Trigger generation conflict
1367+
# We can do this is by simply sending back the same reshape as that
1368+
# will not work because the previous reshape updated generations
1369+
self.assertRaises(
1370+
exception.PlacementReshapeConflict,
1371+
self.client._reshape,
1372+
self.context,
1373+
inventories,
1374+
allocs,
1375+
)
1376+
13661377
def test_update_from_provider_tree_reshape(self):
13671378
"""Run update_from_provider_tree with reshaping."""
13681379
exp_ptree = self._set_up_provider_tree()
@@ -1519,3 +1530,44 @@ def test_update_from_provider_tree_reshape(self):
15191530
self.context, self.compute_name)
15201531
self.assertProviderTree(orig_exp_ptree, ptree)
15211532
self.assertAllocations(orig_exp_allocs, allocs)
1533+
1534+
def test_update_from_provider_tree_reshape_conflict_retry(self):
1535+
exp_ptree = self._set_up_provider_tree()
1536+
1537+
ptree = self.client.get_provider_tree_and_ensure_root(
1538+
self.context, self.compute_uuid)
1539+
allocs = self.client.get_allocations_for_provider_tree(
1540+
self.context, self.compute_name)
1541+
self.assertProviderTree(exp_ptree, ptree)
1542+
self.assertAllocations({}, allocs)
1543+
1544+
exp_allocs = self._set_up_provider_tree_allocs()
1545+
1546+
# we prepare inventory and allocation changes to trigger a reshape
1547+
for rp_uuid in ptree.get_provider_uuids():
1548+
# Add a new resource class to the inventories
1549+
ptree.update_inventory(
1550+
rp_uuid, dict(ptree.data(rp_uuid).inventory,
1551+
CUSTOM_FOO={'total': 10}))
1552+
exp_ptree[rp_uuid]['inventory']['CUSTOM_FOO'] = {'total': 10}
1553+
for c_uuid, alloc in allocs.items():
1554+
for rp_uuid, res in alloc['allocations'].items():
1555+
res['resources']['CUSTOM_FOO'] = 1
1556+
exp_allocs[c_uuid]['allocations'][rp_uuid][
1557+
'resources']['CUSTOM_FOO'] = 1
1558+
1559+
# As the inventory update happens is the same request as the allocation
1560+
# update the allocation update will have a generation conflict.
1561+
# So we expect that it is signalled with an exception so that the
1562+
# upper layer can re-drive the reshape process with a fresh tree that
1563+
# now has the inventories
1564+
self.assertRaises(
1565+
exception.PlacementReshapeConflict,
1566+
self.client.update_from_provider_tree,
1567+
self.context,
1568+
ptree,
1569+
allocations=allocs,
1570+
)
1571+
# also we except that the internal caches is cleared so that the
1572+
# re-drive will have a chance to load fresh data from placement
1573+
self.assertEqual(0, len(self.client._provider_tree.roots))

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,12 +1710,18 @@ def fake_upt(ptree, nodename, allocations=None):
17101710
self.assertEqual(exp_inv, ptree.data(new_compute.uuid).inventory)
17111711
mock_sync_disabled.assert_called_once()
17121712

1713+
@ddt.data(
1714+
exc.ResourceProviderUpdateConflict(
1715+
uuid='uuid', generation=42, error='error'),
1716+
exc.PlacementReshapeConflict(error='error'),
1717+
)
17131718
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
17141719
'_sync_compute_service_disabled_trait')
17151720
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
17161721
'_resource_change', return_value=False)
1717-
def test_update_retry_success(self, mock_resource_change,
1718-
mock_sync_disabled):
1722+
def test_update_retry_success(
1723+
self, exc, mock_resource_change, mock_sync_disabled
1724+
):
17191725
self._setup_rt()
17201726
orig_compute = _COMPUTE_NODE_FIXTURES[0].obj_clone()
17211727
self.rt.compute_nodes[_NODENAME] = orig_compute
@@ -1729,9 +1735,7 @@ def test_update_retry_success(self, mock_resource_change,
17291735
self.driver_mock.update_provider_tree.side_effect = lambda *a: None
17301736

17311737
ufpt_mock = self.rt.reportclient.update_from_provider_tree
1732-
ufpt_mock.side_effect = (
1733-
exc.ResourceProviderUpdateConflict(
1734-
uuid='uuid', generation=42, error='error'), None)
1738+
ufpt_mock.side_effect = (exc, None)
17351739

17361740
self.rt._update(mock.sentinel.ctx, new_compute)
17371741

0 commit comments

Comments
 (0)