Skip to content

Commit 4f1e4b3

Browse files
markgoddardjovial
authored andcommitted
Invalidate provider tree when compute node disappears
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the resource provider for that node might also be deleted. The compute host that owns the node might not recreate the resource provider if it exists in the provider tree cache. This change fixes the issue by clearing resource providers from the provider tree cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the resource providers are not found in the cache and get recreated in placement. Change-Id: Ia53ff43e6964963cdf295604ba0fb7171389606e Related-Bug: #1853009 Related-Bug: #1841481 (cherry picked from commit 7dcc6bfa63c1ab06d86b07bcdd05838a8ad35dec)
1 parent 730071f commit 4f1e4b3

File tree

4 files changed

+25
-20
lines changed

4 files changed

+25
-20
lines changed

nova/compute/resource_tracker.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2005,3 +2005,4 @@ def clean_compute_node_cache(self, compute_nodes_in_db):
20052005
# where another compute service took ownership of the node. Clean
20062006
# up the cache.
20072007
self.remove_node(stale_cn)
2008+
self.reportclient.invalidate_resource_provider(stale_cn)

nova/scheduler/client/report.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -678,11 +678,7 @@ def _delete_provider(self, rp_uuid, global_request_id=None):
678678
if resp:
679679
LOG.info("Deleted resource provider %s", rp_uuid)
680680
# clean the caches
681-
try:
682-
self._provider_tree.remove(rp_uuid)
683-
except ValueError:
684-
pass
685-
self._association_refresh_time.pop(rp_uuid, None)
681+
self.invalidate_resource_provider(rp_uuid)
686682
return
687683

688684
msg = ("[%(placement_req_id)s] Failed to delete resource provider "
@@ -2181,6 +2177,17 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
21812177
# left a no-op for backward compatibility.
21822178
pass
21832179

2180+
def invalidate_resource_provider(self, name_or_uuid):
2181+
"""Invalidate the cache for a resource provider.
2182+
2183+
:param name_or_uuid: Name or UUID of the resource provider to look up.
2184+
"""
2185+
try:
2186+
self._provider_tree.remove(name_or_uuid)
2187+
except ValueError:
2188+
pass
2189+
self._association_refresh_time.pop(name_or_uuid, None)
2190+
21842191
def get_provider_by_name(self, context, name):
21852192
"""Queries the placement API for resource provider information matching
21862193
a supplied name.

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,8 @@ def test_node_rebalance_deleted_compute_node_race(self):
153153
self.assertEqual(0, len(rps), rps)
154154

155155
# host_b[3]: Should recreate compute node and resource provider.
156-
# FIXME(mgoddard): Resource provider not recreated here, because it
157-
# exists in the provider tree. See
158-
# https://bugs.launchpad.net/nova/+bug/1841481.
156+
# FIXME(mgoddard): Resource provider not recreated here, due to
157+
# https://bugs.launchpad.net/nova/+bug/1853159.
159158
host_b.manager.update_available_resource(self.ctxt)
160159

161160
# Verify that the node was recreated.
@@ -170,14 +169,11 @@ def test_node_rebalance_deleted_compute_node_race(self):
170169
self.assertEqual(0, len(rps), rps)
171170

172171
# But the RP exists in the provider tree.
173-
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
172+
self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists(
174173
self.nodename))
175174

176175
# host_b[1]: Should add compute node to RT cache and recreate resource
177176
# provider.
178-
# FIXME(mgoddard): Resource provider not recreated here, because it
179-
# exists in the provider tree. See
180-
# https://bugs.launchpad.net/nova/+bug/1841481.
181177
host_b.manager.update_available_resource(self.ctxt)
182178

183179
# Verify that the node still exists.
@@ -186,13 +182,10 @@ def test_node_rebalance_deleted_compute_node_race(self):
186182
# And it is now in the RT cache.
187183
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
188184

189-
# There is still no RP.
185+
# The resource provider has now been created.
190186
rps = self._get_all_providers()
191-
self.assertEqual(0, len(rps), rps)
192-
193-
# But the RP it exists in the provider tree.
194-
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
195-
self.nodename))
187+
self.assertEqual(1, len(rps), rps)
188+
self.assertEqual(self.nodename, rps[0]['name'])
196189

197190
# This fails due to the lack of a resource provider.
198191
self.assertIn(

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4242,5 +4242,9 @@ def test_clean_compute_node_cache(self, mock_remove):
42424242
invalid_nodename = "invalid-node"
42434243
self.rt.compute_nodes[_NODENAME] = self.compute
42444244
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
4245-
self.rt.clean_compute_node_cache([self.compute])
4246-
mock_remove.assert_called_once_with(invalid_nodename)
4245+
with mock.patch.object(
4246+
self.rt.reportclient, "invalidate_resource_provider",
4247+
) as mock_invalidate:
4248+
self.rt.clean_compute_node_cache([self.compute])
4249+
mock_remove.assert_called_once_with(invalid_nodename)
4250+
mock_invalidate.assert_called_once_with(invalid_nodename)

0 commit comments

Comments
 (0)