Skip to content

Commit 730071f

Browse files
stephenfinjovial
authored andcommitted
Clear rebalanced compute nodes from resource tracker
There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the compute host that actually owns the node might not recreate it if the node is already in its resource tracker cache. This change fixes the issue by clearing nodes from the resource tracker cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the compute node object is not found in the cache and gets recreated. Change-Id: I39241223b447fcc671161c370dbf16e1773b684a Partial-Bug: #1853009 (cherry picked from commit 8f5a078dd7bbe5b6b38cf8e04d916281dc418409)
1 parent cbf44b7 commit 730071f

File tree

5 files changed

+66
-15
lines changed

5 files changed

+66
-15
lines changed

nova/compute/manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9898,6 +9898,8 @@ def update_available_resource(self, context, startup=False):
98989898
use_slave=True,
98999899
startup=startup)
99009900

9901+
self.rt.clean_compute_node_cache(compute_nodes_in_db)
9902+
99019903
# Delete orphan compute node not reported by driver but still in db
99029904
for cn in compute_nodes_in_db:
99039905
if cn.hypervisor_hostname not in nodenames:

nova/compute/resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1988,3 +1988,20 @@ def finish_evacuation(self, instance, node, migration):
19881988
if migration:
19891989
migration.status = 'done'
19901990
migration.save()
1991+
1992+
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
1993+
def clean_compute_node_cache(self, compute_nodes_in_db):
1994+
"""Clean the compute node cache of any nodes that no longer exist.
1995+
1996+
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
1997+
"""
1998+
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
1999+
for cn in compute_nodes_in_db}
2000+
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
2001+
2002+
for stale_cn in stale_cns:
2003+
# NOTE(mgoddard): we have found a node in the cache that has no
2004+
# compute node in the DB. This could be due to a node rebalance
2005+
# where another compute service took ownership of the node. Clean
2006+
# up the cache.
2007+
self.remove_node(stale_cn)

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
9090
# update for this node. See
9191
# https://bugs.launchpad.net/nova/+bug/1853159.
9292
host_b.manager.update_available_resource(self.ctxt)
93-
self.assertIn(
94-
'Deleting orphan compute node %s hypervisor host '
95-
'is host_b, nodes are' % cn.id,
96-
self.stdlog.logger.output)
9793
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
9894
# There should only be one resource provider (fake-node).
9995
original_rps = self._get_all_providers()
@@ -157,21 +153,17 @@ def test_node_rebalance_deleted_compute_node_race(self):
157153
self.assertEqual(0, len(rps), rps)
158154

159155
# host_b[3]: Should recreate compute node and resource provider.
160-
# FIXME(mgoddard): Compute node not recreated here, because it is
161-
# already in RT.compute_nodes. See
162-
# https://bugs.launchpad.net/nova/+bug/1853009.
163156
# FIXME(mgoddard): Resource provider not recreated here, because it
164157
# exists in the provider tree. See
165158
# https://bugs.launchpad.net/nova/+bug/1841481.
166159
host_b.manager.update_available_resource(self.ctxt)
167160

168-
# Verify that the node was not recreated.
169-
hypervisors = self.api.api_get(
170-
'/os-hypervisors/detail').body['hypervisors']
171-
self.assertEqual(0, len(hypervisors), hypervisors)
161+
# Verify that the node was recreated.
162+
self._assert_hypervisor_api(self.nodename, 'host_b')
172163

173-
# But the compute node exists in the RT.
174-
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
164+
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
165+
# node is not cached in the RT.
166+
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
175167

176168
# There is no RP.
177169
rps = self._get_all_providers()
@@ -181,6 +173,27 @@ def test_node_rebalance_deleted_compute_node_race(self):
181173
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
182174
self.nodename))
183175

176+
# host_b[1]: Should add compute node to RT cache and recreate resource
177+
# provider.
178+
# FIXME(mgoddard): Resource provider not recreated here, because it
179+
# exists in the provider tree. See
180+
# https://bugs.launchpad.net/nova/+bug/1841481.
181+
host_b.manager.update_available_resource(self.ctxt)
182+
183+
# Verify that the node still exists.
184+
self._assert_hypervisor_api(self.nodename, 'host_b')
185+
186+
# And it is now in the RT cache.
187+
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
188+
189+
# There is still no RP.
190+
rps = self._get_all_providers()
191+
self.assertEqual(0, len(rps), rps)
192+
193+
# But the RP it exists in the provider tree.
194+
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
195+
self.nodename))
196+
184197
# This fails due to the lack of a resource provider.
185198
self.assertIn(
186199
'Skipping removal of allocations for deleted instances',

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -376,18 +376,20 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
376376
)
377377

378378
# First node in set should have been removed from DB
379+
# Last node in set should have been added to DB.
379380
for db_node in db_nodes:
380381
if db_node.hypervisor_hostname == 'node1':
381382
db_node.destroy.assert_called_once_with()
382383
rc_mock.delete_resource_provider.assert_called_once_with(
383384
self.context, db_node, cascade=True)
384-
mock_rt.remove_node.assert_called_once_with(
385-
'node1')
385+
mock_rt.remove_node.assert_called_once_with('node1')
386386
mock_log.error.assert_called_once_with(
387387
"Failed to delete compute node resource provider for "
388388
"compute node %s: %s", db_node.uuid, mock.ANY)
389389
else:
390390
self.assertFalse(db_node.destroy.called)
391+
self.assertEqual(1, mock_rt.remove_node.call_count)
392+
mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
391393

392394
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
393395
'delete_resource_provider')

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4227,3 +4227,20 @@ def test__get_providers_to_update_not_in_tree(self, mock_log):
42274227
mock_log.warning.assert_called_once_with(*expected_log_call)
42284228
self.assertIn(uuids.unknown, self.rt.absent_providers)
42294229
self.assertEqual(result, [])
4230+
4231+
4232+
class TestCleanComputeNodeCache(BaseTestCase):
4233+
4234+
def setUp(self):
4235+
super(TestCleanComputeNodeCache, self).setUp()
4236+
self._setup_rt()
4237+
self.context = context.RequestContext(
4238+
mock.sentinel.user_id, mock.sentinel.project_id)
4239+
4240+
@mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
4241+
def test_clean_compute_node_cache(self, mock_remove):
4242+
invalid_nodename = "invalid-node"
4243+
self.rt.compute_nodes[_NODENAME] = self.compute
4244+
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
4245+
self.rt.clean_compute_node_cache([self.compute])
4246+
mock_remove.assert_called_once_with(invalid_nodename)

0 commit comments

Comments
 (0)