Skip to content

Commit a7d0866

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Clear rebalanced compute nodes from resource tracker" into stable/wallaby
2 parents 5f2f283 + f950ced commit a7d0866

File tree

5 files changed

+66
-18
lines changed

5 files changed

+66
-18
lines changed

nova/compute/manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9998,6 +9998,8 @@ def update_available_resource(self, context, startup=False):
99989998
use_slave=True,
99999999
startup=startup)
1000010000

10001+
self.rt.clean_compute_node_cache(compute_nodes_in_db)
10002+
1000110003
# Delete orphan compute node not reported by driver but still in db
1000210004
for cn in compute_nodes_in_db:
1000310005
if cn.hypervisor_hostname not in nodenames:

nova/compute/resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,3 +1973,20 @@ def finish_evacuation(self, instance, node, migration):
19731973
if migration:
19741974
migration.status = 'done'
19751975
migration.save()
1976+
1977+
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
1978+
def clean_compute_node_cache(self, compute_nodes_in_db):
1979+
"""Clean the compute node cache of any nodes that no longer exist.
1980+
1981+
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
1982+
"""
1983+
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
1984+
for cn in compute_nodes_in_db}
1985+
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
1986+
1987+
for stale_cn in stale_cns:
1988+
# NOTE(mgoddard): we have found a node in the cache that has no
1989+
# compute node in the DB. This could be due to a node rebalance
1990+
# where another compute service took ownership of the node. Clean
1991+
# up the cache.
1992+
self.remove_node(stale_cn)

nova/tests/functional/regressions/test_bug_1853009.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
8282
# Now run the update_available_resource periodic to register fake-node
8383
# and have it managed by host_b. This will also detect the "host_b"
8484
# node as orphaned and delete it along with its resource provider.
85-
cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename(
86-
self.ctxt, 'host_b', 'host_b',
87-
)
8885

8986
# host_b[1]: Finds no compute record in RT. Tries to create one
9087
# (_init_compute_node).
@@ -93,10 +90,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
9390
# update for this node. See
9491
# https://bugs.launchpad.net/nova/+bug/1853159.
9592
host_b.manager.update_available_resource(self.ctxt)
96-
self.assertIn(
97-
'Deleting orphan compute node %s hypervisor host '
98-
'is host_b, nodes are' % cn_host_b_node.id,
99-
self.stdlog.logger.output)
10093
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
10194
# There should only be one resource provider (fake-node).
10295
original_rps = self._get_all_providers()
@@ -160,21 +153,17 @@ def test_node_rebalance_deleted_compute_node_race(self):
160153
self.assertEqual(0, len(rps), rps)
161154

162155
# host_b[3]: Should recreate compute node and resource provider.
163-
# FIXME(mgoddard): Compute node not recreated here, because it is
164-
# already in RT.compute_nodes. See
165-
# https://bugs.launchpad.net/nova/+bug/1853009.
166156
# FIXME(mgoddard): Resource provider not recreated here, because it
167157
# exists in the provider tree. See
168158
# https://bugs.launchpad.net/nova/+bug/1841481.
169159
host_b.manager.update_available_resource(self.ctxt)
170160

171-
# Verify that the node was not recreated.
172-
hypervisors = self.api.api_get(
173-
'/os-hypervisors/detail').body['hypervisors']
174-
self.assertEqual(0, len(hypervisors), hypervisors)
161+
# Verify that the node was recreated.
162+
self._assert_hypervisor_api(self.nodename, 'host_b')
175163

176-
# But the compute node exists in the RT.
177-
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
164+
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
165+
# node is not cached in the RT.
166+
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
178167

179168
# There is no RP.
180169
rps = self._get_all_providers()
@@ -184,6 +173,27 @@ def test_node_rebalance_deleted_compute_node_race(self):
184173
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
185174
self.nodename))
186175

176+
# host_b[1]: Should add compute node to RT cache and recreate resource
177+
# provider.
178+
# FIXME(mgoddard): Resource provider not recreated here, because it
179+
# exists in the provider tree. See
180+
# https://bugs.launchpad.net/nova/+bug/1841481.
181+
host_b.manager.update_available_resource(self.ctxt)
182+
183+
# Verify that the node still exists.
184+
self._assert_hypervisor_api(self.nodename, 'host_b')
185+
186+
# And it is now in the RT cache.
187+
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
188+
189+
# There is still no RP.
190+
rps = self._get_all_providers()
191+
self.assertEqual(0, len(rps), rps)
192+
193+
# But the RP it exists in the provider tree.
194+
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
195+
self.nodename))
196+
187197
# This fails due to the lack of a resource provider.
188198
self.assertIn(
189199
'Skipping removal of allocations for deleted instances',

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,18 +374,20 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
374374
)
375375

376376
# First node in set should have been removed from DB
377+
# Last node in set should have been added to DB.
377378
for db_node in db_nodes:
378379
if db_node.hypervisor_hostname == 'node1':
379380
db_node.destroy.assert_called_once_with()
380381
rc_mock.delete_resource_provider.assert_called_once_with(
381382
self.context, db_node, cascade=True)
382-
mock_rt.remove_node.assert_called_once_with(
383-
'node1')
383+
mock_rt.remove_node.assert_called_once_with('node1')
384384
mock_log.error.assert_called_once_with(
385385
"Failed to delete compute node resource provider for "
386386
"compute node %s: %s", db_node.uuid, mock.ANY)
387387
else:
388388
self.assertFalse(db_node.destroy.called)
389+
self.assertEqual(1, mock_rt.remove_node.call_count)
390+
mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
389391

390392
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
391393
'delete_resource_provider')

nova/tests/unit/compute/test_resource_tracker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4178,3 +4178,20 @@ def test__get_providers_to_update_not_in_tree(self, mock_log):
41784178
mock_log.warning.assert_called_once_with(*expected_log_call)
41794179
self.assertIn(uuids.unknown, self.rt.absent_providers)
41804180
self.assertEqual(result, [])
4181+
4182+
4183+
class TestCleanComputeNodeCache(BaseTestCase):
4184+
4185+
def setUp(self):
4186+
super(TestCleanComputeNodeCache, self).setUp()
4187+
self._setup_rt()
4188+
self.context = context.RequestContext(
4189+
mock.sentinel.user_id, mock.sentinel.project_id)
4190+
4191+
@mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
4192+
def test_clean_compute_node_cache(self, mock_remove):
4193+
invalid_nodename = "invalid-node"
4194+
self.rt.compute_nodes[_NODENAME] = self.compute
4195+
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
4196+
self.rt.clean_compute_node_cache([self.compute])
4197+
mock_remove.assert_called_once_with(invalid_nodename)

0 commit comments

Comments
 (0)