Clear rebalanced compute nodes from resource tracker

stephenfin · jovial · commit 730071f6bfaf · 2021-05-14T17:47:18.000+01:00
There is a race condition in nova-compute with the ironic virt driver as
nodes get rebalanced. It can lead to compute nodes being removed in the
DB and not repopulated. Ultimately this prevents these nodes from being
scheduled to.

The issue being addressed here is that if a compute node is deleted by a host
which thinks it is an orphan, then the compute host that actually owns the node
might not recreate it if the node is already in its resource tracker cache.

This change fixes the issue by clearing nodes from the resource tracker cache
for which a compute node entry does not exist. Then, when the available
resource for the node is updated, the compute node object is not found in the
cache and gets recreated.

Change-Id: I39241223b447fcc671161c370dbf16e1773b684a
Partial-Bug: #1853009
(cherry picked from commit 8f5a078dd7bbe5b6b38cf8e04d916281dc418409)
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -9898,6 +9898,8 @@ def update_available_resource(self, context, startup=False):
                                                             use_slave=True,
                                                             startup=startup)
 
+        self.rt.clean_compute_node_cache(compute_nodes_in_db)
+
         # Delete orphan compute node not reported by driver but still in db
         for cn in compute_nodes_in_db:
             if cn.hypervisor_hostname not in nodenames:
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
@@ -1988,3 +1988,20 @@ def finish_evacuation(self, instance, node, migration):
         if migration:
             migration.status = 'done'
             migration.save()
+
+    @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
+    def clean_compute_node_cache(self, compute_nodes_in_db):
+        """Clean the compute node cache of any nodes that no longer exist.
+
+        :param compute_nodes_in_db: list of ComputeNode objects from the DB.
+        """
+        compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
+                                         for cn in compute_nodes_in_db}
+        stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
+
+        for stale_cn in stale_cns:
+            # NOTE(mgoddard): we have found a node in the cache that has no
+            # compute node in the DB. This could be due to a node rebalance
+            # where another compute service took ownership of the node. Clean
+            # up the cache.
+            self.remove_node(stale_cn)
diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py
@@ -90,10 +90,6 @@ def test_node_rebalance_deleted_compute_node_race(self):
         # update for this node. See
         # https://bugs.launchpad.net/nova/+bug/1853159.
         host_b.manager.update_available_resource(self.ctxt)
-        self.assertIn(
-            'Deleting orphan compute node %s hypervisor host '
-            'is host_b, nodes are' % cn.id,
-            self.stdlog.logger.output)
         self._assert_hypervisor_api(self.nodename, expected_host='host_b')
         # There should only be one resource provider (fake-node).
         original_rps = self._get_all_providers()
@@ -157,21 +153,17 @@ def test_node_rebalance_deleted_compute_node_race(self):
         self.assertEqual(0, len(rps), rps)
 
         # host_b[3]: Should recreate compute node and resource provider.
-        # FIXME(mgoddard): Compute node not recreated here, because it is
-        # already in RT.compute_nodes. See
-        # https://bugs.launchpad.net/nova/+bug/1853009.
         # FIXME(mgoddard): Resource provider not recreated here, because it
         # exists in the provider tree. See
         # https://bugs.launchpad.net/nova/+bug/1841481.
         host_b.manager.update_available_resource(self.ctxt)
 
-        # Verify that the node was not recreated.
-        hypervisors = self.api.api_get(
-            '/os-hypervisors/detail').body['hypervisors']
-        self.assertEqual(0, len(hypervisors), hypervisors)
+        # Verify that the node was recreated.
+        self._assert_hypervisor_api(self.nodename, 'host_b')
 
-        # But the compute node exists in the RT.
-        self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
+        # But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
+        # node is not cached in the RT.
+        self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
 
         # There is no RP.
         rps = self._get_all_providers()
@@ -181,6 +173,27 @@ def test_node_rebalance_deleted_compute_node_race(self):
         self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
             self.nodename))
 
+        # host_b[1]: Should add compute node to RT cache and recreate resource
+        # provider.
+        # FIXME(mgoddard): Resource provider not recreated here, because it
+        # exists in the provider tree. See
+        # https://bugs.launchpad.net/nova/+bug/1841481.
+        host_b.manager.update_available_resource(self.ctxt)
+
+        # Verify that the node still exists.
+        self._assert_hypervisor_api(self.nodename, 'host_b')
+
+        # And it is now in the RT cache.
+        self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
+
+        # There is still no RP.
+        rps = self._get_all_providers()
+        self.assertEqual(0, len(rps), rps)
+
+        # But the RP it exists in the provider tree.
+        self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
+            self.nodename))
+
         # This fails due to the lack of a resource provider.
         self.assertIn(
             'Skipping removal of allocations for deleted instances',
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
@@ -376,18 +376,20 @@ def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
         )
 
         # First node in set should have been removed from DB
+        # Last node in set should have been added to DB.
         for db_node in db_nodes:
             if db_node.hypervisor_hostname == 'node1':
                 db_node.destroy.assert_called_once_with()
                 rc_mock.delete_resource_provider.assert_called_once_with(
                     self.context, db_node, cascade=True)
-                mock_rt.remove_node.assert_called_once_with(
-                    'node1')
+                mock_rt.remove_node.assert_called_once_with('node1')
                 mock_log.error.assert_called_once_with(
                     "Failed to delete compute node resource provider for "
                     "compute node %s: %s", db_node.uuid, mock.ANY)
             else:
                 self.assertFalse(db_node.destroy.called)
+        self.assertEqual(1, mock_rt.remove_node.call_count)
+        mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
 
     @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                 'delete_resource_provider')
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
@@ -4227,3 +4227,20 @@ def test__get_providers_to_update_not_in_tree(self, mock_log):
         mock_log.warning.assert_called_once_with(*expected_log_call)
         self.assertIn(uuids.unknown, self.rt.absent_providers)
         self.assertEqual(result, [])
+
+
+class TestCleanComputeNodeCache(BaseTestCase):
+
+    def setUp(self):
+        super(TestCleanComputeNodeCache, self).setUp()
+        self._setup_rt()
+        self.context = context.RequestContext(
+            mock.sentinel.user_id, mock.sentinel.project_id)
+
+    @mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
+    def test_clean_compute_node_cache(self, mock_remove):
+        invalid_nodename = "invalid-node"
+        self.rt.compute_nodes[_NODENAME] = self.compute
+        self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
+        self.rt.clean_compute_node_cache([self.compute])
+        mock_remove.assert_called_once_with(invalid_nodename)