Reproduce bug 1944759

Balazs Gibizer · Balazs Gibizer · commit 0b1fa9b4ae01 · 2021-09-24T15:52:21.000+02:00
Add functional tests to reproduce the race between resize_instance() and update_available_resources(). Related-Bug: #1944759 Change-Id: Icb7e3379248fe00f9a94f9860181b5de44902379 (cherry picked from commit 3e4e448) (cherry picked from commit e6c6880) (cherry picked from commit 140ae45)
diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -711,6 +711,127 @@ def fake_confirm_migration(*args, **kwargs):
 
         server = self._wait_for_state_change(server, 'ACTIVE')
 
+    def _assert_pinned_cpus(self, hostname, expected_number_of_pinned):
+        numa_topology = objects.NUMATopology.obj_from_db_obj(
+            objects.ComputeNode.get_by_nodename(
+                self.ctxt, hostname,
+            ).numa_topology,
+        )
+        self.assertEqual(
+            expected_number_of_pinned, len(numa_topology.cells[0].pinned_cpus))
+
+    def _create_server_and_resize_bug_1944759(self):
+        self.flags(
+            cpu_dedicated_set='0-3', cpu_shared_set='4-7', group='compute')
+        self.flags(vcpu_pin_set=None)
+
+        # start services
+        self.start_compute(hostname='test_compute0')
+        self.start_compute(hostname='test_compute1')
+
+        flavor_a_id = self._create_flavor(
+            vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'})
+        server = self._create_server(flavor_id=flavor_a_id)
+
+        src_host = server['OS-EXT-SRV-ATTR:host']
+        self._assert_pinned_cpus(src_host, 2)
+
+        # we don't really care what the new flavor is, so long as the old
+        # flavor is using pinning. We use a similar flavor for simplicity.
+        flavor_b_id = self._create_flavor(
+            vcpu=2, extra_spec={'hw:cpu_policy': 'dedicated'})
+
+        orig_rpc_finish_resize = nova.compute.rpcapi.ComputeAPI.finish_resize
+
+        # Simulate that the finish_resize call overlaps with an
+        # update_available_resource periodic job
+        def inject_periodic_to_finish_resize(*args, **kwargs):
+            self._run_periodics()
+            return orig_rpc_finish_resize(*args, **kwargs)
+
+        self.stub_out(
+            'nova.compute.rpcapi.ComputeAPI.finish_resize',
+            inject_periodic_to_finish_resize,
+        )
+
+        # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should
+        # probably be less...dumb
+        with mock.patch(
+            'nova.virt.libvirt.driver.LibvirtDriver'
+            '.migrate_disk_and_power_off', return_value='{}',
+        ):
+            post = {'resize': {'flavorRef': flavor_b_id}}
+            self.api.post_server_action(server['id'], post)
+            server = self._wait_for_state_change(server, 'VERIFY_RESIZE')
+
+        dst_host = server['OS-EXT-SRV-ATTR:host']
+
+        # This is a resource accounting bug, we should have 2 cpus pinned on
+        # both computes. The source should have it due to the outbound
+        # migration and the destination due to the instance running there
+        self._assert_pinned_cpus(src_host, 0)
+        self._assert_pinned_cpus(dst_host, 2)
+
+        return server, src_host, dst_host
+
+    def test_resize_confirm_bug_1944759(self):
+        server, src_host, dst_host = (
+            self._create_server_and_resize_bug_1944759())
+
+        # Now confirm the resize
+        post = {'confirmResize': None}
+
+        # FIXME(gibi): This is bug 1944759 where during resize, on the source
+        # node the resize_instance() call at the point of calling finish_resize
+        # overlaps with a update_available_resources() periodic job. This
+        # causes that the periodic job will not track the migration nor the
+        # instance and therefore freeing the resource allocation. Then when
+        # later the resize is confirmed the confirm_resize on the source
+        # compute also wants to free up the resources, the pinned CPUs, and it
+        # fails as they are already freed.
+        exc = self.assertRaises(
+            client.OpenStackApiException,
+            self.api.post_server_action, server['id'], post
+        )
+        self.assertEqual(500, exc.response.status_code)
+        self.assertIn('CPUUnpinningInvalid', str(exc))
+
+        # confirm failed above but the resource allocation reflects that the
+        # VM is running on the dest node
+        self._assert_pinned_cpus(src_host, 0)
+        self._assert_pinned_cpus(dst_host, 2)
+
+        self._run_periodics()
+
+        # and such allocation situation is stable so as a recovery the VM
+        # can be reset-state to ACTIVE without problem.
+        self._assert_pinned_cpus(src_host, 0)
+        self._assert_pinned_cpus(dst_host, 2)
+
+    def test_resize_revert_bug_1944759(self):
+        server, src_host, dst_host = (
+            self._create_server_and_resize_bug_1944759())
+
+        # Now revert the resize
+        post = {'revertResize': None}
+
+        # reverts actually succeeds (not like confirm) but the resource
+        # allocation is still flaky
+        self.api.post_server_action(server['id'], post)
+        self._wait_for_state_change(server, 'ACTIVE')
+
+        # This is a resource accounting bug. After the revert the source host
+        # should have 2 cpus pinned due to the instance.
+        self._assert_pinned_cpus(src_host, 0)
+        self._assert_pinned_cpus(dst_host, 0)
+
+        # running the periodic job will fix the resource accounting
+        self._run_periodics()
+
+        # this is now correct
+        self._assert_pinned_cpus(src_host, 2)
+        self._assert_pinned_cpus(dst_host, 0)
+
 
 class NUMAServerTestWithCountingQuotaFromPlacement(NUMAServersTest):