Merge "Store old_flavor already on source host during resize" into stable/wallaby

Zuul · openstack-gerrit · commit 400d25fdeb45 · 2021-11-04T23:30:47.000Z
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -5608,6 +5608,14 @@ def _resize_instance(self, context, instance, image,
 
             instance.host = migration.dest_compute
             instance.node = migration.dest_node
+            # NOTE(gibi): as the instance now tracked on the destination we
+            # have to make sure that the source compute resource track can
+            # track this instance as a migration. For that the resource tracker
+            # needs to see the old_flavor set on the instance. The old_flavor
+            # setting used to be done on the destination host in finish_resize
+            # but that is racy with a source host update_available_resource
+            # periodic run
+            instance.old_flavor = instance.flavor
             instance.task_state = task_states.RESIZE_MIGRATED
             instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
 
@@ -5721,6 +5729,10 @@ def _finish_resize(self, context, instance, migration, disk_info,
         # to ACTIVE for backwards compatibility
         old_vm_state = instance.system_metadata.get('old_vm_state',
                                                     vm_states.ACTIVE)
+        # NOTE(gibi): this is already set by the resize_instance on the source
+        # node before calling finish_resize on destination but during upgrade
+        # it can be that the source node is not having the fix for bug 1944759
+        # yet. This assignment can be removed in Z release.
         instance.old_flavor = old_flavor
 
         if old_instance_type_id != new_instance_type_id:
diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -878,10 +878,10 @@ def inject_periodic_to_finish_resize(*args, **kwargs):
 
         dst_host = server['OS-EXT-SRV-ATTR:host']
 
-        # This is a resource accounting bug, we should have 2 cpus pinned on
-        # both computes. The source should have it due to the outbound
-        # migration and the destination due to the instance running there
-        self._assert_pinned_cpus(src_host, 0)
+        # we have 2 cpus pinned on both computes. The source should have it
+        # due to the outbound migration and the destination due to the
+        # instance running there
+        self._assert_pinned_cpus(src_host, 2)
         self._assert_pinned_cpus(dst_host, 2)
 
         return server, src_host, dst_host
@@ -893,30 +893,17 @@ def test_resize_confirm_bug_1944759(self):
         # Now confirm the resize
         post = {'confirmResize': None}
 
-        # FIXME(gibi): This is bug 1944759 where during resize, on the source
-        # node the resize_instance() call at the point of calling finish_resize
-        # overlaps with a update_available_resources() periodic job. This
-        # causes that the periodic job will not track the migration nor the
-        # instance and therefore freeing the resource allocation. Then when
-        # later the resize is confirmed the confirm_resize on the source
-        # compute also wants to free up the resources, the pinned CPUs, and it
-        # fails as they are already freed.
-        exc = self.assertRaises(
-            client.OpenStackApiException,
-            self.api.post_server_action, server['id'], post
-        )
-        self.assertEqual(500, exc.response.status_code)
-        self.assertIn('CPUUnpinningInvalid', str(exc))
+        self.api.post_server_action(server['id'], post)
+        self._wait_for_state_change(server, 'ACTIVE')
 
-        # confirm failed above but the resource allocation reflects that the
-        # VM is running on the dest node
+        # the resource allocation reflects that the VM is running on the dest
+        # node
         self._assert_pinned_cpus(src_host, 0)
         self._assert_pinned_cpus(dst_host, 2)
 
+        # and running periodics does not break it either
         self._run_periodics()
 
-        # and such allocation situation is stable so as a recovery the VM
-        # can be reset-state to ACTIVE without problem.
         self._assert_pinned_cpus(src_host, 0)
         self._assert_pinned_cpus(dst_host, 2)
 
@@ -932,15 +919,14 @@ def test_resize_revert_bug_1944759(self):
         self.api.post_server_action(server['id'], post)
         self._wait_for_state_change(server, 'ACTIVE')
 
-        # This is a resource accounting bug. After the revert the source host
-        # should have 2 cpus pinned due to the instance.
-        self._assert_pinned_cpus(src_host, 0)
+        # After the revert the source host should have 2 cpus pinned due to
+        # the instance.
+        self._assert_pinned_cpus(src_host, 2)
         self._assert_pinned_cpus(dst_host, 0)
 
-        # running the periodic job will fix the resource accounting
+        # running the periodic job will not break it either
         self._run_periodics()
 
-        # this is now correct
         self._assert_pinned_cpus(src_host, 2)
         self._assert_pinned_cpus(dst_host, 0)