compute: Don't detach volumes when RescheduledException raised without retry

lyarwood · lyarwood · commit eda458828bf4 · 2020-12-07T09:42:53.000Z
I8b1c05317734e14ea73dc868941351bb31210bf0 introduced a crude call to _cleanup_volumes within _do_build_and_run_instance when handling a RescheduledException exception raised from _build_and_run_instance without any retry information provided from the scheduler. This situation can arise when using the 'availability_zone' parameter to skip the scheduler by providing both a target availability_zone and host in the format of `$availability_zone:$host`. If the instance is unable to build on the compute the failure will eventually lead to _cleanup_volumes calling DriverVolumeBlockDevice.detach that will either detach (cinderv2) or delete the associated volume attachments (cinderv3) moving the volume to an `available` state, assuming it isn't multi-attached etc. The issue with this is that this behaviour is in stark contrast to that of volumes associated with instances that have failed to schedule. In that case the volumes remain marked as reserved and associated with the ERROR'd out instance until the instance itself is deleted. This change aims to align both cases by removing the call to _cleanup_volumes and in doing so keeping any volumes in a `reserved` state until the underlying instance is deleted. Note that leaving these volumes associated with ERROR'd out instances is now safe after I4dc6c8bd3bb6c135f8a698af41f5d0e026c39117 landed and now ensures that ports and volumes associated with such an instance are correctly cleaned up. Closes-Bug: #1899649 Change-Id: I5dda9e8bca5fbaae77ece12b67176945ca4d9a4c (cherry picked from commit 26c46a4)
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -2230,8 +2230,6 @@ def _do_build_and_run_instance(self, context, instance, image,
                     instance=instance)
                 self._cleanup_allocated_networks(context, instance,
                     requested_networks)
-                self._cleanup_volumes(context, instance,
-                    block_device_mapping, raise_exc=False)
                 compute_utils.add_instance_fault_from_exc(context,
                         instance, e, sys.exc_info(),
                         fault_message=e.kwargs['reason'])
diff --git a/nova/tests/functional/regressions/test_bug_1899649.py b/nova/tests/functional/regressions/test_bug_1899649.py
@@ -92,19 +92,9 @@ def test_failure_to_schedule_with_host(self):
         self._assert_failure_and_volume_attachments(server)
 
     def test_failure_to_build_with_az_and_host(self):
-        # Assert that a volume attachments does not remain after a failure to
+        # Assert that a volume attachments remain after a failure to
         # build and reschedule by providing an availability_zone *and* host,
         # skipping the scheduler. This is bug #1899649.
         self.server['availability_zone'] = 'nova:compute1'
         server = self.admin_api.post_server({'server': self.server})
-
-        # Assert the server ends up in an ERROR state
-        self._wait_for_state_change(server, 'ERROR')
-
-        # FIXME(lyarwood): A single volume attachment should be present for the
-        # instance at this stage as the volume *can* otherwise be marked as
-        # available within Cinder if it isn't multi-attached.
-        attachments = self.cinder.volume_to_attachment.get(self.volume_id)
-        self.assertEqual(0, len(attachments))
-        self.assertNotIn(
-            self.volume_id, self.cinder.volume_ids_for_instance(server['id']))
+        self._assert_failure_and_volume_attachments(server)
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
@@ -6875,14 +6875,13 @@ def test_rescheduled_exception_with_sriov_network_allocated(self,
     @mock.patch.object(objects.Instance, 'save')
     @mock.patch.object(manager.ComputeManager,
                        '_nil_out_instance_obj_host_and_node')
-    @mock.patch.object(manager.ComputeManager, '_cleanup_volumes')
     @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
     @mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state')
     @mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
     @mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
     def test_rescheduled_exception_without_retry(self,
-            mock_build_run, mock_add, mock_set, mock_clean_net, mock_clean_vol,
-            mock_nil, mock_save, mock_start, mock_finish):
+            mock_build_run, mock_add, mock_set, mock_clean_net, mock_nil,
+            mock_save, mock_start, mock_finish):
         self._do_build_instance_update(mock_save)
         mock_build_run.side_effect = exception.RescheduledException(reason='',
                 instance_uuid=self.instance.uuid)
@@ -6918,9 +6917,6 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs):
                 self.accel_uuids)
         mock_clean_net.assert_called_once_with(self.context, self.instance,
                 self.requested_networks)
-        mock_clean_vol.assert_called_once_with(self.context,
-                self.instance, self.block_device_mapping,
-                raise_exc=False)
         mock_add.assert_called_once_with(self.context, self.instance,
                 mock.ANY, mock.ANY, fault_message=mock.ANY)
         mock_nil.assert_called_once_with(self.instance)