oxidecomputer · hawkw · Oct 4, 2024 · Oct 4, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs
@@ -1166,7 +1166,13 @@ pub trait NexusExternalApi {
         path_params: Path<params::InstancePath>,
     ) -> Result<HttpResponseAccepted<Instance>, HttpError>;
 
-    /// Stop instance, violently.
+    /// Terminate instance
+    ///
+    /// Immediately halts a running instance by rudely terminating its
+    /// virtual machine process. This immediately moves the instance to the
+    /// "stopped" state without transitioning through the "stopping" state.
+    /// This operation can be used to recover an instance that is not
+    /// responding to requests to stop issued through the instance stop API.
     #[endpoint {
         method = POST,
         path = "/v1/instances/{instance}/force-terminate",

diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
@@ -784,31 +784,82 @@ impl super::Nexus {
         let (.., authz_instance) =
             instance_lookup.lookup_for(authz::Action::Modify).await?;
 
-        let state = self
+        let db::datastore::InstanceGestalt {
+            instance,
+            active_vmm,
+            target_vmm,
+            migration: _,
+        } = self
             .db_datastore
-            .instance_fetch_with_vmm(opctx, &authz_instance)
+            .instance_fetch_all(opctx, &authz_instance)
             .await?;
 
-        // Is the instance currently incarnated in a VMM process?
-        let Some(vmm) = state.vmm() else {
-            // If the instance is already un-incarnated, we're all good.
-            debug!(
-                opctx.log,
-                "asked to force terminate an instance that has no VMM, \
-                 doing nothing";
-                "instance_id" => %authz_instance.id(),
-                "instance_state" => ?state.instance.runtime(),
-            );
-            debug_assert_eq!(
-                state.instance.runtime().nexus_state,
-                db::model::InstanceState::NoVmm,
-                "if an instance has no active VMM, it must be in the NoVmm \
-                 state (this is enforced by a DB check constraint)",
-            );
-            return Ok(state);
+        // If the instance is currently incarnated by VMM process(es), hunt down
+        // and destroy them.
+        let terminated_active = match active_vmm {
+            Some(vmm) => {
+                self.instance_force_terminate_vmm(
+                    opctx,
+                    &authz_instance,
+                    vmm,
+                    "active",
+                )
+                .await
+            }
+            None => {
+                debug!(
+                    opctx.log,
+                    "asked to force terminate an instance that has no active VMM";
+                    "instance_id" => %authz_instance.id(),
+                    "instance_state" => ?instance.runtime(),
+                );
+                Ok(())
+            }
         };
+        let terminated_target = match target_vmm {
+            Some(vmm) => {
+                self.instance_force_terminate_vmm(
+                    opctx,
+                    &authz_instance,
+                    vmm,
+                    "target",
+                )
+                .await
+            }
+            None => Ok(()),
+        };
+
+        // If our attempt to terminate either VMM failed, bail --- but only
+        // after both futures completed.
+        terminated_active?;
+        terminated_target?;
 
         // Ladies and gentlemen, we got him!
+        self.db_datastore
+            .instance_fetch_with_vmm(opctx, &authz_instance)
+            .await
+            .map_err(Into::into)
+    }
+
+    /// Forcefully terminate a VMM associated with an instance (by calling
+    /// [`Self::instance_ensure_unregistered`]), and then update the instance's
+    /// state to reflect that the VMM has been unregistered.
+    ///
+    /// # Arguments
+    ///
+    /// - `opctx`: the [`OpContext`] for this action
+    /// - `authz_instance`: the instance associated with the VMM, so that the
+    ///   instance can be updated to reflect the new VMM state.
+    /// - `vmm`: the VMM to forcefully terminate
+    /// - `vmm_role`: a string ("active" or "target") for logging which VMM is
+    ///   being terminated.
+    async fn instance_force_terminate_vmm(
+        &self,
+        opctx: &OpContext,
+        authz_instance: &authz::Instance,
+        vmm: db::model::Vmm,
+        vmm_role: &str,
+    ) -> Result<(), InstanceStateChangeError> {
         let propolis_id = PropolisUuid::from_untyped_uuid(vmm.id);
         let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id);
         let unregister_result =
@@ -818,35 +869,70 @@ impl super::Nexus {
             Ok(Some(state)) => {
                 info!(
                     opctx.log,
-                    "instance terminated with extreme prejudice";
+                    "instance's {vmm_role} VMM terminated with extreme \
+                     prejudice";
                     "instance_id" => %authz_instance.id(),
                     "vmm_id" => %propolis_id,
                     "sled_id" => %sled_id,
                 );
 
-                self.notify_vmm_updated(opctx, propolis_id, &state).await?;
+                // We would like the caller to see the instance they are
+                // attempting to terminate go to "Stopped", so run the
+                // instance-update saga synchronously if possible. This is
+                // particularly important in the case where a migrating instance
+                // is force-terminated, as an instance with a migration ID will
+                // remain "Migrating" (not "Stopping") until its migration ID is
+                // unset, and it seems a bit sad to return a "Migrating"
+                // instance to a caller who tries to force-kill it.
+                //
+                // TODO(eliza): in the case where we are terminating both an
+                // active VMM and migration target, we will unregister them in
+                // sequence, running separate update sagas for both VMMs being
+                // terminated. It would be a bit more efficient to terminate
+                // both VMMs, update CRDB, and then run a *single* update saga
+                // to process both VMMs being destroyed. However, this requires
+                // a bit of annoying refactoring to existing APIs, and I'm not
+                // sure if improving the `instance-force-terminate` endpoint's
+                // latency is particularly important...
+                if let Some((_, saga)) = process_vmm_update(
+                    &self.db_datastore,
+                    opctx,
+                    propolis_id,
+                    &state,
+                )
+                .await?
+                {
+                    self.sagas
+                        .saga_prepare(saga)
+                        .await?
+                        .start()
+                        .await?
+                        .wait_until_stopped()
+                        .await
+                        .into_omicron_result()?;
+                }
             }
-            // If the returned state from sled-agent is `None`, then the instance
-            // was already unregistered. This may have been from a prior
-            // instance-ensure-unregistered call, but, since we observed an
-            // active VMM above, the current observed VMM generation doesn't
-            // know that the VMM is gone, so it is possible that the sled-agent
-            // has misplaced this instance. Therefore, we will attempt to mark
-            // the VMM as `Failed` at the generation after which we observed the
-            // VMM. This is safe to do here, because if the instance has been
-            // unregistered due to a race with another
-            // instance-ensure-unregistered request (rather than a sled-agent
-            // failure), that other call will have advanced the state
-            // generation, and our attempt to write the failed state will not
-            // succeed, which is fine.
+            // If the returned state from sled-agent is `None`, then the
+            // instance was already unregistered. This may have been from a
+            // prior attempt to stop the instance (either normally or
+            // forcefully). But, since we observed an active VMM above, the
+            // current observed VMM generation doesn't know that the VMM is
+            // gone, so it is possible that the sled-agent has misplaced this
+            // instance. Therefore, we will attempt to mark the VMM as `Failed`
+            // at the generation after which we observed the VMM. This is safe
+            // to do here, because if the instance has been unregistered due to
+            // a race with another instance-ensure-unregistered request (rather
+            // than a sled-agent failure), that other call will have advanced
+            // the state generation, and our attempt to write the failed state
+            // will not succeed, which is fine.
             //
             // Either way, the caller should not observe a returned instance
             // state that believes itself to be running.
             Ok(None) => {
                 info!(
                     opctx.log,
-                    "asked to force terminate an instance that was already \
-                     unregistered";
+                    "asked to force terminate an instance's {vmm_role} VMM ;
+                     thatwas already unregistered";
                     "instance_id" => %authz_instance.id(),
                     "vmm_id" => %propolis_id,
                     "sled_id" => %sled_id,
@@ -869,11 +955,7 @@ impl super::Nexus {
             }
             Err(e) => return Err(e),
         }
-
-        self.db_datastore
-            .instance_fetch_with_vmm(opctx, &authz_instance)
-            .await
-            .map_err(Into::into)
+        Ok(())
     }
 
     /// Idempotently ensures that the sled specified in `db_instance` does not