oxidecomputer · hawkw · Oct 4, 2024 · Oct 4, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/nexus/external-api/output/nexus_tags.txt b/nexus/external-api/output/nexus_tags.txt
@@ -50,6 +50,7 @@ instance_disk_list                       GET      /v1/instances/{instance}/disks
 instance_ephemeral_ip_attach             POST     /v1/instances/{instance}/external-ips/ephemeral
 instance_ephemeral_ip_detach             DELETE   /v1/instances/{instance}/external-ips/ephemeral
 instance_external_ip_list                GET      /v1/instances/{instance}/external-ips
+instance_force_terminate                 POST     /v1/instances/{instance}/force-terminate
 instance_list                            GET      /v1/instances
 instance_network_interface_create        POST     /v1/network-interfaces
 instance_network_interface_delete        DELETE   /v1/network-interfaces/{interface}

diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs
@@ -1166,6 +1166,24 @@ pub trait NexusExternalApi {
         path_params: Path<params::InstancePath>,
     ) -> Result<HttpResponseAccepted<Instance>, HttpError>;
 
+    /// Terminate instance
+    ///
+    /// Immediately halts a running instance by rudely terminating its
+    /// virtual machine process. This immediately moves the instance to the
+    /// "stopped" state without transitioning through the "stopping" state.
+    /// This operation can be used to recover an instance that is not
+    /// responding to requests to stop issued through the instance stop API.
+    #[endpoint {
+        method = POST,
+        path = "/v1/instances/{instance}/force-terminate",
+        tags = ["instances"],
+    }]
+    async fn instance_force_terminate(
+        rqctx: RequestContext<Self::Context>,
+        query_params: Query<params::OptionalProjectSelector>,
+        path_params: Path<params::InstancePath>,
+    ) -> Result<HttpResponseAccepted<Instance>, HttpError>;
+
     /// Fetch instance serial console
     #[endpoint {
         method = GET,

diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs
@@ -671,56 +671,45 @@ async fn sis_ensure_registered_undo(
                "error" => ?e);
 
         // If the failure came from talking to sled agent, and the error code
-        // indicates the instance or sled might be unhealthy, manual
-        // intervention is likely to be needed, so try to mark the instance as
-        // Failed and then bail on unwinding.
+        // indicates the VMM has been forgotten by sled-agent, try to mark the
+        // VMM as Failed and continue unwinding. If we cannot mark it as failed,
+        // it has presumably just already been unregistered, which is fine.
         //
-        // If sled agent is in good shape but just doesn't know about the
-        // instance, this saga still owns the instance's state, so allow
-        // unwinding to continue.
+        // If we were unable to communicate with the sled agent, then we cannot
+        // properly clean up this VMM, and manual intervention may be required.
         //
         // If some other Nexus error occurred, this saga is in bad shape, so
         // return an error indicating that intervention is needed without trying
         // to modify the instance further.
-        //
-        // TODO(#3238): `instance_unhealthy` does not take an especially nuanced
-        // view of the meanings of the error codes sled agent could return, so
-        // assuming that an error that isn't `instance_unhealthy` means
-        // that everything is hunky-dory and it's OK to continue unwinding may
-        // be a bit of a stretch. See the definition of `instance_unhealthy` for
-        // more details.
         match e {
             InstanceStateChangeError::SledAgent(inner) if inner.vmm_gone() => {
-                error!(osagactx.log(),
-                       "start saga: failing instance after unregister failure";
-                       "instance_id" => %instance_id,
-                       "start_reason" => ?params.reason,
-                       "error" => ?inner);
-
-                if let Err(set_failed_error) = osagactx
+                info!(
+                    osagactx.log(),
+                    "start saga: VMM has either already been unregistered \
+                     or has been forgotten by sled-agent";
+                    "instance_id" => %instance_id,
+                    "start_reason" => ?params.reason,
+                    "error" => %inner
+                );
+                osagactx
                     .nexus()
                     .mark_vmm_failed(&opctx, authz_instance, &db_vmm, &inner)
-                    .await
-                {
-                    error!(osagactx.log(),
-                           "start saga: failed to mark instance as failed";
-                           "instance_id" => %instance_id,
-                           "start_reason" => ?params.reason,
-                           "error" => ?set_failed_error);
-
-                    Err(set_failed_error.into())
-                } else {
-                    Err(inner.0.into())
-                }
-            }
-            InstanceStateChangeError::SledAgent(_) => {
-                info!(osagactx.log(),
-                       "start saga: instance already unregistered from sled";
-                       "instance_id" => %instance_id,
-                       "start_reason" => ?params.reason);
-
+                    .await;
                 Ok(())
             }
+            InstanceStateChangeError::SledAgent(inner) => {
+                error!(
+                    osagactx.log(),
+                    "start saga: failed to unregister VMM with sled-agent";
+                    "instance_id" => %instance_id,
+                    "error" => %inner,
+                    "start_reason" => ?params.reason,
+                );
+
+                // TODO(eliza): we should probably retry communication errors a
+                // few times before giving up on unwinding entirely!
+                Err(inner.into())
+            }
             InstanceStateChangeError::Other(inner) => {
                 error!(osagactx.log(),
                        "start saga: internal error unregistering instance";

diff --git a/nexus/src/external_api/http_entrypoints.rs b/nexus/src/external_api/http_entrypoints.rs
@@ -2271,6 +2271,36 @@ impl NexusExternalApi for NexusExternalApiImpl {
             .await
     }
 
+    async fn instance_force_terminate(
+        rqctx: RequestContext<Self::Context>,
+        query_params: Query<params::OptionalProjectSelector>,
+        path_params: Path<params::InstancePath>,
+    ) -> Result<HttpResponseAccepted<Instance>, HttpError> {
+        let apictx = rqctx.context();
+        let nexus = &apictx.context.nexus;
+        let path = path_params.into_inner();
+        let query = query_params.into_inner();
+        let instance_selector = params::InstanceSelector {
+            project: query.project,
+            instance: path.instance,
+        };
+        let handler = async {
+            let opctx =
+                crate::context::op_context_for_external_api(&rqctx).await?;
+            let instance_lookup =
+                nexus.instance_lookup(&opctx, instance_selector)?;
+            let instance = nexus
+                .instance_force_terminate(&opctx, &instance_lookup)
+                .await?;
+            Ok(HttpResponseAccepted(instance.into()))
+        };
+        apictx
+            .context
+            .external_latencies
+            .instrument_dropshot_handler(&rqctx, handler)
+            .await
+    }
+
     async fn instance_serial_console(
         rqctx: RequestContext<ApiContext>,
         path_params: Path<params::InstancePath>,