Skip to content
1 change: 1 addition & 0 deletions nexus/external-api/output/nexus_tags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ instance_disk_list GET /v1/instances/{instance}/disks
instance_ephemeral_ip_attach POST /v1/instances/{instance}/external-ips/ephemeral
instance_ephemeral_ip_detach DELETE /v1/instances/{instance}/external-ips/ephemeral
instance_external_ip_list GET /v1/instances/{instance}/external-ips
instance_force_terminate POST /v1/instances/{instance}/force-terminate
instance_list GET /v1/instances
instance_network_interface_create POST /v1/network-interfaces
instance_network_interface_delete DELETE /v1/network-interfaces/{interface}
Expand Down
18 changes: 18 additions & 0 deletions nexus/external-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,24 @@ pub trait NexusExternalApi {
path_params: Path<params::InstancePath>,
) -> Result<HttpResponseAccepted<Instance>, HttpError>;

/// Terminate instance
///
/// Immediately halts a running instance by rudely terminating its
/// virtual machine process. This immediately moves the instance to the
/// "stopped" state without transitioning through the "stopping" state.
/// This operation can be used to recover an instance that is not
/// responding to requests to stop issued through the instance stop API.
Comment on lines +1173 to +1175
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

putting my mildly-uninformed-user hat on, is there something important i could be missing out on by not transitioning through "stopping"? resources that could be leaked (seems unlikely), or internal instance state that could get weird (seems more likely)? from what's here it doesn't seem unreasonable for a user to always /force-terminate on the assumption that it's more like yanking power, and i dunno how much anyone would be disturbed by that. i recognize this is also kind of the ambiguity @gjcolombo was trying to address, sorry :)

putting my Oxide engineer hat on, it feels like any reason to use /force-terminate is a result of a user trying to unwedge themselves from an Oxide bug. so maybe that's the kind of warding this documentation deserves? though i'm still not sure how load-bearing stopping is..

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The instance will still transition through the Stopping state if you are querying it in the meantime, which will be visible in e.g. the console. It's just that the force-terminate API endpoint does not return until the instance has advanced to Stopped.

#[endpoint {
method = POST,
path = "/v1/instances/{instance}/force-terminate",
tags = ["instances"],
}]
async fn instance_force_terminate(
rqctx: RequestContext<Self::Context>,
query_params: Query<params::OptionalProjectSelector>,
path_params: Path<params::InstancePath>,
) -> Result<HttpResponseAccepted<Instance>, HttpError>;

/// Fetch instance serial console
#[endpoint {
method = GET,
Expand Down
304 changes: 253 additions & 51 deletions nexus/src/app/instance.rs

Large diffs are not rendered by default.

67 changes: 28 additions & 39 deletions nexus/src/app/sagas/instance_start.rs
Original file line number Diff line number Diff line change
Expand Up @@ -671,56 +671,45 @@ async fn sis_ensure_registered_undo(
"error" => ?e);

// If the failure came from talking to sled agent, and the error code
// indicates the instance or sled might be unhealthy, manual
// intervention is likely to be needed, so try to mark the instance as
// Failed and then bail on unwinding.
// indicates the VMM has been forgotten by sled-agent, try to mark the
// VMM as Failed and continue unwinding. If we cannot mark it as failed,
// it has presumably just already been unregistered, which is fine.
//
// If sled agent is in good shape but just doesn't know about the
// instance, this saga still owns the instance's state, so allow
// unwinding to continue.
// If we were unable to communicate with the sled agent, then we cannot
// properly clean up this VMM, and manual intervention may be required.
//
// If some other Nexus error occurred, this saga is in bad shape, so
// return an error indicating that intervention is needed without trying
// to modify the instance further.
//
// TODO(#3238): `instance_unhealthy` does not take an especially nuanced
// view of the meanings of the error codes sled agent could return, so
// assuming that an error that isn't `instance_unhealthy` means
// that everything is hunky-dory and it's OK to continue unwinding may
// be a bit of a stretch. See the definition of `instance_unhealthy` for
// more details.
match e {
InstanceStateChangeError::SledAgent(inner) if inner.vmm_gone() => {
error!(osagactx.log(),
"start saga: failing instance after unregister failure";
"instance_id" => %instance_id,
"start_reason" => ?params.reason,
"error" => ?inner);

if let Err(set_failed_error) = osagactx
info!(
osagactx.log(),
"start saga: VMM has either already been unregistered \
or has been forgotten by sled-agent";
"instance_id" => %instance_id,
"start_reason" => ?params.reason,
"error" => %inner
);
osagactx
.nexus()
.mark_vmm_failed(&opctx, authz_instance, &db_vmm, &inner)
.await
{
error!(osagactx.log(),
"start saga: failed to mark instance as failed";
"instance_id" => %instance_id,
"start_reason" => ?params.reason,
"error" => ?set_failed_error);

Err(set_failed_error.into())
} else {
Err(inner.0.into())
}
}
InstanceStateChangeError::SledAgent(_) => {
info!(osagactx.log(),
"start saga: instance already unregistered from sled";
"instance_id" => %instance_id,
"start_reason" => ?params.reason);

.await;
Ok(())
}
InstanceStateChangeError::SledAgent(inner) => {
error!(
osagactx.log(),
"start saga: failed to unregister VMM with sled-agent";
"instance_id" => %instance_id,
"error" => %inner,
"start_reason" => ?params.reason,
);

// TODO(eliza): we should probably retry communication errors a
// few times before giving up on unwinding entirely!
Err(inner.into())
}
InstanceStateChangeError::Other(inner) => {
error!(osagactx.log(),
"start saga: internal error unregistering instance";
Expand Down
30 changes: 30 additions & 0 deletions nexus/src/external_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2271,6 +2271,36 @@ impl NexusExternalApi for NexusExternalApiImpl {
.await
}

async fn instance_force_terminate(
rqctx: RequestContext<Self::Context>,
query_params: Query<params::OptionalProjectSelector>,
path_params: Path<params::InstancePath>,
) -> Result<HttpResponseAccepted<Instance>, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.context.nexus;
let path = path_params.into_inner();
let query = query_params.into_inner();
let instance_selector = params::InstanceSelector {
project: query.project,
instance: path.instance,
};
let handler = async {
let opctx =
crate::context::op_context_for_external_api(&rqctx).await?;
let instance_lookup =
nexus.instance_lookup(&opctx, instance_selector)?;
let instance = nexus
.instance_force_terminate(&opctx, &instance_lookup)
.await?;
Ok(HttpResponseAccepted(instance.into()))
};
apictx
.context
.external_latencies
.instrument_dropshot_handler(&rqctx, handler)
.await
}

async fn instance_serial_console(
rqctx: RequestContext<ApiContext>,
path_params: Path<params::InstancePath>,
Expand Down
Loading