Skip to content
8 changes: 7 additions & 1 deletion nexus/external-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1166,7 +1166,13 @@ pub trait NexusExternalApi {
path_params: Path<params::InstancePath>,
) -> Result<HttpResponseAccepted<Instance>, HttpError>;

/// Stop instance, violently.
/// Terminate instance
///
/// Immediately halts a running instance by rudely terminating its
/// virtual machine process. This immediately moves the instance to the
/// "stopped" state without transitioning through the "stopping" state.
/// This operation can be used to recover an instance that is not
/// responding to requests to stop issued through the instance stop API.
Comment on lines +1173 to +1175
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

putting my mildly-uninformed-user hat on, is there something important i could be missing out on by not transitioning through "stopping"? resources that could be leaked (seems unlikely), or internal instance state that could get weird (seems more likely)? from what's here it doesn't seem unreasonable for a user to always /force-terminate on the assumption that it's more like yanking power, and i dunno how much anyone would be disturbed by that. i recognize this is also kind of the ambiguity @gjcolombo was trying to address, sorry :)

putting my Oxide engineer hat on, it feels like any reason to use /force-terminate is a result of a user trying to unwedge themselves from an Oxide bug. so maybe that's the kind of warding this documentation deserves? though i'm still not sure how load-bearing stopping is..

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The instance will still transition through the Stopping state if you are querying it in the meantime, which will be visible in e.g. the console. It's just that the force-terminate API endpoint does not return until the instance has advanced to Stopped.

#[endpoint {
method = POST,
path = "/v1/instances/{instance}/force-terminate",
Expand Down
164 changes: 123 additions & 41 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -784,31 +784,82 @@ impl super::Nexus {
let (.., authz_instance) =
instance_lookup.lookup_for(authz::Action::Modify).await?;

let state = self
let db::datastore::InstanceGestalt {
instance,
active_vmm,
target_vmm,
migration: _,
} = self
.db_datastore
.instance_fetch_with_vmm(opctx, &authz_instance)
.instance_fetch_all(opctx, &authz_instance)
.await?;

// Is the instance currently incarnated in a VMM process?
let Some(vmm) = state.vmm() else {
// If the instance is already un-incarnated, we're all good.
debug!(
opctx.log,
"asked to force terminate an instance that has no VMM, \
doing nothing";
"instance_id" => %authz_instance.id(),
"instance_state" => ?state.instance.runtime(),
);
debug_assert_eq!(
state.instance.runtime().nexus_state,
db::model::InstanceState::NoVmm,
"if an instance has no active VMM, it must be in the NoVmm \
state (this is enforced by a DB check constraint)",
);
return Ok(state);
// If the instance is currently incarnated by VMM process(es), hunt down
// and destroy them.
let terminated_active = match active_vmm {
Some(vmm) => {
self.instance_force_terminate_vmm(
opctx,
&authz_instance,
vmm,
"active",
)
.await
}
None => {
debug!(
opctx.log,
"asked to force terminate an instance that has no active VMM";
"instance_id" => %authz_instance.id(),
"instance_state" => ?instance.runtime(),
);
Ok(())
}
};
let terminated_target = match target_vmm {
Some(vmm) => {
self.instance_force_terminate_vmm(
opctx,
&authz_instance,
vmm,
"target",
)
.await
}
None => Ok(()),
};

// If our attempt to terminate either VMM failed, bail --- but only
// after both futures completed.
terminated_active?;
terminated_target?;

// Ladies and gentlemen, we got him!
self.db_datastore
.instance_fetch_with_vmm(opctx, &authz_instance)
.await
.map_err(Into::into)
}

/// Forcefully terminate a VMM associated with an instance (by calling
/// [`Self::instance_ensure_unregistered`]), and then update the instance's
/// state to reflect that the VMM has been unregistered.
///
/// # Arguments
///
/// - `opctx`: the [`OpContext`] for this action
/// - `authz_instance`: the instance associated with the VMM, so that the
/// instance can be updated to reflect the new VMM state.
/// - `vmm`: the VMM to forcefully terminate
/// - `vmm_role`: a string ("active" or "target") for logging which VMM is
/// being terminated.
async fn instance_force_terminate_vmm(
&self,
opctx: &OpContext,
authz_instance: &authz::Instance,
vmm: db::model::Vmm,
vmm_role: &str,
) -> Result<(), InstanceStateChangeError> {
let propolis_id = PropolisUuid::from_untyped_uuid(vmm.id);
let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id);
let unregister_result =
Expand All @@ -818,35 +869,70 @@ impl super::Nexus {
Ok(Some(state)) => {
info!(
opctx.log,
"instance terminated with extreme prejudice";
"instance's {vmm_role} VMM terminated with extreme \
prejudice";
"instance_id" => %authz_instance.id(),
"vmm_id" => %propolis_id,
"sled_id" => %sled_id,
);

self.notify_vmm_updated(opctx, propolis_id, &state).await?;
// We would like the caller to see the instance they are
// attempting to terminate go to "Stopped", so run the
// instance-update saga synchronously if possible. This is
// particularly important in the case where a migrating instance
// is force-terminated, as an instance with a migration ID will
// remain "Migrating" (not "Stopping") until its migration ID is
// unset, and it seems a bit sad to return a "Migrating"
// instance to a caller who tries to force-kill it.
//
// TODO(eliza): in the case where we are terminating both an
// active VMM and migration target, we will unregister them in
// sequence, running separate update sagas for both VMMs being
// terminated. It would be a bit more efficient to terminate
// both VMMs, update CRDB, and then run a *single* update saga
// to process both VMMs being destroyed. However, this requires
// a bit of annoying refactoring to existing APIs, and I'm not
// sure if improving the `instance-force-terminate` endpoint's
// latency is particularly important...
if let Some((_, saga)) = process_vmm_update(
&self.db_datastore,
opctx,
propolis_id,
&state,
)
.await?
{
self.sagas
.saga_prepare(saga)
.await?
.start()
.await?
.wait_until_stopped()
.await
.into_omicron_result()?;
}
}
// If the returned state from sled-agent is `None`, then the instance
// was already unregistered. This may have been from a prior
// instance-ensure-unregistered call, but, since we observed an
// active VMM above, the current observed VMM generation doesn't
// know that the VMM is gone, so it is possible that the sled-agent
// has misplaced this instance. Therefore, we will attempt to mark
// the VMM as `Failed` at the generation after which we observed the
// VMM. This is safe to do here, because if the instance has been
// unregistered due to a race with another
// instance-ensure-unregistered request (rather than a sled-agent
// failure), that other call will have advanced the state
// generation, and our attempt to write the failed state will not
// succeed, which is fine.
// If the returned state from sled-agent is `None`, then the
// instance was already unregistered. This may have been from a
// prior attempt to stop the instance (either normally or
// forcefully). But, since we observed an active VMM above, the
// current observed VMM generation doesn't know that the VMM is
// gone, so it is possible that the sled-agent has misplaced this
// instance. Therefore, we will attempt to mark the VMM as `Failed`
// at the generation after which we observed the VMM. This is safe
// to do here, because if the instance has been unregistered due to
// a race with another instance-ensure-unregistered request (rather
// than a sled-agent failure), that other call will have advanced
// the state generation, and our attempt to write the failed state
// will not succeed, which is fine.
//
// Either way, the caller should not observe a returned instance
// state that believes itself to be running.
Ok(None) => {
info!(
opctx.log,
"asked to force terminate an instance that was already \
unregistered";
"asked to force terminate an instance's {vmm_role} VMM ;
thatwas already unregistered";
"instance_id" => %authz_instance.id(),
"vmm_id" => %propolis_id,
"sled_id" => %sled_id,
Expand All @@ -869,11 +955,7 @@ impl super::Nexus {
}
Err(e) => return Err(e),
}

self.db_datastore
.instance_fetch_with_vmm(opctx, &authz_instance)
.await
.map_err(Into::into)
Ok(())
}

/// Idempotently ensures that the sled specified in `db_instance` does not
Expand Down
Loading