Skip to content
1 change: 1 addition & 0 deletions nexus/external-api/output/nexus_tags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ instance_disk_list GET /v1/instances/{instance}/disks
instance_ephemeral_ip_attach POST /v1/instances/{instance}/external-ips/ephemeral
instance_ephemeral_ip_detach DELETE /v1/instances/{instance}/external-ips/ephemeral
instance_external_ip_list GET /v1/instances/{instance}/external-ips
instance_force_terminate POST /v1/instances/{instance}/force-terminate
instance_list GET /v1/instances
instance_network_interface_create POST /v1/network-interfaces
instance_network_interface_delete DELETE /v1/network-interfaces/{interface}
Expand Down
18 changes: 18 additions & 0 deletions nexus/external-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,24 @@ pub trait NexusExternalApi {
path_params: Path<params::InstancePath>,
) -> Result<HttpResponseAccepted<Instance>, HttpError>;

/// Terminate instance
///
/// Immediately halts a running instance by rudely terminating its
/// virtual machine process. This immediately moves the instance to the
/// "stopped" state without transitioning through the "stopping" state.
/// This operation can be used to recover an instance that is not
/// responding to requests to stop issued through the instance stop API.
Comment on lines +1173 to +1175
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

putting my mildly-uninformed-user hat on, is there something important i could be missing out on by not transitioning through "stopping"? resources that could be leaked (seems unlikely), or internal instance state that could get weird (seems more likely)? from what's here it doesn't seem unreasonable for a user to always /force-terminate on the assumption that it's more like yanking power, and i dunno how much anyone would be disturbed by that. i recognize this is also kind of the ambiguity @gjcolombo was trying to address, sorry :)

putting my Oxide engineer hat on, it feels like any reason to use /force-terminate is a result of a user trying to unwedge themselves from an Oxide bug. so maybe that's the kind of warding this documentation deserves? though i'm still not sure how load-bearing stopping is..

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The instance will still transition through the Stopping state if you are querying it in the meantime, which will be visible in e.g. the console. It's just that the force-terminate API endpoint does not return until the instance has advanced to Stopped.

#[endpoint {
method = POST,
path = "/v1/instances/{instance}/force-terminate",
tags = ["instances"],
}]
async fn instance_force_terminate(
rqctx: RequestContext<Self::Context>,
query_params: Query<params::OptionalProjectSelector>,
path_params: Path<params::InstancePath>,
) -> Result<HttpResponseAccepted<Instance>, HttpError>;

/// Fetch instance serial console
#[endpoint {
method = GET,
Expand Down
197 changes: 192 additions & 5 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,190 @@ impl super::Nexus {
.map_err(Into::into)
}

/// Forcefully stop a running instance, causing its sled-agent to rudely
/// terminate its VMM process and unregister the instance.
pub(crate) async fn instance_force_terminate(
&self,
opctx: &OpContext,
instance_lookup: &lookup::Instance<'_>,
) -> Result<InstanceAndActiveVmm, InstanceStateChangeError> {
let (.., authz_instance) =
instance_lookup.lookup_for(authz::Action::Modify).await?;

let db::datastore::InstanceGestalt {
instance,
active_vmm,
target_vmm,
migration: _,
} = self
.db_datastore
.instance_fetch_all(opctx, &authz_instance)
.await?;

// If the instance is currently incarnated by VMM process(es), hunt down
// and destroy them.
let terminated_active = match active_vmm {
Some(vmm) => {
self.instance_force_terminate_vmm(
opctx,
&authz_instance,
vmm,
"active",
)
.await
}
None => {
debug!(
opctx.log,
"asked to force terminate an instance that has no active VMM";
"instance_id" => %authz_instance.id(),
"instance_state" => ?instance.runtime(),
);
Ok(())
}
};
let terminated_target = match target_vmm {
Some(vmm) => {
self.instance_force_terminate_vmm(
opctx,
&authz_instance,
vmm,
"target",
)
.await
}
None => Ok(()),
};

// If our attempt to terminate either VMM failed, bail --- but only
// after both futures completed.
terminated_active?;
terminated_target?;

// Ladies and gentlemen, we got him!
self.db_datastore
.instance_fetch_with_vmm(opctx, &authz_instance)
.await
.map_err(Into::into)
}

/// Forcefully terminate a VMM associated with an instance (by calling
/// [`Self::instance_ensure_unregistered`]), and then update the instance's
/// state to reflect that the VMM has been unregistered.
///
/// # Arguments
///
/// - `opctx`: the [`OpContext`] for this action
/// - `authz_instance`: the instance associated with the VMM, so that the
/// instance can be updated to reflect the new VMM state.
/// - `vmm`: the VMM to forcefully terminate
/// - `vmm_role`: a string ("active" or "target") for logging which VMM is
/// being terminated.
async fn instance_force_terminate_vmm(
&self,
opctx: &OpContext,
authz_instance: &authz::Instance,
vmm: db::model::Vmm,
vmm_role: &str,
) -> Result<(), InstanceStateChangeError> {
let propolis_id = PropolisUuid::from_untyped_uuid(vmm.id);
let sled_id = SledUuid::from_untyped_uuid(vmm.sled_id);
let unregister_result =
self.instance_ensure_unregistered(&propolis_id, &sled_id).await;
Comment on lines +877 to +878
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the start saga clean up properly if this happens between its ensure_registered and ensure_running steps? I think it works out: sis_ensure_running will fail; sis_ensure_registered_undo will also fail and try to move the VMM to Failed, but this doesn't block the saga from continuing to unwind; then I think we'll end up in SagaUnwound and end up with a VM that can be started again. Does that sound about right? If so, it might be worthwhile to add a comment mentioning this case.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, but I would like to figure out whether it's possible to test this...

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, upon further inspection, I believe your assessment is correct and this is fine.

I think that sis_ensure_registered_undo failing to move the VMM to Failed will get the saga stuck:

match e {
InstanceStateChangeError::SledAgent(inner) if inner.vmm_gone() => {
error!(osagactx.log(),
"start saga: failing instance after unregister failure";
"instance_id" => %instance_id,
"start_reason" => ?params.reason,
"error" => ?inner);
if let Err(set_failed_error) = osagactx
.nexus()
.mark_vmm_failed(&opctx, authz_instance, &db_vmm, &inner)
.await
{
error!(osagactx.log(),
"start saga: failed to mark instance as failed";
"instance_id" => %instance_id,
"start_reason" => ?params.reason,
"error" => ?set_failed_error);
Err(set_failed_error.into())

However, mark_vmm_failed won't actually fail here if the VMM's state generation is stale, it will just scream about it:

// XXX: It's not clear what to do with this error; should it be
// bubbled back up to the caller?
Err(e) => error!(self.log,
"failed to write Failed instance state to DB";
"instance_id" => %instance_id,
"vmm_id" => %vmm_id,
"error" => ?e),

So, I think we're in the clear here. But, I feel a bit uncomfortable about this, because it seems like a change to the mark_vmm_failed error path to actually return an error could introduce a bug here, and I'm not immediately sure if there's an obvious way to write a regression test that would fail on such a change. Any ideas?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we do something with error types instead? That is: have mark_vmm_failed return its own error enum with "DB query failed" and "update too old" variants, and have sis_ensure_registered_undo match on specific error codes from this callee, such that a new failure mode will break the match. WDYT? It'd be nice to have an integration test, too, but I'm similarly having trouble figuring out how to inject a failure into this specific call, since we don't have a CRDB test double that I know of.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, actually, the write to CRDB in mark_vmm_failed is done by a call to vmm_update_runtime, which returns Ok(false) if the VMM exists but wasn't updated (e.g. if the generation has advanced). So, we don't even hit the error path (which gets ignored anyway) in mark_vmm_failed in that case...

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which, now that I look at it, indicates a tiny possible inefficiency in mark_vmm_failed --- currently, we try to run an update saga even if the VMM did not move to failed (because it was already failed/destroyed), which means we will probably start spurious update sagas there. I'll clean that up.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'm probably fine with having mark_vmm_failed stick with the convention of returning Ok(false) if the generation has changed, instead of an error variant, since it seems like we generally follow that convention for similar code. On one hand, I do have a sort of ideological preference for an Ok(_) to always mean "yes, we actually wrote the desired update to CRDB", but on the other hand, a lot of saga undo actions probably just ? these functions, and would probably prefer to get an Ok in any case that doesn't mean they should unwind. I dunno.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mark_vmm_failed actually has the type signature of async fn(...) -> Result<(), Error> but there are actually no conditions in which it will ever currently return an error. That's cool.

I might just change the type signature to not return Result

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upon further perusal of sis_ensure_registered_undo, I noticed that it was actually handling sled-agent unregister errors in a pretty outdated way: it was treating vmm_gone errors as a probable failure, and just ignoring any other errors communicating with the sled-agent, as described in this comment:

// If the failure came from talking to sled agent, and the error code
// indicates the instance or sled might be unhealthy, manual
// intervention is likely to be needed, so try to mark the instance as
// Failed and then bail on unwinding.
//
// If sled agent is in good shape but just doesn't know about the
// instance, this saga still owns the instance's state, so allow
// unwinding to continue.
//
// If some other Nexus error occurred, this saga is in bad shape, so
// return an error indicating that intervention is needed without trying
// to modify the instance further.
//
// TODO(#3238): `instance_unhealthy` does not take an especially nuanced
// view of the meanings of the error codes sled agent could return, so
// assuming that an error that isn't `instance_unhealthy` means
// that everything is hunky-dory and it's OK to continue unwinding may
// be a bit of a stretch. See the definition of `instance_unhealthy` for
// more details.

This predates the changes to vmm_gone/instance_unhealthy semantics from RFD 486/#6455, and isn't quite correct. I've changed it to make the "other error from sled agent" be the case that fails the saga, although we should probably retry communication errors eventually.

The changes to mark_vmm_failed and sis_ensure_registered_undo are in a923f2a. Arguably, it's not really in scope for this PR at that point; I'd be happy to cherry-pick it out to a separate branch if you think that's worth doing?

match unregister_result {
// VMM unregistered, now process the state transition.
Ok(Some(state)) => {
info!(
opctx.log,
"instance's {vmm_role} VMM terminated with extreme \
prejudice";
"instance_id" => %authz_instance.id(),
"vmm_id" => %propolis_id,
"sled_id" => %sled_id,
);

// We would like the caller to see the instance they are
// attempting to terminate go to "Stopped", so run the
// instance-update saga synchronously if possible. This is
// particularly important in the case where a migrating instance
// is force-terminated, as an instance with a migration ID will
// remain "Migrating" (not "Stopping") until its migration ID is
// unset, and it seems a bit sad to return a "Migrating"
// instance to a caller who tries to force-kill it.
//
// TODO(eliza): in the case where we are terminating both an
// active VMM and migration target, we will unregister them in
// sequence, running separate update sagas for both VMMs being
// terminated. It would be a bit more efficient to terminate
// both VMMs, update CRDB, and then run a *single* update saga
// to process both VMMs being destroyed. However, this requires
// a bit of annoying refactoring to existing APIs, and I'm not
// sure if improving the `instance-force-terminate` endpoint's
// latency is particularly important...
if let Some((_, saga)) = process_vmm_update(
&self.db_datastore,
opctx,
propolis_id,
&state,
)
.await?
{
self.sagas
.saga_prepare(saga)
.await?
.start()
.await?
.wait_until_stopped()
.await
.into_omicron_result()?;
}
}
// If the returned state from sled-agent is `None`, then the
// instance was already unregistered. This may have been from a
// prior attempt to stop the instance (either normally or
// forcefully). But, since we observed an active VMM above, the
// current observed VMM generation doesn't know that the VMM is
// gone, so it is possible that the sled-agent has misplaced this
// instance. Therefore, we will attempt to mark the VMM as `Failed`
// at the generation after which we observed the VMM. This is safe
// to do here, because if the instance has been unregistered due to
// a race with another instance-ensure-unregistered request (rather
// than a sled-agent failure), that other call will have advanced
// the state generation, and our attempt to write the failed state
// will not succeed, which is fine.
//
// Either way, the caller should not observe a returned instance
// state that believes itself to be running.
Ok(None) => {
info!(
opctx.log,
"asked to force terminate an instance's {vmm_role} VMM ;
thatwas already unregistered";
"instance_id" => %authz_instance.id(),
"vmm_id" => %propolis_id,
"sled_id" => %sled_id,
);
let _ = self
.mark_vmm_failed(
&opctx,
authz_instance.clone(),
&vmm,
&"instance already unregistered",
)
.await;
}
// If the error indicates that the VMM wasn't there to terminate,
// mark it as Failed instead.
Err(InstanceStateChangeError::SledAgent(e)) if e.vmm_gone() => {
let _ = self
.mark_vmm_failed(&opctx, authz_instance.clone(), &vmm, &e)
.await;
}
Err(e) => return Err(e),
}
Ok(())
}

/// Idempotently ensures that the sled specified in `db_instance` does not
/// have a record of the instance. If the instance is currently running on
/// this sled, this operation rudely terminates it.
Expand Down Expand Up @@ -1407,19 +1591,22 @@ impl super::Nexus {
/// execute, as this may just mean that another saga is already updating the
/// instance. The update will be performed eventually even if this method
/// could not update the instance.
pub(crate) async fn mark_vmm_failed(
pub(crate) async fn mark_vmm_failed<R>(
&self,
opctx: &OpContext,
authz_instance: authz::Instance,
vmm: &db::model::Vmm,
reason: &SledAgentInstanceError,
) -> Result<(), Error> {
reason: &R,
) -> Result<(), Error>
where
R: std::fmt::Display,
{
let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id());
let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id);
error!(self.log, "marking VMM failed due to sled agent API error";
"instance_id" => %instance_id,
"vmm_id" => %vmm_id,
"error" => ?reason);
"error" => %reason);

let new_runtime = VmmRuntimeState {
state: db::model::VmmState::Failed,
Expand All @@ -1433,7 +1620,7 @@ impl super::Nexus {
info!(self.log, "marked VMM as Failed, preparing update saga";
"instance_id" => %instance_id,
"vmm_id" => %vmm_id,
"reason" => ?reason,
"reason" => %reason,
);
let saga = instance_update::SagaInstanceUpdate::prepare(
&instance_update::Params {
Expand Down
30 changes: 30 additions & 0 deletions nexus/src/external_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2271,6 +2271,36 @@ impl NexusExternalApi for NexusExternalApiImpl {
.await
}

async fn instance_force_terminate(
rqctx: RequestContext<Self::Context>,
query_params: Query<params::OptionalProjectSelector>,
path_params: Path<params::InstancePath>,
) -> Result<HttpResponseAccepted<Instance>, HttpError> {
let apictx = rqctx.context();
let nexus = &apictx.context.nexus;
let path = path_params.into_inner();
let query = query_params.into_inner();
let instance_selector = params::InstanceSelector {
project: query.project,
instance: path.instance,
};
let handler = async {
let opctx =
crate::context::op_context_for_external_api(&rqctx).await?;
let instance_lookup =
nexus.instance_lookup(&opctx, instance_selector)?;
let instance = nexus
.instance_force_terminate(&opctx, &instance_lookup)
.await?;
Ok(HttpResponseAccepted(instance.into()))
};
apictx
.context
.external_latencies
.instrument_dropshot_handler(&rqctx, handler)
.await
}

async fn instance_serial_console(
rqctx: RequestContext<ApiContext>,
path_params: Path<params::InstancePath>,
Expand Down
Loading