From d70e41f9b05ed0d3b99bccca9dbfd089b5d1240a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 12 Aug 2025 15:13:03 -0700 Subject: [PATCH 01/22] Create db_metadata_nexus_state table --- nexus/db-model/src/db_metadata.rs | 50 + nexus/db-model/src/schema_versions.rs | 6 +- .../src/db/datastore/db_metadata.rs | 1232 ++++++++++++++++- nexus/db-queries/src/db/datastore/mod.rs | 69 +- nexus/db-queries/src/db/datastore/rack.rs | 10 + nexus/db-queries/src/db/pub_test_utils/mod.rs | 7 +- nexus/db-schema/src/enums.rs | 1 + nexus/db-schema/src/schema.rs | 8 + nexus/src/app/mod.rs | 2 + nexus/src/bin/schema-updater.rs | 26 +- nexus/src/populate.rs | 4 +- .../tests/integration_tests/initialization.rs | 2 +- nexus/tests/integration_tests/schema.rs | 147 +- schema/crdb/dbinit.sql | 81 +- .../crdb/populate-db-metadata-nexus/up01.sql | 6 + .../crdb/populate-db-metadata-nexus/up02.sql | 11 + .../crdb/populate-db-metadata-nexus/up03.sql | 3 + .../crdb/populate-db-metadata-nexus/up04.sql | 15 + 18 files changed, 1602 insertions(+), 78 deletions(-) create mode 100644 schema/crdb/populate-db-metadata-nexus/up01.sql create mode 100644 schema/crdb/populate-db-metadata-nexus/up02.sql create mode 100644 schema/crdb/populate-db-metadata-nexus/up03.sql create mode 100644 schema/crdb/populate-db-metadata-nexus/up04.sql diff --git a/nexus/db-model/src/db_metadata.rs b/nexus/db-model/src/db_metadata.rs index de7e2862eb7..8f15ce02ae7 100644 --- a/nexus/db-model/src/db_metadata.rs +++ b/nexus/db-model/src/db_metadata.rs @@ -3,8 +3,14 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::SemverVersion; +use crate::impl_enum_type; +use crate::typed_uuid::DbTypedUuid; use chrono::{DateTime, Utc}; use nexus_db_schema::schema::db_metadata; +use nexus_db_schema::schema::db_metadata_nexus; +use omicron_uuid_kinds::{ + BlueprintKind, BlueprintUuid, OmicronZoneKind, OmicronZoneUuid, +}; use serde::{Deserialize, Serialize}; /// Internal database metadata @@ -33,3 +39,47 @@ impl DbMetadata { &self.version } } + +impl_enum_type!( + DbMetadataNexusStateEnum: + + #[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, PartialEq, Serialize, Deserialize)] + pub enum DbMetadataNexusState; + + // Enum values + Active => b"active" + NotYet => b"not_yet" + Inactive => b"inactive" +); + +#[derive( + Queryable, Insertable, Debug, Clone, Selectable, Serialize, Deserialize, +)] +#[diesel(table_name = db_metadata_nexus)] +pub struct DbMetadataNexus { + nexus_id: DbTypedUuid, + last_drained_blueprint_id: Option>, + state: DbMetadataNexusState, +} + +impl DbMetadataNexus { + pub fn new(nexus_id: OmicronZoneUuid, state: DbMetadataNexusState) -> Self { + Self { + nexus_id: nexus_id.into(), + last_drained_blueprint_id: None, + state, + } + } + + pub fn state(&self) -> DbMetadataNexusState { + self.state + } + + pub fn nexus_id(&self) -> OmicronZoneUuid { + self.nexus_id.into() + } + + pub fn last_drained_blueprint_id(&self) -> Option { + self.last_drained_blueprint_id.map(|id| id.into()) + } +} diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 42c1755e13a..a5525b174ce 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(181, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(182, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(182, "populate-db-metadata-nexus"), KnownVersion::new(181, "rename-nat-table"), KnownVersion::new(180, "sled-cpu-family"), KnownVersion::new(179, "add-pending-mgs-updates-host-phase-1"), @@ -225,6 +226,9 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { /// The earliest supported schema version. pub const EARLIEST_SUPPORTED_VERSION: Version = Version::new(1, 0, 0); +/// The version where "db_metadata_nexus" was added. +pub const DB_METADATA_NEXUS_SCHEMA_VERSION: Version = Version::new(182, 0, 0); + /// Describes one version of the database schema #[derive(Debug, Clone)] struct KnownVersion { diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index dbc1de58571..d94c1ac161e 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -4,22 +4,63 @@ //! [`DataStore`] methods on Database Metadata. -use super::DataStore; +use super::{DataStore, DbConnection}; use anyhow::{Context, bail, ensure}; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use chrono::Utc; use diesel::prelude::*; +use diesel::upsert::excluded; use nexus_db_errors::ErrorHandler; +use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; use nexus_db_model::AllSchemaVersions; +use nexus_db_model::DB_METADATA_NEXUS_SCHEMA_VERSION; +use nexus_db_model::DbMetadataNexus; +use nexus_db_model::DbMetadataNexusState; use nexus_db_model::EARLIEST_SUPPORTED_VERSION; use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::SchemaVersion; use omicron_common::api::external::Error; +use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid}; use semver::Version; use slog::{Logger, error, info, o}; use std::ops::Bound; use std::str::FromStr; +use uuid::Uuid; + +/// Errors that can occur during handoff operations +#[derive(Debug, thiserror::Error)] +pub enum HandoffError { + #[error( + "Cannot perform handoff: \ + {active_count} Nexus instance(s) are still active. \ + All instances must be inactive or not_yet before handoff can proceed." + )] + ActiveNexusInstancesExist { active_count: u32 }, + + #[error( + "Cannot perform handoff: \ + Nexus {nexus_id} does not have a record in db_metadata_nexus table. \ + This Nexus must be registered before it can become active." + )] + NexusNotRegistered { nexus_id: OmicronZoneUuid }, + + #[error( + "Cannot perform handoff: \ + Nexus {nexus_id} is in state {current_state:?}. \ + Must be in 'not_yet' state to become active." + )] + NexusInWrongState { + nexus_id: OmicronZoneUuid, + current_state: DbMetadataNexusState, + }, +} + +impl From for Error { + fn from(err: HandoffError) -> Self { + Error::internal_error(&err.to_string()) + } +} // A SchemaVersion which uses a pre-release value to indicate // "incremental progress". @@ -99,33 +140,314 @@ fn skippable_version( return false; } +/// Describes the state of the schema with respect this Nexus +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum NexusAccess { + /// Nexus does not yet have access to the database. + DoesNotHaveAccessYet, + + /// Nexus has been explicitly locked out of the database. + LockedOut, + + /// Nexus should have normal access to the database + /// + /// We have a record of this Nexus, and it should have access. + HasExplicitAccess, + + /// Nexus should have normal access to the database + /// + /// We may or may not have a record of this Nexus, but it should have + /// access. + HasImplicitAccess, +} + +/// Describes the state of the schema with respect this Nexus +#[derive(Debug, Copy, Clone, PartialEq)] +enum SchemaStatus { + /// The database schema matches what we want + UpToDate, + + /// The database schema is newer than what we want + NewerThanDesired, + + /// The database schema is older than what we want + OlderThanDesired, + + /// The database schema is older than what we want, and it's + /// so old, it does not know about the "db_metadata_nexus" table. + /// + /// We should avoid accessing the "db_metadata_nexus" tables to check + /// access, because the schema for these tables may not exist. + /// + /// TODO: This may be removed, once we're confident deployed systems + /// have upgraded past DB_METADATA_NEXUS_SCHEMA_VERSION. + OlderThanDesiredSkipAccessCheck, +} + +/// Describes how a consumer may want to react to schema and access +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum ConsumerPolicy { + /// Fail immediately on any schema mismatch + FailOnMismatch, + + /// Update the schema if it is safe to do so + Update, +} + +/// Describes what should be done with a schema +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum SchemaAction { + /// Normal operation: The database is ready for usage + Ready, + + /// Not ready for usage yet + /// + /// The database may be ready for usage once handoff has completed. + WaitForHandoff, + + /// Start a schema update + Update, + + /// Refuse to use the database + Refuse, +} + +/// Committment that the database is willing to perform a [SchemaAction] +/// to a desired schema [Version]. +/// +/// Can be created through [DataStore::check_schema_and_access] +#[derive(Clone)] +pub struct ValidatedSchemaAction { + action: SchemaAction, + desired: Version, +} + +impl ValidatedSchemaAction { + pub fn action(&self) -> &SchemaAction { + &self.action + } + + pub fn desired_version(&self) -> &Version { + &self.desired + } +} + +impl SchemaAction { + // Interprets the combination of access, status, and policy to decide + // what action should be taken. + fn new( + access: NexusAccess, + status: SchemaStatus, + policy: ConsumerPolicy, + ) -> Self { + use ConsumerPolicy::*; + use NexusAccess::*; + use SchemaStatus::*; + + match (access, status, policy) { + // Nexus has been explicitly locked-out of using the database + (LockedOut, _, _) => Self::Refuse, + + // The schema updated beyond what we want, do not use it. + (_, NewerThanDesired, _) => Self::Refuse, + + // If we don't have access yet, but could do something once handoff + // occurs, then wait (or refuse if ConsumerPolicy says to fail fast) + ( + DoesNotHaveAccessYet, + UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, + FailOnMismatch, + ) => Self::Refuse, + ( + DoesNotHaveAccessYet, + UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, + Update, + ) => Self::WaitForHandoff, + + // This is the most "normal" case: Nexus should have access to the + // database, and the schema matches what it wants. + (HasExplicitAccess | HasImplicitAccess, UpToDate, _) => Self::Ready, + + // If this Nexus is allowed to access the schema, but it looks + // older than what we expect, we'll need to update the schema to + // use it. Do this if and only if ConsumerPolicy allows it. + ( + HasExplicitAccess | HasImplicitAccess, + OlderThanDesired | OlderThanDesiredSkipAccessCheck, + FailOnMismatch, + ) => Self::Refuse, + ( + HasExplicitAccess | HasImplicitAccess, + OlderThanDesired | OlderThanDesiredSkipAccessCheck, + Update, + ) => Self::Update, + } + } +} + impl DataStore { - // Ensures that the database schema matches "desired_version". - // - // - Updating the schema makes the database incompatible with older - // versions of Nexus, which are not running "desired_version". - // - This is a one-way operation that cannot be undone. - // - The caller is responsible for ensuring that the new version is valid, - // and that all running Nexus instances can understand the new schema - // version. - // - // TODO: This function assumes that all concurrently executing Nexus - // instances on the rack are operating on the same version of software. - // If that assumption is broken, nothing would stop a "new deployment" - // from making a change that invalidates the queries used by an "old - // deployment". - pub async fn ensure_schema( + // Checks if the specified Nexus has access to the database. + async fn check_nexus_access( &self, - log: &Logger, + nexus_id: OmicronZoneUuid, + ) -> Result { + // Check if any "db_metadata_nexus" rows exist. + // If they don't exist, treat the database as having access. + // + // This handles the case for: + // - Fresh deployments where RSS hasn't populated the table yet (we need + // access to finish "rack_initialization"). + // - Systems that haven't been migrated to include nexus access control + // (we need access to the database to backfill these records). + // + // After initialization/migration, this conditional should never trigger + // again. + let any_records_exist = self.database_nexus_access_any_exist().await?; + if !any_records_exist { + warn!( + &self.log, + "No db_metadata_nexus records exist - skipping access check"; + "nexus_id" => ?nexus_id, + "explanation" => "This is expected during initial deployment or before migration" + ); + return Ok(NexusAccess::HasImplicitAccess); + } + + // Records exist, so enforce the access control check + let Some(state) = + self.database_nexus_access(nexus_id).await?.map(|s| s.state()) + else { + let msg = "Nexus does not have access to the database (no db_metadata_nexus record)"; + warn!(&self.log, "{msg}"; "nexus_id" => ?nexus_id); + return Ok(NexusAccess::DoesNotHaveAccessYet); + }; + + let status = match state { + DbMetadataNexusState::Active => { + info!(&self.log, "Nexus has access to the database"; "nexus_id" => ?nexus_id); + NexusAccess::HasExplicitAccess + } + DbMetadataNexusState::NotYet => { + info!(&self.log, "Nexus does not yet have access to the database"; "nexus_id" => ?nexus_id); + NexusAccess::DoesNotHaveAccessYet + } + DbMetadataNexusState::Inactive => { + let msg = "Nexus locked out of database access (inactive)"; + error!(&self.log, "{msg}"; "nexus_id" => ?nexus_id); + NexusAccess::LockedOut + } + }; + Ok(status) + } + + // Checks the schema against a desired version. + async fn check_schema( + &self, + desired_version: Version, + ) -> Result { + let (found_version, _found_target_version) = self + .database_schema_version() + .await + .context("Cannot read database schema version")?; + + let log = self.log.new(o!( + "found_version" => found_version.to_string(), + "desired_version" => desired_version.to_string(), + )); + + use std::cmp::Ordering; + match found_version.cmp(&desired_version) { + Ordering::Less => { + warn!(log, "Found schema version is older than desired"); + if found_version < DB_METADATA_NEXUS_SCHEMA_VERSION { + Ok(SchemaStatus::OlderThanDesiredSkipAccessCheck) + } else { + Ok(SchemaStatus::OlderThanDesired) + } + } + Ordering::Equal => { + info!(log, "Database schema version is up to date"); + Ok(SchemaStatus::UpToDate) + } + Ordering::Greater => { + error!(log, "Found schema version is newer than desired"); + Ok(SchemaStatus::NewerThanDesired) + } + } + } + + /// Compares the state of the schema with the expectations of the + /// currently running Nexus. + /// + /// - `nexus_id`: An optional UUID of the currently-running Nexus. + /// If used, the db_metadata_nexus table will be checked to confirm + /// this Nexus has access to the database. + /// - `desired_version`: The version of the database schema this + /// Nexus wants. + /// - `policy`: Whether or not we should fail or attempt to update + /// on a schema mismatch + pub async fn check_schema_and_access( + &self, + nexus_id: Option, desired_version: Version, + policy: ConsumerPolicy, + ) -> Result { + let schema_status = self.check_schema(desired_version.clone()).await?; + + let nexus_access = if let Some(nexus_id) = nexus_id { + match schema_status { + // If we don't think the "db_metadata_nexus" tables exist in the + // schema yet, treat them as implicitly having access. + // + // TODO: This may be removed, once we're confident deployed systems + // have upgraded past DB_METADATA_NEXUS_SCHEMA_VERSION. + SchemaStatus::OlderThanDesiredSkipAccessCheck => { + NexusAccess::HasImplicitAccess + } + _ => self.check_nexus_access(nexus_id).await?, + } + } else { + // If a "nexus_id" was not supplied, skip the check, and treat it + // as having access. + // + // This is necessary for tools which access the schema without a + // running Nexus, such as the schema-updater binary. + NexusAccess::HasImplicitAccess + }; + + Ok(ValidatedSchemaAction { + action: SchemaAction::new(nexus_access, schema_status, policy), + desired: desired_version, + }) + } + + /// Ensures that the database schema matches `desired_version`. + /// + /// - `validated_action`: A [ValidatedSchemaAction], indicating that + /// [Self::check_schema_and_access] has already been called. + /// - `all_versions`: A description of all schema versions between + /// "whatever is in the DB" and `desired_version`, instructing + /// how to perform an update. + pub async fn update_schema( + &self, + validated_action: ValidatedSchemaAction, all_versions: Option<&AllSchemaVersions>, ) -> Result<(), anyhow::Error> { + let action = validated_action.action(); + + match action { + SchemaAction::Ready => bail!("No schema update is necessary"), + SchemaAction::Update => (), + _ => bail!("Not ready for schema update"), + } + + let desired_version = validated_action.desired_version().clone(); let (found_version, found_target_version) = self .database_schema_version() .await .context("Cannot read database schema version")?; - let log = log.new(o!( + let log = self.log.new(o!( "found_version" => found_version.to_string(), "desired_version" => desired_version.to_string(), )); @@ -340,6 +662,252 @@ impl DataStore { Ok(()) } + /// Returns the access this Nexus has to the database + pub async fn database_nexus_access( + &self, + nexus_id: OmicronZoneUuid, + ) -> Result, Error> { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + let nexus_access: Option = dsl::db_metadata_nexus + .filter( + dsl::nexus_id.eq(nexus_db_model::to_db_typed_uuid(nexus_id)), + ) + .first_async(&*self.pool_connection_unauthorized().await?) + .await + .optional() + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(nexus_access) + } + + /// Checks if any db_metadata_nexus records exist in the database + pub async fn database_nexus_access_any_exist(&self) -> Result { + let conn = self.pool_connection_unauthorized().await?; + Self::database_nexus_access_any_exist_on_connection(&conn).await + } + + /// Checks if any db_metadata_nexus records exist in the database using an existing connection + pub async fn database_nexus_access_any_exist_on_connection( + conn: &async_bb8_diesel::Connection, + ) -> Result { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + let exists: bool = diesel::select(diesel::dsl::exists( + dsl::db_metadata_nexus.select(dsl::nexus_id), + )) + .get_result_async(conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(exists) + } + + /// Registers a Nexus instance as having active access to the database + pub async fn database_nexus_access_insert( + &self, + nexus_id: OmicronZoneUuid, + state: DbMetadataNexusState, + ) -> Result<(), Error> { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + let new_nexus = DbMetadataNexus::new(nexus_id, state); + + diesel::insert_into(dsl::db_metadata_nexus) + .values(new_nexus) + .on_conflict(dsl::nexus_id) + .do_update() + .set(dsl::state.eq(excluded(dsl::state))) + .execute_async(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + /// Registers multiple Nexus instances as having active access to the database + pub async fn database_nexus_access_insert_multiple( + &self, + nexus_ids: &[OmicronZoneUuid], + state: DbMetadataNexusState, + ) -> Result<(), Error> { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + let new_nexuses: Vec = nexus_ids + .iter() + .map(|&nexus_id| DbMetadataNexus::new(nexus_id, state)) + .collect(); + + diesel::insert_into(dsl::db_metadata_nexus) + .values(new_nexuses) + .on_conflict(dsl::nexus_id) + .do_update() + .set(dsl::state.eq(excluded(dsl::state))) + .execute_async(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + /// Initializes Nexus database access records from a blueprint using an + /// existing connection + /// + /// This function finds all Nexus zones in the given blueprint and creates + /// active database access records for them. Used during RSS rack setup. + /// + /// Returns an error if: + /// - Any db_metadata_nexus records already exist (should only be called + /// during initial setup) + pub async fn initialize_nexus_access_from_blueprint_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + blueprint_id: Uuid, + ) -> Result<(), Error> { + use nexus_db_model::ZoneType; + use nexus_db_schema::schema::bp_omicron_zone::dsl as zone_dsl; + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + // Ensure no db_metadata_nexus records already exist + let any_records_exist = + Self::database_nexus_access_any_exist_on_connection(conn).await?; + if any_records_exist { + return Err(Error::internal_error( + "Cannot initialize Nexus access from blueprint: db_metadata_nexus records already exist. \ + This function should only be called during initial rack setup.", + )); + } + + // Query all Nexus zones from the blueprint + let nexus_zone_ids: Vec = zone_dsl::bp_omicron_zone + .filter(zone_dsl::blueprint_id.eq(blueprint_id)) + .filter(zone_dsl::zone_type.eq(ZoneType::Nexus)) + .select(zone_dsl::id) + .load_async(conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + // Create db_metadata_nexus records for all Nexus zones + let new_nexuses: Vec = nexus_zone_ids + .iter() + .map(|&nexus_id| { + DbMetadataNexus::new( + OmicronZoneUuid::from_untyped_uuid(nexus_id), + DbMetadataNexusState::Active, + ) + }) + .collect(); + + diesel::insert_into(dsl::db_metadata_nexus) + .values(new_nexuses) + .execute_async(conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + // Implementation function for attempt_handoff that runs within a transaction + // + // This function must be executed from a tranaction context to be safe. + async fn attempt_handoff_impl( + conn: async_bb8_diesel::Connection, + nexus_id: OmicronZoneUuid, + err: OptionalError, + ) -> Result<(), diesel::result::Error> { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + // Before proceeding, all records must be in the "inactive" or "not_yet" states. + // + // We explicitly look for any records violating this, rather than explicitly looking for + // "active" records, as to protect ourselves from future states being added over time. + let active_count: nexus_db_model::SqlU32 = dsl::db_metadata_nexus + .filter( + dsl::state + .ne(DbMetadataNexusState::Inactive) + .and(dsl::state.ne(DbMetadataNexusState::NotYet)), + ) + .count() + .get_result_async(&conn) + .await?; + let active_count: u32 = active_count.0; + if active_count > 0 { + return Err(err.bail(HandoffError::ActiveNexusInstancesExist { + active_count, + })); + } + + // Check that our nexus has a "not_yet" record + // + // Only read the "state" field to avoid reading the rest of the struct, + // in case additional columns are added over time. + let our_nexus_state: Option = + dsl::db_metadata_nexus + .filter( + dsl::nexus_id + .eq(nexus_db_model::to_db_typed_uuid(nexus_id)), + ) + .select(dsl::state) + .get_result_async(&conn) + .await + .optional()?; + let Some(our_state) = our_nexus_state else { + return Err(err.bail(HandoffError::NexusNotRegistered { nexus_id })); + }; + if our_state != DbMetadataNexusState::NotYet { + return Err(err.bail(HandoffError::NexusInWrongState { + nexus_id, + current_state: our_state, + })); + } + + // Update all "not_yet" records to "active" + diesel::update(dsl::db_metadata_nexus) + .filter(dsl::state.eq(DbMetadataNexusState::NotYet)) + .set(dsl::state.eq(DbMetadataNexusState::Active)) + .execute_async(&conn) + .await?; + + Ok(()) + } + + /// Attempts to perform a handoff to activate this Nexus for database access. + /// + /// This function checks that: + /// 1. ALL records in db_metadata_nexus are in "not_yet" or "inactive" states + /// 2. The specified nexus_id has a record which is "not_yet" + /// + /// If both conditions are met, it updates ALL "not_yet" records to "active". + /// These operations are performed transactionally. + /// + /// Returns an error if: + /// - Any record is in "active" state + /// - The specified nexus_id doesn't have a "not_yet" record + /// - Database transaction fails + pub async fn attempt_handoff( + &self, + nexus_id: OmicronZoneUuid, + ) -> Result<(), Error> { + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + + self.transaction_retry_wrapper("attempt_handoff") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + Self::attempt_handoff_impl(conn, nexus_id, err).await + } + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + err.into() + } else { + public_error_from_diesel(e, ErrorHandler::Server) + } + }) + } + pub async fn database_schema_version( &self, ) -> Result<(Version, Option), Error> { @@ -503,15 +1071,35 @@ mod test { // Confirms that calling the internal "ensure_schema" function can succeed // when the database is already at that version. #[tokio::test] - async fn ensure_schema_is_current_version() { - let logctx = dev::test_setup_log("ensure_schema_is_current_version"); + async fn check_schema_is_current_version() { + let logctx = dev::test_setup_log("check_schema_is_current_version"); let db = TestDatabase::new_with_raw_datastore(&logctx.log).await; let datastore = db.datastore(); - datastore - .ensure_schema(&logctx.log, SCHEMA_VERSION, None) + let checked_action = datastore + .check_schema_and_access( + None, + SCHEMA_VERSION, + ConsumerPolicy::FailOnMismatch, + ) .await - .expect("Failed to ensure schema"); + .expect("Failed to check schema and access"); + + assert!( + matches!(checked_action.action(), SchemaAction::Ready), + "Unexpected action: {:?}", + checked_action.action(), + ); + assert_eq!( + checked_action.desired_version(), + &SCHEMA_VERSION, + "Unexpected desired version: {}", + checked_action.desired_version() + ); + + datastore.update_schema(checked_action, None).await.expect_err( + "Should not be able to update schema that's already up-to-date", + ); db.terminate().await; logctx.cleanup_successful(); @@ -615,7 +1203,8 @@ mod test { let pool = pool.clone(); tokio::task::spawn(async move { let datastore = - DataStore::new(&log, pool, Some(&all_versions)).await?; + DataStore::new(&log, pool, Some(&all_versions), None) + .await?; // This is the crux of this test: confirm that, as each // migration completes, it's not possible to see any artifacts @@ -742,9 +1331,24 @@ mod test { // Manually construct the datastore to avoid the backoff timeout. // We want to trigger errors, but have no need to wait. + let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); + let checked_action = datastore + .check_schema_and_access( + None, + SCHEMA_VERSION, + ConsumerPolicy::Update, + ) + .await + .expect("Failed to check schema and access"); + + // This needs to be in a loop because we constructed a schema change + // that will intentionally fail sometimes when doing this work. + // + // This isn't a normal behavior! But we're trying to test the + // intermediate steps of a schema change here. while let Err(e) = datastore - .ensure_schema(&log, SCHEMA_VERSION, Some(&all_versions)) + .update_schema(checked_action.clone(), Some(&all_versions)) .await { warn!(log, "Failed to ensure schema"; "err" => %e); @@ -771,4 +1375,582 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_attempt_handoff_with_active_records() { + let logctx = + dev::test_setup_log("test_attempt_handoff_with_active_records"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + // Set up test data: create some nexus records, including one active + let nexus1_id = OmicronZoneUuid::new_v4(); + let nexus2_id = OmicronZoneUuid::new_v4(); + let nexus3_id = OmicronZoneUuid::new_v4(); + + // Insert records: one active, one not_yet, one inactive + datastore + .database_nexus_access_insert( + nexus1_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert active nexus"); + datastore + .database_nexus_access_insert( + nexus2_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert not_yet nexus"); + datastore + .database_nexus_access_insert( + nexus3_id, + DbMetadataNexusState::Inactive, + ) + .await + .expect("Failed to insert inactive nexus"); + + // Attempt handoff with nexus2 - should fail because nexus1 is active + let result = datastore.attempt_handoff(nexus2_id).await; + assert!(result.is_err()); + let error_msg = format!("{}", result.unwrap_err()); + assert!( + error_msg.contains("1 Nexus instance(s) are still active"), + "Expected error about active instances, got: {}", + error_msg + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_attempt_handoff_nexus_not_registered() { + let logctx = + dev::test_setup_log("test_attempt_handoff_nexus_not_registered"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + // Set up test data: create some other nexus records but not the one we're trying to handoff + let nexus1_id = OmicronZoneUuid::new_v4(); + let nexus2_id = OmicronZoneUuid::new_v4(); + let unregistered_nexus_id = OmicronZoneUuid::new_v4(); + + datastore + .database_nexus_access_insert( + nexus1_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus1"); + datastore + .database_nexus_access_insert( + nexus2_id, + DbMetadataNexusState::Inactive, + ) + .await + .expect("Failed to insert nexus2"); + + // Attempt handoff with unregistered nexus - should fail + let result = datastore.attempt_handoff(unregistered_nexus_id).await; + assert!(result.is_err()); + let error_msg = format!("{}", result.unwrap_err()); + assert!( + error_msg + .contains("does not have a record in db_metadata_nexus table"), + "Expected error about unregistered nexus, got: {}", + error_msg + ); + assert!( + error_msg.contains(&unregistered_nexus_id.to_string()), + "Expected error to contain nexus ID {}, got: {}", + unregistered_nexus_id, + error_msg + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_attempt_handoff_nexus_wrong_state() { + let logctx = + dev::test_setup_log("test_attempt_handoff_nexus_wrong_state"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + // Set up test data: create nexus records where our target is in wrong state + let nexus1_id = OmicronZoneUuid::new_v4(); + let nexus2_id = OmicronZoneUuid::new_v4(); + let inactive_nexus_id = OmicronZoneUuid::new_v4(); + + datastore + .database_nexus_access_insert( + nexus1_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus1"); + datastore + .database_nexus_access_insert( + nexus2_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus2"); + datastore + .database_nexus_access_insert( + inactive_nexus_id, + DbMetadataNexusState::Inactive, + ) + .await + .expect("Failed to insert inactive nexus"); + + // Attempt handoff with inactive nexus - should fail + let result = datastore.attempt_handoff(inactive_nexus_id).await; + assert!(result.is_err()); + let error_msg = format!("{}", result.unwrap_err()); + assert!( + error_msg.contains("is in state Inactive"), + "Expected error about wrong state, got: {}", + error_msg + ); + assert!( + error_msg.contains("Must be in 'not_yet' state to become active"), + "Expected error to mention required state, got: {}", + error_msg + ); + assert!( + error_msg.contains(&inactive_nexus_id.to_string()), + "Expected error to contain nexus ID {}, got: {}", + inactive_nexus_id, + error_msg + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_attempt_handoff_success() { + let logctx = dev::test_setup_log("test_attempt_handoff_success"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + // Set up test data: create multiple nexus records in not_yet and inactive states + let nexus1_id = OmicronZoneUuid::new_v4(); + let nexus2_id = OmicronZoneUuid::new_v4(); + let nexus3_id = OmicronZoneUuid::new_v4(); + + datastore + .database_nexus_access_insert( + nexus1_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus1"); + datastore + .database_nexus_access_insert( + nexus2_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus2"); + datastore + .database_nexus_access_insert( + nexus3_id, + DbMetadataNexusState::Inactive, + ) + .await + .expect("Failed to insert nexus3"); + + // Verify initial state: all not_yet or inactive + let nexus1_before = datastore + .database_nexus_access(nexus1_id) + .await + .expect("Failed to get nexus1") + .expect("nexus1 should exist"); + let nexus2_before = datastore + .database_nexus_access(nexus2_id) + .await + .expect("Failed to get nexus2") + .expect("nexus2 should exist"); + let nexus3_before = datastore + .database_nexus_access(nexus3_id) + .await + .expect("Failed to get nexus3") + .expect("nexus3 should exist"); + + assert_eq!(nexus1_before.state(), DbMetadataNexusState::NotYet); + assert_eq!(nexus2_before.state(), DbMetadataNexusState::NotYet); + assert_eq!(nexus3_before.state(), DbMetadataNexusState::Inactive); + + // Attempt handoff with nexus2 - should succeed + let result = datastore.attempt_handoff(nexus2_id).await; + if let Err(ref e) = result { + panic!("Handoff should succeed but got error: {}", e); + } + assert!(result.is_ok()); + + // Verify final state: all not_yet records should now be active, inactive should remain inactive + let nexus1_after = datastore + .database_nexus_access(nexus1_id) + .await + .expect("Failed to get nexus1") + .expect("nexus1 should exist"); + let nexus2_after = datastore + .database_nexus_access(nexus2_id) + .await + .expect("Failed to get nexus2") + .expect("nexus2 should exist"); + let nexus3_after = datastore + .database_nexus_access(nexus3_id) + .await + .expect("Failed to get nexus3") + .expect("nexus3 should exist"); + + assert_eq!(nexus1_after.state(), DbMetadataNexusState::Active); + assert_eq!(nexus2_after.state(), DbMetadataNexusState::Active); + assert_eq!(nexus3_after.state(), DbMetadataNexusState::Inactive); // Should remain unchanged + + db.terminate().await; + logctx.cleanup_successful(); + } + + // This test covers two cases: + // + // 1. New systems: We use RSS to initialize Nexus, but no db_metadata_nexus entries + // exist. + // 2. Deployed systems: We have a deployed system which updates to have this + // "db_metadata_nexus"-handling code, but has no rows in that table. + // + // Both of these cases must be granted database access to self-populate later. + #[tokio::test] + async fn test_check_schema_and_access_empty_table_permits_access() { + let logctx = dev::test_setup_log( + "test_check_schema_and_access_empty_table_permits_access", + ); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // With an empty table, even explicit nexus ID should get access + let action = datastore + .check_schema_and_access( + Some(nexus_id), + SCHEMA_VERSION, + ConsumerPolicy::FailOnMismatch, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Ready); + + // Add a record to the table, now explicit nexus ID should NOT get access + datastore + .database_nexus_access_insert( + OmicronZoneUuid::new_v4(), // Different nexus + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + let action = datastore + .check_schema_and_access( + Some(nexus_id), + SCHEMA_VERSION, + ConsumerPolicy::FailOnMismatch, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates the case where a Nexus ID is explicitly requested or omitted. + // + // The omission case is important for the "schema-updater" binary to keep working. + #[tokio::test] + async fn test_check_schema_and_access_nexus_id() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_nexus_id"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + // Add an active record, for some Nexus ID. + datastore + .database_nexus_access_insert( + OmicronZoneUuid::new_v4(), + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + // Using 'None' as a nexus ID should get access (schema updater case) + let action = datastore + .check_schema_and_access( + None, + SCHEMA_VERSION, + ConsumerPolicy::FailOnMismatch, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Ready); + + // Explicit nexus ID that doesn't exist should not get access + let nexus_id = OmicronZoneUuid::new_v4(); + let action = datastore + .check_schema_and_access( + Some(nexus_id), + SCHEMA_VERSION, + ConsumerPolicy::FailOnMismatch, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that an explicit db_metadata_nexus record can lock-out Nexuses which should not be + // able to access the database. + #[tokio::test] + async fn test_check_schema_and_access_lockout_refuses_access() { + let logctx = dev::test_setup_log( + "test_check_schema_and_access_lockout_refuses_access", + ); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as inactive (locked out) + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Inactive, + ) + .await + .expect("Failed to insert nexus record"); + + // Should refuse access regardless of policy + for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { + let action = datastore + .check_schema_and_access(Some(nexus_id), SCHEMA_VERSION, policy) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that if a Nexus with an "old desired schema" boots, it cannot access the + // database under any conditions. + // + // This is the case where the database has upgraded beyond what Nexus can understand. + // + // In practice, the db_metadata_nexus records should prevent this situation from occurring, + // but it's still a useful property to reject old schemas while the "schema-updater" binary + // exists. + #[tokio::test] + async fn test_check_schema_and_access_schema_too_new() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_schema_too_new"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as active + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + // Try to access with an older version than what's in the database + let older_version = Version::new(SCHEMA_VERSION.major - 1, 0, 0); + + for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { + // Explicit Nexus ID: Rejected + let action = datastore + .check_schema_and_access( + Some(nexus_id), + older_version.clone(), + policy, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); + + // Implicit Access: Rejected + let action = datastore + .check_schema_and_access(None, older_version.clone(), policy) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that the schema + access combinations identify we should wait for handoff + // when we have a "NotYet" record that could become compatible with the database. + #[tokio::test] + async fn test_check_schema_and_access_wait_for_handoff() { + let logctx = dev::test_setup_log( + "test_check_schema_and_access_wait_for_handoff", + ); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as not_yet (doesn't have access yet) + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus record"); + + // We should wait for handoff if the versions match, or if our desired + // version is newer than what exists in the database. + let current_version = SCHEMA_VERSION; + let newer_version = Version::new(SCHEMA_VERSION.major + 1, 0, 0); + let versions = [current_version, newer_version]; + + for version in &versions { + // With Update policy, should wait for handoff when schema is up-to-date + let action = datastore + .check_schema_and_access( + Some(nexus_id), + version.clone(), + ConsumerPolicy::Update, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::WaitForHandoff); + + // With FailOnMismatch policy, should refuse + let action = datastore + .check_schema_and_access( + Some(nexus_id), + version.clone(), + ConsumerPolicy::FailOnMismatch, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates the "normal case", where a Nexus has access and the schema already matches. + #[tokio::test] + async fn test_check_schema_and_access_normal_use() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_normal_use"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as active + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + // With current schema version, should be ready for normal use + for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { + let action = datastore + .check_schema_and_access(Some(nexus_id), SCHEMA_VERSION, policy) + .await + .expect("Failed to check schema and access"); + + assert_eq!(action.action(), &SchemaAction::Ready); + assert_eq!(action.desired_version(), &SCHEMA_VERSION); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that when a Nexus is active with a newer-than-database desired version, + // it will request an update if the policy allows. + #[tokio::test] + async fn test_check_schema_and_access_update_now() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_update_now"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as active + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + let newer_version = Version::new(SCHEMA_VERSION.major + 1, 0, 0); + + // With newer desired version and Update policy, should request update + let action = datastore + .check_schema_and_access( + Some(nexus_id), + newer_version.clone(), + ConsumerPolicy::Update, + ) + .await + .expect("Failed to check schema and access"); + + assert_eq!(action.action(), &SchemaAction::Update); + assert_eq!(action.desired_version(), &newer_version); + + // With FailOnMismatch policy, should refuse + let action = datastore + .check_schema_and_access( + Some(nexus_id), + newer_version, + ConsumerPolicy::FailOnMismatch, + ) + .await + .expect("Failed to check schema and access"); + + assert_eq!(action.action(), &SchemaAction::Refuse); + + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 1d148c74200..99d8694f2f4 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -39,7 +39,7 @@ use omicron_common::api::external::ResourceType; use omicron_common::backoff::{ BackoffError, retry_notify, retry_policy_internal_service, }; -use omicron_uuid_kinds::{GenericUuid, SledUuid}; +use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid, SledUuid}; use slog::Logger; use std::net::Ipv6Addr; use std::num::NonZeroU32; @@ -121,6 +121,9 @@ pub mod webhook_delivery; mod zpool; pub use address_lot::AddressLotCreateResult; +pub use db_metadata::ConsumerPolicy; +pub use db_metadata::SchemaAction; +pub use db_metadata::ValidatedSchemaAction; pub use dns::DataStoreDnsTest; pub use dns::DnsVersionUpdateBuilder; pub use ereport::EreportFilters; @@ -221,8 +224,9 @@ impl DataStore { log: &Logger, pool: Arc, config: Option<&AllSchemaVersions>, + nexus_id: Option, ) -> Result { - Self::new_with_timeout(log, pool, config, None).await + Self::new_with_timeout(log, pool, config, None, nexus_id).await } pub async fn new_with_timeout( @@ -230,7 +234,10 @@ impl DataStore { pool: Arc, config: Option<&AllSchemaVersions>, try_for: Option, + nexus_id: Option, ) -> Result { + use db_metadata::ConsumerPolicy; + use db_metadata::SchemaAction; use nexus_db_model::SCHEMA_VERSION as EXPECTED_VERSION; let datastore = @@ -244,25 +251,61 @@ impl DataStore { || async { if let Some(try_for) = try_for { if std::time::Instant::now() > start + try_for { - return Err(BackoffError::permanent(())); + return Err(BackoffError::permanent("Timeout waiting for database")); } } - match datastore - .ensure_schema(&log, EXPECTED_VERSION, config) - .await - { - Ok(()) => return Ok(()), - Err(e) => { - warn!(log, "Failed to ensure schema version"; "error" => #%e); + let checked_action = datastore.check_schema_and_access( + nexus_id, + EXPECTED_VERSION, + ConsumerPolicy::Update, + ).await.map_err(|err| { + warn!(log, "Cannot check schema version / Nexus access"; "error" => #%err); + BackoffError::transient("") + })?; + + match checked_action.action() { + SchemaAction::Ready => { + info!(log, "Datastore is ready for usage"); + return Ok(()); } - }; - return Err(BackoffError::transient(())); + SchemaAction::WaitForHandoff => { + info!(log, "Datastore is awaiting handoff"); + + let Some(nexus_id) = nexus_id else { + return Err(BackoffError::permanent("Nexus ID needed for handoff")); + }; + + datastore.attempt_handoff( + nexus_id + ).await.map_err(|err| { + warn!(log, "Could not perform handoff to new nexus"; err); + BackoffError::transient("") + })?; + + todo!(); + } + SchemaAction::Update => { + info!(log, "Datastore should be updated before usage"); + datastore.update_schema( + checked_action, + config + ).await.map_err(|err| { + warn!(log, "Failed to update schema version"; "error" => #%err); + BackoffError::transient("") + })?; + return Ok(()); + }, + SchemaAction::Refuse => { + error!(log, "Datastore should not be used"); + return Err(BackoffError::permanent("Datastore should not be used")); + }, + } }, |_, _| {}, ) .await - .map_err(|_| "Failed to read valid DB schema".to_string())?; + .map_err(|err| err.to_string())?; Ok(datastore) } diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index ce0e4f72244..268e3d90363 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -723,6 +723,7 @@ impl DataStore { // - Zpools // - Datasets // - A blueprint + // - Nexus database access records // // Which RSS has already allocated during bootstrapping. @@ -787,6 +788,15 @@ impl DataStore { DieselError::RollbackTransaction })?; + // Insert Nexus database access records + self.initialize_nexus_access_from_blueprint_on_connection( + &conn, + *blueprint.id.as_untyped_uuid() + ).await.map_err(|e| { + err.set(RackInitError::BlueprintTargetSet(e)).unwrap(); + DieselError::RollbackTransaction + })?; + // Allocate networking records for all services. for (_, zone_config) in blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) { self.rack_populate_service_networking_records( diff --git a/nexus/db-queries/src/db/pub_test_utils/mod.rs b/nexus/db-queries/src/db/pub_test_utils/mod.rs index db2acabd06a..cbfa5588b33 100644 --- a/nexus/db-queries/src/db/pub_test_utils/mod.rs +++ b/nexus/db-queries/src/db/pub_test_utils/mod.rs @@ -114,7 +114,9 @@ impl TestDatabaseBuilder { Interface::Datastore => { let pool = new_pool(log, &db); let datastore = Arc::new( - DataStore::new(&log, pool, None).await.unwrap(), + DataStore::new(&log, pool, None, None) + .await + .unwrap(), ); TestDatabase { db, @@ -300,7 +302,8 @@ async fn datastore_test( let cfg = db::Config { url: db.pg_config().clone() }; let pool = Arc::new(db::Pool::new_single_host(&log, &cfg)); - let datastore = Arc::new(DataStore::new(&log, pool, None).await.unwrap()); + let datastore = + Arc::new(DataStore::new(&log, pool, None, None).await.unwrap()); // Create an OpContext with the credentials of "db-init" just for the // purpose of loading the built-in users, roles, and assignments. diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 39210a32d6f..2a651f67d4e 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -37,6 +37,7 @@ define_enums! { CabooseWhichEnum => "caboose_which", ClickhouseModeEnum => "clickhouse_mode", DatasetKindEnum => "dataset_kind", + DbMetadataNexusStateEnum => "db_metadata_nexus_state", DnsGroupEnum => "dns_group", DownstairsClientStopRequestReasonEnum => "downstairs_client_stop_request_reason_type", DownstairsClientStoppedReasonEnum => "downstairs_client_stopped_reason_type", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index c0f2fe5f833..dca3030154e 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2369,6 +2369,14 @@ table! { } } +table! { + db_metadata_nexus (nexus_id) { + nexus_id -> Uuid, + last_drained_blueprint_id -> Nullable, + state -> crate::enums::DbMetadataNexusStateEnum, + } +} + table! { migration (id) { id -> Uuid, diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index cb3ca045cf9..f8c81bc5df9 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -310,12 +310,14 @@ impl Nexus { .map(|s| AllSchemaVersions::load(&s.schema_dir)) .transpose() .map_err(|error| format!("{error:#}"))?; + let nexus_id = Some(config.deployment.id); let db_datastore = Arc::new( db::DataStore::new_with_timeout( &log, Arc::clone(&pool), all_versions.as_ref(), config.pkg.tunables.load_timeout, + nexus_id, ) .await?, ); diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index ead8554f734..bc82dccfff1 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -108,11 +108,27 @@ async fn main_impl() -> anyhow::Result<()> { } Cmd::Upgrade { version } => { println!("Upgrading to {version}"); - datastore - .ensure_schema(&log, version.clone(), Some(&all_versions)) - .await - .map_err(|e| anyhow!(e))?; - println!("Upgrade to {version} complete"); + let checked_action = datastore + .check_schema_and_access( + None, + version.clone(), + db::datastore::ConsumerPolicy::Update, + ) + .await?; + + match checked_action.action() { + db::datastore::SchemaAction::Ready => { + println!("Already at version {version}") + } + db::datastore::SchemaAction::Update => { + datastore + .update_schema(checked_action, Some(&all_versions)) + .await + .map_err(|e| anyhow!(e))?; + println!("Update to {version} complete"); + } + _ => println!("Cannot update to version {version}"), + } } } datastore.terminate().await; diff --git a/nexus/src/populate.rs b/nexus/src/populate.rs index b77245d5df4..d19439fb9cc 100644 --- a/nexus/src/populate.rs +++ b/nexus/src/populate.rs @@ -364,7 +364,7 @@ mod test { let cfg = db::Config { url: db.crdb().pg_config().clone() }; let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg)); let datastore = Arc::new( - db::DataStore::new(&logctx.log, pool, None).await.unwrap(), + db::DataStore::new(&logctx.log, pool, None, None).await.unwrap(), ); let opctx = OpContext::for_background( logctx.log.clone(), @@ -415,7 +415,7 @@ mod test { // We need to create the datastore before tearing down the database, as // it verifies the schema version of the DB while booting. let datastore = Arc::new( - db::DataStore::new(&logctx.log, pool, None).await.unwrap(), + db::DataStore::new(&logctx.log, pool, None, None).await.unwrap(), ); let opctx = OpContext::for_background( logctx.log.clone(), diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index 989029625a5..9a2fde988b5 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -248,7 +248,7 @@ async fn test_nexus_does_not_boot_without_valid_schema() { .expect_err("Nexus should have failed to start"); assert!( - err.contains("Failed to read valid DB schema"), + err.contains("Datastore should not be used"), "Saw error: {err}" ); diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 9111e6a0949..9f5c602e025 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -2889,6 +2889,148 @@ fn after_171_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } +const NEXUS_ID_181_0: &str = "387433f9-1473-4ca2-b156-9670452985e0"; +const OLD_NEXUS_ID_181_0: &str = "287433f9-1473-4ca2-b156-9670452985e0"; + +const BP_ID_181_0: &str = "5a5ff941-3b5a-403b-9fda-db2049f4c736"; +const OLD_BP_ID_181_0: &str = "4a5ff941-3b5a-403b-9fda-db2049f4c736"; + +fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + // Create a blueprint which contains a Nexus - we'll use this for the migration. + ctx.client + .execute( + &format!( + "INSERT INTO omicron.public.bp_target + (version, blueprint_id, enabled, time_made_target) + VALUES + (1, '{BP_ID_181_0}', true, now());", + ), + &[], + ) + .await + .expect("inserted bp_target rows for 181"); + ctx.client + .execute( + &format!( + "INSERT INTO omicron.public.bp_omicron_zone ( + blueprint_id, sled_id, id, zone_type, + primary_service_ip, primary_service_port, + second_service_ip, second_service_port, + dataset_zpool_name, bp_nic_id, + dns_gz_address, dns_gz_address_index, + ntp_ntp_servers, ntp_dns_servers, ntp_domain, + nexus_external_tls, nexus_external_dns_servers, + snat_ip, snat_first_port, snat_last_port, + external_ip_id, filesystem_pool, disposition, + disposition_expunged_as_of_generation, + disposition_expunged_ready_for_cleanup, + image_source, image_artifact_sha256 + ) + VALUES ( + '{BP_ID_181_0}', gen_random_uuid(), '{NEXUS_ID_181_0}', + 'nexus', '192.168.1.10', 8080, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, + false, ARRAY[]::INET[], NULL, NULL, NULL, + NULL, gen_random_uuid(), 'in_service', + NULL, false, 'install_dataset', NULL + );" + ), + &[], + ) + .await + .expect("inserted bp_omicron_zone rows for 181"); + + // ALSO create an old blueprint, which isn't the latest target. + // + // We should ignore this one! No rows should be inserted for old data. + ctx.client + .execute( + &format!( + "INSERT INTO omicron.public.bp_target + (version, blueprint_id, enabled, time_made_target) + VALUES + (0, '{OLD_BP_ID_181_0}', true, now());", + ), + &[], + ) + .await + .expect("inserted bp_target rows for 181"); + ctx.client + .execute( + &format!( + "INSERT INTO omicron.public.bp_omicron_zone ( + blueprint_id, sled_id, id, zone_type, + primary_service_ip, primary_service_port, + second_service_ip, second_service_port, + dataset_zpool_name, bp_nic_id, + dns_gz_address, dns_gz_address_index, + ntp_ntp_servers, ntp_dns_servers, ntp_domain, + nexus_external_tls, nexus_external_dns_servers, + snat_ip, snat_first_port, snat_last_port, + external_ip_id, filesystem_pool, disposition, + disposition_expunged_as_of_generation, + disposition_expunged_ready_for_cleanup, + image_source, image_artifact_sha256 + ) + VALUES ( + '{OLD_BP_ID_181_0}', gen_random_uuid(), + '{OLD_NEXUS_ID_181_0}', 'nexus', '192.168.1.10', 8080, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, + false, ARRAY[]::INET[], NULL, NULL, NULL, + NULL, gen_random_uuid(), 'in_service', + NULL, false, 'install_dataset', NULL + );" + ), + &[], + ) + .await + .expect("inserted bp_omicron_zone rows for 181"); + }) +} + +fn after_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async move { + // After the migration, the new row should be created - only for Nexuses + // in the latest blueprint. + // + // Note that "OLD_NEXUS_ID_181_0" doesn't get a row - it's in an old + // blueprint. + let rows = ctx + .client + .query( + "SELECT + nexus_id, + last_drained_blueprint_id, + state + FROM omicron.public.db_metadata_nexus;", + &[], + ) + .await + .expect("queried post-migration inv_sled_config_reconciler"); + + let rows = process_rows(&rows); + assert_eq!(rows.len(), 1); + let row = &rows[0]; + + // Create a new row for the Nexuses in the target blueprint + assert_eq!( + row.values[0].expect("nexus_id").unwrap(), + &AnySqlType::Uuid(NEXUS_ID_181_0.parse().unwrap()) + ); + assert_eq!(row.values[1].expect("last_drained_blueprint_id"), None); + assert_eq!( + row.values[2].expect("state").unwrap(), + &AnySqlType::Enum(SqlEnum::from(( + "db_metadata_nexus_state", + "active" + ))) + ); + }) +} + // Lazily initializes all migration checks. The combination of Rust function // pointers and async makes defining a static table fairly painful, so we're // using lazy initialization instead. @@ -2987,7 +3129,10 @@ fn get_migration_checks() -> BTreeMap { Version::new(171, 0, 0), DataMigrationFns::new().before(before_171_0_0).after(after_171_0_0), ); - + map.insert( + Version::new(181, 0, 0), + DataMigrationFns::new().before(before_181_0_0).after(after_181_0_0), + ); map } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f87d11e0903..799ade10b39 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5616,29 +5616,6 @@ CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_step_by_state CREATE INDEX IF NOT EXISTS lookup_region_snapshot_replacement_step_by_old_volume_id on omicron.public.region_snapshot_replacement_step (old_snapshot_volume_id); -/* - * Metadata for the schema itself. This version number isn't great, as there's - * nothing to ensure it gets bumped when it should be, but it's a start. - */ -CREATE TABLE IF NOT EXISTS omicron.public.db_metadata ( - -- There should only be one row of this table for the whole DB. - -- It's a little goofy, but filter on "singleton = true" before querying - -- or applying updates, and you'll access the singleton row. - -- - -- We also add a constraint on this table to ensure it's not possible to - -- access the version of this table with "singleton = false". - singleton BOOL NOT NULL PRIMARY KEY, - time_created TIMESTAMPTZ NOT NULL, - time_modified TIMESTAMPTZ NOT NULL, - -- Semver representation of the DB version - version STRING(64) NOT NULL, - - -- (Optional) Semver representation of the DB version to which we're upgrading - target_version STRING(64), - - CHECK (singleton = true) -); - -- An allowlist of IP addresses that can make requests to user-facing services. CREATE TABLE IF NOT EXISTS omicron.public.allow_list ( id UUID PRIMARY KEY, @@ -6539,10 +6516,58 @@ ON omicron.public.host_ereport ( ) WHERE time_deleted IS NULL; -/* - * Keep this at the end of file so that the database does not contain a version - * until it is fully populated. - */ +-- Metadata for the schema itself. +-- +-- This table may be read by Nexuses with different notions of "what the schema should be". +-- Unlike other tables in the database, caution should be taken when upgrading this schema. +CREATE TABLE IF NOT EXISTS omicron.public.db_metadata ( + -- There should only be one row of this table for the whole DB. + -- It's a little goofy, but filter on "singleton = true" before querying + -- or applying updates, and you'll access the singleton row. + -- + -- We also add a constraint on this table to ensure it's not possible to + -- access the version of this table with "singleton = false". + singleton BOOL NOT NULL PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_modified TIMESTAMPTZ NOT NULL, + -- Semver representation of the DB version + version STRING(64) NOT NULL, + + -- (Optional) Semver representation of the DB version to which we're upgrading + target_version STRING(64), + + CHECK (singleton = true) +); + +CREATE TYPE IF NOT EXISTS omicron.public.db_metadata_nexus_state AS ENUM ( + -- This Nexus is allowed to access this database + 'active', + + -- This Nexus is not yet allowed to access the database + 'not_yet', + + -- This Nexus has committed to no longer accessing this database + 'inactive' +); + +-- Nexuses which may be attempting to access the database, and a state +-- which identifies if they should be allowed to do so. +-- +-- This table is used during upgrade implement handoff between old and new +-- Nexus zones. It is read by all Nexuses during initialization to identify +-- if they should have access to the database. +CREATE TABLE IF NOT EXISTS omicron.public.db_metadata_nexus ( + nexus_id UUID NOT NULL PRIMARY KEY, + last_drained_blueprint_id UUID, + state omicron.public.db_metadata_nexus_state NOT NULL +); + +CREATE INDEX IF NOT EXISTS lookup_db_metadata_nexus_by_state on omicron.public.db_metadata_nexus ( + state +); + +-- Keep this at the end of file so that the database does not contain a version +-- until it is fully populated. INSERT INTO omicron.public.db_metadata ( singleton, time_created, @@ -6550,7 +6575,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '181.0.0', NULL) + (TRUE, NOW(), NOW(), '182.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/populate-db-metadata-nexus/up01.sql b/schema/crdb/populate-db-metadata-nexus/up01.sql new file mode 100644 index 00000000000..cf3400b2226 --- /dev/null +++ b/schema/crdb/populate-db-metadata-nexus/up01.sql @@ -0,0 +1,6 @@ +CREATE TYPE IF NOT EXISTS omicron.public.db_metadata_nexus_state AS ENUM ( + 'active', + 'not_yet', + 'inactive' +); + diff --git a/schema/crdb/populate-db-metadata-nexus/up02.sql b/schema/crdb/populate-db-metadata-nexus/up02.sql new file mode 100644 index 00000000000..9fac217eec4 --- /dev/null +++ b/schema/crdb/populate-db-metadata-nexus/up02.sql @@ -0,0 +1,11 @@ +-- Nexuses which may be attempting to access the database, and a state +-- which identifies if they should be allowed to do so. +-- +-- This table is used during upgrade implement handoff between old and new +-- Nexus zones. +CREATE TABLE IF NOT EXISTS omicron.public.db_metadata_nexus ( + nexus_id UUID NOT NULL PRIMARY KEY, + last_drained_blueprint_id UUID, + state omicron.public.db_metadata_nexus_state NOT NULL +); + diff --git a/schema/crdb/populate-db-metadata-nexus/up03.sql b/schema/crdb/populate-db-metadata-nexus/up03.sql new file mode 100644 index 00000000000..e8326838b92 --- /dev/null +++ b/schema/crdb/populate-db-metadata-nexus/up03.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_db_metadata_nexus_by_state on omicron.public.db_metadata_nexus ( + state +); diff --git a/schema/crdb/populate-db-metadata-nexus/up04.sql b/schema/crdb/populate-db-metadata-nexus/up04.sql new file mode 100644 index 00000000000..c94c67dfeec --- /dev/null +++ b/schema/crdb/populate-db-metadata-nexus/up04.sql @@ -0,0 +1,15 @@ +-- Populate db_metadata_nexus records for all Nexus zones in the current target blueprint +-- +-- This migration handles backfill for existing deployments that are upgrading +-- to include db_metadata_nexus. It finds all Nexus zones in the current +-- target blueprint and marks them as 'active' in the db_metadata_nexus table. + +SET LOCAL disallow_full_table_scans = off; + +INSERT INTO omicron.public.db_metadata_nexus (nexus_id, last_drained_blueprint_id, state) +SELECT bz.id, NULL, 'active' +FROM omicron.public.bp_target bt +JOIN omicron.public.bp_omicron_zone bz ON bt.blueprint_id = bz.blueprint_id +WHERE bz.zone_type = 'nexus' + AND bt.version = (SELECT MAX(version) FROM omicron.public.bp_target) +ON CONFLICT (nexus_id) DO UPDATE SET state = EXCLUDED.state; From 1ef5e5e413b1b6a4a5102c64775a0f506c675d49 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 21 Aug 2025 14:32:59 -0700 Subject: [PATCH 02/22] Update data migration tests from 181 -> 182 --- nexus/tests/integration_tests/schema.rs | 38 ++++++++++++------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 9f5c602e025..1733574be70 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -2889,13 +2889,13 @@ fn after_171_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } -const NEXUS_ID_181_0: &str = "387433f9-1473-4ca2-b156-9670452985e0"; -const OLD_NEXUS_ID_181_0: &str = "287433f9-1473-4ca2-b156-9670452985e0"; +const NEXUS_ID_182_0: &str = "387433f9-1473-4ca2-b156-9670452985e0"; +const OLD_NEXUS_ID_182_0: &str = "287433f9-1473-4ca2-b156-9670452985e0"; -const BP_ID_181_0: &str = "5a5ff941-3b5a-403b-9fda-db2049f4c736"; -const OLD_BP_ID_181_0: &str = "4a5ff941-3b5a-403b-9fda-db2049f4c736"; +const BP_ID_182_0: &str = "5a5ff941-3b5a-403b-9fda-db2049f4c736"; +const OLD_BP_ID_182_0: &str = "4a5ff941-3b5a-403b-9fda-db2049f4c736"; -fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { +fn before_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { Box::pin(async move { // Create a blueprint which contains a Nexus - we'll use this for the migration. ctx.client @@ -2904,12 +2904,12 @@ fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { "INSERT INTO omicron.public.bp_target (version, blueprint_id, enabled, time_made_target) VALUES - (1, '{BP_ID_181_0}', true, now());", + (1, '{BP_ID_182_0}', true, now());", ), &[], ) .await - .expect("inserted bp_target rows for 181"); + .expect("inserted bp_target rows for 182"); ctx.client .execute( &format!( @@ -2928,7 +2928,7 @@ fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { image_source, image_artifact_sha256 ) VALUES ( - '{BP_ID_181_0}', gen_random_uuid(), '{NEXUS_ID_181_0}', + '{BP_ID_182_0}', gen_random_uuid(), '{NEXUS_ID_182_0}', 'nexus', '192.168.1.10', 8080, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -2940,7 +2940,7 @@ fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { &[], ) .await - .expect("inserted bp_omicron_zone rows for 181"); + .expect("inserted bp_omicron_zone rows for 182"); // ALSO create an old blueprint, which isn't the latest target. // @@ -2951,12 +2951,12 @@ fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { "INSERT INTO omicron.public.bp_target (version, blueprint_id, enabled, time_made_target) VALUES - (0, '{OLD_BP_ID_181_0}', true, now());", + (0, '{OLD_BP_ID_182_0}', true, now());", ), &[], ) .await - .expect("inserted bp_target rows for 181"); + .expect("inserted bp_target rows for 182"); ctx.client .execute( &format!( @@ -2975,8 +2975,8 @@ fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { image_source, image_artifact_sha256 ) VALUES ( - '{OLD_BP_ID_181_0}', gen_random_uuid(), - '{OLD_NEXUS_ID_181_0}', 'nexus', '192.168.1.10', 8080, + '{OLD_BP_ID_182_0}', gen_random_uuid(), + '{OLD_NEXUS_ID_182_0}', 'nexus', '192.168.1.10', 8080, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, false, ARRAY[]::INET[], NULL, NULL, NULL, @@ -2987,16 +2987,16 @@ fn before_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { &[], ) .await - .expect("inserted bp_omicron_zone rows for 181"); + .expect("inserted bp_omicron_zone rows for 182"); }) } -fn after_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { +fn after_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { Box::pin(async move { // After the migration, the new row should be created - only for Nexuses // in the latest blueprint. // - // Note that "OLD_NEXUS_ID_181_0" doesn't get a row - it's in an old + // Note that "OLD_NEXUS_ID_182_0" doesn't get a row - it's in an old // blueprint. let rows = ctx .client @@ -3018,7 +3018,7 @@ fn after_181_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { // Create a new row for the Nexuses in the target blueprint assert_eq!( row.values[0].expect("nexus_id").unwrap(), - &AnySqlType::Uuid(NEXUS_ID_181_0.parse().unwrap()) + &AnySqlType::Uuid(NEXUS_ID_182_0.parse().unwrap()) ); assert_eq!(row.values[1].expect("last_drained_blueprint_id"), None); assert_eq!( @@ -3130,8 +3130,8 @@ fn get_migration_checks() -> BTreeMap { DataMigrationFns::new().before(before_171_0_0).after(after_171_0_0), ); map.insert( - Version::new(181, 0, 0), - DataMigrationFns::new().before(before_181_0_0).after(after_181_0_0), + Version::new(182, 0, 0), + DataMigrationFns::new().before(before_182_0_0).after(after_182_0_0), ); map } From f6e90a3d7c47fb166e83918137e40d89b2a9d58c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 15:59:12 -0700 Subject: [PATCH 03/22] patch schema version, update up04.sql --- nexus/tests/integration_tests/schema.rs | 30 +++++++++---------- .../crdb/populate-db-metadata-nexus/up04.sql | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 1733574be70..592908db01b 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -2889,13 +2889,13 @@ fn after_171_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } -const NEXUS_ID_182_0: &str = "387433f9-1473-4ca2-b156-9670452985e0"; -const OLD_NEXUS_ID_182_0: &str = "287433f9-1473-4ca2-b156-9670452985e0"; +const NEXUS_ID_183_0: &str = "387433f9-1473-4ca2-b156-9670452985e0"; +const OLD_NEXUS_ID_183_0: &str = "287433f9-1473-4ca2-b156-9670452985e0"; -const BP_ID_182_0: &str = "5a5ff941-3b5a-403b-9fda-db2049f4c736"; -const OLD_BP_ID_182_0: &str = "4a5ff941-3b5a-403b-9fda-db2049f4c736"; +const BP_ID_183_0: &str = "5a5ff941-3b5a-403b-9fda-db2049f4c736"; +const OLD_BP_ID_183_0: &str = "4a5ff941-3b5a-403b-9fda-db2049f4c736"; -fn before_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { +fn before_183_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { Box::pin(async move { // Create a blueprint which contains a Nexus - we'll use this for the migration. ctx.client @@ -2904,7 +2904,7 @@ fn before_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { "INSERT INTO omicron.public.bp_target (version, blueprint_id, enabled, time_made_target) VALUES - (1, '{BP_ID_182_0}', true, now());", + (1, '{BP_ID_183_0}', true, now());", ), &[], ) @@ -2928,7 +2928,7 @@ fn before_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { image_source, image_artifact_sha256 ) VALUES ( - '{BP_ID_182_0}', gen_random_uuid(), '{NEXUS_ID_182_0}', + '{BP_ID_183_0}', gen_random_uuid(), '{NEXUS_ID_183_0}', 'nexus', '192.168.1.10', 8080, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -2951,7 +2951,7 @@ fn before_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { "INSERT INTO omicron.public.bp_target (version, blueprint_id, enabled, time_made_target) VALUES - (0, '{OLD_BP_ID_182_0}', true, now());", + (0, '{OLD_BP_ID_183_0}', true, now());", ), &[], ) @@ -2975,8 +2975,8 @@ fn before_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { image_source, image_artifact_sha256 ) VALUES ( - '{OLD_BP_ID_182_0}', gen_random_uuid(), - '{OLD_NEXUS_ID_182_0}', 'nexus', '192.168.1.10', 8080, + '{OLD_BP_ID_183_0}', gen_random_uuid(), + '{OLD_NEXUS_ID_183_0}', 'nexus', '192.168.1.10', 8080, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, false, ARRAY[]::INET[], NULL, NULL, NULL, @@ -2991,12 +2991,12 @@ fn before_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } -fn after_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { +fn after_183_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { Box::pin(async move { // After the migration, the new row should be created - only for Nexuses // in the latest blueprint. // - // Note that "OLD_NEXUS_ID_182_0" doesn't get a row - it's in an old + // Note that "OLD_NEXUS_ID_183_0" doesn't get a row - it's in an old // blueprint. let rows = ctx .client @@ -3018,7 +3018,7 @@ fn after_182_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { // Create a new row for the Nexuses in the target blueprint assert_eq!( row.values[0].expect("nexus_id").unwrap(), - &AnySqlType::Uuid(NEXUS_ID_182_0.parse().unwrap()) + &AnySqlType::Uuid(NEXUS_ID_183_0.parse().unwrap()) ); assert_eq!(row.values[1].expect("last_drained_blueprint_id"), None); assert_eq!( @@ -3130,8 +3130,8 @@ fn get_migration_checks() -> BTreeMap { DataMigrationFns::new().before(before_171_0_0).after(after_171_0_0), ); map.insert( - Version::new(182, 0, 0), - DataMigrationFns::new().before(before_182_0_0).after(after_182_0_0), + Version::new(183, 0, 0), + DataMigrationFns::new().before(before_183_0_0).after(after_183_0_0), ); map } diff --git a/schema/crdb/populate-db-metadata-nexus/up04.sql b/schema/crdb/populate-db-metadata-nexus/up04.sql index c94c67dfeec..9a4b7b1175c 100644 --- a/schema/crdb/populate-db-metadata-nexus/up04.sql +++ b/schema/crdb/populate-db-metadata-nexus/up04.sql @@ -12,4 +12,4 @@ FROM omicron.public.bp_target bt JOIN omicron.public.bp_omicron_zone bz ON bt.blueprint_id = bz.blueprint_id WHERE bz.zone_type = 'nexus' AND bt.version = (SELECT MAX(version) FROM omicron.public.bp_target) -ON CONFLICT (nexus_id) DO UPDATE SET state = EXCLUDED.state; +ON CONFLICT (nexus_id) DO NOTHING; From 914248b771ce2c1cabf5c63aa9c1ba7cef62ac5c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 15:59:51 -0700 Subject: [PATCH 04/22] s/inactive/quiesced --- nexus/db-model/src/db_metadata.rs | 2 +- .../src/db/datastore/db_metadata.rs | 54 +++++++++---------- schema/crdb/dbinit.sql | 2 +- .../crdb/populate-db-metadata-nexus/up01.sql | 2 +- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/nexus/db-model/src/db_metadata.rs b/nexus/db-model/src/db_metadata.rs index 8f15ce02ae7..080da4d423c 100644 --- a/nexus/db-model/src/db_metadata.rs +++ b/nexus/db-model/src/db_metadata.rs @@ -49,7 +49,7 @@ impl_enum_type!( // Enum values Active => b"active" NotYet => b"not_yet" - Inactive => b"inactive" + Quiesced => b"quiesced" ); #[derive( diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index d94c1ac161e..b6d077eebf1 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -34,7 +34,7 @@ pub enum HandoffError { #[error( "Cannot perform handoff: \ {active_count} Nexus instance(s) are still active. \ - All instances must be inactive or not_yet before handoff can proceed." + All instances must be quiesced or not_yet before handoff can proceed." )] ActiveNexusInstancesExist { active_count: u32 }, @@ -331,8 +331,8 @@ impl DataStore { info!(&self.log, "Nexus does not yet have access to the database"; "nexus_id" => ?nexus_id); NexusAccess::DoesNotHaveAccessYet } - DbMetadataNexusState::Inactive => { - let msg = "Nexus locked out of database access (inactive)"; + DbMetadataNexusState::Quiesced => { + let msg = "Nexus locked out of database access (quiesced)"; error!(&self.log, "{msg}"; "nexus_id" => ?nexus_id); NexusAccess::LockedOut } @@ -817,14 +817,14 @@ impl DataStore { ) -> Result<(), diesel::result::Error> { use nexus_db_schema::schema::db_metadata_nexus::dsl; - // Before proceeding, all records must be in the "inactive" or "not_yet" states. + // Before proceeding, all records must be in the "quiesced" or "not_yet" states. // // We explicitly look for any records violating this, rather than explicitly looking for // "active" records, as to protect ourselves from future states being added over time. let active_count: nexus_db_model::SqlU32 = dsl::db_metadata_nexus .filter( dsl::state - .ne(DbMetadataNexusState::Inactive) + .ne(DbMetadataNexusState::Quiesced) .and(dsl::state.ne(DbMetadataNexusState::NotYet)), ) .count() @@ -874,7 +874,7 @@ impl DataStore { /// Attempts to perform a handoff to activate this Nexus for database access. /// /// This function checks that: - /// 1. ALL records in db_metadata_nexus are in "not_yet" or "inactive" states + /// 1. ALL records in db_metadata_nexus are in "not_yet" or "quiesced" states /// 2. The specified nexus_id has a record which is "not_yet" /// /// If both conditions are met, it updates ALL "not_yet" records to "active". @@ -1389,7 +1389,7 @@ mod test { let nexus2_id = OmicronZoneUuid::new_v4(); let nexus3_id = OmicronZoneUuid::new_v4(); - // Insert records: one active, one not_yet, one inactive + // Insert records: one active, one not_yet, one quiesced datastore .database_nexus_access_insert( nexus1_id, @@ -1407,10 +1407,10 @@ mod test { datastore .database_nexus_access_insert( nexus3_id, - DbMetadataNexusState::Inactive, + DbMetadataNexusState::Quiesced, ) .await - .expect("Failed to insert inactive nexus"); + .expect("Failed to insert quiesced nexus"); // Attempt handoff with nexus2 - should fail because nexus1 is active let result = datastore.attempt_handoff(nexus2_id).await; @@ -1449,7 +1449,7 @@ mod test { datastore .database_nexus_access_insert( nexus2_id, - DbMetadataNexusState::Inactive, + DbMetadataNexusState::Quiesced, ) .await .expect("Failed to insert nexus2"); @@ -1486,7 +1486,7 @@ mod test { // Set up test data: create nexus records where our target is in wrong state let nexus1_id = OmicronZoneUuid::new_v4(); let nexus2_id = OmicronZoneUuid::new_v4(); - let inactive_nexus_id = OmicronZoneUuid::new_v4(); + let quiesced_nexus_id = OmicronZoneUuid::new_v4(); datastore .database_nexus_access_insert( @@ -1504,18 +1504,18 @@ mod test { .expect("Failed to insert nexus2"); datastore .database_nexus_access_insert( - inactive_nexus_id, - DbMetadataNexusState::Inactive, + quiesced_nexus_id, + DbMetadataNexusState::Quiesced, ) .await - .expect("Failed to insert inactive nexus"); + .expect("Failed to insert quiesced nexus"); - // Attempt handoff with inactive nexus - should fail - let result = datastore.attempt_handoff(inactive_nexus_id).await; + // Attempt handoff with quiesced nexus - should fail + let result = datastore.attempt_handoff(quiesced_nexus_id).await; assert!(result.is_err()); let error_msg = format!("{}", result.unwrap_err()); assert!( - error_msg.contains("is in state Inactive"), + error_msg.contains("is in state Quiesced"), "Expected error about wrong state, got: {}", error_msg ); @@ -1525,9 +1525,9 @@ mod test { error_msg ); assert!( - error_msg.contains(&inactive_nexus_id.to_string()), + error_msg.contains(&quiesced_nexus_id.to_string()), "Expected error to contain nexus ID {}, got: {}", - inactive_nexus_id, + quiesced_nexus_id, error_msg ); @@ -1542,7 +1542,7 @@ mod test { let datastore = DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); - // Set up test data: create multiple nexus records in not_yet and inactive states + // Set up test data: create multiple nexus records in not_yet and quiesced states let nexus1_id = OmicronZoneUuid::new_v4(); let nexus2_id = OmicronZoneUuid::new_v4(); let nexus3_id = OmicronZoneUuid::new_v4(); @@ -1564,12 +1564,12 @@ mod test { datastore .database_nexus_access_insert( nexus3_id, - DbMetadataNexusState::Inactive, + DbMetadataNexusState::Quiesced, ) .await .expect("Failed to insert nexus3"); - // Verify initial state: all not_yet or inactive + // Verify initial state: all not_yet or quiesced let nexus1_before = datastore .database_nexus_access(nexus1_id) .await @@ -1588,7 +1588,7 @@ mod test { assert_eq!(nexus1_before.state(), DbMetadataNexusState::NotYet); assert_eq!(nexus2_before.state(), DbMetadataNexusState::NotYet); - assert_eq!(nexus3_before.state(), DbMetadataNexusState::Inactive); + assert_eq!(nexus3_before.state(), DbMetadataNexusState::Quiesced); // Attempt handoff with nexus2 - should succeed let result = datastore.attempt_handoff(nexus2_id).await; @@ -1597,7 +1597,7 @@ mod test { } assert!(result.is_ok()); - // Verify final state: all not_yet records should now be active, inactive should remain inactive + // Verify final state: all not_yet records should now be active, quiesced should remain quiesced let nexus1_after = datastore .database_nexus_access(nexus1_id) .await @@ -1616,7 +1616,7 @@ mod test { assert_eq!(nexus1_after.state(), DbMetadataNexusState::Active); assert_eq!(nexus2_after.state(), DbMetadataNexusState::Active); - assert_eq!(nexus3_after.state(), DbMetadataNexusState::Inactive); // Should remain unchanged + assert_eq!(nexus3_after.state(), DbMetadataNexusState::Quiesced); // Should remain unchanged db.terminate().await; logctx.cleanup_successful(); @@ -1735,11 +1735,11 @@ mod test { let nexus_id = OmicronZoneUuid::new_v4(); - // Insert our nexus as inactive (locked out) + // Insert our nexus as quiesced (locked out) datastore .database_nexus_access_insert( nexus_id, - DbMetadataNexusState::Inactive, + DbMetadataNexusState::Quiesced, ) .await .expect("Failed to insert nexus record"); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 49d6848dac6..664d927dffb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6550,7 +6550,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.db_metadata_nexus_state AS ENUM ( 'not_yet', -- This Nexus has committed to no longer accessing this database - 'inactive' + 'quiesced' ); -- Nexuses which may be attempting to access the database, and a state diff --git a/schema/crdb/populate-db-metadata-nexus/up01.sql b/schema/crdb/populate-db-metadata-nexus/up01.sql index cf3400b2226..25c42761e04 100644 --- a/schema/crdb/populate-db-metadata-nexus/up01.sql +++ b/schema/crdb/populate-db-metadata-nexus/up01.sql @@ -1,6 +1,6 @@ CREATE TYPE IF NOT EXISTS omicron.public.db_metadata_nexus_state AS ENUM ( 'active', 'not_yet', - 'inactive' + 'quiesced' ); From f46e22a59f354dad32ce99edbe3ebba4feaebfb6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 16:14:00 -0700 Subject: [PATCH 05/22] unique index --- schema/crdb/dbinit.sql | 5 +++-- schema/crdb/populate-db-metadata-nexus/up03.sql | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 664d927dffb..754ca5cda1a 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6565,8 +6565,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.db_metadata_nexus ( state omicron.public.db_metadata_nexus_state NOT NULL ); -CREATE INDEX IF NOT EXISTS lookup_db_metadata_nexus_by_state on omicron.public.db_metadata_nexus ( - state +CREATE UNIQUE INDEX IF NOT EXISTS lookup_db_metadata_nexus_by_state on omicron.public.db_metadata_nexus ( + state, + nexus_id ); -- Keep this at the end of file so that the database does not contain a version diff --git a/schema/crdb/populate-db-metadata-nexus/up03.sql b/schema/crdb/populate-db-metadata-nexus/up03.sql index e8326838b92..42fbf004137 100644 --- a/schema/crdb/populate-db-metadata-nexus/up03.sql +++ b/schema/crdb/populate-db-metadata-nexus/up03.sql @@ -1,3 +1,4 @@ -CREATE INDEX IF NOT EXISTS lookup_db_metadata_nexus_by_state on omicron.public.db_metadata_nexus ( - state +CREATE UNIQUE INDEX IF NOT EXISTS lookup_db_metadata_nexus_by_state on omicron.public.db_metadata_nexus ( + state, + nexus_id ); From db416d0bf16683d42ef4ef9262042ac0f8776335 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 16:40:16 -0700 Subject: [PATCH 06/22] Only update non-expunged Nexuses, update data migration test too --- nexus/tests/integration_tests/schema.rs | 23 +++++++++++++------ .../crdb/populate-db-metadata-nexus/up04.sql | 1 + 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 592908db01b..12ae8de6999 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -2890,7 +2890,8 @@ fn after_171_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { } const NEXUS_ID_183_0: &str = "387433f9-1473-4ca2-b156-9670452985e0"; -const OLD_NEXUS_ID_183_0: &str = "287433f9-1473-4ca2-b156-9670452985e0"; +const EXPUNGED_NEXUS_ID_183_0: &str = "287433f9-1473-4ca2-b156-9670452985e0"; +const OLD_NEXUS_ID_183_0: &str = "187433f9-1473-4ca2-b156-9670452985e0"; const BP_ID_183_0: &str = "5a5ff941-3b5a-403b-9fda-db2049f4c736"; const OLD_BP_ID_183_0: &str = "4a5ff941-3b5a-403b-9fda-db2049f4c736"; @@ -2898,6 +2899,8 @@ const OLD_BP_ID_183_0: &str = "4a5ff941-3b5a-403b-9fda-db2049f4c736"; fn before_183_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { Box::pin(async move { // Create a blueprint which contains a Nexus - we'll use this for the migration. + // + // It also contains an exupnged Nexus, which should be ignored. ctx.client .execute( &format!( @@ -2929,12 +2932,18 @@ fn before_183_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { ) VALUES ( '{BP_ID_183_0}', gen_random_uuid(), '{NEXUS_ID_183_0}', - 'nexus', '192.168.1.10', 8080, - NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, - false, ARRAY[]::INET[], NULL, NULL, NULL, - NULL, gen_random_uuid(), 'in_service', - NULL, false, 'install_dataset', NULL + 'nexus', '192.168.1.10', 8080, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, false, ARRAY[]::INET[], + NULL, NULL, NULL, NULL, gen_random_uuid(), + 'in_service', NULL, false, 'install_dataset', NULL + ), + ( + '{BP_ID_183_0}', gen_random_uuid(), + '{EXPUNGED_NEXUS_ID_183_0}', 'nexus', '192.168.1.11', + 8080, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, false, ARRAY[]::INET[], NULL, NULL, NULL, NULL, + gen_random_uuid(), 'expunged', 1, false, + 'install_dataset', NULL );" ), &[], diff --git a/schema/crdb/populate-db-metadata-nexus/up04.sql b/schema/crdb/populate-db-metadata-nexus/up04.sql index 9a4b7b1175c..36b876b9cdd 100644 --- a/schema/crdb/populate-db-metadata-nexus/up04.sql +++ b/schema/crdb/populate-db-metadata-nexus/up04.sql @@ -11,5 +11,6 @@ SELECT bz.id, NULL, 'active' FROM omicron.public.bp_target bt JOIN omicron.public.bp_omicron_zone bz ON bt.blueprint_id = bz.blueprint_id WHERE bz.zone_type = 'nexus' + AND bz.disposition != 'expunged' AND bt.version = (SELECT MAX(version) FROM omicron.public.bp_target) ON CONFLICT (nexus_id) DO NOTHING; From b25ce91b7fa173ceb68acf3ab224290ea3f40c38 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 16:49:26 -0700 Subject: [PATCH 07/22] new_with_timeout tweaks --- nexus/db-queries/src/db/datastore/mod.rs | 85 ++++++++++++++++-------- 1 file changed, 58 insertions(+), 27 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 99d8694f2f4..fcf829b4d09 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -41,6 +41,7 @@ use omicron_common::backoff::{ }; use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid, SledUuid}; use slog::Logger; +use slog_error_chain::InlineErrorChain; use std::net::Ipv6Addr; use std::num::NonZeroU32; use std::sync::Arc; @@ -251,18 +252,29 @@ impl DataStore { || async { if let Some(try_for) = try_for { if std::time::Instant::now() > start + try_for { - return Err(BackoffError::permanent("Timeout waiting for database")); + return Err(BackoffError::permanent( + "Timeout waiting for DataStore::new_with_timeout", + )); } } - let checked_action = datastore.check_schema_and_access( - nexus_id, - EXPECTED_VERSION, - ConsumerPolicy::Update, - ).await.map_err(|err| { - warn!(log, "Cannot check schema version / Nexus access"; "error" => #%err); - BackoffError::transient("") - })?; + let checked_action = datastore + .check_schema_and_access( + nexus_id, + EXPECTED_VERSION, + ConsumerPolicy::Update, + ) + .await + .map_err(|err| { + warn!( + log, + "Cannot check schema version / Nexus access"; + "error" => InlineErrorChain::new(err.as_ref()), + ); + BackoffError::transient( + "Cannot check schema version / Nexus access", + ) + })?; match checked_action.action() { SchemaAction::Ready => { @@ -273,33 +285,52 @@ impl DataStore { info!(log, "Datastore is awaiting handoff"); let Some(nexus_id) = nexus_id else { - return Err(BackoffError::permanent("Nexus ID needed for handoff")); + return Err(BackoffError::permanent( + "Nexus ID needed for handoff", + )); }; - datastore.attempt_handoff( - nexus_id - ).await.map_err(|err| { - warn!(log, "Could not perform handoff to new nexus"; err); - BackoffError::transient("") - })?; - - todo!(); + datastore.attempt_handoff(nexus_id).await.map_err( + |err| { + warn!( + log, + "Could not perform handoff to new nexus"; + err + ); + BackoffError::transient( + "Could not perform handoff to new nexus", + ) + }, + )?; + + todo!( + "Post-handoff, Nexus should self-update. \ + This is not yet implemented" + ); } SchemaAction::Update => { info!(log, "Datastore should be updated before usage"); - datastore.update_schema( - checked_action, - config - ).await.map_err(|err| { - warn!(log, "Failed to update schema version"; "error" => #%err); - BackoffError::transient("") + datastore + .update_schema(checked_action, config) + .await + .map_err(|err| { + warn!( + log, + "Failed to update schema version"; + "error" => #%err + ); + BackoffError::transient( + "Failed to update schema version", + ) })?; return Ok(()); - }, + } SchemaAction::Refuse => { error!(log, "Datastore should not be used"); - return Err(BackoffError::permanent("Datastore should not be used")); - }, + return Err(BackoffError::permanent( + "Datastore should not be used", + )); + } } }, |_, _| {}, From edeee09ee425f6f3a80d8bfbbc96fb27693c8e16 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 17:26:31 -0700 Subject: [PATCH 08/22] IdentityCheckPolicy --- .../src/db/datastore/db_metadata.rs | 92 +++++++++++-------- nexus/db-queries/src/db/datastore/mod.rs | 27 +++++- nexus/db-queries/src/db/pub_test_utils/mod.rs | 19 +++- nexus/src/app/mod.rs | 5 +- nexus/src/bin/schema-updater.rs | 15 ++- nexus/src/populate.rs | 19 +++- 6 files changed, 122 insertions(+), 55 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index b6d077eebf1..3b7fa3f41d6 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -4,7 +4,7 @@ //! [`DataStore`] methods on Database Metadata. -use super::{DataStore, DbConnection}; +use super::{DataStore, DbConnection, IdentityCheckPolicy}; use anyhow::{Context, bail, ensure}; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use chrono::Utc; @@ -388,31 +388,34 @@ impl DataStore { /// on a schema mismatch pub async fn check_schema_and_access( &self, - nexus_id: Option, + identity_check: IdentityCheckPolicy, desired_version: Version, policy: ConsumerPolicy, ) -> Result { let schema_status = self.check_schema(desired_version.clone()).await?; - let nexus_access = if let Some(nexus_id) = nexus_id { - match schema_status { - // If we don't think the "db_metadata_nexus" tables exist in the - // schema yet, treat them as implicitly having access. - // - // TODO: This may be removed, once we're confident deployed systems - // have upgraded past DB_METADATA_NEXUS_SCHEMA_VERSION. - SchemaStatus::OlderThanDesiredSkipAccessCheck => { - NexusAccess::HasImplicitAccess + let nexus_access = match identity_check { + IdentityCheckPolicy::CheckAndTakeover { nexus_id } => { + match schema_status { + // If we don't think the "db_metadata_nexus" tables exist in the + // schema yet, treat them as implicitly having access. + // + // TODO: This may be removed, once we're confident deployed systems + // have upgraded past DB_METADATA_NEXUS_SCHEMA_VERSION. + SchemaStatus::OlderThanDesiredSkipAccessCheck => { + NexusAccess::HasImplicitAccess + } + _ => self.check_nexus_access(nexus_id).await?, } - _ => self.check_nexus_access(nexus_id).await?, } - } else { - // If a "nexus_id" was not supplied, skip the check, and treat it - // as having access. - // - // This is necessary for tools which access the schema without a - // running Nexus, such as the schema-updater binary. - NexusAccess::HasImplicitAccess + IdentityCheckPolicy::DontCare => { + // If a "nexus_id" was not supplied, skip the check, and treat it + // as having access. + // + // This is necessary for tools which access the schema without a + // running Nexus, such as the schema-updater binary. + NexusAccess::HasImplicitAccess + } }; Ok(ValidatedSchemaAction { @@ -1062,6 +1065,7 @@ impl DataStore { #[cfg(test)] mod test { use super::*; + use crate::db::datastore::IdentityCheckPolicy; use crate::db::pub_test_utils::TestDatabase; use camino::Utf8Path; use camino_tempfile::Utf8TempDir; @@ -1078,7 +1082,7 @@ mod test { let checked_action = datastore .check_schema_and_access( - None, + IdentityCheckPolicy::DontCare, SCHEMA_VERSION, ConsumerPolicy::FailOnMismatch, ) @@ -1202,9 +1206,13 @@ mod test { let log = log.clone(); let pool = pool.clone(); tokio::task::spawn(async move { - let datastore = - DataStore::new(&log, pool, Some(&all_versions), None) - .await?; + let datastore = DataStore::new( + &log, + pool, + Some(&all_versions), + IdentityCheckPolicy::DontCare, + ) + .await?; // This is the crux of this test: confirm that, as each // migration completes, it's not possible to see any artifacts @@ -1335,7 +1343,7 @@ mod test { let datastore = DataStore::new_unchecked(log.clone(), pool.clone()); let checked_action = datastore .check_schema_and_access( - None, + IdentityCheckPolicy::DontCare, SCHEMA_VERSION, ConsumerPolicy::Update, ) @@ -1644,7 +1652,7 @@ mod test { // With an empty table, even explicit nexus ID should get access let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, SCHEMA_VERSION, ConsumerPolicy::FailOnMismatch, ) @@ -1663,7 +1671,7 @@ mod test { let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, SCHEMA_VERSION, ConsumerPolicy::FailOnMismatch, ) @@ -1698,7 +1706,7 @@ mod test { // Using 'None' as a nexus ID should get access (schema updater case) let action = datastore .check_schema_and_access( - None, + IdentityCheckPolicy::DontCare, SCHEMA_VERSION, ConsumerPolicy::FailOnMismatch, ) @@ -1710,7 +1718,7 @@ mod test { let nexus_id = OmicronZoneUuid::new_v4(); let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, SCHEMA_VERSION, ConsumerPolicy::FailOnMismatch, ) @@ -1747,7 +1755,11 @@ mod test { // Should refuse access regardless of policy for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { let action = datastore - .check_schema_and_access(Some(nexus_id), SCHEMA_VERSION, policy) + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + policy, + ) .await .expect("Failed to check schema and access"); assert_eq!(action.action(), &SchemaAction::Refuse); @@ -1791,7 +1803,7 @@ mod test { // Explicit Nexus ID: Rejected let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, older_version.clone(), policy, ) @@ -1801,7 +1813,11 @@ mod test { // Implicit Access: Rejected let action = datastore - .check_schema_and_access(None, older_version.clone(), policy) + .check_schema_and_access( + IdentityCheckPolicy::DontCare, + older_version.clone(), + policy, + ) .await .expect("Failed to check schema and access"); assert_eq!(action.action(), &SchemaAction::Refuse); @@ -1843,7 +1859,7 @@ mod test { // With Update policy, should wait for handoff when schema is up-to-date let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, version.clone(), ConsumerPolicy::Update, ) @@ -1854,7 +1870,7 @@ mod test { // With FailOnMismatch policy, should refuse let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, version.clone(), ConsumerPolicy::FailOnMismatch, ) @@ -1890,7 +1906,11 @@ mod test { // With current schema version, should be ready for normal use for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { let action = datastore - .check_schema_and_access(Some(nexus_id), SCHEMA_VERSION, policy) + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + policy, + ) .await .expect("Failed to check schema and access"); @@ -1928,7 +1948,7 @@ mod test { // With newer desired version and Update policy, should request update let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, newer_version.clone(), ConsumerPolicy::Update, ) @@ -1941,7 +1961,7 @@ mod test { // With FailOnMismatch policy, should refuse let action = datastore .check_schema_and_access( - Some(nexus_id), + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, newer_version, ConsumerPolicy::FailOnMismatch, ) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index fcf829b4d09..3218489ed79 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -190,6 +190,21 @@ impl RunnableQuery for T where { } +/// Specifies whether the consumer wants to check whether they're allowed to +/// access the database based on the `db_metadata_nexus` table. +#[derive(Debug, Clone, Copy)] +pub enum IdentityCheckPolicy { + /// The consumer wants full access to the database regardless of the current + /// upgrade / handoff state. This would be used by almost all tools and + /// tests. + DontCare, + + /// The consumer only wants to access the database if it's in the current + /// set of Nexus instances that's supposed to be able to access it. If + /// possible and legal, take over access from the existing set. + CheckAndTakeover { nexus_id: OmicronZoneUuid }, +} + pub struct DataStore { log: Logger, pool: Arc, @@ -225,9 +240,9 @@ impl DataStore { log: &Logger, pool: Arc, config: Option<&AllSchemaVersions>, - nexus_id: Option, + identity_check: IdentityCheckPolicy, ) -> Result { - Self::new_with_timeout(log, pool, config, None, nexus_id).await + Self::new_with_timeout(log, pool, config, None, identity_check).await } pub async fn new_with_timeout( @@ -235,7 +250,7 @@ impl DataStore { pool: Arc, config: Option<&AllSchemaVersions>, try_for: Option, - nexus_id: Option, + identity_check: IdentityCheckPolicy, ) -> Result { use db_metadata::ConsumerPolicy; use db_metadata::SchemaAction; @@ -260,7 +275,7 @@ impl DataStore { let checked_action = datastore .check_schema_and_access( - nexus_id, + identity_check, EXPECTED_VERSION, ConsumerPolicy::Update, ) @@ -284,7 +299,9 @@ impl DataStore { SchemaAction::WaitForHandoff => { info!(log, "Datastore is awaiting handoff"); - let Some(nexus_id) = nexus_id else { + let IdentityCheckPolicy::CheckAndTakeover { nexus_id } = + identity_check + else { return Err(BackoffError::permanent( "Nexus ID needed for handoff", )); diff --git a/nexus/db-queries/src/db/pub_test_utils/mod.rs b/nexus/db-queries/src/db/pub_test_utils/mod.rs index cbfa5588b33..420f96ee6ab 100644 --- a/nexus/db-queries/src/db/pub_test_utils/mod.rs +++ b/nexus/db-queries/src/db/pub_test_utils/mod.rs @@ -12,6 +12,7 @@ use crate::authz; use crate::context::OpContext; use crate::db; use crate::db::DataStore; +use crate::db::datastore::IdentityCheckPolicy; use omicron_test_utils::dev::db::CockroachInstance; use slog::Logger; use std::sync::Arc; @@ -114,9 +115,14 @@ impl TestDatabaseBuilder { Interface::Datastore => { let pool = new_pool(log, &db); let datastore = Arc::new( - DataStore::new(&log, pool, None, None) - .await - .unwrap(), + DataStore::new( + &log, + pool, + None, + IdentityCheckPolicy::DontCare, + ) + .await + .unwrap(), ); TestDatabase { db, @@ -302,8 +308,11 @@ async fn datastore_test( let cfg = db::Config { url: db.pg_config().clone() }; let pool = Arc::new(db::Pool::new_single_host(&log, &cfg)); - let datastore = - Arc::new(DataStore::new(&log, pool, None, None).await.unwrap()); + let datastore = Arc::new( + DataStore::new(&log, pool, None, IdentityCheckPolicy::DontCare) + .await + .unwrap(), + ); // Create an OpContext with the credentials of "db-init" just for the // purpose of loading the built-in users, roles, and assignments. diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index f8c81bc5df9..4a037cb0968 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -24,6 +24,7 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::IdentityCheckPolicy; use nexus_mgs_updates::ArtifactCache; use nexus_mgs_updates::MgsUpdateDriver; use nexus_types::deployment::PendingMgsUpdates; @@ -310,14 +311,14 @@ impl Nexus { .map(|s| AllSchemaVersions::load(&s.schema_dir)) .transpose() .map_err(|error| format!("{error:#}"))?; - let nexus_id = Some(config.deployment.id); + let nexus_id = config.deployment.id; let db_datastore = Arc::new( db::DataStore::new_with_timeout( &log, Arc::clone(&pool), all_versions.as_ref(), config.pkg.tunables.load_timeout, - nexus_id, + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, ) .await?, ); diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index bc82dccfff1..9f4d177d54c 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -14,6 +14,9 @@ use nexus_db_model::AllSchemaVersions; use nexus_db_model::SCHEMA_VERSION; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::datastore::ConsumerPolicy; +use nexus_db_queries::db::datastore::IdentityCheckPolicy; +use nexus_db_queries::db::datastore::SchemaAction; use semver::Version; use slog::Drain; use slog::Level; @@ -110,24 +113,26 @@ async fn main_impl() -> anyhow::Result<()> { println!("Upgrading to {version}"); let checked_action = datastore .check_schema_and_access( - None, + IdentityCheckPolicy::DontCare, version.clone(), - db::datastore::ConsumerPolicy::Update, + ConsumerPolicy::Update, ) .await?; match checked_action.action() { - db::datastore::SchemaAction::Ready => { + SchemaAction::Ready => { println!("Already at version {version}") } - db::datastore::SchemaAction::Update => { + SchemaAction::Update => { datastore .update_schema(checked_action, Some(&all_versions)) .await .map_err(|e| anyhow!(e))?; println!("Update to {version} complete"); } - _ => println!("Cannot update to version {version}"), + SchemaAction::WaitForHandoff | SchemaAction::Refuse => { + println!("Cannot update to version {version}") + } } } } diff --git a/nexus/src/populate.rs b/nexus/src/populate.rs index d19439fb9cc..93632d77c10 100644 --- a/nexus/src/populate.rs +++ b/nexus/src/populate.rs @@ -339,6 +339,7 @@ mod test { use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; + use nexus_db_queries::db::datastore::IdentityCheckPolicy; use nexus_db_queries::db::pub_test_utils::TestDatabase; use omicron_common::api::external::Error; use omicron_test_utils::dev; @@ -364,7 +365,14 @@ mod test { let cfg = db::Config { url: db.crdb().pg_config().clone() }; let pool = Arc::new(db::Pool::new_single_host(&logctx.log, &cfg)); let datastore = Arc::new( - db::DataStore::new(&logctx.log, pool, None, None).await.unwrap(), + db::DataStore::new( + &logctx.log, + pool, + None, + IdentityCheckPolicy::DontCare, + ) + .await + .unwrap(), ); let opctx = OpContext::for_background( logctx.log.clone(), @@ -415,7 +423,14 @@ mod test { // We need to create the datastore before tearing down the database, as // it verifies the schema version of the DB while booting. let datastore = Arc::new( - db::DataStore::new(&logctx.log, pool, None, None).await.unwrap(), + db::DataStore::new( + &logctx.log, + pool, + None, + IdentityCheckPolicy::DontCare, + ) + .await + .unwrap(), ); let opctx = OpContext::for_background( logctx.log.clone(), From 25733841c4aef063e630b2c00453a9fe74e9802e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 20:02:30 -0700 Subject: [PATCH 09/22] 503 --- nexus/db-queries/src/db/datastore/db_metadata.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 3b7fa3f41d6..924b9a8776d 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -58,7 +58,18 @@ pub enum HandoffError { impl From for Error { fn from(err: HandoffError) -> Self { - Error::internal_error(&err.to_string()) + use HandoffError::*; + match err { + // These conditions are all errors that may occur transiently, with + // handoff from old -> new Nexus, or with multiple Nexuses + // concurrently attempting to perform the handoff operation. + // + // As a result, each returns a "503" error indicating that a retry + // should be attempted. + ActiveNexusInstancesExist { .. } + | NexusNotRegistered { .. } + | NexusInWrongState { .. } => Error::unavail(&err.to_string()), + } } } From 2f0c43fa26f3f8e2b17b5809abe306553a30eee4 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 20:09:02 -0700 Subject: [PATCH 10/22] Better handling of SchemaAction::Handoff --- .../src/db/datastore/db_metadata.rs | 10 +- nexus/db-queries/src/db/datastore/mod.rs | 140 +++++++++--------- nexus/src/bin/schema-updater.rs | 2 +- 3 files changed, 79 insertions(+), 73 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 924b9a8776d..02fc3f926b3 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -151,7 +151,7 @@ fn skippable_version( return false; } -/// Describes the state of the schema with respect this Nexus +/// Describes the state of the database access with respect this Nexus #[derive(Debug, Copy, Clone, PartialEq)] pub enum NexusAccess { /// Nexus does not yet have access to the database. @@ -214,7 +214,7 @@ pub enum SchemaAction { /// Not ready for usage yet /// /// The database may be ready for usage once handoff has completed. - WaitForHandoff, + NeedsHandoff, /// Start a schema update Update, @@ -273,7 +273,7 @@ impl SchemaAction { DoesNotHaveAccessYet, UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, Update, - ) => Self::WaitForHandoff, + ) => Self::NeedsHandoff, // This is the most "normal" case: Nexus should have access to the // database, and the schema matches what it wants. @@ -823,7 +823,7 @@ impl DataStore { // Implementation function for attempt_handoff that runs within a transaction // - // This function must be executed from a tranaction context to be safe. + // This function must be executed from a transaction context to be safe. async fn attempt_handoff_impl( conn: async_bb8_diesel::Connection, nexus_id: OmicronZoneUuid, @@ -1876,7 +1876,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::WaitForHandoff); + assert_eq!(action.action(), &SchemaAction::NeedsHandoff); // With FailOnMismatch policy, should refuse let action = datastore diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 3218489ed79..538c91e2cb0 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -273,80 +273,86 @@ impl DataStore { } } - let checked_action = datastore - .check_schema_and_access( - identity_check, - EXPECTED_VERSION, - ConsumerPolicy::Update, - ) - .await - .map_err(|err| { - warn!( - log, - "Cannot check schema version / Nexus access"; - "error" => InlineErrorChain::new(err.as_ref()), - ); - BackoffError::transient( - "Cannot check schema version / Nexus access", + loop { + let checked_action = datastore + .check_schema_and_access( + identity_check, + EXPECTED_VERSION, + ConsumerPolicy::Update, ) - })?; - - match checked_action.action() { - SchemaAction::Ready => { - info!(log, "Datastore is ready for usage"); - return Ok(()); - } - SchemaAction::WaitForHandoff => { - info!(log, "Datastore is awaiting handoff"); - - let IdentityCheckPolicy::CheckAndTakeover { nexus_id } = - identity_check - else { - return Err(BackoffError::permanent( - "Nexus ID needed for handoff", - )); - }; - - datastore.attempt_handoff(nexus_id).await.map_err( - |err| { - warn!( - log, - "Could not perform handoff to new nexus"; - err - ); - BackoffError::transient( - "Could not perform handoff to new nexus", - ) - }, - )?; - - todo!( - "Post-handoff, Nexus should self-update. \ - This is not yet implemented" - ); - } - SchemaAction::Update => { - info!(log, "Datastore should be updated before usage"); - datastore - .update_schema(checked_action, config) - .await - .map_err(|err| { + .await + .map_err(|err| { warn!( log, - "Failed to update schema version"; - "error" => #%err + "Cannot check schema version / Nexus access"; + "error" => InlineErrorChain::new(err.as_ref()), ); BackoffError::transient( - "Failed to update schema version", + "Cannot check schema version / Nexus access", ) })?; - return Ok(()); - } - SchemaAction::Refuse => { - error!(log, "Datastore should not be used"); - return Err(BackoffError::permanent( - "Datastore should not be used", - )); + + match checked_action.action() { + SchemaAction::Ready => { + info!(log, "Datastore is ready for usage"); + return Ok(()); + } + SchemaAction::NeedsHandoff => { + info!(log, "Datastore is awaiting handoff"); + + let IdentityCheckPolicy::CheckAndTakeover { + nexus_id, + } = identity_check + else { + return Err(BackoffError::permanent( + "Nexus ID needed for handoff", + )); + }; + + datastore.attempt_handoff(nexus_id).await.map_err( + |err| { + warn!( + log, + "Could not handoff to new nexus"; + err + ); + BackoffError::transient( + "Could not handoff to new nexus", + ) + }, + )?; + + // If the handoff was successful, immediately + // re-evaluate the schema and access policies to see + // if we should update or not. + continue; + } + SchemaAction::Update => { + info!( + log, + "Datastore should be updated before usage" + ); + datastore + .update_schema(checked_action, config) + .await + .map_err(|err| { + warn!( + log, + "Failed to update schema version"; + "error" => #%err + ); + BackoffError::transient( + "Failed to update schema version", + ) + })?; + return Ok(()); + } + SchemaAction::Refuse => { + error!(log, "Datastore should not be used"); + return Err(BackoffError::permanent( + "Datastore should not be used", + )); + } } } }, diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index 9f4d177d54c..25343ad7f69 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -130,7 +130,7 @@ async fn main_impl() -> anyhow::Result<()> { .map_err(|e| anyhow!(e))?; println!("Update to {version} complete"); } - SchemaAction::WaitForHandoff | SchemaAction::Refuse => { + SchemaAction::NeedsHandoff | SchemaAction::Refuse => { println!("Cannot update to version {version}") } } From 165fd0e11494948cfeca66e31a6bc0e3a593c1bf Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 20:38:43 -0700 Subject: [PATCH 11/22] remove ConsumerPolicy --- .../src/db/datastore/db_metadata.rs | 180 ++++++------------ nexus/db-queries/src/db/datastore/mod.rs | 5 +- nexus/src/bin/schema-updater.rs | 2 - 3 files changed, 56 insertions(+), 131 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 02fc3f926b3..90e3f25e95b 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -195,16 +195,6 @@ enum SchemaStatus { OlderThanDesiredSkipAccessCheck, } -/// Describes how a consumer may want to react to schema and access -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum ConsumerPolicy { - /// Fail immediately on any schema mismatch - FailOnMismatch, - - /// Update the schema if it is safe to do so - Update, -} - /// Describes what should be done with a schema #[derive(Debug, Copy, Clone, PartialEq)] pub enum SchemaAction { @@ -244,53 +234,36 @@ impl ValidatedSchemaAction { } impl SchemaAction { - // Interprets the combination of access, status, and policy to decide - // what action should be taken. - fn new( - access: NexusAccess, - status: SchemaStatus, - policy: ConsumerPolicy, - ) -> Self { - use ConsumerPolicy::*; + // Interprets the combination of access and status to decide what action + // should be taken. + fn new(access: NexusAccess, status: SchemaStatus) -> Self { use NexusAccess::*; use SchemaStatus::*; - match (access, status, policy) { + match (access, status) { // Nexus has been explicitly locked-out of using the database - (LockedOut, _, _) => Self::Refuse, + (LockedOut, _) => Self::Refuse, // The schema updated beyond what we want, do not use it. - (_, NewerThanDesired, _) => Self::Refuse, + (_, NewerThanDesired) => Self::Refuse, // If we don't have access yet, but could do something once handoff - // occurs, then wait (or refuse if ConsumerPolicy says to fail fast) + // occurs, then handoff is needed ( DoesNotHaveAccessYet, UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, - FailOnMismatch, - ) => Self::Refuse, - ( - DoesNotHaveAccessYet, - UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, - Update, ) => Self::NeedsHandoff, // This is the most "normal" case: Nexus should have access to the // database, and the schema matches what it wants. - (HasExplicitAccess | HasImplicitAccess, UpToDate, _) => Self::Ready, + (HasExplicitAccess | HasImplicitAccess, UpToDate) => Self::Ready, // If this Nexus is allowed to access the schema, but it looks // older than what we expect, we'll need to update the schema to - // use it. Do this if and only if ConsumerPolicy allows it. - ( - HasExplicitAccess | HasImplicitAccess, - OlderThanDesired | OlderThanDesiredSkipAccessCheck, - FailOnMismatch, - ) => Self::Refuse, + // use it. ( HasExplicitAccess | HasImplicitAccess, OlderThanDesired | OlderThanDesiredSkipAccessCheck, - Update, ) => Self::Update, } } @@ -395,13 +368,10 @@ impl DataStore { /// this Nexus has access to the database. /// - `desired_version`: The version of the database schema this /// Nexus wants. - /// - `policy`: Whether or not we should fail or attempt to update - /// on a schema mismatch pub async fn check_schema_and_access( &self, identity_check: IdentityCheckPolicy, desired_version: Version, - policy: ConsumerPolicy, ) -> Result { let schema_status = self.check_schema(desired_version.clone()).await?; @@ -430,7 +400,7 @@ impl DataStore { }; Ok(ValidatedSchemaAction { - action: SchemaAction::new(nexus_access, schema_status, policy), + action: SchemaAction::new(nexus_access, schema_status), desired: desired_version, }) } @@ -1095,7 +1065,6 @@ mod test { .check_schema_and_access( IdentityCheckPolicy::DontCare, SCHEMA_VERSION, - ConsumerPolicy::FailOnMismatch, ) .await .expect("Failed to check schema and access"); @@ -1356,7 +1325,6 @@ mod test { .check_schema_and_access( IdentityCheckPolicy::DontCare, SCHEMA_VERSION, - ConsumerPolicy::Update, ) .await .expect("Failed to check schema and access"); @@ -1665,7 +1633,6 @@ mod test { .check_schema_and_access( IdentityCheckPolicy::CheckAndTakeover { nexus_id }, SCHEMA_VERSION, - ConsumerPolicy::FailOnMismatch, ) .await .expect("Failed to check schema and access"); @@ -1684,11 +1651,10 @@ mod test { .check_schema_and_access( IdentityCheckPolicy::CheckAndTakeover { nexus_id }, SCHEMA_VERSION, - ConsumerPolicy::FailOnMismatch, ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); + assert_eq!(action.action(), &SchemaAction::NeedsHandoff); db.terminate().await; logctx.cleanup_successful(); @@ -1714,28 +1680,27 @@ mod test { .await .expect("Failed to insert nexus record"); - // Using 'None' as a nexus ID should get access (schema updater case) + // Using 'DontCare' as a nexus ID should get access (schema updater case) let action = datastore .check_schema_and_access( IdentityCheckPolicy::DontCare, SCHEMA_VERSION, - ConsumerPolicy::FailOnMismatch, ) .await .expect("Failed to check schema and access"); assert_eq!(action.action(), &SchemaAction::Ready); - // Explicit nexus ID that doesn't exist should not get access + // Explicit CheckAndTakeover with a Nexus ID that doesn't exist should + // not get access let nexus_id = OmicronZoneUuid::new_v4(); let action = datastore .check_schema_and_access( IdentityCheckPolicy::CheckAndTakeover { nexus_id }, SCHEMA_VERSION, - ConsumerPolicy::FailOnMismatch, ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); + assert_eq!(action.action(), &SchemaAction::NeedsHandoff); db.terminate().await; logctx.cleanup_successful(); @@ -1763,18 +1728,15 @@ mod test { .await .expect("Failed to insert nexus record"); - // Should refuse access regardless of policy - for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { - let action = datastore - .check_schema_and_access( - IdentityCheckPolicy::CheckAndTakeover { nexus_id }, - SCHEMA_VERSION, - policy, - ) - .await - .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); - } + // Should refuse access + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); db.terminate().await; logctx.cleanup_successful(); @@ -1810,29 +1772,25 @@ mod test { // Try to access with an older version than what's in the database let older_version = Version::new(SCHEMA_VERSION.major - 1, 0, 0); - for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { - // Explicit Nexus ID: Rejected - let action = datastore - .check_schema_and_access( - IdentityCheckPolicy::CheckAndTakeover { nexus_id }, - older_version.clone(), - policy, - ) - .await - .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); + // Explicit Nexus ID: Rejected + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + older_version.clone(), + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); - // Implicit Access: Rejected - let action = datastore - .check_schema_and_access( - IdentityCheckPolicy::DontCare, - older_version.clone(), - policy, - ) - .await - .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); - } + // Implicit Access: Rejected + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::DontCare, + older_version.clone(), + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &SchemaAction::Refuse); db.terminate().await; logctx.cleanup_successful(); @@ -1867,27 +1825,15 @@ mod test { let versions = [current_version, newer_version]; for version in &versions { - // With Update policy, should wait for handoff when schema is up-to-date + // Should wait for handoff when schema is up-to-date let action = datastore .check_schema_and_access( IdentityCheckPolicy::CheckAndTakeover { nexus_id }, version.clone(), - ConsumerPolicy::Update, ) .await .expect("Failed to check schema and access"); assert_eq!(action.action(), &SchemaAction::NeedsHandoff); - - // With FailOnMismatch policy, should refuse - let action = datastore - .check_schema_and_access( - IdentityCheckPolicy::CheckAndTakeover { nexus_id }, - version.clone(), - ConsumerPolicy::FailOnMismatch, - ) - .await - .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); } db.terminate().await; @@ -1915,26 +1861,23 @@ mod test { .expect("Failed to insert nexus record"); // With current schema version, should be ready for normal use - for policy in [ConsumerPolicy::FailOnMismatch, ConsumerPolicy::Update] { - let action = datastore - .check_schema_and_access( - IdentityCheckPolicy::CheckAndTakeover { nexus_id }, - SCHEMA_VERSION, - policy, - ) - .await - .expect("Failed to check schema and access"); + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Ready); - assert_eq!(action.desired_version(), &SCHEMA_VERSION); - } + assert_eq!(action.action(), &SchemaAction::Ready); + assert_eq!(action.desired_version(), &SCHEMA_VERSION); db.terminate().await; logctx.cleanup_successful(); } - // Validates that when a Nexus is active with a newer-than-database desired version, - // it will request an update if the policy allows. + // Validates that when a Nexus is active with a newer-than-database desired + // version, it will request an update #[tokio::test] async fn test_check_schema_and_access_update_now() { let logctx = @@ -1956,12 +1899,11 @@ mod test { let newer_version = Version::new(SCHEMA_VERSION.major + 1, 0, 0); - // With newer desired version and Update policy, should request update + // With a newer desired version, should request update let action = datastore .check_schema_and_access( IdentityCheckPolicy::CheckAndTakeover { nexus_id }, newer_version.clone(), - ConsumerPolicy::Update, ) .await .expect("Failed to check schema and access"); @@ -1969,18 +1911,6 @@ mod test { assert_eq!(action.action(), &SchemaAction::Update); assert_eq!(action.desired_version(), &newer_version); - // With FailOnMismatch policy, should refuse - let action = datastore - .check_schema_and_access( - IdentityCheckPolicy::CheckAndTakeover { nexus_id }, - newer_version, - ConsumerPolicy::FailOnMismatch, - ) - .await - .expect("Failed to check schema and access"); - - assert_eq!(action.action(), &SchemaAction::Refuse); - db.terminate().await; logctx.cleanup_successful(); } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 538c91e2cb0..b5ab8b21fd6 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -122,7 +122,6 @@ pub mod webhook_delivery; mod zpool; pub use address_lot::AddressLotCreateResult; -pub use db_metadata::ConsumerPolicy; pub use db_metadata::SchemaAction; pub use db_metadata::ValidatedSchemaAction; pub use dns::DataStoreDnsTest; @@ -252,7 +251,6 @@ impl DataStore { try_for: Option, identity_check: IdentityCheckPolicy, ) -> Result { - use db_metadata::ConsumerPolicy; use db_metadata::SchemaAction; use nexus_db_model::SCHEMA_VERSION as EXPECTED_VERSION; @@ -278,7 +276,6 @@ impl DataStore { .check_schema_and_access( identity_check, EXPECTED_VERSION, - ConsumerPolicy::Update, ) .await .map_err(|err| { @@ -339,7 +336,7 @@ impl DataStore { warn!( log, "Failed to update schema version"; - "error" => #%err + "error" => InlineErrorChain::new(err.as_ref()) ); BackoffError::transient( "Failed to update schema version", diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index 25343ad7f69..fd128102d84 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -14,7 +14,6 @@ use nexus_db_model::AllSchemaVersions; use nexus_db_model::SCHEMA_VERSION; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; -use nexus_db_queries::db::datastore::ConsumerPolicy; use nexus_db_queries::db::datastore::IdentityCheckPolicy; use nexus_db_queries::db::datastore::SchemaAction; use semver::Version; @@ -115,7 +114,6 @@ async fn main_impl() -> anyhow::Result<()> { .check_schema_and_access( IdentityCheckPolicy::DontCare, version.clone(), - ConsumerPolicy::Update, ) .await?; From 3edce16da80b3541c67d9aae67cbad5b2490205b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 21:06:52 -0700 Subject: [PATCH 12/22] Error types, unused code, comments --- .../src/db/datastore/db_metadata.rs | 37 +++---------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 90e3f25e95b..446c5d0b85d 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -363,9 +363,8 @@ impl DataStore { /// Compares the state of the schema with the expectations of the /// currently running Nexus. /// - /// - `nexus_id`: An optional UUID of the currently-running Nexus. - /// If used, the db_metadata_nexus table will be checked to confirm - /// this Nexus has access to the database. + /// - `identity_check`: Describes whether or not the identity of the + /// calling Nexus should be validated before returning database access /// - `desired_version`: The version of the database schema this /// Nexus wants. pub async fn check_schema_and_access( @@ -709,31 +708,6 @@ impl DataStore { Ok(()) } - /// Registers multiple Nexus instances as having active access to the database - pub async fn database_nexus_access_insert_multiple( - &self, - nexus_ids: &[OmicronZoneUuid], - state: DbMetadataNexusState, - ) -> Result<(), Error> { - use nexus_db_schema::schema::db_metadata_nexus::dsl; - - let new_nexuses: Vec = nexus_ids - .iter() - .map(|&nexus_id| DbMetadataNexus::new(nexus_id, state)) - .collect(); - - diesel::insert_into(dsl::db_metadata_nexus) - .values(new_nexuses) - .on_conflict(dsl::nexus_id) - .do_update() - .set(dsl::state.eq(excluded(dsl::state))) - .execute_async(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - Ok(()) - } - /// Initializes Nexus database access records from a blueprint using an /// existing connection /// @@ -756,9 +730,10 @@ impl DataStore { let any_records_exist = Self::database_nexus_access_any_exist_on_connection(conn).await?; if any_records_exist { - return Err(Error::internal_error( - "Cannot initialize Nexus access from blueprint: db_metadata_nexus records already exist. \ - This function should only be called during initial rack setup.", + return Err(Error::conflict( + "Cannot initialize Nexus access from blueprint: \ + db_metadata_nexus records already exist. This function should \ + only be called during initial rack setup.", )); } From 46f920a877baade73e2f45c3cf672f498a443fa1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 21:14:19 -0700 Subject: [PATCH 13/22] line lengths --- .../src/db/datastore/db_metadata.rs | 48 ++++++++++++------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 446c5d0b85d..be5fd9199ee 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -292,7 +292,8 @@ impl DataStore { &self.log, "No db_metadata_nexus records exist - skipping access check"; "nexus_id" => ?nexus_id, - "explanation" => "This is expected during initial deployment or before migration" + "explanation" => "This is expected during initial deployment \ + or before migration" ); return Ok(NexusAccess::HasImplicitAccess); } @@ -301,18 +302,27 @@ impl DataStore { let Some(state) = self.database_nexus_access(nexus_id).await?.map(|s| s.state()) else { - let msg = "Nexus does not have access to the database (no db_metadata_nexus record)"; + let msg = "Nexus does not have access to the database (no \ + db_metadata_nexus record)"; warn!(&self.log, "{msg}"; "nexus_id" => ?nexus_id); return Ok(NexusAccess::DoesNotHaveAccessYet); }; let status = match state { DbMetadataNexusState::Active => { - info!(&self.log, "Nexus has access to the database"; "nexus_id" => ?nexus_id); + info!( + &self.log, + "Nexus has access to the database"; + "nexus_id" => ?nexus_id + ); NexusAccess::HasExplicitAccess } DbMetadataNexusState::NotYet => { - info!(&self.log, "Nexus does not yet have access to the database"; "nexus_id" => ?nexus_id); + info!( + &self.log, + "Nexus does not yet have access to the database"; + "nexus_id" => ?nexus_id + ); NexusAccess::DoesNotHaveAccessYet } DbMetadataNexusState::Quiesced => { @@ -377,11 +387,12 @@ impl DataStore { let nexus_access = match identity_check { IdentityCheckPolicy::CheckAndTakeover { nexus_id } => { match schema_status { - // If we don't think the "db_metadata_nexus" tables exist in the - // schema yet, treat them as implicitly having access. + // If we don't think the "db_metadata_nexus" tables exist in + // the schema yet, treat them as implicitly having access. // - // TODO: This may be removed, once we're confident deployed systems - // have upgraded past DB_METADATA_NEXUS_SCHEMA_VERSION. + // TODO: This may be removed, once we're confident deployed + // systems have upgraded past + // DB_METADATA_NEXUS_SCHEMA_VERSION. SchemaStatus::OlderThanDesiredSkipAccessCheck => { NexusAccess::HasImplicitAccess } @@ -766,7 +777,8 @@ impl DataStore { Ok(()) } - // Implementation function for attempt_handoff that runs within a transaction + // Implementation function for attempt_handoff that runs within a + // transaction // // This function must be executed from a transaction context to be safe. async fn attempt_handoff_impl( @@ -776,10 +788,12 @@ impl DataStore { ) -> Result<(), diesel::result::Error> { use nexus_db_schema::schema::db_metadata_nexus::dsl; - // Before proceeding, all records must be in the "quiesced" or "not_yet" states. + // Before proceeding, all records must be in the "quiesced" or "not_yet" + // states. // - // We explicitly look for any records violating this, rather than explicitly looking for - // "active" records, as to protect ourselves from future states being added over time. + // We explicitly look for any records violating this, rather than + // explicitly looking for "active" records, as to protect ourselves from + // future states being added over time. let active_count: nexus_db_model::SqlU32 = dsl::db_metadata_nexus .filter( dsl::state @@ -830,14 +844,16 @@ impl DataStore { Ok(()) } - /// Attempts to perform a handoff to activate this Nexus for database access. + /// Attempts to perform a handoff to activate this Nexus for database + /// access. /// /// This function checks that: - /// 1. ALL records in db_metadata_nexus are in "not_yet" or "quiesced" states + /// 1. ALL records in db_metadata_nexus are in "not_yet" or "quiesced" + /// states /// 2. The specified nexus_id has a record which is "not_yet" /// - /// If both conditions are met, it updates ALL "not_yet" records to "active". - /// These operations are performed transactionally. + /// If both conditions are met, it updates ALL "not_yet" records to + /// "active". These operations are performed transactionally. /// /// Returns an error if: /// - Any record is in "active" state From f5a479269a98a60d41e20200821add2ca75bddc6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 21:27:05 -0700 Subject: [PATCH 14/22] comment --- nexus/db-queries/src/db/datastore/db_metadata.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index be5fd9199ee..3e6bb37e7f7 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -794,6 +794,9 @@ impl DataStore { // We explicitly look for any records violating this, rather than // explicitly looking for "active" records, as to protect ourselves from // future states being added over time. + // + // There is no concern of time-of-check-to-time-of-use bugs because + // this function must be executed within a transaction. let active_count: nexus_db_model::SqlU32 = dsl::db_metadata_nexus .filter( dsl::state From 4b2b58a146e6f58b2fe06c3da9ebffc15c84226c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 21:29:58 -0700 Subject: [PATCH 15/22] use blueprint in-memory, rather than doing db queries --- .../src/db/datastore/db_metadata.rs | 21 +++---------------- nexus/db-queries/src/db/datastore/rack.rs | 9 +++++++- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 3e6bb37e7f7..c626e70bbae 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -21,12 +21,11 @@ use nexus_db_model::EARLIEST_SUPPORTED_VERSION; use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::SchemaVersion; use omicron_common::api::external::Error; -use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid}; +use omicron_uuid_kinds::OmicronZoneUuid; use semver::Version; use slog::{Logger, error, info, o}; use std::ops::Bound; use std::str::FromStr; -use uuid::Uuid; /// Errors that can occur during handoff operations #[derive(Debug, thiserror::Error)] @@ -731,10 +730,8 @@ impl DataStore { pub async fn initialize_nexus_access_from_blueprint_on_connection( &self, conn: &async_bb8_diesel::Connection, - blueprint_id: Uuid, + nexus_zone_ids: Vec, ) -> Result<(), Error> { - use nexus_db_model::ZoneType; - use nexus_db_schema::schema::bp_omicron_zone::dsl as zone_dsl; use nexus_db_schema::schema::db_metadata_nexus::dsl; // Ensure no db_metadata_nexus records already exist @@ -748,23 +745,11 @@ impl DataStore { )); } - // Query all Nexus zones from the blueprint - let nexus_zone_ids: Vec = zone_dsl::bp_omicron_zone - .filter(zone_dsl::blueprint_id.eq(blueprint_id)) - .filter(zone_dsl::zone_type.eq(ZoneType::Nexus)) - .select(zone_dsl::id) - .load_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - // Create db_metadata_nexus records for all Nexus zones let new_nexuses: Vec = nexus_zone_ids .iter() .map(|&nexus_id| { - DbMetadataNexus::new( - OmicronZoneUuid::from_untyped_uuid(nexus_id), - DbMetadataNexusState::Active, - ) + DbMetadataNexus::new(nexus_id, DbMetadataNexusState::Active) }) .collect(); diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 268e3d90363..474f9316c61 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -791,7 +791,14 @@ impl DataStore { // Insert Nexus database access records self.initialize_nexus_access_from_blueprint_on_connection( &conn, - *blueprint.id.as_untyped_uuid() + blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled, zone_cfg)| { + if zone_cfg.zone_type.is_nexus() { + Some(zone_cfg.id) + } else { + None + } + }).collect(), ).await.map_err(|e| { err.set(RackInitError::BlueprintTargetSet(e)).unwrap(); DieselError::RollbackTransaction From 641ed0a827dec9a1cd32c8bede7277a6c466f911 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 10:47:50 -0700 Subject: [PATCH 16/22] s/SchemaAction/DatastoreSetupAction --- .../src/db/datastore/db_metadata.rs | 54 ++++++++++--------- nexus/db-queries/src/db/datastore/mod.rs | 14 ++--- nexus/src/bin/schema-updater.rs | 9 ++-- 3 files changed, 40 insertions(+), 37 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index c626e70bbae..bf9b7f3306d 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -194,9 +194,9 @@ enum SchemaStatus { OlderThanDesiredSkipAccessCheck, } -/// Describes what should be done with a schema +/// Describes what setup is necessary for DataStore creation #[derive(Debug, Copy, Clone, PartialEq)] -pub enum SchemaAction { +pub enum DatastoreSetupAction { /// Normal operation: The database is ready for usage Ready, @@ -212,18 +212,18 @@ pub enum SchemaAction { Refuse, } -/// Committment that the database is willing to perform a [SchemaAction] +/// Committment that the database is willing to perform a [DatastoreSetupAction] /// to a desired schema [Version]. /// /// Can be created through [DataStore::check_schema_and_access] #[derive(Clone)] -pub struct ValidatedSchemaAction { - action: SchemaAction, +pub struct ValidatedDatastoreSetupAction { + action: DatastoreSetupAction, desired: Version, } -impl ValidatedSchemaAction { - pub fn action(&self) -> &SchemaAction { +impl ValidatedDatastoreSetupAction { + pub fn action(&self) -> &DatastoreSetupAction { &self.action } @@ -232,7 +232,7 @@ impl ValidatedSchemaAction { } } -impl SchemaAction { +impl DatastoreSetupAction { // Interprets the combination of access and status to decide what action // should be taken. fn new(access: NexusAccess, status: SchemaStatus) -> Self { @@ -380,7 +380,7 @@ impl DataStore { &self, identity_check: IdentityCheckPolicy, desired_version: Version, - ) -> Result { + ) -> Result { let schema_status = self.check_schema(desired_version.clone()).await?; let nexus_access = match identity_check { @@ -408,29 +408,31 @@ impl DataStore { } }; - Ok(ValidatedSchemaAction { - action: SchemaAction::new(nexus_access, schema_status), + Ok(ValidatedDatastoreSetupAction { + action: DatastoreSetupAction::new(nexus_access, schema_status), desired: desired_version, }) } /// Ensures that the database schema matches `desired_version`. /// - /// - `validated_action`: A [ValidatedSchemaAction], indicating that + /// - `validated_action`: A [ValidatedDatastoreSetupAction], indicating that /// [Self::check_schema_and_access] has already been called. /// - `all_versions`: A description of all schema versions between /// "whatever is in the DB" and `desired_version`, instructing /// how to perform an update. pub async fn update_schema( &self, - validated_action: ValidatedSchemaAction, + validated_action: ValidatedDatastoreSetupAction, all_versions: Option<&AllSchemaVersions>, ) -> Result<(), anyhow::Error> { let action = validated_action.action(); match action { - SchemaAction::Ready => bail!("No schema update is necessary"), - SchemaAction::Update => (), + DatastoreSetupAction::Ready => { + bail!("No schema update is necessary") + } + DatastoreSetupAction::Update => (), _ => bail!("Not ready for schema update"), } @@ -1049,7 +1051,7 @@ mod test { .expect("Failed to check schema and access"); assert!( - matches!(checked_action.action(), SchemaAction::Ready), + matches!(checked_action.action(), DatastoreSetupAction::Ready), "Unexpected action: {:?}", checked_action.action(), ); @@ -1615,7 +1617,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Ready); + assert_eq!(action.action(), &DatastoreSetupAction::Ready); // Add a record to the table, now explicit nexus ID should NOT get access datastore @@ -1633,7 +1635,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::NeedsHandoff); + assert_eq!(action.action(), &DatastoreSetupAction::NeedsHandoff); db.terminate().await; logctx.cleanup_successful(); @@ -1667,7 +1669,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Ready); + assert_eq!(action.action(), &DatastoreSetupAction::Ready); // Explicit CheckAndTakeover with a Nexus ID that doesn't exist should // not get access @@ -1679,7 +1681,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::NeedsHandoff); + assert_eq!(action.action(), &DatastoreSetupAction::NeedsHandoff); db.terminate().await; logctx.cleanup_successful(); @@ -1715,7 +1717,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); + assert_eq!(action.action(), &DatastoreSetupAction::Refuse); db.terminate().await; logctx.cleanup_successful(); @@ -1759,7 +1761,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); + assert_eq!(action.action(), &DatastoreSetupAction::Refuse); // Implicit Access: Rejected let action = datastore @@ -1769,7 +1771,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Refuse); + assert_eq!(action.action(), &DatastoreSetupAction::Refuse); db.terminate().await; logctx.cleanup_successful(); @@ -1812,7 +1814,7 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::NeedsHandoff); + assert_eq!(action.action(), &DatastoreSetupAction::NeedsHandoff); } db.terminate().await; @@ -1848,7 +1850,7 @@ mod test { .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Ready); + assert_eq!(action.action(), &DatastoreSetupAction::Ready); assert_eq!(action.desired_version(), &SCHEMA_VERSION); db.terminate().await; @@ -1887,7 +1889,7 @@ mod test { .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &SchemaAction::Update); + assert_eq!(action.action(), &DatastoreSetupAction::Update); assert_eq!(action.desired_version(), &newer_version); db.terminate().await; diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index b5ab8b21fd6..f451ded3d87 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -122,8 +122,8 @@ pub mod webhook_delivery; mod zpool; pub use address_lot::AddressLotCreateResult; -pub use db_metadata::SchemaAction; -pub use db_metadata::ValidatedSchemaAction; +pub use db_metadata::DatastoreSetupAction; +pub use db_metadata::ValidatedDatastoreSetupAction; pub use dns::DataStoreDnsTest; pub use dns::DnsVersionUpdateBuilder; pub use ereport::EreportFilters; @@ -251,7 +251,7 @@ impl DataStore { try_for: Option, identity_check: IdentityCheckPolicy, ) -> Result { - use db_metadata::SchemaAction; + use db_metadata::DatastoreSetupAction; use nexus_db_model::SCHEMA_VERSION as EXPECTED_VERSION; let datastore = @@ -290,11 +290,11 @@ impl DataStore { })?; match checked_action.action() { - SchemaAction::Ready => { + DatastoreSetupAction::Ready => { info!(log, "Datastore is ready for usage"); return Ok(()); } - SchemaAction::NeedsHandoff => { + DatastoreSetupAction::NeedsHandoff => { info!(log, "Datastore is awaiting handoff"); let IdentityCheckPolicy::CheckAndTakeover { @@ -324,7 +324,7 @@ impl DataStore { // if we should update or not. continue; } - SchemaAction::Update => { + DatastoreSetupAction::Update => { info!( log, "Datastore should be updated before usage" @@ -344,7 +344,7 @@ impl DataStore { })?; return Ok(()); } - SchemaAction::Refuse => { + DatastoreSetupAction::Refuse => { error!(log, "Datastore should not be used"); return Err(BackoffError::permanent( "Datastore should not be used", diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index fd128102d84..1b5cc1d831a 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -14,8 +14,8 @@ use nexus_db_model::AllSchemaVersions; use nexus_db_model::SCHEMA_VERSION; use nexus_db_queries::db; use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::datastore::DatastoreSetupAction; use nexus_db_queries::db::datastore::IdentityCheckPolicy; -use nexus_db_queries::db::datastore::SchemaAction; use semver::Version; use slog::Drain; use slog::Level; @@ -118,17 +118,18 @@ async fn main_impl() -> anyhow::Result<()> { .await?; match checked_action.action() { - SchemaAction::Ready => { + DatastoreSetupAction::Ready => { println!("Already at version {version}") } - SchemaAction::Update => { + DatastoreSetupAction::Update => { datastore .update_schema(checked_action, Some(&all_versions)) .await .map_err(|e| anyhow!(e))?; println!("Update to {version} complete"); } - SchemaAction::NeedsHandoff | SchemaAction::Refuse => { + DatastoreSetupAction::NeedsHandoff + | DatastoreSetupAction::Refuse => { println!("Cannot update to version {version}") } } From ef27f2105105b43b567567851b418d24e17e34f3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 11:08:43 -0700 Subject: [PATCH 17/22] feed-forward nexus_id to avoid impossible code paths --- .../src/db/datastore/db_metadata.rs | 53 ++++++++++++------- nexus/db-queries/src/db/datastore/mod.rs | 13 +---- nexus/src/bin/schema-updater.rs | 14 +++-- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index bf9b7f3306d..c074248c1fa 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -154,7 +154,7 @@ fn skippable_version( #[derive(Debug, Copy, Clone, PartialEq)] pub enum NexusAccess { /// Nexus does not yet have access to the database. - DoesNotHaveAccessYet, + DoesNotHaveAccessYet { nexus_id: OmicronZoneUuid }, /// Nexus has been explicitly locked out of the database. LockedOut, @@ -203,7 +203,10 @@ pub enum DatastoreSetupAction { /// Not ready for usage yet /// /// The database may be ready for usage once handoff has completed. - NeedsHandoff, + /// The `nexus_id` here may attempt to takeover the database if it has + /// a `db_metadata_nexus` record of "not_yet", and all other records + /// are either "not_yet" or "quiesced". + NeedsHandoff { nexus_id: OmicronZoneUuid }, /// Start a schema update Update, @@ -249,9 +252,9 @@ impl DatastoreSetupAction { // If we don't have access yet, but could do something once handoff // occurs, then handoff is needed ( - DoesNotHaveAccessYet, + DoesNotHaveAccessYet { nexus_id }, UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, - ) => Self::NeedsHandoff, + ) => Self::NeedsHandoff { nexus_id }, // This is the most "normal" case: Nexus should have access to the // database, and the schema matches what it wants. @@ -304,7 +307,7 @@ impl DataStore { let msg = "Nexus does not have access to the database (no \ db_metadata_nexus record)"; warn!(&self.log, "{msg}"; "nexus_id" => ?nexus_id); - return Ok(NexusAccess::DoesNotHaveAccessYet); + return Ok(NexusAccess::DoesNotHaveAccessYet { nexus_id }); }; let status = match state { @@ -322,7 +325,7 @@ impl DataStore { "Nexus does not yet have access to the database"; "nexus_id" => ?nexus_id ); - NexusAccess::DoesNotHaveAccessYet + NexusAccess::DoesNotHaveAccessYet { nexus_id } } DbMetadataNexusState::Quiesced => { let msg = "Nexus locked out of database access (quiesced)"; @@ -399,8 +402,8 @@ impl DataStore { } } IdentityCheckPolicy::DontCare => { - // If a "nexus_id" was not supplied, skip the check, and treat it - // as having access. + // If a "nexus_id" was not supplied, skip the check, and treat + // it as having access. // // This is necessary for tools which access the schema without a // running Nexus, such as the schema-updater binary. @@ -1510,7 +1513,8 @@ mod test { let datastore = DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); - // Set up test data: create multiple nexus records in not_yet and quiesced states + // Set up test data: create multiple nexus records in not_yet and + // quiesced states let nexus1_id = OmicronZoneUuid::new_v4(); let nexus2_id = OmicronZoneUuid::new_v4(); let nexus3_id = OmicronZoneUuid::new_v4(); @@ -1565,7 +1569,8 @@ mod test { } assert!(result.is_ok()); - // Verify final state: all not_yet records should now be active, quiesced should remain quiesced + // Verify final state: all not_yet records should now be active, + // quiesced should remain quiesced let nexus1_after = datastore .database_nexus_access(nexus1_id) .await @@ -1584,7 +1589,8 @@ mod test { assert_eq!(nexus1_after.state(), DbMetadataNexusState::Active); assert_eq!(nexus2_after.state(), DbMetadataNexusState::Active); - assert_eq!(nexus3_after.state(), DbMetadataNexusState::Quiesced); // Should remain unchanged + // Should remain unchanged + assert_eq!(nexus3_after.state(), DbMetadataNexusState::Quiesced); db.terminate().await; logctx.cleanup_successful(); @@ -1592,12 +1598,13 @@ mod test { // This test covers two cases: // - // 1. New systems: We use RSS to initialize Nexus, but no db_metadata_nexus entries - // exist. + // 1. New systems: We use RSS to initialize Nexus, but no db_metadata_nexus + // entries exist. // 2. Deployed systems: We have a deployed system which updates to have this // "db_metadata_nexus"-handling code, but has no rows in that table. // - // Both of these cases must be granted database access to self-populate later. + // Both of these cases must be granted database access to self-populate + // later. #[tokio::test] async fn test_check_schema_and_access_empty_table_permits_access() { let logctx = dev::test_setup_log( @@ -1619,7 +1626,8 @@ mod test { .expect("Failed to check schema and access"); assert_eq!(action.action(), &DatastoreSetupAction::Ready); - // Add a record to the table, now explicit nexus ID should NOT get access + // Add a record to the table, now explicit nexus ID should NOT get + // access datastore .database_nexus_access_insert( OmicronZoneUuid::new_v4(), // Different nexus @@ -1635,7 +1643,10 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &DatastoreSetupAction::NeedsHandoff); + assert_eq!( + action.action(), + &DatastoreSetupAction::NeedsHandoff { nexus_id } + ); db.terminate().await; logctx.cleanup_successful(); @@ -1681,7 +1692,10 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &DatastoreSetupAction::NeedsHandoff); + assert_eq!( + action.action(), + &DatastoreSetupAction::NeedsHandoff { nexus_id }, + ); db.terminate().await; logctx.cleanup_successful(); @@ -1814,7 +1828,10 @@ mod test { ) .await .expect("Failed to check schema and access"); - assert_eq!(action.action(), &DatastoreSetupAction::NeedsHandoff); + assert_eq!( + action.action(), + &DatastoreSetupAction::NeedsHandoff { nexus_id }, + ); } db.terminate().await; diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index f451ded3d87..d3dfd3dd1f6 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -294,19 +294,10 @@ impl DataStore { info!(log, "Datastore is ready for usage"); return Ok(()); } - DatastoreSetupAction::NeedsHandoff => { + DatastoreSetupAction::NeedsHandoff { nexus_id } => { info!(log, "Datastore is awaiting handoff"); - let IdentityCheckPolicy::CheckAndTakeover { - nexus_id, - } = identity_check - else { - return Err(BackoffError::permanent( - "Nexus ID needed for handoff", - )); - }; - - datastore.attempt_handoff(nexus_id).await.map_err( + datastore.attempt_handoff(*nexus_id).await.map_err( |err| { warn!( log, diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index 1b5cc1d831a..f6e68683eb1 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -128,9 +128,17 @@ async fn main_impl() -> anyhow::Result<()> { .map_err(|e| anyhow!(e))?; println!("Update to {version} complete"); } - DatastoreSetupAction::NeedsHandoff - | DatastoreSetupAction::Refuse => { - println!("Cannot update to version {version}") + DatastoreSetupAction::Refuse => { + println!("Refusing to update to version {version}") + } + DatastoreSetupAction::NeedsHandoff { .. } => { + // This case should not happen - we supplied + // IdentityCheckPolicy::DontCare, so we should not be told + // to attempt a takeover by a specific Nexus. + println!( + "Refusing to update to version {version} \ + (Handoff needed?)" + ) } } } From 276093023e780847dc7be85e4249da12c93f9aec Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 11:30:29 -0700 Subject: [PATCH 18/22] schema-updater comments --- nexus/src/bin/schema-updater.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nexus/src/bin/schema-updater.rs b/nexus/src/bin/schema-updater.rs index f6e68683eb1..9d5869180d6 100644 --- a/nexus/src/bin/schema-updater.rs +++ b/nexus/src/bin/schema-updater.rs @@ -136,8 +136,10 @@ async fn main_impl() -> anyhow::Result<()> { // IdentityCheckPolicy::DontCare, so we should not be told // to attempt a takeover by a specific Nexus. println!( - "Refusing to update to version {version} \ - (Handoff needed?)" + "Refusing to update to version {version}. \ + The schema updater tried to ignore the identity check, \ + but got a response indicating handoff is needed. \ + This is unexpected, and probably a bug" ) } } From 42be76bad6dbb21caff13c68c1c6b0c784bc263b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 11:31:19 -0700 Subject: [PATCH 19/22] less pub --- .../src/db/datastore/db_metadata.rs | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index c074248c1fa..e1bbe8ed65f 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -152,7 +152,7 @@ fn skippable_version( /// Describes the state of the database access with respect this Nexus #[derive(Debug, Copy, Clone, PartialEq)] -pub enum NexusAccess { +enum NexusAccess { /// Nexus does not yet have access to the database. DoesNotHaveAccessYet { nexus_id: OmicronZoneUuid }, @@ -660,8 +660,8 @@ impl DataStore { Ok(()) } - /// Returns the access this Nexus has to the database - pub async fn database_nexus_access( + // Returns the access this Nexus has to the database + async fn database_nexus_access( &self, nexus_id: OmicronZoneUuid, ) -> Result, Error> { @@ -679,14 +679,15 @@ impl DataStore { Ok(nexus_access) } - /// Checks if any db_metadata_nexus records exist in the database - pub async fn database_nexus_access_any_exist(&self) -> Result { + // Checks if any db_metadata_nexus records exist in the database + async fn database_nexus_access_any_exist(&self) -> Result { let conn = self.pool_connection_unauthorized().await?; Self::database_nexus_access_any_exist_on_connection(&conn).await } - /// Checks if any db_metadata_nexus records exist in the database using an existing connection - pub async fn database_nexus_access_any_exist_on_connection( + // Checks if any db_metadata_nexus records exist in the database using an + // existing connection + async fn database_nexus_access_any_exist_on_connection( conn: &async_bb8_diesel::Connection, ) -> Result { use nexus_db_schema::schema::db_metadata_nexus::dsl; @@ -701,8 +702,9 @@ impl DataStore { Ok(exists) } - /// Registers a Nexus instance as having active access to the database - pub async fn database_nexus_access_insert( + // Registers a Nexus instance as having active access to the database + #[cfg(test)] + async fn database_nexus_access_insert( &self, nexus_id: OmicronZoneUuid, state: DbMetadataNexusState, From 875d8e6583bac12a131da6f2f2a2e9d5e5d6c2b4 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 11:50:31 -0700 Subject: [PATCH 20/22] warnings --- nexus/db-queries/src/db/datastore/db_metadata.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index e1bbe8ed65f..8b5541ff846 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -9,7 +9,6 @@ use anyhow::{Context, bail, ensure}; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use chrono::Utc; use diesel::prelude::*; -use diesel::upsert::excluded; use nexus_db_errors::ErrorHandler; use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; @@ -717,7 +716,7 @@ impl DataStore { .values(new_nexus) .on_conflict(dsl::nexus_id) .do_update() - .set(dsl::state.eq(excluded(dsl::state))) + .set(dsl::state.eq(diesel::upsert::excluded(dsl::state))) .execute_async(&*self.pool_connection_unauthorized().await?) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; From c5e450997b944403c4283d4fe585dfbf2d48919b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 15:19:27 -0700 Subject: [PATCH 21/22] reconfigurator execution to populate/destroy records --- .../src/db/datastore/db_metadata.rs | 80 +++++++++++++++++++ .../reconfigurator/execution/src/database.rs | 24 ++++++ nexus/reconfigurator/execution/src/lib.rs | 31 +++++++ .../execution/src/omicron_zones.rs | 9 ++- 4 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 nexus/reconfigurator/execution/src/database.rs diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 8b5541ff846..68e1052809b 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -5,10 +5,14 @@ //! [`DataStore`] methods on Database Metadata. use super::{DataStore, DbConnection, IdentityCheckPolicy}; +use crate::authz; +use crate::context::OpContext; + use anyhow::{Context, bail, ensure}; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use chrono::Utc; use diesel::prelude::*; +use futures::FutureExt; use nexus_db_errors::ErrorHandler; use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; @@ -19,7 +23,9 @@ use nexus_db_model::DbMetadataNexusState; use nexus_db_model::EARLIEST_SUPPORTED_VERSION; use nexus_db_model::SchemaUpgradeStep; use nexus_db_model::SchemaVersion; +use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::api::external::Error; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use semver::Version; use slog::{Logger, error, info, o}; @@ -701,6 +707,80 @@ impl DataStore { Ok(exists) } + /// Deletes the "db_metadata_nexus" record for a Nexus ID, if it exists. + pub async fn database_nexus_access_delete( + &self, + opctx: &OpContext, + nexus_id: OmicronZoneUuid, + ) -> Result<(), Error> { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + let conn = &*self.pool_connection_authorized(&opctx).await?; + + diesel::delete( + dsl::db_metadata_nexus + .filter(dsl::nexus_id.eq(nexus_id.into_untyped_uuid())), + ) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + Ok(()) + } + + /// Propagate the nexus records to the database if and only if + /// the blueprint is the current target. + /// + /// If any of these records already exist, they are unmodified. + pub async fn database_nexus_access_create( + &self, + opctx: &OpContext, + blueprint: &nexus_types::deployment::Blueprint, + ) -> Result<(), Error> { + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + // TODO: Without https://github.com/oxidecomputer/omicron/pull/8863, we + // treat all Nexuses as active. Some will become "not_yet", depending on + // the Nexus Generation, once it exists. + let active_nexus_zones = blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled, zone_cfg)| { + if zone_cfg.zone_type.is_nexus() { + Some(zone_cfg) + } else { + None + } + }); + let new_nexuses = active_nexus_zones + .map(|z| DbMetadataNexus::new(z.id, DbMetadataNexusState::Active)) + .collect::>(); + + let conn = &*self.pool_connection_authorized(&opctx).await?; + self.transaction_if_current_blueprint_is( + &conn, + "database_nexus_access_create", + opctx, + blueprint.id, + |conn| { + let new_nexuses = new_nexuses.clone(); + async move { + use nexus_db_schema::schema::db_metadata_nexus::dsl; + + diesel::insert_into(dsl::db_metadata_nexus) + .values(new_nexuses) + .on_conflict(dsl::nexus_id) + .do_nothing() + .execute_async(&*conn) + .await?; + Ok(()) + } + .boxed() + }, + ) + .await + } + // Registers a Nexus instance as having active access to the database #[cfg(test)] async fn database_nexus_access_insert( diff --git a/nexus/reconfigurator/execution/src/database.rs b/nexus/reconfigurator/execution/src/database.rs new file mode 100644 index 00000000000..9652e53ec1a --- /dev/null +++ b/nexus/reconfigurator/execution/src/database.rs @@ -0,0 +1,24 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Manages deployment of records into the database. + +use anyhow::anyhow; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::Blueprint; + +/// Idempotently ensure that the Nexus records for the zones are populated +/// in the database. +pub(crate) async fn deploy_db_metadata_nexus_records( + opctx: &OpContext, + datastore: &DataStore, + blueprint: &Blueprint, +) -> Result<(), anyhow::Error> { + datastore + .database_nexus_access_create(opctx, blueprint) + .await + .map_err(|err| anyhow!(err))?; + Ok(()) +} diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 43c09485557..5660fcfc06f 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -32,8 +32,10 @@ use tokio::sync::watch; use update_engine::StepSuccess; use update_engine::StepWarning; use update_engine::merge_anyhow_list; + mod clickhouse; mod cockroachdb; +mod database; mod dns; mod omicron_physical_disks; mod omicron_sled_config; @@ -196,6 +198,13 @@ pub async fn realize_blueprint( ) .into_shared(); + register_deploy_db_metadata_nexus_records_step( + &engine.for_component(ExecutionComponent::SledList), + &opctx, + datastore, + blueprint, + ); + register_deploy_sled_configs_step( &engine.for_component(ExecutionComponent::SledAgent), &opctx, @@ -390,6 +399,28 @@ fn register_sled_list_step<'a>( .register() } +fn register_deploy_db_metadata_nexus_records_step<'a>( + registrar: &ComponentRegistrar<'_, 'a>, + opctx: &'a OpContext, + datastore: &'a DataStore, + blueprint: &'a Blueprint, +) { + registrar + .new_step( + ExecutionStepId::Ensure, + "Ensure db_metadata_nexus_state records exist", + async move |_cx| { + database::deploy_db_metadata_nexus_records( + opctx, &datastore, &blueprint, + ) + .await + .context("ensuring db_metadata_nexus_state")?; + StepSuccess::new(()).into() + }, + ) + .register(); +} + fn register_deploy_sled_configs_step<'a>( registrar: &ComponentRegistrar<'_, 'a>, opctx: &'a OpContext, diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 28c981fd90e..74e4625358d 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -72,10 +72,13 @@ async fn clean_up_expunged_zones_impl( )); let result = match &config.zone_type { - // Zones which need no cleanup work after expungement. - BlueprintZoneType::Nexus(_) => None, - // Zones which need cleanup after expungement. + BlueprintZoneType::Nexus(_) => Some( + datastore + .database_nexus_access_delete(&opctx, config.id) + .await + .map_err(|err| anyhow::anyhow!(err)), + ), BlueprintZoneType::CockroachDb(_) => { if decommission_cockroach { Some( From 403b1d7458a75e79b7d65bec99b5ed4005874ea5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 16:24:09 -0700 Subject: [PATCH 22/22] tests --- .../src/db/datastore/db_metadata.rs | 452 +++++++++++++++++- 1 file changed, 450 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 68e1052809b..55094660d0e 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -722,7 +722,7 @@ impl DataStore { dsl::db_metadata_nexus .filter(dsl::nexus_id.eq(nexus_id.into_untyped_uuid())), ) - .execute_async(&*conn) + .execute_async(conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; @@ -771,7 +771,7 @@ impl DataStore { .values(new_nexuses) .on_conflict(dsl::nexus_id) .do_nothing() - .execute_async(&*conn) + .execute_async(conn) .await?; Ok(()) } @@ -1115,8 +1115,35 @@ mod test { use crate::db::pub_test_utils::TestDatabase; use camino::Utf8Path; use camino_tempfile::Utf8TempDir; + use id_map::IdMap; use nexus_db_model::SCHEMA_VERSION; + use nexus_inventory::now_db_precision; + use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; + use nexus_types::deployment::BlueprintSledConfig; + use nexus_types::deployment::BlueprintTarget; + use nexus_types::deployment::BlueprintZoneConfig; + use nexus_types::deployment::BlueprintZoneDisposition; + use nexus_types::deployment::BlueprintZoneImageSource; + use nexus_types::deployment::BlueprintZoneType; + use nexus_types::deployment::CockroachDbPreserveDowngrade; + use nexus_types::deployment::OximeterReadMode; + use nexus_types::deployment::PendingMgsUpdates; + use nexus_types::deployment::PlanningReport; + use nexus_types::deployment::blueprint_zone_type; + use nexus_types::external_api::views::SledState; + use nexus_types::inventory::NetworkInterface; + use nexus_types::inventory::NetworkInterfaceKind; + use omicron_common::api::external::Generation; + use omicron_common::api::external::MacAddr; + use omicron_common::api::external::Vni; + use omicron_common::zpool_name::ZpoolName; use omicron_test_utils::dev; + use omicron_uuid_kinds::BlueprintUuid; + use omicron_uuid_kinds::ExternalIpUuid; + use omicron_uuid_kinds::SledUuid; + use omicron_uuid_kinds::ZpoolUuid; + use std::collections::BTreeMap; // Confirms that calling the internal "ensure_schema" function can succeed // when the database is already at that version. @@ -1993,4 +2020,425 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + fn create_test_blueprint( + nexus_zones: Vec<(OmicronZoneUuid, BlueprintZoneDisposition)>, + ) -> Blueprint { + let blueprint_id = BlueprintUuid::new_v4(); + let sled_id = SledUuid::new_v4(); + + let zones: IdMap = nexus_zones + .into_iter() + .map(|(zone_id, disposition)| BlueprintZoneConfig { + disposition, + id: zone_id, + filesystem_pool: ZpoolName::new_external(ZpoolUuid::new_v4()), + zone_type: BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { + internal_address: "[::1]:0".parse().unwrap(), + external_dns_servers: Vec::new(), + external_ip: nexus_types::deployment::OmicronZoneExternalFloatingIp { + id: ExternalIpUuid::new_v4(), + ip: std::net::IpAddr::V6(std::net::Ipv6Addr::LOCALHOST), + }, + external_tls: true, + nic: NetworkInterface { + id: uuid::Uuid::new_v4(), + kind: NetworkInterfaceKind::Service { + id: zone_id.into_untyped_uuid(), + }, + name: "test-nic".parse().unwrap(), + ip: "192.168.1.1".parse().unwrap(), + mac: MacAddr::random_system(), + subnet: ipnetwork::IpNetwork::V4( + "192.168.1.0/24".parse().unwrap() + ).into(), + vni: Vni::try_from(100).unwrap(), + primary: true, + slot: 0, + transit_ips: Vec::new(), + }, + }), + image_source: BlueprintZoneImageSource::InstallDataset, + }) + .collect(); + + let mut sleds = BTreeMap::new(); + sleds.insert( + sled_id, + BlueprintSledConfig { + state: SledState::Active, + sled_agent_generation: Generation::new(), + zones, + disks: IdMap::new(), + datasets: IdMap::new(), + remove_mupdate_override: None, + host_phase_2: BlueprintHostPhase2DesiredSlots::current_contents( + ), + }, + ); + + Blueprint { + id: blueprint_id, + sleds, + pending_mgs_updates: PendingMgsUpdates::new(), + parent_blueprint_id: None, + internal_dns_version: Generation::new(), + external_dns_version: Generation::new(), + target_release_minimum_generation: Generation::new(), + cockroachdb_fingerprint: String::new(), + cockroachdb_setting_preserve_downgrade: + CockroachDbPreserveDowngrade::DoNotModify, + clickhouse_cluster_config: None, + oximeter_read_mode: OximeterReadMode::SingleNode, + oximeter_read_version: Generation::new(), + time_created: now_db_precision(), + creator: "test suite".to_string(), + comment: "test blueprint".to_string(), + report: PlanningReport::new(blueprint_id), + } + } + + #[tokio::test] + async fn test_database_nexus_access_create() { + let logctx = dev::test_setup_log("test_database_nexus_access_create"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let datastore = db.datastore(); + let opctx = db.opctx(); + + // Create a blueprint with two in-service Nexus zones, + // and one expunged Nexus. + let nexus1_id = OmicronZoneUuid::new_v4(); + let nexus2_id = OmicronZoneUuid::new_v4(); + let expunged_nexus = OmicronZoneUuid::new_v4(); + let blueprint = create_test_blueprint(vec![ + (nexus1_id, BlueprintZoneDisposition::InService), + (nexus2_id, BlueprintZoneDisposition::InService), + ( + expunged_nexus, + BlueprintZoneDisposition::Expunged { + as_of_generation: Generation::new(), + ready_for_cleanup: true, + }, + ), + ]); + + // Insert the blueprint and make it the target + datastore + .blueprint_insert(&opctx, &blueprint) + .await + .expect("Failed to insert blueprint"); + datastore + .blueprint_target_set_current( + &opctx, + BlueprintTarget { + target_id: blueprint.id, + enabled: false, + time_made_target: chrono::Utc::now(), + }, + ) + .await + .expect("Failed to set blueprint target"); + + // Create nexus access records + datastore + .database_nexus_access_create(&opctx, &blueprint) + .await + .expect("Failed to create nexus access"); + + // Verify records were created with Active state + let nexus1_access = datastore + .database_nexus_access(nexus1_id) + .await + .expect("Failed to get nexus1 access"); + let nexus2_access = datastore + .database_nexus_access(nexus2_id) + .await + .expect("Failed to get nexus2 access"); + let expunged_access = datastore + .database_nexus_access(expunged_nexus) + .await + .expect("Failed to get expunged access"); + + assert!(nexus1_access.is_some(), "nexus1 should have access record"); + assert!(nexus2_access.is_some(), "nexus2 should have access record"); + assert!( + expunged_access.is_none(), + "expunged nexus should not have access record" + ); + + let nexus1_record = nexus1_access.unwrap(); + let nexus2_record = nexus2_access.unwrap(); + assert_eq!(nexus1_record.state(), DbMetadataNexusState::Active); + assert_eq!(nexus2_record.state(), DbMetadataNexusState::Active); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_database_nexus_access_create_idempotent() { + let logctx = + dev::test_setup_log("test_database_nexus_access_create_idempotent"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let datastore = db.datastore(); + let opctx = db.opctx(); + + // Create a blueprint with one Nexus zone + let nexus_id = OmicronZoneUuid::new_v4(); + let blueprint = create_test_blueprint(vec![( + nexus_id, + BlueprintZoneDisposition::InService, + )]); + + // Insert the blueprint and make it the target + datastore + .blueprint_insert(&opctx, &blueprint) + .await + .expect("Failed to insert blueprint"); + datastore + .blueprint_target_set_current( + &opctx, + BlueprintTarget { + target_id: blueprint.id, + enabled: false, + time_made_target: chrono::Utc::now(), + }, + ) + .await + .expect("Failed to set blueprint target"); + + // Create nexus access records (first time) + datastore + .database_nexus_access_create(&opctx, &blueprint) + .await + .expect("Failed to create nexus access (first time)"); + + // Verify record was created + async fn confirm_state( + datastore: &DataStore, + nexus_id: OmicronZoneUuid, + expected_state: DbMetadataNexusState, + ) { + let state = datastore + .database_nexus_access(nexus_id) + .await + .expect("Failed to get nexus access after first create") + .expect("Entry for Nexus should have been inserted"); + assert_eq!(state.state(), expected_state); + } + + confirm_state(datastore, nexus_id, DbMetadataNexusState::Active).await; + + // Creating the record again: not an error. + datastore + .database_nexus_access_create(&opctx, &blueprint) + .await + .expect("Failed to create nexus access (first time)"); + confirm_state(datastore, nexus_id, DbMetadataNexusState::Active).await; + + // Manually make the record "Quiesced". + use nexus_db_schema::schema::db_metadata_nexus::dsl; + diesel::update(dsl::db_metadata_nexus) + .filter(dsl::nexus_id.eq(nexus_id.into_untyped_uuid())) + .set(dsl::state.eq(DbMetadataNexusState::Quiesced)) + .execute_async( + &*datastore.pool_connection_unauthorized().await.unwrap(), + ) + .await + .expect("Failed to update record"); + confirm_state(datastore, nexus_id, DbMetadataNexusState::Quiesced) + .await; + + // Create nexus access records another time - should be idempotent, + // but should be "on-conflict, ignore". + datastore + .database_nexus_access_create(&opctx, &blueprint) + .await + .expect("Failed to create nexus access (second time)"); + confirm_state(datastore, nexus_id, DbMetadataNexusState::Quiesced) + .await; + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_database_nexus_access_create_fails_wrong_target_blueprint() { + let logctx = dev::test_setup_log( + "test_database_nexus_access_create_fails_wrong_target_blueprint", + ); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let datastore = db.datastore(); + let opctx = db.opctx(); + + // Create two different blueprints + let nexus_id = OmicronZoneUuid::new_v4(); + let target_blueprint = create_test_blueprint(vec![( + nexus_id, + BlueprintZoneDisposition::InService, + )]); + let non_target_blueprint = create_test_blueprint(vec![( + nexus_id, + BlueprintZoneDisposition::InService, + )]); + + // Insert both blueprints + datastore + .blueprint_insert(&opctx, &target_blueprint) + .await + .expect("Failed to insert target blueprint"); + datastore + .blueprint_insert(&opctx, &non_target_blueprint) + .await + .expect("Failed to insert non-target blueprint"); + + // Set the first blueprint as the target + datastore + .blueprint_target_set_current( + &opctx, + BlueprintTarget { + target_id: target_blueprint.id, + enabled: false, + time_made_target: chrono::Utc::now(), + }, + ) + .await + .expect("Failed to set target blueprint"); + + // Try to create nexus access records using the non-target blueprint. + // This should fail because the transaction should check if the + // blueprint is the current target + let result = datastore + .database_nexus_access_create(&opctx, &non_target_blueprint) + .await; + assert!( + result.is_err(), + "Creating nexus access with wrong target blueprint should fail" + ); + + // Verify no records were created for the nexus + let access = datastore + .database_nexus_access(nexus_id) + .await + .expect("Failed to get nexus access"); + assert!( + access.is_none(), + "No access record should exist when wrong blueprint is used" + ); + + // Verify that using the correct target blueprint works + datastore + .database_nexus_access_create(&opctx, &target_blueprint) + .await + .expect( + "Creating nexus access with correct blueprint should succeed", + ); + + let access_after_correct = datastore + .database_nexus_access(nexus_id) + .await + .expect("Failed to get nexus access after correct blueprint"); + assert!( + access_after_correct.is_some(), + "Access record should exist after using correct target blueprint" + ); + + db.terminate().await; + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_database_nexus_access_delete() { + let logctx = dev::test_setup_log("test_database_nexus_access_delete"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let datastore = db.datastore(); + let opctx = db.opctx(); + + // Create test nexus IDs + let nexus1_id = OmicronZoneUuid::new_v4(); + let nexus2_id = OmicronZoneUuid::new_v4(); + + // Insert records directly using the test method + datastore + .database_nexus_access_insert( + nexus1_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus1 access"); + datastore + .database_nexus_access_insert( + nexus2_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus2 access"); + + // Verify records were created + let nexus1_before = datastore + .database_nexus_access(nexus1_id) + .await + .expect("Failed to get nexus1 access"); + let nexus2_before = datastore + .database_nexus_access(nexus2_id) + .await + .expect("Failed to get nexus2 access"); + assert!(nexus1_before.is_some(), "nexus1 should have access record"); + assert!(nexus2_before.is_some(), "nexus2 should have access record"); + + // Delete nexus1 record + datastore + .database_nexus_access_delete(&opctx, nexus1_id) + .await + .expect("Failed to delete nexus1 access"); + + // Verify nexus1 record was deleted, nexus2 record remains + let nexus1_after = datastore + .database_nexus_access(nexus1_id) + .await + .expect("Failed to get nexus1 access after delete"); + let nexus2_after = datastore + .database_nexus_access(nexus2_id) + .await + .expect("Failed to get nexus2 access after delete"); + assert!( + nexus1_after.is_none(), + "nexus1 should not have access record after delete" + ); + assert!( + nexus2_after.is_some(), + "nexus2 should still have access record" + ); + + // Delete nexus2 record + datastore + .database_nexus_access_delete(&opctx, nexus2_id) + .await + .expect("Failed to delete nexus2 access"); + + // Verify nexus2 record was also deleted + let nexus2_final = datastore + .database_nexus_access(nexus2_id) + .await + .expect("Failed to get nexus2 access after final delete"); + assert!( + nexus2_final.is_none(), + "nexus2 should not have access record after delete" + ); + + // Confirm deletion is idempotent + datastore + .database_nexus_access_delete(&opctx, nexus1_id) + .await + .expect("Failed to delete nexus1 access idempotently"); + + // This also means deleting non-existent records should be fine + datastore + .database_nexus_access_delete(&opctx, OmicronZoneUuid::new_v4()) + .await + .expect("Failed to delete non-existent record"); + + db.terminate().await; + logctx.cleanup_successful(); + } }