diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index bea3638fd5..558e60ec1b 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -4,7 +4,7 @@ //! [`DataStore`] methods on Database Metadata. -use super::{DataStore, DbConnection}; +use super::{DataStore, DbConnection, IdentityCheckPolicy}; use crate::authz; use crate::context::OpContext; @@ -16,6 +16,7 @@ use futures::FutureExt; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_model::AllSchemaVersions; +use nexus_db_model::DB_METADATA_NEXUS_SCHEMA_VERSION; use nexus_db_model::DbMetadataNexus; use nexus_db_model::DbMetadataNexusState; use nexus_db_model::EARLIEST_SUPPORTED_VERSION; @@ -108,7 +109,294 @@ fn skippable_version( return false; } +/// Describes the state of the database access with respect to this Nexus +#[derive(Debug, Copy, Clone, PartialEq)] +enum NexusAccess { + /// Nexus does not yet have access to the database, but can take over when + /// the current-generation Nexus instances quiesce. + DoesNotHaveAccessYet { nexus_id: OmicronZoneUuid }, + + /// Nexus has been permanently, explicitly locked out of the database. + LockedOut, + + /// Nexus should have normal access to the database + /// + /// We have a record of this Nexus, and it should have access. + HasExplicitAccess, + + /// Nexus should have normal access to the database + /// + /// We may or may not have a record of this Nexus, but it should have + /// access. + HasImplicitAccess, + + /// Nexus does not yet have access to the database, but it might get + /// access later. Unlike [`Self::DoesNotHaveAccessYet`], this variant + /// is triggered because we don't have an explicit records. + /// + /// Although some Nexuses have records, this one doesn't. This can + /// mean that a Nexus zone has just been deployed, and booted before + /// its record has been populated. + NoRecordNoAccess, +} + +/// Describes the state of the schema with respect this Nexus +#[derive(Debug, Copy, Clone, PartialEq)] +enum SchemaStatus { + /// The database schema matches what we want + UpToDate, + + /// The database schema is newer than what we want + NewerThanDesired, + + /// The database schema is older than what we want + OlderThanDesired, + + /// The database schema is older than what we want, and it's + /// so old, it does not know about the "db_metadata_nexus" table. + /// + /// We should avoid accessing the "db_metadata_nexus" tables to check + /// access, because the schema for these tables may not exist. + /// + /// TODO: This may be removed, once we're confident deployed systems + /// have upgraded past DB_METADATA_NEXUS_SCHEMA_VERSION. + OlderThanDesiredSkipAccessCheck, +} + +/// Describes what setup is necessary for DataStore creation +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum DatastoreSetupAction { + /// Normal operation: The database is ready for usage + Ready, + + /// Not ready for usage yet + /// + /// The database may be ready for usage once handoff has completed. + /// The `nexus_id` here may attempt to takeover the database if it has + /// a `db_metadata_nexus` record of "not_yet", and all other records + /// are either "not_yet" or "quiesced". + NeedsHandoff { nexus_id: OmicronZoneUuid }, + + /// Wait, then try to set up the datastore later. + /// + /// This can be triggered by observing incomplete data, such as missing + /// records in the "db_metadata_nexus" table, which may be populated by + /// waiting for an existing system to finish execution. + TryLater, + + /// Start a schema update + Update, + + /// Permanently refuse to use the database + Refuse, +} + +/// Committment that the database is willing to perform a +/// [`DatastoreSetupAction`] to a desired schema [`Version`]. +/// +/// Can be created through [`DataStore::check_schema_and_access`] +#[derive(Clone)] +pub struct ValidatedDatastoreSetupAction { + action: DatastoreSetupAction, + desired: Version, +} + +impl ValidatedDatastoreSetupAction { + pub fn action(&self) -> &DatastoreSetupAction { + &self.action + } + + pub fn desired_version(&self) -> &Version { + &self.desired + } +} + +impl DatastoreSetupAction { + // Interprets the combination of access and status to decide what action + // should be taken. + fn new(access: NexusAccess, status: SchemaStatus) -> Self { + use NexusAccess::*; + use SchemaStatus::*; + + match (access, status) { + // Nexus has been explicitly locked-out of using the database + (LockedOut, _) => Self::Refuse, + + // The schema updated beyond what we want, do not use it. + (_, NewerThanDesired) => Self::Refuse, + + // If we aren't sure if we have access yet, try again later. + ( + NoRecordNoAccess, + UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, + ) => Self::TryLater, + + // If we don't have access yet, but could do something once handoff + // occurs, then handoff is needed + ( + DoesNotHaveAccessYet { nexus_id }, + UpToDate | OlderThanDesired | OlderThanDesiredSkipAccessCheck, + ) => Self::NeedsHandoff { nexus_id }, + + // This is the most "normal" case: Nexus should have access to the + // database, and the schema matches what it wants. + (HasExplicitAccess | HasImplicitAccess, UpToDate) => Self::Ready, + + // If this Nexus is allowed to access the schema, but it looks + // older than what we expect, we'll need to update the schema to + // use it. + ( + HasExplicitAccess | HasImplicitAccess, + OlderThanDesired | OlderThanDesiredSkipAccessCheck, + ) => Self::Update, + } + } +} + impl DataStore { + // Checks if the specified Nexus has access to the database. + async fn check_nexus_access( + &self, + nexus_id: OmicronZoneUuid, + ) -> Result { + // Check if any "db_metadata_nexus" rows exist. + // If they don't exist, treat the database as having access. + // + // This handles the case for fresh deployments where RSS hasn't + // populated the table yet (we need access to finish + // "rack_initialization"). + // + // After initialization, this conditional should never trigger + // again. + let any_records_exist = self.database_nexus_access_any_exist().await?; + if !any_records_exist { + warn!( + &self.log, + "No db_metadata_nexus records exist - skipping access check"; + "nexus_id" => ?nexus_id, + "explanation" => "This is expected during initial deployment \ + or before migration" + ); + return Ok(NexusAccess::HasImplicitAccess); + } + + // Records exist, so enforce the identity check + let Some(state) = + self.database_nexus_access(nexus_id).await?.map(|s| s.state()) + else { + let msg = "Nexus does not have access to the database (no \ + db_metadata_nexus record)"; + warn!(&self.log, "{msg}"; "nexus_id" => ?nexus_id); + return Ok(NexusAccess::NoRecordNoAccess); + }; + + let status = match state { + DbMetadataNexusState::Active => { + info!( + &self.log, + "Nexus has access to the database"; + "nexus_id" => ?nexus_id + ); + NexusAccess::HasExplicitAccess + } + DbMetadataNexusState::NotYet => { + info!( + &self.log, + "Nexus does not yet have access to the database"; + "nexus_id" => ?nexus_id + ); + NexusAccess::DoesNotHaveAccessYet { nexus_id } + } + DbMetadataNexusState::Quiesced => { + let msg = "Nexus locked out of database access (quiesced)"; + error!(&self.log, "{msg}"; "nexus_id" => ?nexus_id); + NexusAccess::LockedOut + } + }; + Ok(status) + } + + // Checks the schema against a desired version. + async fn check_schema( + &self, + desired_version: Version, + ) -> Result { + let (found_version, _found_target_version) = self + .database_schema_version() + .await + .context("Cannot read database schema version")?; + + let log = self.log.new(o!( + "found_version" => found_version.to_string(), + "desired_version" => desired_version.to_string(), + )); + + use std::cmp::Ordering; + match found_version.cmp(&desired_version) { + Ordering::Less => { + warn!(log, "Found schema version is older than desired"); + if found_version < DB_METADATA_NEXUS_SCHEMA_VERSION { + Ok(SchemaStatus::OlderThanDesiredSkipAccessCheck) + } else { + Ok(SchemaStatus::OlderThanDesired) + } + } + Ordering::Equal => { + info!(log, "Database schema version is up to date"); + Ok(SchemaStatus::UpToDate) + } + Ordering::Greater => { + error!(log, "Found schema version is newer than desired"); + Ok(SchemaStatus::NewerThanDesired) + } + } + } + + /// Compares the state of the schema with the expectations of the + /// currently running Nexus. + /// + /// - `identity_check`: Describes whether or not the identity of the + /// calling Nexus should be validated before returning database access + /// - `desired_version`: The version of the database schema this + /// Nexus wants. + pub async fn check_schema_and_access( + &self, + identity_check: IdentityCheckPolicy, + desired_version: Version, + ) -> Result { + let schema_status = self.check_schema(desired_version.clone()).await?; + + let nexus_access = match identity_check { + IdentityCheckPolicy::CheckAndTakeover { nexus_id } => { + match schema_status { + // If we don't think the "db_metadata_nexus" tables exist in + // the schema yet, treat them as implicitly having access. + // + // TODO: This may be removed, once we're confident deployed + // systems have upgraded past + // DB_METADATA_NEXUS_SCHEMA_VERSION. + SchemaStatus::OlderThanDesiredSkipAccessCheck => { + NexusAccess::HasImplicitAccess + } + _ => self.check_nexus_access(nexus_id).await?, + } + } + IdentityCheckPolicy::DontCare => { + // If a "nexus_id" was not supplied, skip the check, and treat + // it as having access. + // + // This is necessary for tools which access the schema without a + // running Nexus, such as the schema-updater binary. + NexusAccess::HasImplicitAccess + } + }; + + Ok(ValidatedDatastoreSetupAction { + action: DatastoreSetupAction::new(nexus_access, schema_status), + desired: desired_version, + }) + } + // Ensures that the database schema matches "desired_version". // // - Updating the schema makes the database incompatible with older @@ -350,7 +638,6 @@ impl DataStore { } // Returns the access this Nexus has to the database - #[cfg(test)] async fn database_nexus_access( &self, nexus_id: OmicronZoneUuid, @@ -369,6 +656,12 @@ impl DataStore { Ok(nexus_access) } + // Checks if any db_metadata_nexus records exist in the database + async fn database_nexus_access_any_exist(&self) -> Result { + let conn = self.pool_connection_unauthorized().await?; + Self::database_nexus_access_any_exist_on_connection(&conn).await + } + // Checks if any db_metadata_nexus records exist in the database using an // existing connection async fn database_nexus_access_any_exist_on_connection( @@ -985,6 +1278,317 @@ mod test { logctx.cleanup_successful(); } + // This test covers two cases: + // + // 1. New systems: We use RSS to initialize Nexus, but no db_metadata_nexus + // entries exist. + // 2. Deployed systems: We have a deployed system which updates to have this + // "db_metadata_nexus"-handling code, but has no rows in that table. + // + // Both of these cases must be granted database access to self-populate + // later. + #[tokio::test] + async fn test_check_schema_and_access_empty_table_permits_access() { + let logctx = dev::test_setup_log( + "test_check_schema_and_access_empty_table_permits_access", + ); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // With an empty table, even explicit nexus ID should get access + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &DatastoreSetupAction::Ready); + + // Add a record to the table, now explicit nexus ID should NOT get + // access + datastore + .database_nexus_access_insert( + OmicronZoneUuid::new_v4(), // Different nexus + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &DatastoreSetupAction::TryLater); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates the case where a Nexus ID is explicitly requested or omitted. + // + // The omission case is important for the "schema-updater" binary to keep working. + #[tokio::test] + async fn test_check_schema_and_access_nexus_id() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_nexus_id"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + // Add an active record, for some Nexus ID. + datastore + .database_nexus_access_insert( + OmicronZoneUuid::new_v4(), + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + // Using 'DontCare' as a nexus ID should get access (schema updater case) + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::DontCare, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &DatastoreSetupAction::Ready); + + // Explicit CheckAndTakeover with a Nexus ID that doesn't exist should + // not get access, and should be told to retry later. + let nexus_id = OmicronZoneUuid::new_v4(); + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &DatastoreSetupAction::TryLater); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that an explicit db_metadata_nexus record can lock-out Nexuses which should not be + // able to access the database. + #[tokio::test] + async fn test_check_schema_and_access_lockout_refuses_access() { + let logctx = dev::test_setup_log( + "test_check_schema_and_access_lockout_refuses_access", + ); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as quiesced (locked out) + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Quiesced, + ) + .await + .expect("Failed to insert nexus record"); + + // Should refuse access + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &DatastoreSetupAction::Refuse); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that if a Nexus with an "old desired schema" boots, it cannot access the + // database under any conditions. + // + // This is the case where the database has upgraded beyond what Nexus can understand. + // + // In practice, the db_metadata_nexus records should prevent this situation from occurring, + // but it's still a useful property to reject old schemas while the "schema-updater" binary + // exists. + #[tokio::test] + async fn test_check_schema_and_access_schema_too_new() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_schema_too_new"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as active + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + // Try to access with an older version than what's in the database + let older_version = Version::new(SCHEMA_VERSION.major - 1, 0, 0); + + // Explicit Nexus ID: Rejected + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + older_version.clone(), + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &DatastoreSetupAction::Refuse); + + // Implicit Access: Rejected + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::DontCare, + older_version.clone(), + ) + .await + .expect("Failed to check schema and access"); + assert_eq!(action.action(), &DatastoreSetupAction::Refuse); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that the schema + access combinations identify we should wait for handoff + // when we have a "NotYet" record that could become compatible with the database. + #[tokio::test] + async fn test_check_schema_and_access_wait_for_handoff() { + let logctx = dev::test_setup_log( + "test_check_schema_and_access_wait_for_handoff", + ); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as not_yet (doesn't have access yet) + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::NotYet, + ) + .await + .expect("Failed to insert nexus record"); + + // We should wait for handoff if the versions match, or if our desired + // version is newer than what exists in the database. + let current_version = SCHEMA_VERSION; + let newer_version = Version::new(SCHEMA_VERSION.major + 1, 0, 0); + let versions = [current_version, newer_version]; + + for version in &versions { + // Should wait for handoff when schema is up-to-date + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + version.clone(), + ) + .await + .expect("Failed to check schema and access"); + assert_eq!( + action.action(), + &DatastoreSetupAction::NeedsHandoff { nexus_id }, + ); + } + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates the "normal case", where a Nexus has access and the schema already matches. + #[tokio::test] + async fn test_check_schema_and_access_normal_use() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_normal_use"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as active + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + // With current schema version, should be ready for normal use + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + SCHEMA_VERSION, + ) + .await + .expect("Failed to check schema and access"); + + assert_eq!(action.action(), &DatastoreSetupAction::Ready); + assert_eq!(action.desired_version(), &SCHEMA_VERSION); + + db.terminate().await; + logctx.cleanup_successful(); + } + + // Validates that when a Nexus is active with a newer-than-database desired + // version, it will request an update + #[tokio::test] + async fn test_check_schema_and_access_update_now() { + let logctx = + dev::test_setup_log("test_check_schema_and_access_update_now"); + let db = TestDatabase::new_with_pool(&logctx.log).await; + let datastore = + DataStore::new_unchecked(logctx.log.clone(), db.pool().clone()); + + let nexus_id = OmicronZoneUuid::new_v4(); + + // Insert our nexus as active + datastore + .database_nexus_access_insert( + nexus_id, + DbMetadataNexusState::Active, + ) + .await + .expect("Failed to insert nexus record"); + + let newer_version = Version::new(SCHEMA_VERSION.major + 1, 0, 0); + + // With a newer desired version, should request update + let action = datastore + .check_schema_and_access( + IdentityCheckPolicy::CheckAndTakeover { nexus_id }, + newer_version.clone(), + ) + .await + .expect("Failed to check schema and access"); + + assert_eq!(action.action(), &DatastoreSetupAction::Update); + assert_eq!(action.desired_version(), &newer_version); + + db.terminate().await; + logctx.cleanup_successful(); + } + fn create_test_blueprint( nexus_zones: Vec<(OmicronZoneUuid, BlueprintZoneDisposition)>, ) -> Blueprint { diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 76862d9276..23642d9334 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -39,7 +39,7 @@ use omicron_common::api::external::ResourceType; use omicron_common::backoff::{ BackoffError, retry_notify, retry_policy_internal_service, }; -use omicron_uuid_kinds::{GenericUuid, SledUuid}; +use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid, SledUuid}; use slog::Logger; use std::net::Ipv6Addr; use std::num::NonZeroU32; @@ -189,6 +189,21 @@ impl RunnableQuery for T where { } +/// Specifies whether the consumer wants to check whether they're allowed to +/// access the database based on the `db_metadata_nexus` table. +#[derive(Debug, Clone, Copy)] +pub enum IdentityCheckPolicy { + /// The consumer wants full access to the database regardless of the current + /// upgrade / handoff state. This would be used by almost all tools and + /// tests. + DontCare, + + /// The consumer only wants to access the database if it's in the current + /// set of Nexus instances that's supposed to be able to access it. If + /// possible and legal, take over access from the existing set. + CheckAndTakeover { nexus_id: OmicronZoneUuid }, +} + pub struct DataStore { log: Logger, pool: Arc,