Skip to content

Commit 555af71

Browse files
authored
[nexus] disable blueprint autoplanner if there are too many blueprints (#9130)
We've had bugs in planning that have resulted in the database filling up with blueprints. If there's a bug, it's better to disable autoplanning than to tip the database over. This only affects the autoplanner -- manually-generated blueprints continue to work. Closes #9099.
1 parent b8efb9a commit 555af71

File tree

5 files changed

+403
-3
lines changed

5 files changed

+403
-3
lines changed

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1300,29 +1300,63 @@ fn print_task_blueprint_planner(details: &serde_json::Value) {
13001300
BlueprintPlannerStatus::Disabled => {
13011301
println!(" blueprint planning explicitly disabled by config!");
13021302
}
1303+
BlueprintPlannerStatus::LimitReached { limit, report } => {
1304+
println!(
1305+
" blueprint auto-planning disabled because \
1306+
current blueprint count >= limit ({limit}); planning report \
1307+
contains what would have been stored had the limit not been \
1308+
reached",
1309+
);
1310+
println!("{report}");
1311+
}
13031312
BlueprintPlannerStatus::Error(error) => {
13041313
println!(" task did not complete successfully: {error}");
13051314
}
1306-
BlueprintPlannerStatus::Unchanged { parent_blueprint_id, report } => {
1315+
BlueprintPlannerStatus::Unchanged {
1316+
parent_blueprint_id,
1317+
report,
1318+
blueprint_count,
1319+
limit,
1320+
} => {
13071321
println!(" plan unchanged from parent {parent_blueprint_id}");
1322+
println!(
1323+
" note: {}/{} blueprints in database",
1324+
blueprint_count, limit
1325+
);
13081326
println!("{report}");
13091327
}
13101328
BlueprintPlannerStatus::Planned {
13111329
parent_blueprint_id,
13121330
error,
13131331
report,
1332+
blueprint_count,
1333+
limit,
13141334
} => {
13151335
println!(
13161336
" planned new blueprint from parent {parent_blueprint_id}, \
13171337
but could not make it the target: {error}"
13181338
);
1339+
println!(
1340+
" note: {}/{} blueprints in database",
1341+
blueprint_count, limit
1342+
);
13191343
println!("{report}");
13201344
}
1321-
BlueprintPlannerStatus::Targeted { blueprint_id, report, .. } => {
1345+
BlueprintPlannerStatus::Targeted {
1346+
parent_blueprint_id: _,
1347+
blueprint_id,
1348+
report,
1349+
blueprint_count,
1350+
limit,
1351+
} => {
13221352
println!(
13231353
" planned new blueprint {blueprint_id}, \
13241354
and made it the current target"
13251355
);
1356+
println!(
1357+
" note: {}/{} blueprints in database",
1358+
blueprint_count, limit,
1359+
);
13261360
println!("{report}");
13271361
}
13281362
}

nexus/db-queries/src/db/datastore/deployment.rs

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@ use crate::context::OpContext;
99
use crate::db::datastore::SQL_BATCH_SIZE;
1010
use crate::db::pagination::Paginator;
1111
use crate::db::pagination::paginated;
12+
use crate::db::queries::ALLOW_FULL_TABLE_SCAN_SQL;
13+
use crate::db::raw_query_builder::QueryBuilder;
1214
use anyhow::Context;
1315
use async_bb8_diesel::AsyncRunQueryDsl;
16+
use async_bb8_diesel::AsyncSimpleConnection;
1417
use chrono::DateTime;
1518
use chrono::Utc;
1619
use clickhouse_admin_types::{KeeperId, ServerId};
@@ -564,6 +567,100 @@ impl DataStore {
564567
Ok(())
565568
}
566569

570+
/// Check whether the given blueprint limit has been reached.
571+
///
572+
/// This (necessarily) does a full table scan on the blueprint table up to
573+
/// the limit, so `limit` must be relatively small to avoid performance
574+
/// issues. Experimentally, on a Gimlet, a limit of a few thousand takes
575+
/// under 20ms.
576+
pub async fn check_blueprint_limit_reached(
577+
&self,
578+
opctx: &OpContext,
579+
limit: u64,
580+
) -> Result<BlueprintLimitReachedOutput, Error> {
581+
// The "full" table scan below is treated as a complex operation. (This
582+
// should only be called from the blueprint planner background task,
583+
// for which complex operations are allowed.)
584+
opctx.check_complex_operations_allowed()?;
585+
opctx.authorize(authz::Action::Read, &authz::BLUEPRINT_CONFIG).await?;
586+
let conn = self.pool_connection_authorized(opctx).await?;
587+
588+
let limit_i64 = i64::try_from(limit).map_err(|e| {
589+
Error::invalid_value(
590+
"limit",
591+
format!("limit cannot be converted to i64: {e}"),
592+
)
593+
})?;
594+
595+
let err = OptionalError::new();
596+
597+
let count = self
598+
.transaction_retry_wrapper("blueprint_count")
599+
.transaction(&conn, |conn| {
600+
let err = err.clone();
601+
602+
async move {
603+
// We need this to call the COUNT(*) query below. But note
604+
// that this isn't really a "full" table scan; the number of
605+
// rows scanned is limited by the LIMIT clause.
606+
conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL)
607+
.await
608+
.map_err(|e| err.bail(TransactionError::Database(e)))?;
609+
610+
// Rather than doing a full table scan, we use a LIMIT
611+
// clause to limit the number of rows returned.
612+
let mut count_star_sql = QueryBuilder::new();
613+
count_star_sql
614+
.sql(
615+
"SELECT COUNT(*) FROM \
616+
(SELECT 1 FROM omicron.public.blueprint \
617+
LIMIT $1)",
618+
)
619+
.bind::<diesel::sql_types::BigInt, _>(limit_i64);
620+
621+
let query =
622+
count_star_sql.query::<diesel::sql_types::BigInt>();
623+
624+
// query.first_async fails with `the trait bound
625+
// `TypedSqlQuery<BigInt>: diesel::Table` is not satisfied`.
626+
// So we use load_async, knowing that only one row will be
627+
// returned.
628+
let value = query
629+
.load_async::<i64>(&conn)
630+
.await
631+
.map_err(|e| err.bail(TransactionError::Database(e)))?;
632+
633+
Ok(value)
634+
}
635+
})
636+
.await
637+
.map_err(|e| match err.take() {
638+
Some(err) => err.into(),
639+
None => public_error_from_diesel(e, ErrorHandler::Server),
640+
})?;
641+
642+
// There must be exactly one row in the returned result.
643+
let count = *count.get(0).ok_or_else(|| {
644+
Error::internal_error("error getting blueprint count from database")
645+
})?;
646+
647+
let count = u64::try_from(count).map_err(|_| {
648+
Error::internal_error(&format!(
649+
"error converting blueprint count {} into \
650+
u64 (how is it negative?)",
651+
count
652+
))
653+
})?;
654+
655+
// Note count >= limit (and not count > limit): for a limit of 5000 we
656+
// want to fail if it's reached 5000.
657+
if count >= limit {
658+
Ok(BlueprintLimitReachedOutput::Yes)
659+
} else {
660+
Ok(BlueprintLimitReachedOutput::No { count })
661+
}
662+
}
663+
567664
/// Read a complete blueprint from the database
568665
pub async fn blueprint_read(
569666
&self,
@@ -2602,6 +2699,12 @@ async fn insert_pending_mgs_update(
26022699
Ok(())
26032700
}
26042701

2702+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2703+
pub enum BlueprintLimitReachedOutput {
2704+
No { count: u64 },
2705+
Yes,
2706+
}
2707+
26052708
// Helper to process BpPendingMgsUpdateComponent rows
26062709
fn process_update_row<T>(
26072710
row: T,

nexus/db-queries/src/db/datastore/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ mod zpool;
126126
pub use address_lot::AddressLotCreateResult;
127127
pub use db_metadata::DatastoreSetupAction;
128128
pub use db_metadata::ValidatedDatastoreSetupAction;
129+
pub use deployment::BlueprintLimitReachedOutput;
129130
pub use dns::DataStoreDnsTest;
130131
pub use dns::DnsVersionUpdateBuilder;
131132
pub use ereport::EreportFilters;

0 commit comments

Comments
 (0)