Skip to content

Commit c5cb399

Browse files
authored
[Reconfigurator] Don't reevaluate impossible MGS updates for a few minutes (#8760)
This PR adds `ignore_impossible_mgs_updates_since` to `PlanningInput`, and changes the planner to keep rather than reevaluate MGS updates that it deems impossible if they are newer than `ignore_impossible_mgs_updates_since`. This avoids churn when the act of updating itself renders the planned update (temporarily) impossible. See #8483 for context. Fixes #8483. Also fixes #8484 (not directly relevant to the main changes, but is just a couple new lines in `do_plan_mgs_updates()`). I'll test this on a racklette and post results in a comment before merging.
1 parent 14197b3 commit c5cb399

File tree

13 files changed

+382
-66
lines changed

13 files changed

+382
-66
lines changed

dev-tools/reconfigurator-cli/src/lib.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
77
use anyhow::{Context, anyhow, bail};
88
use camino::{Utf8Path, Utf8PathBuf};
9+
use chrono::{DateTime, Utc};
910
use clap::{ArgAction, ValueEnum};
1011
use clap::{Args, Parser, Subcommand};
1112
use daft::Diffable;
@@ -1077,6 +1078,27 @@ enum SetArgs {
10771078
},
10781079
/// planner chicken switches
10791080
ChickenSwitches(SetChickenSwitchesArgs),
1081+
/// timestamp for ignoring impossible MGS updates
1082+
IgnoreImpossibleMgsUpdatesSince {
1083+
since: SetIgnoreImpossibleMgsUpdatesSinceArgs,
1084+
},
1085+
}
1086+
1087+
#[derive(Debug, Clone)]
1088+
struct SetIgnoreImpossibleMgsUpdatesSinceArgs(DateTime<Utc>);
1089+
1090+
impl FromStr for SetIgnoreImpossibleMgsUpdatesSinceArgs {
1091+
type Err = anyhow::Error;
1092+
1093+
fn from_str(s: &str) -> Result<Self, Self::Err> {
1094+
if s.eq_ignore_ascii_case("now") {
1095+
return Ok(Self(Utc::now()));
1096+
}
1097+
if let Ok(datetime) = humantime::parse_rfc3339(s) {
1098+
return Ok(Self(datetime.into()));
1099+
}
1100+
bail!("invalid timestamp: expected `now` or an RFC3339 timestamp")
1101+
}
10801102
}
10811103

10821104
#[derive(Debug, Args)]
@@ -2394,6 +2416,16 @@ fn cmd_set(
23942416
)
23952417
}
23962418
}
2419+
SetArgs::IgnoreImpossibleMgsUpdatesSince { since } => {
2420+
state
2421+
.system_mut()
2422+
.description_mut()
2423+
.set_ignore_impossible_mgs_updates_since(since.0);
2424+
format!(
2425+
"ignoring impossible MGS updates since {}",
2426+
humantime::format_rfc3339_millis(since.0.into())
2427+
)
2428+
}
23972429
};
23982430

23992431
sim.commit_and_bump(format!("reconfigurator-cli set: {}", rv), state);

dev-tools/reconfigurator-cli/tests/input/cmds-target-release.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,9 @@ blueprint-diff 58d5e830-0884-47d8-a7cd-b2b3751adeb4 af934083-59b5-4bf6-8966-6fb5
5050

5151
# This time, make it more interesting. Change the inactive slot contents of
5252
# the simulated SP. This should make the configured update impossible and cause
53-
# the planner to fix it.
53+
# the planner to fix it. To test this, we also need to tell the planner not to
54+
# ignore this update even though it's quite new.
55+
set ignore-impossible-mgs-updates-since now
5456
sled-update-sp 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c --inactive 0.5.0
5557
inventory-generate
5658
blueprint-plan af934083-59b5-4bf6-8966-6fb5292c29e1 61f451b3-2121-4ed6-91c7-a550054f6c21

dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count
4848
INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3
4949
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
5050
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
51-
WARN cannot issue more SP updates (no current artifacts)
51+
WARN cannot issue more MGS-driven updates (no current artifacts)
5252
INFO all zones up-to-date
5353
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
5454
generated blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 based on parent blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21

dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ WARN failed to place all new desired InternalDns zones, placed: 0, wanted_to_pla
543543
INFO sufficient ExternalDns zones exist in plan, desired_count: 0, current_count: 0
544544
WARN failed to place all new desired Nexus zones, placed: 0, wanted_to_place: 3
545545
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
546-
WARN cannot issue more SP updates (no current artifacts)
546+
WARN cannot issue more MGS-driven updates (no current artifacts)
547547
INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: b3c9c041-d2f0-4767-bdaf-0e52e9d7a013 (service), zone_kind: InternalNtp, reason: MissingInInventory { bp_image_source: InstallDataset } }]
548548
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
549549
generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a
@@ -1564,7 +1564,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count
15641564
INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3
15651565
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
15661566
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
1567-
WARN cannot issue more SP updates (no current artifacts)
1567+
WARN cannot issue more MGS-driven updates (no current artifacts)
15681568
INFO all zones up-to-date
15691569
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
15701570
generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a

dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1036,7 +1036,7 @@ INFO sufficient InternalDns zones exist in plan, desired_count: 3, current_count
10361036
INFO added zone to sled, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, kind: ExternalDns
10371037
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
10381038
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
1039-
WARN cannot issue more SP updates (no current artifacts)
1039+
WARN cannot issue more MGS-driven updates (no current artifacts)
10401040
INFO some zones not yet up-to-date, sled_id: 711ac7f8-d19e-4572-bdb9-e9b50f6e362a, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: fe2d5287-24e3-4071-b214-2640b097a759 (service), zone_kind: ExternalDns, reason: MissingInInventory { bp_image_source: InstallDataset } }]
10411041
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
10421042
generated blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 based on parent blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0

dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1053,7 +1053,7 @@ INFO added zone to sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, kind: In
10531053
INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count: 3
10541054
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
10551055
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
1056-
WARN cannot issue more SP updates (no current artifacts)
1056+
WARN cannot issue more MGS-driven updates (no current artifacts)
10571057
INFO some zones not yet up-to-date, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: e375dd21-320b-43b7-bc92-a2c3dac9d9e1 (service), zone_kind: InternalDns, reason: MissingInInventory { bp_image_source: InstallDataset } }]
10581058
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
10591059
generated blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 based on parent blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4

dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2137,12 +2137,12 @@ INFO sufficient ExternalDns zones exist in plan, desired_count: 3, current_count
21372137
INFO sufficient Nexus zones exist in plan, desired_count: 3, current_count: 3
21382138
INFO sufficient Oximeter zones exist in plan, desired_count: 0, current_count: 0
21392139
WARN cannot configure SP update for board (no matching artifact), serial_number: serial0, part_number: model0
2140-
INFO skipping board for SP update, serial_number: serial0, part_number: model0
2140+
INFO skipping board for MGS-driven update, serial_number: serial0, part_number: model0
21412141
WARN cannot configure SP update for board (no matching artifact), serial_number: serial1, part_number: model1
2142-
INFO skipping board for SP update, serial_number: serial1, part_number: model1
2142+
INFO skipping board for MGS-driven update, serial_number: serial1, part_number: model1
21432143
WARN cannot configure SP update for board (no matching artifact), serial_number: serial2, part_number: model2
2144-
INFO skipping board for SP update, serial_number: serial2, part_number: model2
2145-
INFO ran out of boards for SP update
2144+
INFO skipping board for MGS-driven update, serial_number: serial2, part_number: model2
2145+
INFO ran out of boards for MGS-driven update
21462146
INFO some zones not yet up-to-date, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: 0c71b3b2-6ceb-4e8f-b020-b08675e83038 (service), zone_kind: Nexus, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("0e32b4a3e5d3668bb1d6a16fb06b74dc60b973fa479dcee0aae3adbb52bf1388") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 427ec88f-f467-42fa-9bbb-66a91a36103c (service), zone_kind: InternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("ffbf1373f7ee08dddd74c53ed2a94e7c4c572a982d3a9bc94000c6956b700c6a") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 5199c033-4cf9-4ab6-8ae7-566bd7606363 (service), zone_kind: Crucible, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("6f17cf65fb5a5bec5542dd07c03cd0acc01e59130f02c532c8d848ecae810047") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 6444f8a5-6465-4f0b-a549-1993c113569c (service), zone_kind: InternalNtp, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("67593d686ed04a1709f93972b71f4ebc148a9362120f65d239943e814a9a7439") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 803bfb63-c246-41db-b0da-d3b87ddfc63d (service), zone_kind: ExternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("ccca13ed19b8731f9adaf0d6203b02ea3b9ede4fa426b9fac0a07ce95440046d") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: ba4994a8-23f9-4b1a-a84f-a08d74591389 (service), zone_kind: CruciblePantry, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("21f0ada306859c23917361f2e0b9235806c32607ec689c7e8cf16bb898bc5a02") }, inv_image_source: InstallDataset } }]
21472147
INFO will ensure cockroachdb setting, setting: cluster.preserve_downgrade_option, value: DoNotModify
21482148
generated blueprint 8f2d1f39-7c88-4701-aa43-56bf281b28c1 based on parent blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700

0 commit comments

Comments
 (0)