From ecd6a001c2b72f95cf380baca29dccd51d071397 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 18 Aug 2025 15:19:04 -0700 Subject: [PATCH 01/22] Add `nexus_generation` to blueprint --- dev-tools/omdb/src/bin/omdb/reconfigurator.rs | 2 +- dev-tools/omdb/tests/successes.out | 3 + .../output/cmds-add-sled-no-disks-stdout | 1 + .../tests/output/cmds-example-stdout | 9 +- ...ds-expunge-newly-added-external-dns-stdout | 10 +- ...ds-expunge-newly-added-internal-dns-stdout | 6 +- .../output/cmds-host-phase-2-source-stdout | 6 + .../output/cmds-mupdate-update-flow-stdout | 36 +- .../output/cmds-noop-image-source-stdout | 4 + .../tests/output/cmds-set-mgs-updates-stdout | 11 + .../cmds-set-remove-mupdate-override-stdout | 4 + .../tests/output/cmds-set-zone-images-stdout | 5 + .../tests/output/cmds-target-release-stdout | 859 +++++++++---- live-tests/tests/test_nexus_add_remove.rs | 24 +- nexus/db-model/src/deployment.rs | 11 + nexus/db-model/src/schema_versions.rs | 3 +- .../db-queries/src/db/datastore/deployment.rs | 4 + .../deployment/external_networking.rs | 2 + nexus/db-queries/src/db/datastore/rack.rs | 10 + nexus/db-queries/src/db/datastore/vpc.rs | 12 +- nexus/db-schema/src/schema.rs | 3 + nexus/reconfigurator/blippy/src/blippy.rs | 19 + nexus/reconfigurator/blippy/src/checks.rs | 152 +++ .../tests/integration/blueprint_edit.rs | 2 +- nexus/reconfigurator/execution/src/dns.rs | 4 +- .../planning/src/blueprint_builder/builder.rs | 695 +++++++++- nexus/reconfigurator/planning/src/example.rs | 24 +- nexus/reconfigurator/planning/src/planner.rs | 1145 ++++++++++++++--- nexus/reconfigurator/planning/src/system.rs | 1 + .../example_builder_zone_counts_blueprint.txt | 1 + .../output/planner_basic_add_sled_2_3.txt | 1 + .../output/planner_basic_add_sled_3_5.txt | 1 + ...dataset_settings_modified_in_place_1_2.txt | 1 + .../planner_decommissions_sleds_1_2.txt | 1 + .../planner_decommissions_sleds_bp2.txt | 7 +- .../planner_deploy_all_keeper_nodes_1_2.txt | 1 + .../planner_deploy_all_keeper_nodes_3_4.txt | 1 + .../planner_deploy_all_keeper_nodes_4_5.txt | 1 + .../planner_deploy_all_keeper_nodes_5_6.txt | 1 + ...lanner_expunge_clickhouse_clusters_3_4.txt | 1 + ...lanner_expunge_clickhouse_clusters_5_6.txt | 1 + ...ouse_zones_after_policy_is_changed_3_4.txt | 1 + .../output/planner_nonprovisionable_1_2.txt | 1 + .../output/planner_nonprovisionable_2_2a.txt | 4 + .../output/planner_nonprovisionable_bp2.txt | 9 +- .../output/zone_image_source_change_1.txt | 1 + nexus/reconfigurator/preparation/src/lib.rs | 20 +- nexus/src/app/background/init.rs | 1 + .../background/tasks/blueprint_execution.rs | 1 + .../app/background/tasks/blueprint_load.rs | 1 + .../app/background/tasks/blueprint_planner.rs | 6 + nexus/src/app/deployment.rs | 10 +- nexus/test-utils/src/lib.rs | 2 + nexus/types/src/deployment.rs | 15 + nexus/types/src/deployment/blueprint_diff.rs | 7 + .../types/src/deployment/blueprint_display.rs | 1 + nexus/types/src/deployment/planning_input.rs | 28 + nexus/types/src/deployment/planning_report.rs | 154 ++- nexus/types/src/deployment/zone_type.rs | 5 + openapi/nexus-internal.json | 70 +- schema/crdb/dbinit.sql | 10 +- schema/crdb/nexus-generation/up01.sql | 1 + schema/crdb/nexus-generation/up02.sql | 5 + schema/crdb/nexus-generation/up03.sql | 1 + schema/crdb/nexus-generation/up04.sql | 1 + sled-agent/src/rack_setup/plan/service.rs | 3 +- sled-agent/src/rack_setup/service.rs | 1 + sled-agent/src/sim/server.rs | 1 + 68 files changed, 2962 insertions(+), 483 deletions(-) create mode 100644 schema/crdb/nexus-generation/up01.sql create mode 100644 schema/crdb/nexus-generation/up02.sql create mode 100644 schema/crdb/nexus-generation/up03.sql create mode 100644 schema/crdb/nexus-generation/up04.sql diff --git a/dev-tools/omdb/src/bin/omdb/reconfigurator.rs b/dev-tools/omdb/src/bin/omdb/reconfigurator.rs index c56679f493c..e074fc25b8f 100644 --- a/dev-tools/omdb/src/bin/omdb/reconfigurator.rs +++ b/dev-tools/omdb/src/bin/omdb/reconfigurator.rs @@ -148,7 +148,7 @@ async fn cmd_reconfigurator_export( // See Nexus::blueprint_planning_context(). eprint!("assembling reconfigurator state ... "); let state = nexus_reconfigurator_preparation::reconfigurator_state_load( - opctx, datastore, + opctx, datastore, None, ) .await?; eprintln!("done"); diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 701561dfebe..82577f26f53 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -1569,6 +1569,7 @@ parent: internal DNS version::: 1 external DNS version::: 2 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -1692,6 +1693,7 @@ parent: internal DNS version::: 1 external DNS version::: 2 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -1717,6 +1719,7 @@ to: blueprint ............. internal DNS version::: 1 (unchanged) external DNS version::: 2 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout index 9591408f905..9ed40ac1396 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-add-sled-no-disks-stdout @@ -274,6 +274,7 @@ parent: dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index cd7b094d85a..e9754eca45e 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -399,6 +399,7 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -518,6 +519,7 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -551,7 +553,7 @@ T ENA ID PARENT > blueprint-plan ade5749d-bdf3-4fab-a8ae-00bea01b3a5a INFO skipping noop image source check for all sleds, reason: no target release is currently set WARN cannot issue more MGS-driven updates (no current artifacts) -INFO some zones not yet up-to-date, sled_id: 89d02b1b-478c-401a-8e28-7a26f74fa41b, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: b3c9c041-d2f0-4767-bdaf-0e52e9d7a013 (service), zone_kind: InternalNtp, reason: MissingInInventory { bp_image_source: InstallDataset } }] +INFO some zones not yet up-to-date, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: b3c9c041-d2f0-4767-bdaf-0e52e9d7a013 (service), zone_kind: InternalNtp, reason: MissingInInventory { bp_image_source: InstallDataset } }] generated blueprint 86db3308-f817-4626-8838-4085949a6a41 based on parent blueprint ade5749d-bdf3-4fab-a8ae-00bea01b3a5a planning report for blueprint 86db3308-f817-4626-8838-4085949a6a41: chicken switches: @@ -633,6 +635,7 @@ to: blueprint 86db3308-f817-4626-8838-4085949a6a41 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -713,6 +716,7 @@ to: blueprint 86db3308-f817-4626-8838-4085949a6a41 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -793,6 +797,7 @@ to: blueprint 02697f74-b14a-4418-90f0-c28b2a3a6aa9 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1032,6 +1037,7 @@ parent: 02697f74-b14a-4418-90f0-c28b2a3a6aa9 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -1670,6 +1676,7 @@ to: blueprint 86db3308-f817-4626-8838-4085949a6a41 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout index fea4459e751..3d8db0389d3 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-external-dns-stdout @@ -331,6 +331,7 @@ parent: 06c88262-f435-410e-ba98-101bed41ec27 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -463,6 +464,7 @@ to: blueprint 366b0b68-d80e-4bc1-abd3-dc69837847e0 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1022,6 +1024,7 @@ parent: 3f00b694-1b16-4aaa-8f78-e6b3a527b434 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -1039,7 +1042,7 @@ chicken switches: add zones with mupdate override: false * discretionary zones placed: - * 1 zone on sled 711ac7f8-d19e-4572-bdb9-e9b50f6e362a: external_dns + * external_dns zone on sled 711ac7f8-d19e-4572-bdb9-e9b50f6e362a from source install dataset * zone updates waiting on discretionary zones @@ -1163,6 +1166,7 @@ to: blueprint 9c998c1d-1a7b-440a-ae0c-40f781dea6e2 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1725,6 +1729,7 @@ parent: 366b0b68-d80e-4bc1-abd3-dc69837847e0 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -1733,7 +1738,7 @@ chicken switches: add zones with mupdate override: false * discretionary zones placed: - * 1 zone on sled 711ac7f8-d19e-4572-bdb9-e9b50f6e362a: external_dns + * external_dns zone on sled 711ac7f8-d19e-4572-bdb9-e9b50f6e362a from source install dataset * zone updates waiting on discretionary zones @@ -1859,6 +1864,7 @@ to: blueprint 2ac8c740-444d-42ff-8d66-9812a7e51288 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout index 8f759dc2f04..26df8f664c8 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-expunge-newly-added-internal-dns-stdout @@ -329,6 +329,7 @@ parent: 184f10b3-61cb-41ef-9b93-3489b2bac559 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -461,6 +462,7 @@ to: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -820,6 +822,7 @@ to: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1053,7 +1056,7 @@ chicken switches: add zones with mupdate override: false * discretionary zones placed: - * 1 zone on sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c: internal_dns + * internal_dns zone on sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c from source install dataset * zone updates waiting on discretionary zones @@ -1177,6 +1180,7 @@ to: blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-host-phase-2-source-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-host-phase-2-source-stdout index 174b44d1c93..4caba4fd7e6 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-host-phase-2-source-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-host-phase-2-source-stdout @@ -150,6 +150,7 @@ to: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -430,6 +431,7 @@ to: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -709,6 +711,7 @@ parent: 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -860,6 +863,7 @@ to: blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1140,6 +1144,7 @@ to: blueprint df06bb57-ad42-4431-9206-abff322896c7 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1419,6 +1424,7 @@ parent: af934083-59b5-4bf6-8966-6fb5292c29e1 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 9634b16e631..e62ef6b1a7e 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -508,6 +508,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates @@ -684,6 +685,7 @@ to: blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) * target release min gen: 1 -> 3 + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -883,6 +885,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -897,6 +900,7 @@ to: blueprint 626487fa-7139-45ec-8416-902271fc730b internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 3 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1105,6 +1109,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates @@ -1229,6 +1234,7 @@ to: blueprint c1a0d242-9160-40f4-96ae-61f8f40a0b1b internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) * target release min gen: 3 -> 4 + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1390,6 +1396,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-show latest @@ -1554,6 +1561,7 @@ parent: c1a0d242-9160-40f4-96ae-61f8f40a0b1b internal DNS version::: 1 external DNS version::: 1 target release min gen: 4 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -1565,6 +1573,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates @@ -1629,6 +1638,7 @@ to: blueprint afb09faf-a586-4483-9289-04d4f1d8ba23 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 4 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1804,6 +1814,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-show latest @@ -1968,6 +1979,7 @@ parent: afb09faf-a586-4483-9289-04d4f1d8ba23 internal DNS version::: 1 external DNS version::: 1 target release min gen: 4 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -1980,6 +1992,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates @@ -2049,6 +2062,7 @@ to: blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 4 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -2199,7 +2213,9 @@ chicken switches: * skipping noop image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts * 1 pending MGS update: * model0:serial0: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("0c0362b640cece5b9a5e86d8fa683bd2eb84c3e7f90731f597197d604ffa76e3"), sled_agent_address: [fd00:1122:3344:101::1]:12345 }) +* only placed 0/2 desired nexus zones * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-show latest @@ -2364,6 +2380,7 @@ parent: ce365dff-2cdb-4f35-a186-b15e20e1e700 internal DNS version::: 1 external DNS version::: 1 target release min gen: 4 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 1 Pending MGS-managed updates (all baseboards): @@ -2381,7 +2398,9 @@ chicken switches: * skipping noop image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts * 1 pending MGS update: * model0:serial0: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("0c0362b640cece5b9a5e86d8fa683bd2eb84c3e7f90731f597197d604ffa76e3"), sled_agent_address: [fd00:1122:3344:101::1]:12345 }) +* only placed 0/2 desired nexus zones * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates @@ -2446,6 +2465,7 @@ to: blueprint 8f2d1f39-7c88-4701-aa43-56bf281b28c1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 4 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -2608,7 +2628,9 @@ chicken switches: * skipping noop image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts * 1 pending MGS update: * model1:serial1: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("0c0362b640cece5b9a5e86d8fa683bd2eb84c3e7f90731f597197d604ffa76e3"), sled_agent_address: [fd00:1122:3344:102::1]:12345 }) +* only placed 0/2 desired nexus zones * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -2675,6 +2697,7 @@ to: blueprint 12d602a6-5ab4-487a-b94e-eb30cdf30300 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 4 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -2842,7 +2865,9 @@ chicken switches: * skipping noop image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts * 1 pending MGS update: * model2:serial2: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("0c0362b640cece5b9a5e86d8fa683bd2eb84c3e7f90731f597197d604ffa76e3"), sled_agent_address: [fd00:1122:3344:103::1]:12345 }) +* only placed 0/2 desired nexus zones * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -2906,6 +2931,7 @@ to: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 4 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3064,7 +3090,7 @@ WARN cannot configure RoT update for board (no matching artifact), serial_number WARN cannot configure SP update for board (no matching artifact), serial_number: serial1, part_number: model1 INFO skipping board for MGS-driven update, serial_number: serial1, part_number: model1 INFO ran out of boards for MGS-driven update -INFO some zones not yet up-to-date, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: 0c71b3b2-6ceb-4e8f-b020-b08675e83038 (service), zone_kind: Nexus, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("0e32b4a3e5d3668bb1d6a16fb06b74dc60b973fa479dcee0aae3adbb52bf1388") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 427ec88f-f467-42fa-9bbb-66a91a36103c (service), zone_kind: InternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("ffbf1373f7ee08dddd74c53ed2a94e7c4c572a982d3a9bc94000c6956b700c6a") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 5199c033-4cf9-4ab6-8ae7-566bd7606363 (service), zone_kind: Crucible, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("6f17cf65fb5a5bec5542dd07c03cd0acc01e59130f02c532c8d848ecae810047") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 6444f8a5-6465-4f0b-a549-1993c113569c (service), zone_kind: InternalNtp, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("67593d686ed04a1709f93972b71f4ebc148a9362120f65d239943e814a9a7439") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 803bfb63-c246-41db-b0da-d3b87ddfc63d (service), zone_kind: ExternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("ccca13ed19b8731f9adaf0d6203b02ea3b9ede4fa426b9fac0a07ce95440046d") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: ba4994a8-23f9-4b1a-a84f-a08d74591389 (service), zone_kind: CruciblePantry, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("21f0ada306859c23917361f2e0b9235806c32607ec689c7e8cf16bb898bc5a02") }, inv_image_source: InstallDataset } }] +INFO some zones not yet up-to-date, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: 0c71b3b2-6ceb-4e8f-b020-b08675e83038 (service), zone_kind: Nexus, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("0e32b4a3e5d3668bb1d6a16fb06b74dc60b973fa479dcee0aae3adbb52bf1388") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 427ec88f-f467-42fa-9bbb-66a91a36103c (service), zone_kind: InternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("ffbf1373f7ee08dddd74c53ed2a94e7c4c572a982d3a9bc94000c6956b700c6a") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 5199c033-4cf9-4ab6-8ae7-566bd7606363 (service), zone_kind: Crucible, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("6f17cf65fb5a5bec5542dd07c03cd0acc01e59130f02c532c8d848ecae810047") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 6444f8a5-6465-4f0b-a549-1993c113569c (service), zone_kind: InternalNtp, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("67593d686ed04a1709f93972b71f4ebc148a9362120f65d239943e814a9a7439") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 803bfb63-c246-41db-b0da-d3b87ddfc63d (service), zone_kind: ExternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("ccca13ed19b8731f9adaf0d6203b02ea3b9ede4fa426b9fac0a07ce95440046d") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: ba4994a8-23f9-4b1a-a84f-a08d74591389 (service), zone_kind: CruciblePantry, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("21f0ada306859c23917361f2e0b9235806c32607ec689c7e8cf16bb898bc5a02") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6 (service), zone_kind: Nexus, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("2.0.0") }, hash: ArtifactHash("e9b7035f41848a987a798c15ac424cc91dd662b1af0920d58d8aa1ebad7467b6") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: 75b220ba-a0f4-4872-8202-dc7c87f062d0 (service), zone_kind: CruciblePantry, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("2.0.0") }, hash: ArtifactHash("3ff26dad96faa8f67251f5de40458b4f809d536bfe8572134da0e42c2fa12674") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: ea5b4030-b52f-44b2-8d70-45f15f987d01 (service), zone_kind: InternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("2.0.0") }, hash: ArtifactHash("de30657a72b066b8ef1f56351a0a5d4d7000da0a62c4be9b2e949a107ca8a389") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: f10a4fb9-759f-4a65-b25e-5794ad2d07d8 (service), zone_kind: InternalNtp, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("2.0.0") }, hash: ArtifactHash("d76e26198daed69cdae04490d7477f8c842e0dbe37d463eac0d0a8d3fb803095") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: f55647d4-5500-4ad3-893a-df45bd50d622 (service), zone_kind: Crucible, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("2.0.0") }, hash: ArtifactHash("866f6a7c2e51c056fb722b5113e80181cc9cd8b712a0d3dbf1edc4ce29e5229e") }, inv_image_source: InstallDataset } }, ZoneCurrentlyUpdating { zone_id: f6ec9c67-946a-4da3-98d5-581f72ce8bf0 (service), zone_kind: ExternalDns, reason: ImageSourceMismatch { bp_image_source: Artifact { version: Available { version: ArtifactVersion("2.0.0") }, hash: ArtifactHash("f282c45771429f7bebf71f0cc668521066db57c6bb07fcfccdfb44825d3d930f") }, inv_image_source: InstallDataset } }] generated blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c based on parent blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 planning report for blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c: chicken switches: @@ -3072,6 +3098,8 @@ chicken switches: * skipping noop image source check on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6: all 6 zones are already from artifacts * skipping noop image source check on sled d81c6a84-79b8-4958-ae41-ea46c9b19763: all 6 zones are already from artifacts +* only placed 0/2 desired nexus zones +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3086,6 +3114,7 @@ to: blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 4 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3263,6 +3292,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3345,6 +3375,7 @@ to: blueprint 9f89efdf-a23e-4137-b7cc-79f4a91cbe1f internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) * target release min gen: 4 -> 5 + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3489,7 +3520,9 @@ planning report for blueprint 9a9e6c32-5a84-4020-a159-33dceff18d35: * adding zones despite MUPdate override, as specified by the `add_zones_with_mupdate_override` chicken switch * discretionary zone placement waiting for NTP zones on sleds: c3bc4c6d-fdde-4fc4-8493-89d2a1e5ee6b * missing NTP zone on sled c3bc4c6d-fdde-4fc4-8493-89d2a1e5ee6b +* only placed 0/2 desired nexus zones * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3566,6 +3599,7 @@ to: blueprint 9a9e6c32-5a84-4020-a159-33dceff18d35 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 5 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout index 08e6ad39d07..675515349dc 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-noop-image-source-stdout @@ -180,6 +180,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates @@ -345,6 +346,7 @@ to: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -538,6 +540,7 @@ chicken switches: * waiting on MUPdate overrides * MUPdate overrides exist * zone updates waiting on MUPdate overrides +* waiting to update top-level nexus_generation: pending non-nexus zone updates @@ -594,6 +597,7 @@ to: blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-set-mgs-updates-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-set-mgs-updates-stdout index 451abc426ae..7dbb2ddddc3 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-set-mgs-updates-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-set-mgs-updates-stdout @@ -205,6 +205,7 @@ parent: 6ccc786b-17f1-4562-958f-5a7d9a5a15fd internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -416,6 +417,7 @@ parent: ad97e762-7bf1-45a6-a98f-60afb7e491c0 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 1 Pending MGS-managed updates (all baseboards): @@ -441,6 +443,7 @@ to: blueprint cca24b71-09b5-4042-9185-b33e9f2ebba0 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -606,6 +609,7 @@ to: blueprint ad97e762-7bf1-45a6-a98f-60afb7e491c0 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -963,6 +967,7 @@ parent: cca24b71-09b5-4042-9185-b33e9f2ebba0 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 1 Pending MGS-managed updates (all baseboards): @@ -988,6 +993,7 @@ to: blueprint 5bf974f3-81f9-455b-b24e-3099f765664c internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1154,6 +1160,7 @@ to: blueprint cca24b71-09b5-4042-9185-b33e9f2ebba0 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1513,6 +1520,7 @@ parent: 5bf974f3-81f9-455b-b24e-3099f765664c internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 2 Pending MGS-managed updates (all baseboards): @@ -1539,6 +1547,7 @@ to: blueprint 1b837a27-3be1-4fcb-8499-a921c839e1d0 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1895,6 +1904,7 @@ parent: 1b837a27-3be1-4fcb-8499-a921c839e1d0 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 1 Pending MGS-managed updates (all baseboards): @@ -1920,6 +1930,7 @@ to: blueprint 3682a71b-c6ca-4b7e-8f84-16df80c85960 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-set-remove-mupdate-override-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-set-remove-mupdate-override-stdout index 01fb9438c11..5ee8dfee717 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-set-remove-mupdate-override-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-set-remove-mupdate-override-stdout @@ -274,6 +274,7 @@ parent: df06bb57-ad42-4431-9206-abff322896c7 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -397,6 +398,7 @@ to: blueprint afb09faf-a586-4483-9289-04d4f1d8ba23 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -665,6 +667,7 @@ parent: afb09faf-a586-4483-9289-04d4f1d8ba23 internal DNS version::: 1 external DNS version::: 1 target release min gen: 2 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -684,6 +687,7 @@ to: blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) * target release min gen: 1 -> 2 + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-set-zone-images-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-set-zone-images-stdout index f704a5c0af3..37b9c7d3c32 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-set-zone-images-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-set-zone-images-stdout @@ -107,6 +107,7 @@ parent: 1b013011-2062-4b48-b544-a32b23bce83a internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -225,6 +226,7 @@ parent: 9766ca20-38d4-4380-b005-e7c43c797e7c internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -329,6 +331,7 @@ to: blueprint f714e6ea-e85a-4d7d-93c2-a018744fe176 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -547,6 +550,7 @@ parent: bb128f06-a2e1-44c1-8874-4f789d0ff896 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -651,6 +655,7 @@ to: blueprint d9c572a1-a68c-4945-b1ec-5389bd588fe9 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index 0c2730f4989..774d8a5dd95 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -215,13 +215,203 @@ chicken switches: * 1 pending MGS update: * model0:serial0: RotBootloader(PendingMgsUpdateRotBootloaderDetails { expected_stage0_version: ArtifactVersion("0.0.1"), expected_stage0_next_version: NoValidVersion }) -* zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* discretionary zones placed: + * nexus zone on sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c from source artifact: version 1.0.0 + * nexus zone on sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 from source artifact: version 1.0.0 + * nexus zone on sled d81c6a84-79b8-4958-ae41-ea46c9b19763 from source artifact: version 1.0.0 +* zone updates waiting on discretionary zones +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest from: blueprint dbcbd3d6-41ff-48ae-ac0b-1becc9b2fd21 to: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 + MODIFIED SLEDS: + + sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c (active, config generation 2 -> 3): + + host phase 2 contents: + ------------------------ + slot boot image source + ------------------------ + A current contents + B current contents + + + physical disks: + ------------------------------------------------------------------------------------ + vendor model serial disposition + ------------------------------------------------------------------------------------ + fake-vendor fake-model serial-727522a7-934f-494d-b5b3-160968e74463 in service + fake-vendor fake-model serial-72c59873-31ff-4e36-8d76-ff834009349a in service + fake-vendor fake-model serial-b5fd5bc1-099e-4e77-8028-a9793c11f43b in service + + + datasets: + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + dataset name dataset id disposition quota reservation compression + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + oxp_727522a7-934f-494d-b5b3-160968e74463/crucible 2f204c50-a327-479c-8852-f53ec7a19c1f in service none none off + oxp_72c59873-31ff-4e36-8d76-ff834009349a/crucible 78f34ce7-42f1-41da-995f-318f32054ad2 in service none none off + oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crucible 1640adb6-70bf-44cf-b05c-bff6dd300cf3 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/clickhouse 841d5648-05f0-47b0-b446-92f6b60fe9a6 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/external_dns 8e0bd2bd-23b7-4bc6-9e73-c4d4ebc0bc8c in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/internal_dns 2ad1875a-92ac-472f-8c26-593309f0e4da in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone 4829f422-aa31-41a8-ab73-95684ff1ef48 in service none none off + oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/zone 775f9207-c42d-4af2-9186-27ffef67735e in service none none off + oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crypt/zone 3b66453b-7148-4c1b-84a9-499e43290ab4 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_clickhouse_353b3b65-20f7-48c3-88f7-495bd5d31545 b46de15d-33e7-4cd0-aa7c-e7be2a61e71b in service none none off + oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crypt/zone/oxz_crucible_86a22a56-0168-453d-9df1-cb2a7c64b5d3 3e0d6188-c503-49cf-a441-fa7df40ceb43 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_crucible_bd354eef-d8a6-4165-9124-283fb5e46d77 5ae11c7e-08fa-4d78-a4ea-14b4a9a10241 in service none none off + oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/zone/oxz_crucible_e2fdefe7-95b2-4fd2-ae37-56929a06d58c b8f2a09f-8bd2-4418-872b-a4457a3f958c in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_crucible_pantry_ad6a3a03-8d0f-4504-99a4-cbf73d69b973 49f8fbb6-5bac-4609-907f-6e3dfc206059 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_external_dns_6c3ae381-04f7-41ea-b0ac-74db387dbc3a 8c4fa711-1d5d-4e93-85f0-d17bff47b063 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_internal_dns_99e2f30b-3174-40bf-a78a-90da8abba8ca c31623de-c19b-4615-9f1d-5e1daa5d3bda in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_nexus_466a9f29-62bf-4e63-924a-b9efdb86afec 3560dd69-3b23-4c69-807d-d673104cfc68 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_ntp_62620961-fc4a-481e-968b-f5acbac0dc63 09b9cc9b-3426-470b-a7bc-538f82dede03 in service none none off + oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/debug 93957ca0-9ed1-4e7b-8c34-2ce07a69541c in service 100 GiB none gzip-9 + oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/debug 2db6b7c1-0f46-4ced-a3ad-48872793360e in service 100 GiB none gzip-9 + oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crypt/debug 318fae85-abcb-4259-b1b6-ac96d193f7b7 in service 100 GiB none gzip-9 ++ oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/zone/oxz_nexus_a67ad53f-d551-40e7-abae-57664779b27b 9edcc144-9dd9-4bf9-a26d-26f265400b0b in service none none off + + + omicron zones: + ----------------------------------------------------------------------------------------------------------------------- + zone type zone id image source disposition underlay IP + ----------------------------------------------------------------------------------------------------------------------- + clickhouse 353b3b65-20f7-48c3-88f7-495bd5d31545 install dataset in service fd00:1122:3344:102::23 + crucible 86a22a56-0168-453d-9df1-cb2a7c64b5d3 install dataset in service fd00:1122:3344:102::28 + crucible bd354eef-d8a6-4165-9124-283fb5e46d77 install dataset in service fd00:1122:3344:102::26 + crucible e2fdefe7-95b2-4fd2-ae37-56929a06d58c install dataset in service fd00:1122:3344:102::27 + crucible_pantry ad6a3a03-8d0f-4504-99a4-cbf73d69b973 install dataset in service fd00:1122:3344:102::25 + external_dns 6c3ae381-04f7-41ea-b0ac-74db387dbc3a install dataset in service fd00:1122:3344:102::24 + internal_dns 99e2f30b-3174-40bf-a78a-90da8abba8ca install dataset in service fd00:1122:3344:1::1 + internal_ntp 62620961-fc4a-481e-968b-f5acbac0dc63 install dataset in service fd00:1122:3344:102::21 + nexus 466a9f29-62bf-4e63-924a-b9efdb86afec install dataset in service fd00:1122:3344:102::22 ++ nexus a67ad53f-d551-40e7-abae-57664779b27b artifact: version 1.0.0 in service fd00:1122:3344:102::29 + + + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (active, config generation 2 -> 3): + + host phase 2 contents: + ------------------------ + slot boot image source + ------------------------ + A current contents + B current contents + + + physical disks: + ------------------------------------------------------------------------------------ + vendor model serial disposition + ------------------------------------------------------------------------------------ + fake-vendor fake-model serial-073979dd-3248-44a5-9fa1-cc72a140d682 in service + fake-vendor fake-model serial-c6d33b64-fb96-4129-bab1-7878a06a5f9b in service + fake-vendor fake-model serial-e4d937e1-6ddc-4eca-bb08-c1f73791e608 in service + + + datasets: + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + dataset name dataset id disposition quota reservation compression + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crucible 7b4ce6bf-95bb-42fe-a4a0-dff31211ab88 in service none none off + oxp_c6d33b64-fb96-4129-bab1-7878a06a5f9b/crucible ea8a11bf-a884-4c4f-8df0-3ef9b7aacf43 in service none none off + oxp_e4d937e1-6ddc-4eca-bb08-c1f73791e608/crucible 50b029e3-96aa-41e5-bf39-023193a4355e in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/external_dns 4847a96e-a267-4ae7-aa3d-805c1e77f81e in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/internal_dns ad41be71-6c15-4428-b510-20ceacde4fa6 in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone 4617d206-4330-4dfa-b9f3-f63a3db834f9 in service none none off + oxp_c6d33b64-fb96-4129-bab1-7878a06a5f9b/crypt/zone 793ac181-1b01-403c-850d-7f5c54bda6c9 in service none none off + oxp_e4d937e1-6ddc-4eca-bb08-c1f73791e608/crypt/zone 4f60b534-eaa3-40a1-b60f-bfdf147af478 in service none none off + oxp_e4d937e1-6ddc-4eca-bb08-c1f73791e608/crypt/zone/oxz_crucible_058fd5f9-60a8-4e11-9302-15172782e17d 02c56a30-7d97-406d-bd34-1eb437fd517d in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_crucible_5199c033-4cf9-4ab6-8ae7-566bd7606363 832fd140-d467-4bad-b5e9-63171634087c in service none none off + oxp_c6d33b64-fb96-4129-bab1-7878a06a5f9b/crypt/zone/oxz_crucible_dfac80b4-a887-430a-ae87-a4e065dba787 4d7e3e8e-06bd-414c-a468-779e056a9b75 in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_crucible_pantry_ba4994a8-23f9-4b1a-a84f-a08d74591389 42430c80-7836-4191-a4f6-bcee749010fe in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_external_dns_803bfb63-c246-41db-b0da-d3b87ddfc63d 43931274-7fe8-4077-825d-dff2bc8efa58 in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_internal_dns_427ec88f-f467-42fa-9bbb-66a91a36103c 1bca7f71-5e42-4749-91ec-fa40793a3a9a in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_nexus_0c71b3b2-6ceb-4e8f-b020-b08675e83038 a4c3032e-21fa-4d4a-b040-a7e3c572cf3c in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_ntp_6444f8a5-6465-4f0b-a549-1993c113569c 3ac089c9-9dec-465b-863a-188e80d71fb4 in service none none off + oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/debug 248c6c10-1ac6-45de-bb55-ede36ca56bbd in service 100 GiB none gzip-9 + oxp_c6d33b64-fb96-4129-bab1-7878a06a5f9b/crypt/debug cdf3684f-a6cf-4449-b9ec-e696b2c663e2 in service 100 GiB none gzip-9 + oxp_e4d937e1-6ddc-4eca-bb08-c1f73791e608/crypt/debug 686c19cf-a0d7-45f6-866f-c564612b2664 in service 100 GiB none gzip-9 ++ oxp_c6d33b64-fb96-4129-bab1-7878a06a5f9b/crypt/zone/oxz_nexus_43cbc3a6-e640-43f5-a9a2-f83eff427870 7f7a2971-de10-4d5a-a814-901adc52bb00 in service none none off + + + omicron zones: + ----------------------------------------------------------------------------------------------------------------------- + zone type zone id image source disposition underlay IP + ----------------------------------------------------------------------------------------------------------------------- + crucible 058fd5f9-60a8-4e11-9302-15172782e17d install dataset in service fd00:1122:3344:101::27 + crucible 5199c033-4cf9-4ab6-8ae7-566bd7606363 install dataset in service fd00:1122:3344:101::25 + crucible dfac80b4-a887-430a-ae87-a4e065dba787 install dataset in service fd00:1122:3344:101::26 + crucible_pantry ba4994a8-23f9-4b1a-a84f-a08d74591389 install dataset in service fd00:1122:3344:101::24 + external_dns 803bfb63-c246-41db-b0da-d3b87ddfc63d install dataset in service fd00:1122:3344:101::23 + internal_dns 427ec88f-f467-42fa-9bbb-66a91a36103c install dataset in service fd00:1122:3344:2::1 + internal_ntp 6444f8a5-6465-4f0b-a549-1993c113569c install dataset in service fd00:1122:3344:101::21 + nexus 0c71b3b2-6ceb-4e8f-b020-b08675e83038 install dataset in service fd00:1122:3344:101::22 ++ nexus 43cbc3a6-e640-43f5-a9a2-f83eff427870 artifact: version 1.0.0 in service fd00:1122:3344:101::28 + + + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (active, config generation 2 -> 3): + + host phase 2 contents: + ------------------------ + slot boot image source + ------------------------ + A current contents + B current contents + + + physical disks: + ------------------------------------------------------------------------------------ + vendor model serial disposition + ------------------------------------------------------------------------------------ + fake-vendor fake-model serial-18b20749-0748-4105-bb10-7b13cfc776e2 in service + fake-vendor fake-model serial-30c16fe4-4229-49d0-ab01-3138f2c7dff2 in service + fake-vendor fake-model serial-4930954e-9ac7-4453-b63f-5ab97c389a99 in service + + + datasets: + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + dataset name dataset id disposition quota reservation compression + ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crucible 7ea73f80-c4e0-450a-92dc-8397ce2af14f in service none none off + oxp_30c16fe4-4229-49d0-ab01-3138f2c7dff2/crucible 6f04dd20-5e2c-4fa8-8430-a886470ed140 in service none none off + oxp_4930954e-9ac7-4453-b63f-5ab97c389a99/crucible a50cd13a-5749-4e79-bb8b-19229500a8b3 in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/external_dns 96ae8389-3027-4260-9374-e0f6ce851de2 in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/internal_dns 1cb0a47a-59ac-4892-8e92-cf87b4290f96 in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone 45cd9687-20be-4247-b62a-dfdacf324929 in service none none off + oxp_30c16fe4-4229-49d0-ab01-3138f2c7dff2/crypt/zone e009d8b8-4695-4322-b53f-f03f2744aef7 in service none none off + oxp_4930954e-9ac7-4453-b63f-5ab97c389a99/crypt/zone 252ac39f-b9e2-4697-8c07-3a833115d704 in service none none off + oxp_30c16fe4-4229-49d0-ab01-3138f2c7dff2/crypt/zone/oxz_crucible_694bd14f-cb24-4be4-bb19-876e79cda2c8 3443a368-199e-4d26-b59f-3f2bbd507761 in service none none off + oxp_4930954e-9ac7-4453-b63f-5ab97c389a99/crypt/zone/oxz_crucible_7c252b64-c5af-4ec1-989e-9a03f3b0f111 429da94b-19f7-48bd-98e9-47842863ba7b in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_crucible_f55647d4-5500-4ad3-893a-df45bd50d622 50ea8c15-c4c0-4403-a490-d14b3405dfc2 in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_crucible_pantry_75b220ba-a0f4-4872-8202-dc7c87f062d0 54bbadaf-ec04-41a2-a62f-f5ac5bf321be in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_external_dns_f6ec9c67-946a-4da3-98d5-581f72ce8bf0 090bd88d-0a43-4040-a832-b13ae721f74f in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_internal_dns_ea5b4030-b52f-44b2-8d70-45f15f987d01 b1deff4b-51df-4a37-9043-afbd7c70a1cb in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_nexus_3eeb8d49-eb1a-43f8-bb64-c2338421c2c6 4da74a5b-6911-4cca-b624-b90c65530117 in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_ntp_f10a4fb9-759f-4a65-b25e-5794ad2d07d8 c65a9c1c-36dc-4ddb-8aac-ec3be8dbb209 in service none none off + oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/debug 7a6a2058-ea78-49de-9730-cce5e28b4cfb in service 100 GiB none gzip-9 + oxp_30c16fe4-4229-49d0-ab01-3138f2c7dff2/crypt/debug 41071985-1dfd-4ce5-8bc2-897161a8bce4 in service 100 GiB none gzip-9 + oxp_4930954e-9ac7-4453-b63f-5ab97c389a99/crypt/debug 21fd4f3a-ec31-469b-87b1-087c343a2422 in service 100 GiB none gzip-9 ++ oxp_30c16fe4-4229-49d0-ab01-3138f2c7dff2/crypt/zone/oxz_nexus_26fbf986-e560-4449-a351-547d1721b90e e0b86bc5-6a64-432b-bcbe-482e228a4e7d in service none none off + + + omicron zones: + ----------------------------------------------------------------------------------------------------------------------- + zone type zone id image source disposition underlay IP + ----------------------------------------------------------------------------------------------------------------------- + crucible 694bd14f-cb24-4be4-bb19-876e79cda2c8 install dataset in service fd00:1122:3344:103::26 + crucible 7c252b64-c5af-4ec1-989e-9a03f3b0f111 install dataset in service fd00:1122:3344:103::27 + crucible f55647d4-5500-4ad3-893a-df45bd50d622 install dataset in service fd00:1122:3344:103::25 + crucible_pantry 75b220ba-a0f4-4872-8202-dc7c87f062d0 install dataset in service fd00:1122:3344:103::24 + external_dns f6ec9c67-946a-4da3-98d5-581f72ce8bf0 install dataset in service fd00:1122:3344:103::23 + internal_dns ea5b4030-b52f-44b2-8d70-45f15f987d01 install dataset in service fd00:1122:3344:3::1 + internal_ntp f10a4fb9-759f-4a65-b25e-5794ad2d07d8 install dataset in service fd00:1122:3344:103::21 + nexus 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6 install dataset in service fd00:1122:3344:103::22 ++ nexus 26fbf986-e560-4449-a351-547d1721b90e artifact: version 1.0.0 in service fd00:1122:3344:103::28 + + COCKROACHDB SETTINGS: state fingerprint::::::::::::::::: (none) (unchanged) cluster.preserve_downgrade_option: (do not modify) (unchanged) @@ -230,6 +420,7 @@ to: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -245,11 +436,13 @@ to: blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 internal DNS: - DNS zone: "control-plane.oxide.internal" (unchanged) +* DNS zone: "control-plane.oxide.internal": name: 058fd5f9-60a8-4e11-9302-15172782e17d.host (records: 1) AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 ++ name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) ++ AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -258,6 +451,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 ++ name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) ++ AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -326,16 +521,24 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) - SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal - SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal - SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal +* name: _nexus._tcp (records: 3 -> 6) +- SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal +- SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal +- SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal ++ SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal ++ SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal ++ SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal ++ SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal ++ SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal ++ SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal ++ name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) ++ AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -364,15 +567,21 @@ internal DNS: AAAA fd00:1122:3344:3::1 external DNS: - DNS zone: "oxide.example" (unchanged) +* DNS zone: "oxide.example": name: @ (records: 3) NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) - A 192.0.2.2 - A 192.0.2.3 - A 192.0.2.4 +* name: example-silo.sys (records: 3 -> 6) +- A 192.0.2.2 +- A 192.0.2.3 +- A 192.0.2.4 ++ A 192.0.2.2 ++ A 192.0.2.7 ++ A 192.0.2.3 ++ A 192.0.2.6 ++ A 192.0.2.5 ++ A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 name: ns2 (records: 1) @@ -385,9 +594,9 @@ external DNS: > # If we generate another plan, there should be no change. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update not yet completed (will keep it), artifact_version: 1.0.0, artifact_hash: 005ea358f1cd316df42465b1e3a0334ea22cc0c0442cf9ddf9b42fbf49780236, expected_stage0_next_version: NoValidVersion, expected_stage0_version: 0.0.1, component: rot_bootloader, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending MGS-driven updates, max: 1 generated blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 based on parent blueprint 8da82a8e-bf97-4fbd-8ddd-9f6462732cf1 @@ -398,6 +607,7 @@ chicken switches: * 1 pending MGS update: * model0:serial0: RotBootloader(PendingMgsUpdateRotBootloaderDetails { expected_stage0_version: ArtifactVersion("0.0.1"), expected_stage0_next_version: NoValidVersion }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -412,6 +622,7 @@ to: blueprint 58d5e830-0884-47d8-a7cd-b2b3751adeb4 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -424,6 +635,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -432,6 +645,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -500,16 +715,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -543,9 +763,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -568,9 +791,9 @@ set sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 RoT bootloader versions: stage0 -> generated inventory collection eb0796d5-ab8a-4f7b-a884-b4aeacb8ab51 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 005ea358f1cd316df42465b1e3a0334ea22cc0c0442cf9ddf9b42fbf49780236, expected_stage0_next_version: NoValidVersion, expected_stage0_version: 0.0.1, component: rot_bootloader, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, component: rot, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -582,6 +805,7 @@ chicken switches: * 1 pending MGS update: * model0:serial0: Rot(PendingMgsUpdateRotDetails { expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, expected_persistent_boot_preference: A, expected_pending_persistent_boot_preference: None, expected_transient_boot_preference: None }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -596,6 +820,7 @@ to: blueprint af934083-59b5-4bf6-8966-6fb5292c29e1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -617,6 +842,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -625,6 +852,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -693,16 +922,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -736,9 +970,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -761,9 +998,9 @@ set sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 RoT settings: slot a -> 1.0.0 generated inventory collection 61f451b3-2121-4ed6-91c7-a550054f6c21 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, component: rot, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -775,6 +1012,7 @@ chicken switches: * 1 pending MGS update: * model0:serial0: Sp(PendingMgsUpdateSpDetails { expected_active_version: ArtifactVersion("0.0.1"), expected_inactive_version: NoValidVersion }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -789,6 +1027,7 @@ to: blueprint df06bb57-ad42-4431-9206-abff322896c7 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -810,6 +1049,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -818,6 +1059,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -886,16 +1129,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -929,9 +1177,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -955,9 +1206,9 @@ set sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 SP versions: active -> 1.0.0 generated inventory collection b1bda47d-2c19-4fba-96e3-d9df28db7436 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:101::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -969,6 +1220,7 @@ chicken switches: * 1 pending MGS update: * model0:serial0: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008"), sled_agent_address: [fd00:1122:3344:101::1]:12345 }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -977,7 +1229,7 @@ to: blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba MODIFIED SLEDS: - sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (active, config generation 2 -> 3): + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (active, config generation 3 -> 4): host phase 2 contents: -------------------------------- @@ -1016,6 +1268,7 @@ to: blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_external_dns_803bfb63-c246-41db-b0da-d3b87ddfc63d 43931274-7fe8-4077-825d-dff2bc8efa58 in service none none off oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_internal_dns_427ec88f-f467-42fa-9bbb-66a91a36103c 1bca7f71-5e42-4749-91ec-fa40793a3a9a in service none none off oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_nexus_0c71b3b2-6ceb-4e8f-b020-b08675e83038 a4c3032e-21fa-4d4a-b040-a7e3c572cf3c in service none none off + oxp_c6d33b64-fb96-4129-bab1-7878a06a5f9b/crypt/zone/oxz_nexus_43cbc3a6-e640-43f5-a9a2-f83eff427870 7f7a2971-de10-4d5a-a814-901adc52bb00 in service none none off oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/zone/oxz_ntp_6444f8a5-6465-4f0b-a549-1993c113569c 3ac089c9-9dec-465b-863a-188e80d71fb4 in service none none off oxp_073979dd-3248-44a5-9fa1-cc72a140d682/crypt/debug 248c6c10-1ac6-45de-bb55-ede36ca56bbd in service 100 GiB none gzip-9 oxp_c6d33b64-fb96-4129-bab1-7878a06a5f9b/crypt/debug cdf3684f-a6cf-4449-b9ec-e696b2c663e2 in service 100 GiB none gzip-9 @@ -1023,17 +1276,18 @@ to: blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba omicron zones: - --------------------------------------------------------------------------------------------------------------- - zone type zone id image source disposition underlay IP - --------------------------------------------------------------------------------------------------------------- - crucible 058fd5f9-60a8-4e11-9302-15172782e17d install dataset in service fd00:1122:3344:101::27 - crucible 5199c033-4cf9-4ab6-8ae7-566bd7606363 install dataset in service fd00:1122:3344:101::25 - crucible dfac80b4-a887-430a-ae87-a4e065dba787 install dataset in service fd00:1122:3344:101::26 - crucible_pantry ba4994a8-23f9-4b1a-a84f-a08d74591389 install dataset in service fd00:1122:3344:101::24 - external_dns 803bfb63-c246-41db-b0da-d3b87ddfc63d install dataset in service fd00:1122:3344:101::23 - internal_dns 427ec88f-f467-42fa-9bbb-66a91a36103c install dataset in service fd00:1122:3344:2::1 - internal_ntp 6444f8a5-6465-4f0b-a549-1993c113569c install dataset in service fd00:1122:3344:101::21 - nexus 0c71b3b2-6ceb-4e8f-b020-b08675e83038 install dataset in service fd00:1122:3344:101::22 + ----------------------------------------------------------------------------------------------------------------------- + zone type zone id image source disposition underlay IP + ----------------------------------------------------------------------------------------------------------------------- + crucible 058fd5f9-60a8-4e11-9302-15172782e17d install dataset in service fd00:1122:3344:101::27 + crucible 5199c033-4cf9-4ab6-8ae7-566bd7606363 install dataset in service fd00:1122:3344:101::25 + crucible dfac80b4-a887-430a-ae87-a4e065dba787 install dataset in service fd00:1122:3344:101::26 + crucible_pantry ba4994a8-23f9-4b1a-a84f-a08d74591389 install dataset in service fd00:1122:3344:101::24 + external_dns 803bfb63-c246-41db-b0da-d3b87ddfc63d install dataset in service fd00:1122:3344:101::23 + internal_dns 427ec88f-f467-42fa-9bbb-66a91a36103c install dataset in service fd00:1122:3344:2::1 + internal_ntp 6444f8a5-6465-4f0b-a549-1993c113569c install dataset in service fd00:1122:3344:101::21 + nexus 0c71b3b2-6ceb-4e8f-b020-b08675e83038 install dataset in service fd00:1122:3344:101::22 + nexus 43cbc3a6-e640-43f5-a9a2-f83eff427870 artifact: version 1.0.0 in service fd00:1122:3344:101::28 COCKROACHDB SETTINGS: @@ -1044,6 +1298,7 @@ to: blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1065,6 +1320,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -1073,6 +1330,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -1141,16 +1400,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -1184,9 +1448,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -1200,9 +1467,9 @@ external DNS: > # If we generate another plan, there should be no change. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update not yet completed (will keep it), artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:101::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending MGS-driven updates, max: 1 generated blueprint 9034c710-3e57-45f3-99e5-4316145e87ac based on parent blueprint 7f976e0d-d2a5-4eeb-9e82-c82bc2824aba @@ -1213,6 +1480,7 @@ chicken switches: * 1 pending MGS update: * model0:serial0: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008"), sled_agent_address: [fd00:1122:3344:101::1]:12345 }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -1227,6 +1495,7 @@ to: blueprint 9034c710-3e57-45f3-99e5-4316145e87ac internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1239,6 +1508,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -1247,6 +1518,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -1315,16 +1588,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -1358,9 +1636,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -1384,9 +1665,9 @@ generated inventory collection a71f7a73-35a6-45e8-acbe-f1c5925eed69 from configu > # Planning after only phase 2 has changed should make no changes. We're still > # waiting on phase 1 to change. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update not yet completed (will keep it), artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:101::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending MGS-driven updates, max: 1 generated blueprint d60afc57-f15d-476c-bd0f-b1071e2bb976 based on parent blueprint 9034c710-3e57-45f3-99e5-4316145e87ac @@ -1397,6 +1678,7 @@ chicken switches: * 1 pending MGS update: * model0:serial0: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008"), sled_agent_address: [fd00:1122:3344:101::1]:12345 }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -1411,6 +1693,7 @@ to: blueprint d60afc57-f15d-476c-bd0f-b1071e2bb976 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1423,6 +1706,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -1431,6 +1716,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -1499,16 +1786,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -1542,9 +1834,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -1567,9 +1862,9 @@ generated inventory collection 0b5efbb3-0b1b-4bbf-b7d8-a2d6fca074c6 from configu > # Planning _still_ shouldn't make any new changes; the OS update as a whole > # isn't done until sled-agent reports it has booted from the new image. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO keeping apparently-impossible MGS-driven update (waiting for recent update to be applied), artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:101::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO reached maximum number of pending MGS-driven updates, max: 1 generated blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 based on parent blueprint d60afc57-f15d-476c-bd0f-b1071e2bb976 @@ -1580,6 +1875,7 @@ chicken switches: * 1 pending MGS update: * model0:serial0: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008"), sled_agent_address: [fd00:1122:3344:101::1]:12345 }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -1594,6 +1890,7 @@ to: blueprint a5a8f242-ffa5-473c-8efd-2acf2dc0b736 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1606,6 +1903,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -1614,6 +1913,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -1682,16 +1983,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -1725,9 +2031,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -1750,9 +2059,9 @@ generated inventory collection 78f72e8d-46a9-40a9-8618-602f54454d80 from configu > # Planning should now remove the host OS update and plan the next RoT bootloader > # update. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:101::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 0, sp_type: Sled, serial_number: serial0, part_number: model0 INFO skipping board for MGS-driven update, serial_number: serial0, part_number: model0 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 005ea358f1cd316df42465b1e3a0334ea22cc0c0442cf9ddf9b42fbf49780236, expected_stage0_next_version: NoValidVersion, expected_stage0_version: 0.0.1, component: rot_bootloader, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 @@ -1765,6 +2074,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: RotBootloader(PendingMgsUpdateRotBootloaderDetails { expected_stage0_version: ArtifactVersion("0.0.1"), expected_stage0_next_version: NoValidVersion }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -1779,6 +2089,7 @@ to: blueprint 626487fa-7139-45ec-8416-902271fc730b internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1800,6 +2111,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -1808,6 +2121,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -1876,16 +2191,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -1919,9 +2239,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -1947,9 +2270,9 @@ set sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c RoT bootloader versions: stage0_ne generated inventory collection 39363465-89ae-4ac2-9be1-099068da9d45 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update impossible (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 005ea358f1cd316df42465b1e3a0334ea22cc0c0442cf9ddf9b42fbf49780236, expected_stage0_next_version: NoValidVersion, expected_stage0_version: 0.0.1, component: rot_bootloader, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 005ea358f1cd316df42465b1e3a0334ea22cc0c0442cf9ddf9b42fbf49780236, expected_stage0_next_version: Version(ArtifactVersion("0.5.0")), expected_stage0_version: 0.0.1, component: rot_bootloader, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -1961,6 +2284,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: RotBootloader(PendingMgsUpdateRotBootloaderDetails { expected_stage0_version: ArtifactVersion("0.0.1"), expected_stage0_next_version: Version(ArtifactVersion("0.5.0")) }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -1975,6 +2299,7 @@ to: blueprint c1a0d242-9160-40f4-96ae-61f8f40a0b1b internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -1996,6 +2321,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -2004,6 +2331,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -2072,16 +2401,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -2115,9 +2449,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -2138,9 +2475,9 @@ set sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c RoT bootloader versions: stage0 -> generated inventory collection 04bc9001-0836-4fec-b9cb-9d4760caf8b4 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 005ea358f1cd316df42465b1e3a0334ea22cc0c0442cf9ddf9b42fbf49780236, expected_stage0_next_version: Version(ArtifactVersion("0.5.0")), expected_stage0_version: 0.0.1, component: rot_bootloader, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, component: rot, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -2152,6 +2489,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: Rot(PendingMgsUpdateRotDetails { expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, expected_persistent_boot_preference: A, expected_pending_persistent_boot_preference: None, expected_transient_boot_preference: None }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -2166,6 +2504,7 @@ to: blueprint afb09faf-a586-4483-9289-04d4f1d8ba23 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -2187,6 +2526,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -2195,6 +2536,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -2263,16 +2606,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -2306,9 +2654,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -2333,9 +2684,9 @@ set sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c RoT settings: slot b -> 0.5.0 generated inventory collection 08abe624-4b5f-491c-90cb-d74a84e4ba3e from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update impossible (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, component: rot, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: Version(ArtifactVersion("0.5.0")), component: rot, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -2347,6 +2698,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: Rot(PendingMgsUpdateRotDetails { expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: Version(ArtifactVersion("0.5.0")), expected_persistent_boot_preference: A, expected_pending_persistent_boot_preference: None, expected_transient_boot_preference: None }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -2361,6 +2713,7 @@ to: blueprint ce365dff-2cdb-4f35-a186-b15e20e1e700 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -2382,6 +2735,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -2390,6 +2745,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -2458,16 +2815,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -2501,9 +2863,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -2524,9 +2889,9 @@ set sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c RoT settings: slot a -> 1.0.0 generated inventory collection 005f6a30-7f65-4593-9f78-ee68f766f42b from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: Version(ArtifactVersion("0.5.0")), component: rot, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -2538,6 +2903,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: Sp(PendingMgsUpdateSpDetails { expected_active_version: ArtifactVersion("0.0.1"), expected_inactive_version: NoValidVersion }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -2552,6 +2918,7 @@ to: blueprint 8f2d1f39-7c88-4701-aa43-56bf281b28c1 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -2573,6 +2940,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -2581,6 +2950,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -2649,16 +3020,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -2692,9 +3068,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -2718,9 +3097,9 @@ set sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c SP versions: inactive -> 0.5.0 generated inventory collection b5263998-e486-4cea-8842-b32bd326fa3a from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update impossible (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: Version(ArtifactVersion("0.5.0")), expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -2732,6 +3111,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: Sp(PendingMgsUpdateSpDetails { expected_active_version: ArtifactVersion("0.0.1"), expected_inactive_version: Version(ArtifactVersion("0.5.0")) }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -2746,6 +3126,7 @@ to: blueprint 12d602a6-5ab4-487a-b94e-eb30cdf30300 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -2767,6 +3148,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -2775,6 +3158,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -2843,16 +3228,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -2886,9 +3276,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -2910,9 +3303,9 @@ generated inventory collection 68767302-7fed-4eb1-9611-3dfd807ff0cd from configu > # Planning should remove this update and add an OS update for this sled. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: Version(ArtifactVersion("0.5.0")), expected_active_version: 0.0.1, component: sp, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:102::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -2924,6 +3317,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008"), sled_agent_address: [fd00:1122:3344:102::1]:12345 }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -2932,7 +3326,7 @@ to: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 MODIFIED SLEDS: - sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c (active, config generation 2 -> 3): + sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c (active, config generation 3 -> 4): host phase 2 contents: -------------------------------- @@ -2973,6 +3367,7 @@ to: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_external_dns_6c3ae381-04f7-41ea-b0ac-74db387dbc3a 8c4fa711-1d5d-4e93-85f0-d17bff47b063 in service none none off oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_internal_dns_99e2f30b-3174-40bf-a78a-90da8abba8ca c31623de-c19b-4615-9f1d-5e1daa5d3bda in service none none off oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_nexus_466a9f29-62bf-4e63-924a-b9efdb86afec 3560dd69-3b23-4c69-807d-d673104cfc68 in service none none off + oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/zone/oxz_nexus_a67ad53f-d551-40e7-abae-57664779b27b 9edcc144-9dd9-4bf9-a26d-26f265400b0b in service none none off oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_ntp_62620961-fc4a-481e-968b-f5acbac0dc63 09b9cc9b-3426-470b-a7bc-538f82dede03 in service none none off oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/debug 93957ca0-9ed1-4e7b-8c34-2ce07a69541c in service 100 GiB none gzip-9 oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/debug 2db6b7c1-0f46-4ced-a3ad-48872793360e in service 100 GiB none gzip-9 @@ -2980,18 +3375,19 @@ to: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 omicron zones: - --------------------------------------------------------------------------------------------------------------- - zone type zone id image source disposition underlay IP - --------------------------------------------------------------------------------------------------------------- - clickhouse 353b3b65-20f7-48c3-88f7-495bd5d31545 install dataset in service fd00:1122:3344:102::23 - crucible 86a22a56-0168-453d-9df1-cb2a7c64b5d3 install dataset in service fd00:1122:3344:102::28 - crucible bd354eef-d8a6-4165-9124-283fb5e46d77 install dataset in service fd00:1122:3344:102::26 - crucible e2fdefe7-95b2-4fd2-ae37-56929a06d58c install dataset in service fd00:1122:3344:102::27 - crucible_pantry ad6a3a03-8d0f-4504-99a4-cbf73d69b973 install dataset in service fd00:1122:3344:102::25 - external_dns 6c3ae381-04f7-41ea-b0ac-74db387dbc3a install dataset in service fd00:1122:3344:102::24 - internal_dns 99e2f30b-3174-40bf-a78a-90da8abba8ca install dataset in service fd00:1122:3344:1::1 - internal_ntp 62620961-fc4a-481e-968b-f5acbac0dc63 install dataset in service fd00:1122:3344:102::21 - nexus 466a9f29-62bf-4e63-924a-b9efdb86afec install dataset in service fd00:1122:3344:102::22 + ----------------------------------------------------------------------------------------------------------------------- + zone type zone id image source disposition underlay IP + ----------------------------------------------------------------------------------------------------------------------- + clickhouse 353b3b65-20f7-48c3-88f7-495bd5d31545 install dataset in service fd00:1122:3344:102::23 + crucible 86a22a56-0168-453d-9df1-cb2a7c64b5d3 install dataset in service fd00:1122:3344:102::28 + crucible bd354eef-d8a6-4165-9124-283fb5e46d77 install dataset in service fd00:1122:3344:102::26 + crucible e2fdefe7-95b2-4fd2-ae37-56929a06d58c install dataset in service fd00:1122:3344:102::27 + crucible_pantry ad6a3a03-8d0f-4504-99a4-cbf73d69b973 install dataset in service fd00:1122:3344:102::25 + external_dns 6c3ae381-04f7-41ea-b0ac-74db387dbc3a install dataset in service fd00:1122:3344:102::24 + internal_dns 99e2f30b-3174-40bf-a78a-90da8abba8ca install dataset in service fd00:1122:3344:1::1 + internal_ntp 62620961-fc4a-481e-968b-f5acbac0dc63 install dataset in service fd00:1122:3344:102::21 + nexus 466a9f29-62bf-4e63-924a-b9efdb86afec install dataset in service fd00:1122:3344:102::22 + nexus a67ad53f-d551-40e7-abae-57664779b27b artifact: version 1.0.0 in service fd00:1122:3344:102::29 COCKROACHDB SETTINGS: @@ -3002,6 +3398,7 @@ to: blueprint 61a93ea3-c872-48e0-aace-e86b0c52b839 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3023,6 +3420,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -3031,6 +3430,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -3099,16 +3500,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -3142,9 +3548,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -3170,9 +3579,9 @@ set sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c host phase 1 details: B -> fffffff generated inventory collection 62898097-2ff1-48d0-8bc1-91b475daa33d from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update impossible (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:102::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:102::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -3184,6 +3593,7 @@ chicken switches: * 1 pending MGS update: * model1:serial1: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"), expected_inactive_phase_2_hash: ArtifactHash("f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008"), sled_agent_address: [fd00:1122:3344:102::1]:12345 }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3198,6 +3608,7 @@ to: blueprint 27e755bc-dc10-4647-853c-f89bb3a15a2c internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3219,6 +3630,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -3227,6 +3640,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -3295,16 +3710,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -3338,9 +3758,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -3366,9 +3789,9 @@ generated inventory collection 3086f142-62d3-4f77-bda3-674afbb42d0d from configu > # Another planning step should try to update the last sled, starting with the > # RoT bootloader. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:102::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 1, sp_type: Sled, serial_number: serial1, part_number: model1 INFO skipping board for MGS-driven update, serial_number: serial1, part_number: model1 INFO skipping board for MGS-driven update, serial_number: serial0, part_number: model0 @@ -3382,6 +3805,7 @@ chicken switches: * 1 pending MGS update: * model2:serial2: RotBootloader(PendingMgsUpdateRotBootloaderDetails { expected_stage0_version: ArtifactVersion("0.0.1"), expected_stage0_next_version: NoValidVersion }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3396,6 +3820,7 @@ to: blueprint 9f89efdf-a23e-4137-b7cc-79f4a91cbe1f internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3417,6 +3842,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -3425,6 +3852,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -3493,16 +3922,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -3536,9 +3970,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -3559,9 +3996,9 @@ set sled d81c6a84-79b8-4958-ae41-ea46c9b19763 RoT bootloader versions: stage0 -> generated inventory collection ae5b3bb4-ce21-465f-b18e-857614732d66 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 005ea358f1cd316df42465b1e3a0334ea22cc0c0442cf9ddf9b42fbf49780236, expected_stage0_next_version: NoValidVersion, expected_stage0_version: 0.0.1, component: rot_bootloader, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, component: rot, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -3573,6 +4010,7 @@ chicken switches: * 1 pending MGS update: * model2:serial2: Rot(PendingMgsUpdateRotDetails { expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, expected_persistent_boot_preference: A, expected_pending_persistent_boot_preference: None, expected_transient_boot_preference: None }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3587,6 +4025,7 @@ to: blueprint 9a9e6c32-5a84-4020-a159-33dceff18d35 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3608,6 +4047,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -3616,6 +4057,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -3684,16 +4127,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -3727,9 +4175,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -3750,9 +4201,9 @@ set sled d81c6a84-79b8-4958-ae41-ea46c9b19763 RoT settings: slot a -> 1.0.0 generated inventory collection 34c3258c-b2ab-4da9-9720-41a3a703c3d7 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 04e4a7fdb84acca92c8fd3235e26d64ea61bef8a5f98202589fd346989c5720a, expected_transient_boot_preference: None, expected_pending_persistent_boot_preference: None, expected_persistent_boot_preference: A, expected_active_slot: ExpectedActiveRotSlot { slot: A, version: ArtifactVersion("0.0.2") }, expected_inactive_version: NoValidVersion, component: rot, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -3764,6 +4215,7 @@ chicken switches: * 1 pending MGS update: * model2:serial2: Sp(PendingMgsUpdateSpDetails { expected_active_version: ArtifactVersion("0.0.1"), expected_inactive_version: NoValidVersion }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3778,6 +4230,7 @@ to: blueprint 13cfdd24-52ba-4e94-8c83-02e3a48fc746 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -3799,6 +4252,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -3807,6 +4262,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -3875,16 +4332,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -3918,9 +4380,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -3941,9 +4406,9 @@ set sled d81c6a84-79b8-4958-ae41-ea46c9b19763 SP versions: active -> 1.0.0 generated inventory collection 5e106b73-6a14-4955-b8a8-a4f8afed6405 from configured sleds > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 7e6667e646ad001b54c8365a3d309c03f89c59102723d38d01697ee8079fe670, expected_inactive_version: NoValidVersion, expected_active_version: 0.0.1, component: sp, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO configuring MGS-driven update, artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:103::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO reached maximum number of pending MGS-driven updates, max: 1 @@ -3955,6 +4420,7 @@ chicken switches: * 1 pending MGS update: * model2:serial2: HostPhase1(PendingMgsUpdateHostPhase1Details { expected_active_phase_1_slot: A, expected_boot_disk: A, expected_active_phase_1_hash: ArtifactHash("0101010101010101010101010101010101010101010101010101010101010101"), expected_active_phase_2_hash: ArtifactHash("0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a"), expected_inactive_phase_1_hash: ArtifactHash("0202020202020202020202020202020202020202020202020202020202020202"), expected_inactive_phase_2_hash: ArtifactHash("f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008"), sled_agent_address: [fd00:1122:3344:103::1]:12345 }) * zone updates waiting on pending MGS updates (RoT / SP / Host OS / etc.) +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest @@ -3963,7 +4429,7 @@ to: blueprint b82656b0-a9be-433d-83d0-e2bdf371777a MODIFIED SLEDS: - sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (active, config generation 2 -> 3): + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (active, config generation 3 -> 4): host phase 2 contents: -------------------------------- @@ -4001,6 +4467,7 @@ to: blueprint b82656b0-a9be-433d-83d0-e2bdf371777a oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_crucible_pantry_75b220ba-a0f4-4872-8202-dc7c87f062d0 54bbadaf-ec04-41a2-a62f-f5ac5bf321be in service none none off oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_external_dns_f6ec9c67-946a-4da3-98d5-581f72ce8bf0 090bd88d-0a43-4040-a832-b13ae721f74f in service none none off oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_internal_dns_ea5b4030-b52f-44b2-8d70-45f15f987d01 b1deff4b-51df-4a37-9043-afbd7c70a1cb in service none none off + oxp_30c16fe4-4229-49d0-ab01-3138f2c7dff2/crypt/zone/oxz_nexus_26fbf986-e560-4449-a351-547d1721b90e e0b86bc5-6a64-432b-bcbe-482e228a4e7d in service none none off oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_nexus_3eeb8d49-eb1a-43f8-bb64-c2338421c2c6 4da74a5b-6911-4cca-b624-b90c65530117 in service none none off oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/zone/oxz_ntp_f10a4fb9-759f-4a65-b25e-5794ad2d07d8 c65a9c1c-36dc-4ddb-8aac-ec3be8dbb209 in service none none off oxp_18b20749-0748-4105-bb10-7b13cfc776e2/crypt/debug 7a6a2058-ea78-49de-9730-cce5e28b4cfb in service 100 GiB none gzip-9 @@ -4009,17 +4476,18 @@ to: blueprint b82656b0-a9be-433d-83d0-e2bdf371777a omicron zones: - --------------------------------------------------------------------------------------------------------------- - zone type zone id image source disposition underlay IP - --------------------------------------------------------------------------------------------------------------- - crucible 694bd14f-cb24-4be4-bb19-876e79cda2c8 install dataset in service fd00:1122:3344:103::26 - crucible 7c252b64-c5af-4ec1-989e-9a03f3b0f111 install dataset in service fd00:1122:3344:103::27 - crucible f55647d4-5500-4ad3-893a-df45bd50d622 install dataset in service fd00:1122:3344:103::25 - crucible_pantry 75b220ba-a0f4-4872-8202-dc7c87f062d0 install dataset in service fd00:1122:3344:103::24 - external_dns f6ec9c67-946a-4da3-98d5-581f72ce8bf0 install dataset in service fd00:1122:3344:103::23 - internal_dns ea5b4030-b52f-44b2-8d70-45f15f987d01 install dataset in service fd00:1122:3344:3::1 - internal_ntp f10a4fb9-759f-4a65-b25e-5794ad2d07d8 install dataset in service fd00:1122:3344:103::21 - nexus 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6 install dataset in service fd00:1122:3344:103::22 + ----------------------------------------------------------------------------------------------------------------------- + zone type zone id image source disposition underlay IP + ----------------------------------------------------------------------------------------------------------------------- + crucible 694bd14f-cb24-4be4-bb19-876e79cda2c8 install dataset in service fd00:1122:3344:103::26 + crucible 7c252b64-c5af-4ec1-989e-9a03f3b0f111 install dataset in service fd00:1122:3344:103::27 + crucible f55647d4-5500-4ad3-893a-df45bd50d622 install dataset in service fd00:1122:3344:103::25 + crucible_pantry 75b220ba-a0f4-4872-8202-dc7c87f062d0 install dataset in service fd00:1122:3344:103::24 + external_dns f6ec9c67-946a-4da3-98d5-581f72ce8bf0 install dataset in service fd00:1122:3344:103::23 + internal_dns ea5b4030-b52f-44b2-8d70-45f15f987d01 install dataset in service fd00:1122:3344:3::1 + internal_ntp f10a4fb9-759f-4a65-b25e-5794ad2d07d8 install dataset in service fd00:1122:3344:103::21 + nexus 26fbf986-e560-4449-a351-547d1721b90e artifact: version 1.0.0 in service fd00:1122:3344:103::28 + nexus 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6 install dataset in service fd00:1122:3344:103::22 COCKROACHDB SETTINGS: @@ -4030,6 +4498,7 @@ to: blueprint b82656b0-a9be-433d-83d0-e2bdf371777a internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -4051,6 +4520,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -4059,6 +4530,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -4127,16 +4600,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -4170,9 +4648,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 @@ -4197,92 +4678,27 @@ generated inventory collection 36ef425f-a672-4bf4-8d29-14815a84ccad from configu > # Do one more planning run. This should update one control plane zone. > blueprint-plan latest latest -INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 9, num_already_artifact: 0, num_eligible: 0, num_ineligible: 9 -INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 -INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 8, num_already_artifact: 0, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, num_total: 10, num_already_artifact: 1, num_eligible: 0, num_ineligible: 9 +INFO performed noop image source checks on sled, sled_id: 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 +INFO performed noop image source checks on sled, sled_id: d81c6a84-79b8-4958-ae41-ea46c9b19763, num_total: 9, num_already_artifact: 1, num_eligible: 0, num_ineligible: 8 INFO MGS-driven update completed (will remove it and re-evaluate board), artifact_version: 1.0.0, artifact_hash: 2053f8594971bbf0a7326c833e2ffc12b065b9d823b9c0b967d275fa595e4e89, sled_agent_address: [fd00:1122:3344:103::1]:12345, expected_inactive_phase_2_hash: f3dd0c7a1bd4500ea0d8bcf67581f576d47752b2f1998a4cb0f0c3155c483008, expected_inactive_phase_1_hash: 0202020202020202020202020202020202020202020202020202020202020202, expected_active_phase_2_hash: 0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a, expected_active_phase_1_hash: 0101010101010101010101010101010101010101010101010101010101010101, expected_boot_disk: A, expected_active_phase_1_slot: A, component: host_phase_1, sp_slot: 2, sp_type: Sled, serial_number: serial2, part_number: model2 INFO skipping board for MGS-driven update, serial_number: serial2, part_number: model2 INFO skipping board for MGS-driven update, serial_number: serial0, part_number: model0 INFO skipping board for MGS-driven update, serial_number: serial1, part_number: model1 INFO ran out of boards for MGS-driven update +INFO some zones not yet up-to-date, zones_currently_updating: [ZoneCurrentlyUpdating { zone_id: a67ad53f-d551-40e7-abae-57664779b27b (service), zone_kind: Nexus, reason: MissingInInventory { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("0e32b4a3e5d3668bb1d6a16fb06b74dc60b973fa479dcee0aae3adbb52bf1388") } } }, ZoneCurrentlyUpdating { zone_id: 43cbc3a6-e640-43f5-a9a2-f83eff427870 (service), zone_kind: Nexus, reason: MissingInInventory { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("0e32b4a3e5d3668bb1d6a16fb06b74dc60b973fa479dcee0aae3adbb52bf1388") } } }, ZoneCurrentlyUpdating { zone_id: 26fbf986-e560-4449-a351-547d1721b90e (service), zone_kind: Nexus, reason: MissingInInventory { bp_image_source: Artifact { version: Available { version: ArtifactVersion("1.0.0") }, hash: ArtifactHash("0e32b4a3e5d3668bb1d6a16fb06b74dc60b973fa479dcee0aae3adbb52bf1388") } } }] generated blueprint 31c84831-be52-4630-bc3f-128d72cd8f22 based on parent blueprint b82656b0-a9be-433d-83d0-e2bdf371777a planning report for blueprint 31c84831-be52-4630-bc3f-128d72cd8f22: chicken switches: add zones with mupdate override: false -* 1 out-of-date zone updated in-place: - * sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c, zone 353b3b65-20f7-48c3-88f7-495bd5d31545 (clickhouse) -* 25 remaining out-of-date zones +* waiting to update top-level nexus_generation: pending non-nexus zone updates > blueprint-diff latest from: blueprint b82656b0-a9be-433d-83d0-e2bdf371777a to: blueprint 31c84831-be52-4630-bc3f-128d72cd8f22 - MODIFIED SLEDS: - - sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c (active, config generation 3 -> 4): - - host phase 2 contents: - ------------------------------ - slot boot image source - ------------------------------ - A current contents - B artifact: version 1.0.0 - - - physical disks: - ------------------------------------------------------------------------------------ - vendor model serial disposition - ------------------------------------------------------------------------------------ - fake-vendor fake-model serial-727522a7-934f-494d-b5b3-160968e74463 in service - fake-vendor fake-model serial-72c59873-31ff-4e36-8d76-ff834009349a in service - fake-vendor fake-model serial-b5fd5bc1-099e-4e77-8028-a9793c11f43b in service - - - datasets: - ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - dataset name dataset id disposition quota reservation compression - ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - oxp_727522a7-934f-494d-b5b3-160968e74463/crucible 2f204c50-a327-479c-8852-f53ec7a19c1f in service none none off - oxp_72c59873-31ff-4e36-8d76-ff834009349a/crucible 78f34ce7-42f1-41da-995f-318f32054ad2 in service none none off - oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crucible 1640adb6-70bf-44cf-b05c-bff6dd300cf3 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/clickhouse 841d5648-05f0-47b0-b446-92f6b60fe9a6 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/external_dns 8e0bd2bd-23b7-4bc6-9e73-c4d4ebc0bc8c in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/internal_dns 2ad1875a-92ac-472f-8c26-593309f0e4da in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone 4829f422-aa31-41a8-ab73-95684ff1ef48 in service none none off - oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/zone 775f9207-c42d-4af2-9186-27ffef67735e in service none none off - oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crypt/zone 3b66453b-7148-4c1b-84a9-499e43290ab4 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_clickhouse_353b3b65-20f7-48c3-88f7-495bd5d31545 b46de15d-33e7-4cd0-aa7c-e7be2a61e71b in service none none off - oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crypt/zone/oxz_crucible_86a22a56-0168-453d-9df1-cb2a7c64b5d3 3e0d6188-c503-49cf-a441-fa7df40ceb43 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_crucible_bd354eef-d8a6-4165-9124-283fb5e46d77 5ae11c7e-08fa-4d78-a4ea-14b4a9a10241 in service none none off - oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/zone/oxz_crucible_e2fdefe7-95b2-4fd2-ae37-56929a06d58c b8f2a09f-8bd2-4418-872b-a4457a3f958c in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_crucible_pantry_ad6a3a03-8d0f-4504-99a4-cbf73d69b973 49f8fbb6-5bac-4609-907f-6e3dfc206059 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_external_dns_6c3ae381-04f7-41ea-b0ac-74db387dbc3a 8c4fa711-1d5d-4e93-85f0-d17bff47b063 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_internal_dns_99e2f30b-3174-40bf-a78a-90da8abba8ca c31623de-c19b-4615-9f1d-5e1daa5d3bda in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_nexus_466a9f29-62bf-4e63-924a-b9efdb86afec 3560dd69-3b23-4c69-807d-d673104cfc68 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/zone/oxz_ntp_62620961-fc4a-481e-968b-f5acbac0dc63 09b9cc9b-3426-470b-a7bc-538f82dede03 in service none none off - oxp_727522a7-934f-494d-b5b3-160968e74463/crypt/debug 93957ca0-9ed1-4e7b-8c34-2ce07a69541c in service 100 GiB none gzip-9 - oxp_72c59873-31ff-4e36-8d76-ff834009349a/crypt/debug 2db6b7c1-0f46-4ced-a3ad-48872793360e in service 100 GiB none gzip-9 - oxp_b5fd5bc1-099e-4e77-8028-a9793c11f43b/crypt/debug 318fae85-abcb-4259-b1b6-ac96d193f7b7 in service 100 GiB none gzip-9 - - - omicron zones: - ------------------------------------------------------------------------------------------------------------------------- - zone type zone id image source disposition underlay IP - ------------------------------------------------------------------------------------------------------------------------- - crucible 86a22a56-0168-453d-9df1-cb2a7c64b5d3 install dataset in service fd00:1122:3344:102::28 - crucible bd354eef-d8a6-4165-9124-283fb5e46d77 install dataset in service fd00:1122:3344:102::26 - crucible e2fdefe7-95b2-4fd2-ae37-56929a06d58c install dataset in service fd00:1122:3344:102::27 - crucible_pantry ad6a3a03-8d0f-4504-99a4-cbf73d69b973 install dataset in service fd00:1122:3344:102::25 - external_dns 6c3ae381-04f7-41ea-b0ac-74db387dbc3a install dataset in service fd00:1122:3344:102::24 - internal_dns 99e2f30b-3174-40bf-a78a-90da8abba8ca install dataset in service fd00:1122:3344:1::1 - internal_ntp 62620961-fc4a-481e-968b-f5acbac0dc63 install dataset in service fd00:1122:3344:102::21 - nexus 466a9f29-62bf-4e63-924a-b9efdb86afec install dataset in service fd00:1122:3344:102::22 -* clickhouse 353b3b65-20f7-48c3-88f7-495bd5d31545 - install dataset in service fd00:1122:3344:102::23 - └─ + artifact: version 1.0.0 - - COCKROACHDB SETTINGS: state fingerprint::::::::::::::::: (none) (unchanged) cluster.preserve_downgrade_option: (do not modify) (unchanged) @@ -4291,6 +4707,7 @@ to: blueprint 31c84831-be52-4630-bc3f-128d72cd8f22 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) @@ -4311,6 +4728,8 @@ internal DNS: AAAA fd00:1122:3344:101::27 name: 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host (records: 1) AAAA fd00:1122:3344:101::22 + name: 26fbf986-e560-4449-a351-547d1721b90e.host (records: 1) + AAAA fd00:1122:3344:103::28 name: 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled (records: 1) AAAA fd00:1122:3344:102::1 name: 353b3b65-20f7-48c3-88f7-495bd5d31545.host (records: 1) @@ -4319,6 +4738,8 @@ internal DNS: AAAA fd00:1122:3344:103::22 name: 427ec88f-f467-42fa-9bbb-66a91a36103c.host (records: 1) AAAA fd00:1122:3344:2::1 + name: 43cbc3a6-e640-43f5-a9a2-f83eff427870.host (records: 1) + AAAA fd00:1122:3344:101::28 name: 466a9f29-62bf-4e63-924a-b9efdb86afec.host (records: 1) AAAA fd00:1122:3344:102::22 name: 5199c033-4cf9-4ab6-8ae7-566bd7606363.host (records: 1) @@ -4387,16 +4808,21 @@ internal DNS: SRV port 5353 427ec88f-f467-42fa-9bbb-66a91a36103c.host.control-plane.oxide.internal SRV port 5353 99e2f30b-3174-40bf-a78a-90da8abba8ca.host.control-plane.oxide.internal SRV port 5353 ea5b4030-b52f-44b2-8d70-45f15f987d01.host.control-plane.oxide.internal - name: _nexus._tcp (records: 3) + name: _nexus._tcp (records: 6) SRV port 12221 0c71b3b2-6ceb-4e8f-b020-b08675e83038.host.control-plane.oxide.internal + SRV port 12221 26fbf986-e560-4449-a351-547d1721b90e.host.control-plane.oxide.internal SRV port 12221 3eeb8d49-eb1a-43f8-bb64-c2338421c2c6.host.control-plane.oxide.internal + SRV port 12221 43cbc3a6-e640-43f5-a9a2-f83eff427870.host.control-plane.oxide.internal SRV port 12221 466a9f29-62bf-4e63-924a-b9efdb86afec.host.control-plane.oxide.internal + SRV port 12221 a67ad53f-d551-40e7-abae-57664779b27b.host.control-plane.oxide.internal name: _oximeter-reader._tcp (records: 1) SRV port 9000 353b3b65-20f7-48c3-88f7-495bd5d31545.host.control-plane.oxide.internal name: _repo-depot._tcp (records: 3) SRV port 12348 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c.sled.control-plane.oxide.internal SRV port 12348 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6.sled.control-plane.oxide.internal SRV port 12348 d81c6a84-79b8-4958-ae41-ea46c9b19763.sled.control-plane.oxide.internal + name: a67ad53f-d551-40e7-abae-57664779b27b.host (records: 1) + AAAA fd00:1122:3344:102::29 name: ad6a3a03-8d0f-4504-99a4-cbf73d69b973.host (records: 1) AAAA fd00:1122:3344:102::25 name: ba4994a8-23f9-4b1a-a84f-a08d74591389.host (records: 1) @@ -4430,9 +4856,12 @@ external DNS: NS ns1.oxide.example NS ns2.oxide.example NS ns3.oxide.example - name: example-silo.sys (records: 3) + name: example-silo.sys (records: 6) A 192.0.2.2 + A 192.0.2.7 A 192.0.2.3 + A 192.0.2.6 + A 192.0.2.5 A 192.0.2.4 name: ns1 (records: 1) A 198.51.100.1 diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 8116af7321d..4d61b0d57b1 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -53,10 +53,14 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .map_or_else(PlannerChickenSwitches::default, |cs| { cs.switches.planner_switches }); - let planning_input = - PlanningInputFromDb::assemble(&opctx, &datastore, chicken_switches) - .await - .expect("planning input"); + let planning_input = PlanningInputFromDb::assemble( + &opctx, + &datastore, + chicken_switches, + None, + ) + .await + .expect("planning input"); let collection = datastore .inventory_get_latest_collection(opctx) .await @@ -271,10 +275,14 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // Now run through the planner. info!(log, "running through planner"); - let planning_input = - PlanningInputFromDb::assemble(&opctx, &datastore, chicken_switches) - .await - .expect("planning input"); + let planning_input = PlanningInputFromDb::assemble( + &opctx, + &datastore, + chicken_switches, + None, + ) + .await + .expect("planning input"); let (_, parent_blueprint) = datastore .blueprint_target_get_current_full(opctx) .await diff --git a/nexus/db-model/src/deployment.rs b/nexus/db-model/src/deployment.rs index e3e7cd50ddf..5cd9ca9e500 100644 --- a/nexus/db-model/src/deployment.rs +++ b/nexus/db-model/src/deployment.rs @@ -81,6 +81,7 @@ pub struct Blueprint { pub creator: String, pub comment: String, pub target_release_minimum_generation: Generation, + pub nexus_generation: Generation, } impl From<&'_ nexus_types::deployment::Blueprint> for Blueprint { @@ -100,6 +101,7 @@ impl From<&'_ nexus_types::deployment::Blueprint> for Blueprint { target_release_minimum_generation: Generation( bp.target_release_minimum_generation, ), + nexus_generation: Generation(bp.nexus_generation), } } } @@ -113,6 +115,7 @@ impl From for nexus_types::deployment::BlueprintMetadata { external_dns_version: *value.external_dns_version, target_release_minimum_generation: *value .target_release_minimum_generation, + nexus_generation: *value.nexus_generation, cockroachdb_fingerprint: value.cockroachdb_fingerprint, cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade::from_optional_string( @@ -524,6 +527,7 @@ pub struct BpOmicronZone { pub image_source: DbBpZoneImageSource, pub image_artifact_sha256: Option, + pub nexus_generation: Option, } impl BpOmicronZone { @@ -585,6 +589,7 @@ impl BpOmicronZone { snat_ip: None, snat_first_port: None, snat_last_port: None, + nexus_generation: None, }; match &blueprint_zone.zone_type { @@ -716,6 +721,7 @@ impl BpOmicronZone { nic, external_tls, external_dns_servers, + nexus_generation, }) => { // Set the common fields bp_omicron_zone @@ -733,6 +739,8 @@ impl BpOmicronZone { .map(IpNetwork::from) .collect(), ); + bp_omicron_zone.nexus_generation = + Some(Generation::from(*nexus_generation)); } BlueprintZoneType::Oximeter(blueprint_zone_type::Oximeter { address, @@ -938,6 +946,9 @@ impl BpOmicronZone { .into_iter() .map(|i| i.ip()) .collect(), + nexus_generation: *self.nexus_generation.ok_or_else( + || anyhow!("expected 'nexus_generation'"), + )?, }) } ZoneType::Oximeter => { diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 42c1755e13a..69c2233a094 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(181, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(182, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(182, "nexus-generation"), KnownVersion::new(181, "rename-nat-table"), KnownVersion::new(180, "sled-cpu-family"), KnownVersion::new(179, "add-pending-mgs-updates-host-phase-1"), diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 5361bc418d7..1f2b7e6e596 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -548,6 +548,7 @@ impl DataStore { internal_dns_version, external_dns_version, target_release_minimum_generation, + nexus_generation, cockroachdb_fingerprint, cockroachdb_setting_preserve_downgrade, time_created, @@ -574,6 +575,7 @@ impl DataStore { *blueprint.internal_dns_version, *blueprint.external_dns_version, *blueprint.target_release_minimum_generation, + *blueprint.nexus_generation, blueprint.cockroachdb_fingerprint, blueprint.cockroachdb_setting_preserve_downgrade, blueprint.time_created, @@ -1325,6 +1327,7 @@ impl DataStore { internal_dns_version, external_dns_version, target_release_minimum_generation, + nexus_generation, cockroachdb_fingerprint, cockroachdb_setting_preserve_downgrade, clickhouse_cluster_config, @@ -4263,6 +4266,7 @@ mod tests { }, external_tls: false, external_dns_servers: vec![], + nexus_generation: Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, diff --git a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs index 3364a15380d..cd0473cf833 100644 --- a/nexus/db-queries/src/db/datastore/deployment/external_networking.rs +++ b/nexus/db-queries/src/db/datastore/deployment/external_networking.rs @@ -433,6 +433,7 @@ mod tests { use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; use omicron_common::address::NTP_OPTE_IPV4_SUBNET; use omicron_common::address::NUM_SOURCE_NAT_PORTS; + use omicron_common::api::external::Generation; use omicron_common::api::external::MacAddr; use omicron_common::api::external::Vni; use omicron_common::zpool_name::ZpoolName; @@ -617,6 +618,7 @@ mod tests { nic: self.nexus_nic.clone(), external_tls: false, external_dns_servers: Vec::new(), + nexus_generation: Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index ce0e4f72244..cf546bbb968 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -1064,6 +1064,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), target_release_minimum_generation: *Generation::new(), + nexus_generation: *Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: *Generation::new(), @@ -1488,6 +1489,7 @@ mod test { slot: 0, transit_ips: vec![], }, + nexus_generation: *Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, @@ -1558,6 +1560,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), target_release_minimum_generation: *Generation::new(), + nexus_generation: *Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: *Generation::new(), @@ -1744,6 +1747,7 @@ mod test { slot: 0, transit_ips: vec![], }, + nexus_generation: *Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, @@ -1777,6 +1781,7 @@ mod test { slot: 0, transit_ips: vec![], }, + nexus_generation: *Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, @@ -1821,6 +1826,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), target_release_minimum_generation: *Generation::new(), + nexus_generation: *Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: *Generation::new(), @@ -2015,6 +2021,7 @@ mod test { slot: 0, transit_ips: vec![], }, + nexus_generation: *Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, @@ -2033,6 +2040,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), target_release_minimum_generation: *Generation::new(), + nexus_generation: *Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: *Generation::new(), @@ -2155,6 +2163,7 @@ mod test { slot: 0, transit_ips: vec![], }, + nexus_generation: *Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, @@ -2175,6 +2184,7 @@ mod test { internal_dns_version: *Generation::new(), external_dns_version: *Generation::new(), target_release_minimum_generation: *Generation::new(), + nexus_generation: *Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: *Generation::new(), diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 31f2111a0c8..0bfe2e3bdfe 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -3330,12 +3330,12 @@ mod tests { ) .expect("ensured disks"); } + let must_have_nexus_zones = false; builder - .sled_add_zone_nexus_with_config( + .sled_add_zone_nexus_internal( sled_ids[2], - false, - Vec::new(), BlueprintZoneImageSource::InstallDataset, + must_have_nexus_zones, ) .expect("added nexus to third sled"); builder.build() @@ -3405,12 +3405,12 @@ mod tests { ) .expect("created blueprint builder"); for &sled_id in &sled_ids { + let must_have_nexus_zones = false; builder - .sled_add_zone_nexus_with_config( + .sled_add_zone_nexus_internal( sled_id, - false, - Vec::new(), BlueprintZoneImageSource::InstallDataset, + must_have_nexus_zones, ) .expect("added nexus to third sled"); } diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index c0f2fe5f833..4efdc617ec7 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1966,6 +1966,8 @@ table! { cockroachdb_setting_preserve_downgrade -> Nullable, target_release_minimum_generation -> Int8, + + nexus_generation -> Int8, } } @@ -2066,6 +2068,7 @@ table! { filesystem_pool -> Uuid, image_source -> crate::enums::BpZoneImageSourceEnum, image_artifact_sha256 -> Nullable, + nexus_generation -> Nullable, } } diff --git a/nexus/reconfigurator/blippy/src/blippy.rs b/nexus/reconfigurator/blippy/src/blippy.rs index 07192c7f276..dbcd4858570 100644 --- a/nexus/reconfigurator/blippy/src/blippy.rs +++ b/nexus/reconfigurator/blippy/src/blippy.rs @@ -14,6 +14,7 @@ use nexus_types::inventory::ZpoolName; use omicron_common::address::DnsSubnet; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; +use omicron_common::api::external::Generation; use omicron_common::api::external::MacAddr; use omicron_common::disk::DatasetKind; use omicron_common::disk::M2Slot; @@ -193,6 +194,12 @@ pub enum SledKind { version: BlueprintArtifactVersion, hash: ArtifactHash, }, + /// Nexus zones with the same generation have different image sources. + NexusZoneGenerationImageSourceMismatch { + zone1: BlueprintZoneConfig, + zone2: BlueprintZoneConfig, + generation: Generation, + }, } impl fmt::Display for SledKind { @@ -415,6 +422,18 @@ impl fmt::Display for SledKind { (version {version}, hash {hash})", ) } + SledKind::NexusZoneGenerationImageSourceMismatch { + zone1, + zone2, + generation, + } => { + write!( + f, + "Nexus zones {} and {} both have generation {generation} but \ + different image sources ({:?} vs {:?})", + zone1.id, zone2.id, zone1.image_source, zone2.image_source, + ) + } } } } diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index 3894ae335b3..264e8ee5d02 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -21,6 +21,7 @@ use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::DnsSubnet; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; +use omicron_common::api::external::Generation; use omicron_common::disk::DatasetKind; use omicron_common::disk::M2Slot; use omicron_uuid_kinds::MupdateOverrideUuid; @@ -37,6 +38,7 @@ pub(crate) fn perform_all_blueprint_only_checks(blippy: &mut Blippy<'_>) { check_dataset_zpool_uniqueness(blippy); check_datasets(blippy); check_mupdate_override(blippy); + check_nexus_generation_consistency(blippy); } fn check_underlay_ips(blippy: &mut Blippy<'_>) { @@ -632,6 +634,55 @@ fn check_mupdate_override_host_phase_2_contents( } } +fn check_nexus_generation_consistency(blippy: &mut Blippy<'_>) { + use std::collections::HashMap; + + // Map from generation -> (sled_id, image_source, zone) + let mut generation_info: HashMap< + Generation, + Vec<(SledUuid, BlueprintZoneImageSource, &BlueprintZoneConfig)>, + > = HashMap::new(); + + // Collect all Nexus zones and their generations + for (sled_id, zone) in blippy + .blueprint() + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + { + if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { + generation_info.entry(nexus.nexus_generation).or_default().push(( + sled_id, + zone.image_source.clone(), + zone, + )); + } + } + + // Check each generation for image source consistency + for (generation, zones_with_gen) in generation_info { + if zones_with_gen.len() < 2 { + continue; // Only one zone with this generation, no consistency issue + } + + // Take the first zone as the reference + let (ref_sled_id, ref_image_source, ref_zone) = &zones_with_gen[0]; + + // Compare all other zones to the reference + for (_sled_id, image_source, zone) in &zones_with_gen[1..] { + if image_source != ref_image_source { + blippy.push_sled_note( + *ref_sled_id, + Severity::Fatal, + SledKind::NexusZoneGenerationImageSourceMismatch { + zone1: (*ref_zone).clone(), + zone2: (*zone).clone(), + generation, + }, + ); + } + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -1812,4 +1863,105 @@ mod tests { logctx.cleanup_successful(); } + + #[test] + fn test_nexus_generation_consistency() { + static TEST_NAME: &str = "test_nexus_generation_consistency"; + let logctx = test_setup_log(TEST_NAME); + let (_, mut blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .nsleds(3) + .nexus_count(3) + .build(); + + // Find the Nexus zones + let ((sled1, zone1_id), (sled2, zone2_id)) = { + let nexus_zones: Vec<_> = blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(sled_id, zone)| { + if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) { + Some((sled_id, zone)) + } else { + None + } + }) + .collect(); + + // Should have exactly 3 Nexus zones + assert_eq!(nexus_zones.len(), 3); + + // Modify two zones to have the same generation but different image sources + let (sled1, zone1) = nexus_zones[0]; + let (sled2, zone2) = nexus_zones[1]; + + ((sled1, zone1.id), (sled2, zone2.id)) + }; + + let generation = Generation::new(); + + let zone1 = { + // Find the zones in the blueprint and modify them + let mut zone1_config = blueprint + .sleds + .get_mut(&sled1) + .unwrap() + .zones + .get_mut(&zone1_id) + .unwrap(); + + match &mut zone1_config.zone_type { + BlueprintZoneType::Nexus(nexus) => { + nexus.nexus_generation = generation; + } + _ => unreachable!("this is a Nexus zone"), + } + zone1_config.image_source = + BlueprintZoneImageSource::InstallDataset; + zone1_config.clone() + }; + + let zone2 = { + let mut zone2_config = blueprint + .sleds + .get_mut(&sled2) + .unwrap() + .zones + .get_mut(&zone2_id) + .unwrap(); + + match &mut zone2_config.zone_type { + BlueprintZoneType::Nexus(nexus) => { + nexus.nexus_generation = generation; + } + _ => unreachable!("this is a Nexus zone"), + } + zone2_config.image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintArtifactVersion::Available { + version: "1.0.0".parse().unwrap(), + }, + hash: ArtifactHash([0; 32]), + }; + zone2_config.clone() + }; + + // Run blippy checks + let expected_notes = [Note { + severity: Severity::Fatal, + kind: Kind::Sled { + sled_id: sled1, + kind: SledKind::NexusZoneGenerationImageSourceMismatch { + zone1, + zone2, + generation, + }, + }, + }]; + + let report = + Blippy::new(&blueprint).into_report(BlippyReportSortKey::Kind); + eprintln!("{}", report.display()); + assert_eq!(report.notes(), &expected_notes); + + logctx.cleanup_successful(); + } } diff --git a/nexus/reconfigurator/cli-integration-tests/tests/integration/blueprint_edit.rs b/nexus/reconfigurator/cli-integration-tests/tests/integration/blueprint_edit.rs index fbfed8169eb..33301d9408b 100644 --- a/nexus/reconfigurator/cli-integration-tests/tests/integration/blueprint_edit.rs +++ b/nexus/reconfigurator/cli-integration-tests/tests/integration/blueprint_edit.rs @@ -116,7 +116,7 @@ async fn test_blueprint_edit(cptestctx: &ControlPlaneTestContext) { // Assemble state that we can load into reconfigurator-cli. let state1 = nexus_reconfigurator_preparation::reconfigurator_state_load( - &opctx, datastore, + &opctx, datastore, None, ) .await .expect("failed to assemble reconfigurator state"); diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 85bc01a30ac..5a46b2bb054 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -597,6 +597,7 @@ mod test { nic, external_tls, external_dns_servers, + nexus_generation: Generation::new(), }) } OmicronZoneType::Oximeter { address } => { @@ -720,6 +721,7 @@ mod test { internal_dns_version: initial_dns_generation, external_dns_version: Generation::new(), target_release_minimum_generation: Generation::new(), + nexus_generation: Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: Generation::new(), @@ -1509,7 +1511,7 @@ mod test { chicken_switches: PlannerChickenSwitches::default(), log, } - .build() + .build(None) .unwrap() .into_builder(); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 29c9c67bad4..6d987482152 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -139,6 +139,22 @@ pub enum Error { AllocateInternalDnsSubnet(#[from] NoAvailableDnsSubnets), #[error("error allocating external networking resources")] AllocateExternalNetworking(#[from] ExternalNetworkingError), + #[error( + "mismatch while setting top-level nexus_generation for blueprint, \ + expected current value is {expected} but actual value is {actual}" + )] + NexusGenerationMismatch { expected: Generation, actual: Generation }, + #[error( + "mismatch while setting nexus_generation for a zone with an old image, \ + expected current value is {expected} but actual value is {actual}" + )] + OldImageNexusGenerationMismatch { expected: Generation, actual: Generation }, + #[error( + "mismatch while setting nexus_generation for a zone with a new image, \ + expected current value is {expected} (or that +1) but actual value is \ + {actual}" + )] + NewImageNexusGenerationMismatch { expected: Generation, actual: Generation }, #[error("can only have {INTERNAL_DNS_REDUNDANCY} internal DNS servers")] PolicySpecifiesTooManyInternalDnsServers, #[error("zone is already up-to-date and should not be updated")] @@ -357,6 +373,10 @@ pub(crate) enum Operation { current_generation: Generation, new_generation: Generation, }, + SetNexusGeneration { + current_generation: Generation, + new_generation: Generation, + }, SledNoopZoneImageSourcesUpdated { sled_id: SledUuid, count: usize, @@ -442,6 +462,13 @@ impl fmt::Display for Operation { {current_generation} to {new_generation}" ) } + Self::SetNexusGeneration { current_generation, new_generation } => { + write!( + f, + "updated nexus generation from \ + {current_generation} to {new_generation}" + ) + } } } } @@ -492,6 +519,7 @@ pub struct BlueprintBuilder<'a> { sled_editors: BTreeMap, cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade, target_release_minimum_generation: Generation, + nexus_generation: Generation, report: Option, creator: String, @@ -559,6 +587,7 @@ impl<'a> BlueprintBuilder<'a> { internal_dns_version: Generation::new(), external_dns_version: Generation::new(), target_release_minimum_generation: Generation::new(), + nexus_generation: Generation::new(), cockroachdb_fingerprint: String::new(), cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade::DoNotModify, @@ -640,6 +669,7 @@ impl<'a> BlueprintBuilder<'a> { pending_mgs_updates: parent_blueprint.pending_mgs_updates.clone(), target_release_minimum_generation: parent_blueprint .target_release_minimum_generation, + nexus_generation: parent_blueprint.nexus_generation, report: None, creator: creator.to_owned(), operations: Vec::new(), @@ -822,6 +852,7 @@ impl<'a> BlueprintBuilder<'a> { external_dns_version: self.input.external_dns_version(), target_release_minimum_generation: self .target_release_minimum_generation, + nexus_generation: self.nexus_generation, cockroachdb_fingerprint: self .input .cockroachdb_settings() @@ -1499,6 +1530,137 @@ impl<'a> BlueprintBuilder<'a> { &mut self, sled_id: SledUuid, image_source: BlueprintZoneImageSource, + ) -> Result<(), Error> { + let must_have_nexus_zones = true; + self.sled_add_zone_nexus_internal( + sled_id, + image_source, + must_have_nexus_zones, + ) + } + + // Determines TLS and DNS server configuration from existing Nexus zones. + // + // Returns `Some((external_tls, external_dns_servers))` if existing Nexus + // zones are found, or `None` if no existing Nexus zones exist. + fn determine_nexus_tls_dns_config(&self) -> Option<(bool, Vec)> { + self.parent_blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .find_map(|(_, z)| match &z.zone_type { + BlueprintZoneType::Nexus(nexus) => Some(( + nexus.external_tls, + nexus.external_dns_servers.clone(), + )), + _ => None, + }) + } + + // Determines the appropriate generation number for a new Nexus zone. + // + // Returns `Some(generation)` if a generation can be determined from existing + // Nexus zones, or `None` if no existing Nexus zones exist. + // + // The logic is: + // - If any existing Nexus zone has the same image source, reuse its generation + // - Otherwise, use the highest existing generation + 1 + // - If no existing zones exist, return None + // + // This function also validates that the determined generation matches the + // top-level current blueprint generation. + fn determine_nexus_generation( + &self, + image_source: &BlueprintZoneImageSource, + ) -> Result, Error> { + // If any other Nexus in the blueprint has the same image source, + // use it. Otherwise, use the highest generation number + 1. + // + // TODO: This will check the parent blueprint, but perhaps should + // also be checking all "pending" updates in "sled_editors". + // If we are adding "multiple new nexus zones" in a blueprint, + // they'll all happen to get a generation number equal to "the previous + // highest generation, plus 1". But if, for some weird reason, + // we added multiple Nexuses with different new image sources in a single + // blueprint, they'd also get assigned the same generation (which should + // be a bug). + // + // In the meantime: There is a blippy check to verify that all Nexus + // zones with the same generation have the same image source. + let mut highest_seen_generation = None; + let mut same_image_nexus_generation = None; + + for (zone, nexus) in self + .parent_blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .filter_map(|(_, z)| match &z.zone_type { + BlueprintZoneType::Nexus(nexus) => Some((z, nexus)), + _ => None, + }) + { + if zone.image_source == *image_source { + // If the image matches exactly, use it. + same_image_nexus_generation = Some(nexus.nexus_generation); + break; + } else if let Some(gen) = highest_seen_generation { + // Otherwise, use the generation number if it's the highest + // we've seen + if nexus.nexus_generation > gen { + highest_seen_generation = Some(nexus.nexus_generation); + } + } else { + // Use it regardless if it's the first generation number we've + // seen + highest_seen_generation = Some(nexus.nexus_generation); + } + } + + let determined_generation = match same_image_nexus_generation { + Some(gen) => Some(gen), + None => highest_seen_generation.map(|gen| gen.next()), + }; + + // Validate that the determined generation matches the top-level current blueprint generation + if let Some(gen) = determined_generation { + let current_blueprint_gen = self.parent_blueprint.nexus_generation; + if same_image_nexus_generation.is_some() { + // Existing image - should either match the currently-used Nexus + // generation, or be part of a "generation + 1". + let matches_current_nexus = current_blueprint_gen == gen; + let matches_next_nexus = current_blueprint_gen.next() == gen; + + if !matches_current_nexus && !matches_next_nexus { + return Err(Error::OldImageNexusGenerationMismatch { + expected: current_blueprint_gen, + actual: gen, + }); + } + } else { + // New image source - should be current blueprint generation + 1 + let expected_gen = current_blueprint_gen.next(); + if gen != expected_gen { + return Err(Error::NewImageNexusGenerationMismatch { + expected: expected_gen, + actual: gen, + }); + } + } + } + + Ok(determined_generation) + } + + /// Adds a nexus zone on this sled. + /// + /// If `must_have_nexus_zones` is true, then other Nexus zones + /// are used to determine configuration settings (e.g., TLS, + /// DNS servers, generation number). + /// + /// If `must_have_nexus_zones` is false, then these settings + /// are permitted to use default values. + pub fn sled_add_zone_nexus_internal( + &mut self, + sled_id: SledUuid, + image_source: BlueprintZoneImageSource, + must_have_nexus_zones: bool, ) -> Result<(), Error> { // Whether Nexus should use TLS and what the external DNS servers it // should use are currently provided at rack-setup time, and should be @@ -1511,31 +1673,48 @@ impl<'a> BlueprintBuilder<'a> { // check that we're if this builder is being used to make such a change, // that change is also reflected here in a new zone. Perhaps these // settings should be part of `Policy` instead? - let (external_tls, external_dns_servers) = self - .parent_blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .find_map(|(_, z)| match &z.zone_type { - BlueprintZoneType::Nexus(nexus) => Some(( - nexus.external_tls, - nexus.external_dns_servers.clone(), - )), - _ => None, - }) - .ok_or(Error::NoNexusZonesInParentBlueprint)?; + let (external_tls, external_dns_servers) = + match self.determine_nexus_tls_dns_config() { + Some(config) => config, + None => { + if must_have_nexus_zones { + return Err(Error::NoNexusZonesInParentBlueprint); + } else { + (false, Vec::new()) + } + } + }; + + let nexus_generation = + match self.determine_nexus_generation(&image_source)? { + Some(generation) => generation, + None => { + if must_have_nexus_zones { + return Err(Error::NoNexusZonesInParentBlueprint); + } else { + // If there are no existing Nexus zones, start with whatever the top-level + // blueprint value happens to be. + self.parent_blueprint.nexus_generation + } + } + }; + self.sled_add_zone_nexus_with_config( sled_id, external_tls, external_dns_servers, image_source, + nexus_generation, ) } - pub fn sled_add_zone_nexus_with_config( + fn sled_add_zone_nexus_with_config( &mut self, sled_id: SledUuid, external_tls: bool, external_dns_servers: Vec, image_source: BlueprintZoneImageSource, + nexus_generation: Generation, ) -> Result<(), Error> { let nexus_id = self.rng.sled_rng(sled_id).next_zone(); let ExternalNetworkingChoice { @@ -1573,6 +1752,7 @@ impl<'a> BlueprintBuilder<'a> { nic, external_tls, external_dns_servers: external_dns_servers.clone(), + nexus_generation, }); let filesystem_pool = self.sled_select_zpool(sled_id, zone_type.kind())?; @@ -2147,6 +2327,32 @@ impl<'a> BlueprintBuilder<'a> { Ok(()) } + /// Get the value of `nexus_generation`. + pub fn nexus_generation(&self) -> Generation { + self.nexus_generation + } + + /// Given the current value of `nexus_generation`, set the new value for + /// this blueprint. + pub fn set_nexus_generation( + &mut self, + current_generation: Generation, + new_generation: Generation, + ) -> Result<(), Error> { + if self.nexus_generation != current_generation { + return Err(Error::NexusGenerationMismatch { + expected: current_generation, + actual: self.nexus_generation, + }); + } + self.nexus_generation = new_generation; + self.record_operation(Operation::SetNexusGeneration { + current_generation, + new_generation, + }); + Ok(()) + } + /// Allow a test to manually add an external DNS address, which could /// ordinarily only come from RSS. /// @@ -3647,4 +3853,469 @@ pub mod test { logctx.cleanup_successful(); } + + /// Test nexus generation assignment logic for new zones + #[test] + fn test_nexus_generation_assignment_new_generation() { + static TEST_NAME: &str = + "test_nexus_generation_assignment_new_generation"; + let logctx = test_setup_log(TEST_NAME); + let mut rng = SimRngState::from_seed(TEST_NAME); + + // Start with a system that has no Nexus zones + let (example_system, blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .nexus_count(0) + .build(); + verify_blueprint(&blueprint); + + let mut builder = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint, + &example_system.input, + &example_system.collection, + "test", + rng.next_planner_rng(), + ) + .expect("failed to create builder"); + + // Get first sled + let sled_id = example_system + .input + .all_sled_ids(SledFilter::Commissioned) + .next() + .unwrap(); + let image_source = BlueprintZoneImageSource::InstallDataset; + + // Add first Nexus zone - should get generation 1 + builder + .sled_add_zone_nexus_internal(sled_id, image_source.clone(), false) + .expect("failed to add nexus zone"); + + let blueprint1 = builder.build(); + verify_blueprint(&blueprint1); + + // Find the nexus zone and verify it has generation 1 + let nexus_zones: Vec<_> = blueprint1 + .all_omicron_zones(BlueprintZoneDisposition::any) + .filter_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus) => Some(nexus), + _ => None, + }) + .collect(); + + assert_eq!(nexus_zones.len(), 1); + assert_eq!(nexus_zones[0].nexus_generation, Generation::new()); + + logctx.cleanup_successful(); + } + + /// Test that adding a Nexus zone with the same image source as an existing + /// Nexus zone re-uses the same generation number + #[test] + fn test_nexus_generation_assignment_same_image_reuse() { + static TEST_NAME: &str = + "test_nexus_generation_assignment_same_image_reuse"; + let logctx = test_setup_log(TEST_NAME); + let mut rng = SimRngState::from_seed(TEST_NAME); + + // Start with a system that has one Nexus zone + let (example_system, blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .nexus_count(1) + .build(); + verify_blueprint(&blueprint); + + // Get the generation of the existing nexus zone + let existing_nexus_gen = blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .find_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus) => { + // We're gonna add a new Nexus with this source in a moment + // - we want to be sure this image_source matches. + assert_eq!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ); + Some(nexus.nexus_generation) + } + _ => None, + }) + .expect("should have found existing nexus"); + + let mut builder = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint, + &example_system.input, + &example_system.collection, + "test", + rng.next_planner_rng(), + ) + .expect("failed to create builder"); + + // Get a different sled + let sled_ids: Vec<_> = example_system + .input + .all_sled_ids(SledFilter::Commissioned) + .collect(); + let second_sled_id = sled_ids[1]; + let image_source = BlueprintZoneImageSource::InstallDataset; + + // Add another Nexus zone with same image source - should reuse generation + builder + .sled_add_zone_nexus_internal( + second_sled_id, + image_source.clone(), + false, + ) + .expect("failed to add nexus zone"); + + let blueprint2 = builder.build(); + verify_blueprint(&blueprint2); + + // Find all nexus zones and verify they have the same generation + let nexus_zones: Vec<_> = blueprint2 + .all_omicron_zones(BlueprintZoneDisposition::any) + .filter_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus) => Some(nexus), + _ => None, + }) + .collect(); + + assert_eq!(nexus_zones.len(), 2); + assert_eq!(nexus_zones[0].nexus_generation, existing_nexus_gen); + assert_eq!(nexus_zones[1].nexus_generation, existing_nexus_gen); + + logctx.cleanup_successful(); + } + + /// Test nexus generation assignment logic for different image sources + #[test] + fn test_nexus_generation_assignment_different_image_increment() { + static TEST_NAME: &str = + "test_nexus_generation_assignment_different_image_increment"; + let logctx = test_setup_log(TEST_NAME); + let mut rng = SimRngState::from_seed(TEST_NAME); + + // Start with a system that has one Nexus zone + let (example_system, blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .nexus_count(1) + .build(); + verify_blueprint(&blueprint); + + // Get the generation of the existing nexus zone + let existing_nexus_gen = blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .find_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus) => { + assert_eq!( + zone.image_source, + BlueprintZoneImageSource::InstallDataset + ); + Some(nexus.nexus_generation) + } + _ => None, + }) + .expect("should have found existing nexus"); + + let mut builder = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint, + &example_system.input, + &example_system.collection, + "test", + rng.next_planner_rng(), + ) + .expect("failed to create builder"); + + // Get a different sled + let sled_ids: Vec<_> = example_system + .input + .all_sled_ids(SledFilter::Commissioned) + .collect(); + let second_sled_id = sled_ids[1]; + + // Use a different image source (artifact vs install dataset) + let different_image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("1.2.3.4"), + }, + hash: ArtifactHash([0x42; 32]), + }; + + // Add another Nexus zone with different image source - should increment generation + builder + .sled_add_zone_nexus_internal( + second_sled_id, + different_image_source.clone(), + false, + ) + .expect("failed to add nexus zone"); + + let blueprint2 = builder.build(); + verify_blueprint(&blueprint2); + + // Find all nexus zones and verify generations + let mut nexus_zones: Vec<_> = blueprint2 + .all_omicron_zones(BlueprintZoneDisposition::any) + .filter_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus) => Some((zone, nexus)), + _ => None, + }) + .collect(); + + // Sort by generation to ensure predictable ordering + nexus_zones.sort_by_key(|(_, nexus)| nexus.nexus_generation); + + assert_eq!(nexus_zones.len(), 2); + assert_eq!(nexus_zones[0].1.nexus_generation, existing_nexus_gen); + assert_eq!( + nexus_zones[1].1.nexus_generation, + existing_nexus_gen.next() + ); + + // Verify image sources are different + assert_eq!( + nexus_zones[0].0.image_source, + BlueprintZoneImageSource::InstallDataset + ); + assert_eq!(nexus_zones[1].0.image_source, different_image_source); + + logctx.cleanup_successful(); + } + + /// Test nexus generation assignment logic with mixed old/new image sources + /// + /// Tests a scenario where we restore redundancy with existing image source + /// while also adding zones with new image source for upgrade. + #[test] + fn test_nexus_generation_assignment_multiple_generations() { + static TEST_NAME: &str = + "test_nexus_generation_assignment_multiple_generations"; + let logctx = test_setup_log(TEST_NAME); + let mut rng = SimRngState::from_seed(TEST_NAME); + + // Start with a system with one Nexus zone using the install dataset as an image source + let (example_system, blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .nsleds(3) + .nexus_count(1) + .build(); + verify_blueprint(&blueprint); + + // Get the existing nexus zone's generation (should be generation 1) + let existing_nexus_gen = blueprint + .all_omicron_zones(BlueprintZoneDisposition::any) + .find_map(|(_, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus) => Some(nexus.nexus_generation), + _ => None, + }) + .expect("should have found existing nexus"); + + let mut builder = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint, + &example_system.input, + &example_system.collection, + "test", + rng.next_planner_rng(), + ) + .expect("failed to create builder"); + + let sled_ids: Vec<_> = example_system + .input + .all_sled_ids(SledFilter::Commissioned) + .collect(); + + // Define image sources: A (same as existing Nexus) and B (new) + let image_source_a = BlueprintZoneImageSource::InstallDataset; + let image_source_b = BlueprintZoneImageSource::Artifact { + version: BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("2.0.0"), + }, + hash: ArtifactHash([0x11; 32]), + }; + + // In a single BlueprintBuilder step, add: + // 1. One zone with image source A (should reuse existing generation) + // 2. One zone with image source B (should get existing generation + 1) + builder + .sled_add_zone_nexus_internal( + sled_ids[1], + image_source_a.clone(), + false, + ) + .expect("failed to add nexus zone with image source A"); + builder + .sled_add_zone_nexus_internal( + sled_ids[2], + image_source_b.clone(), + false, + ) + .expect("failed to add nexus zone with image source B"); + + let blueprint2 = builder.build(); + verify_blueprint(&blueprint2); + + // Collect all nexus zones and organize by image source + let mut nexus_by_image: std::collections::HashMap< + BlueprintZoneImageSource, + Vec, + > = std::collections::HashMap::new(); + + for (_, zone) in + blueprint2.all_omicron_zones(BlueprintZoneDisposition::any) + { + if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { + nexus_by_image + .entry(zone.image_source.clone()) + .or_insert_with(Vec::new) + .push(nexus.nexus_generation); + } + } + + // Should have 2 image sources now + assert_eq!(nexus_by_image.len(), 2); + + // Image source A should have 2 zones (original + new) with same generation + let image_a_gens = nexus_by_image.get(&image_source_a).unwrap(); + assert_eq!(image_a_gens.len(), 2); + assert_eq!(image_a_gens[0], existing_nexus_gen); + assert_eq!(image_a_gens[1], existing_nexus_gen); + + // Image source B should have 1 zone with next generation + let image_b_gens = nexus_by_image.get(&image_source_b).unwrap(); + assert_eq!(image_b_gens.len(), 1); + assert_eq!(image_b_gens[0], existing_nexus_gen.next()); + + logctx.cleanup_successful(); + } + + /// Test nexus generation validation against blueprint generation + #[test] + fn test_nexus_generation_blueprint_validation() { + static TEST_NAME: &str = "test_nexus_generation_blueprint_validation"; + let logctx = test_setup_log(TEST_NAME); + let mut rng = SimRngState::from_seed(TEST_NAME); + + // Start with a system that has one Nexus zone + let (example_system, mut blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .nexus_count(1) + .build(); + verify_blueprint(&blueprint); + + // Manually modify the blueprint to create a mismatch: + // Set the top-level nexus_generation to 2, but keep the zone generation at 1 + blueprint.nexus_generation = Generation::new().next(); + + let mut builder = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint, + &example_system.input, + &example_system.collection, + "test", + rng.next_planner_rng(), + ) + .expect("failed to create builder"); + + let sled_ids: Vec<_> = example_system + .input + .all_sled_ids(SledFilter::Commissioned) + .collect(); + let image_source = BlueprintZoneImageSource::InstallDataset; // Same as existing + + // Try to add another Nexus zone with same image source + // This should fail because existing zone has generation 1 but blueprint has generation 2 + let result = builder.sled_add_zone_nexus_internal( + sled_ids[1], + image_source, + false, + ); + + match result { + Err(Error::OldImageNexusGenerationMismatch { + expected, + actual, + }) => { + assert_eq!(expected, Generation::new().next()); // Blueprint generation + assert_eq!(actual, Generation::new()); // Zone generation + } + other => panic!( + "Expected OldImageNexusGenerationMismatch error, got: {:?}", + other + ), + } + + logctx.cleanup_successful(); + } + + /// Test nexus generation validation for new image source + #[test] + fn test_nexus_generation_blueprint_validation_new_image() { + static TEST_NAME: &str = + "test_nexus_generation_blueprint_validation_new_image"; + let logctx = test_setup_log(TEST_NAME); + let mut rng = SimRngState::from_seed(TEST_NAME); + + // Start with a system that has one Nexus zone + let (example_system, mut blueprint) = + ExampleSystemBuilder::new(&logctx.log, TEST_NAME) + .nexus_count(1) + .build(); + verify_blueprint(&blueprint); + + // The zone has generation 1 and blueprint has generation 1 + // Now modify the blueprint generation to be different from what + // the new image source logic would expect + blueprint.nexus_generation = Generation::new().next().next(); // Set to generation 3 + + let mut builder = BlueprintBuilder::new_based_on( + &logctx.log, + &blueprint, + &example_system.input, + &example_system.collection, + "test", + rng.next_planner_rng(), + ) + .expect("failed to create builder"); + + let sled_ids: Vec<_> = example_system + .input + .all_sled_ids(SledFilter::Commissioned) + .collect(); + + // Use a different image source (this should get existing generation + 1 = 2) + let different_image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("2.0.0"), + }, + hash: ArtifactHash([0x42; 32]), + }; + + // Try to add a Nexus zone with different image source + // This should fail because the calculated generation (2) doesn't match blueprint generation + 1 (4) + let result = builder.sled_add_zone_nexus_internal( + sled_ids[1], + different_image_source, + false, + ); + + match result { + Err(Error::NewImageNexusGenerationMismatch { + expected, + actual, + }) => { + assert_eq!(expected, Generation::new().next().next().next()); // Blueprint generation + 1 = 4 + assert_eq!(actual, Generation::new().next()); // Calculated generation = 2 + } + other => panic!( + "Expected NewImageNexusGenerationMismatch error, got: {:?}", + other + ), + } + + logctx.cleanup_successful(); + } } diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index ce793980b2c..67ac9f79a2a 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -480,12 +480,12 @@ impl ExampleSystemBuilder { for _ in 0..nexus_count .on(discretionary_ix, discretionary_sled_count) { + let must_have_nexus_zones = false; builder - .sled_add_zone_nexus_with_config( + .sled_add_zone_nexus_internal( sled_id, - false, - vec![], image_source.clone(), + must_have_nexus_zones, ) .unwrap(); } @@ -547,6 +547,24 @@ impl ExampleSystemBuilder { } let blueprint = builder.build(); + + // Find the first Nexus zone to use as the current Nexus zone ID + let current_nexus_zone_id = blueprint + .sleds + .values() + .flat_map(|sled_cfg| sled_cfg.zones.iter()) + .find_map(|zone| match &zone.zone_type { + nexus_types::deployment::BlueprintZoneType::Nexus(_) => { + Some(zone.id) + } + _ => None, + }); + + // Set the current Nexus zone ID if we found one + if let Some(nexus_zone_id) = current_nexus_zone_id { + input_builder.set_current_nexus_zone_id(Some(nexus_zone_id)); + } + for sled_cfg in blueprint.sleds.values() { for zone in sled_cfg.zones.iter() { let service_id = zone.id; diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index d9633258d02..e6d4e943428 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -30,6 +30,7 @@ use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; +use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::CockroachDbSettings; @@ -41,9 +42,10 @@ use nexus_types::deployment::SledFilter; use nexus_types::deployment::TufRepoContentsError; use nexus_types::deployment::ZpoolFilter; use nexus_types::deployment::{ - CockroachdbUnsafeToShutdown, PlanningAddStepReport, - PlanningCockroachdbSettingsStepReport, PlanningDecommissionStepReport, - PlanningExpungeStepReport, PlanningMgsUpdatesStepReport, + CockroachdbUnsafeToShutdown, NexusGenerationBumpWaitingOn, + PlanningAddStepReport, PlanningCockroachdbSettingsStepReport, + PlanningDecommissionStepReport, PlanningExpungeStepReport, + PlanningMgsUpdatesStepReport, PlanningNexusGenerationBumpReport, PlanningNoopImageSourceStepReport, PlanningReport, PlanningZoneUpdatesStepReport, ZoneAddWaitingOn, ZoneUnsafeToShutdown, ZoneUpdatesWaitingOn, @@ -52,6 +54,7 @@ use nexus_types::external_api::views::PhysicalDiskPolicy; use nexus_types::external_api::views::SledPolicy; use nexus_types::external_api::views::SledState; use nexus_types::inventory::Collection; +use omicron_common::api::external::Generation; use omicron_common::policy::BOUNDARY_NTP_REDUNDANCY; use omicron_common::policy::COCKROACHDB_REDUNDANCY; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; @@ -109,6 +112,31 @@ const NUM_CONCURRENT_MGS_UPDATES: usize = 1; /// A receipt that `check_input_validity` has been run prior to planning. struct InputChecked; +#[derive(Debug)] +#[expect(dead_code)] +struct ZoneCurrentlyUpdating<'a> { + zone_id: OmicronZoneUuid, + zone_kind: ZoneKind, + reason: UpdatingReason<'a>, +} + +#[derive(Debug)] +#[expect(dead_code)] +enum UpdatingReason<'a> { + ImageSourceMismatch { + bp_image_source: &'a BlueprintZoneImageSource, + inv_image_source: &'a OmicronZoneImageSource, + }, + MissingInInventory { + bp_image_source: &'a BlueprintZoneImageSource, + }, + ReconciliationError { + bp_image_source: &'a BlueprintZoneImageSource, + inv_image_source: &'a OmicronZoneImageSource, + message: &'a str, + }, +} + pub struct Planner<'a> { log: Logger, input: &'a PlanningInput, @@ -230,6 +258,10 @@ impl<'a> Planner<'a> { self.do_plan_zone_updates(&mgs_updates)? }; + // We may need to bump the top-level Nexus generation number + // to update Nexus zones. + let nexus_generation_bump = self.do_plan_nexus_generation_update()?; + // CockroachDB settings aren't dependent on zones, so they can be // planned independently of the rest of the system. let cockroachdb_settings = self.do_plan_cockroachdb_settings(); @@ -243,6 +275,7 @@ impl<'a> Planner<'a> { add, mgs_updates, zone_updates, + nexus_generation_bump, cockroachdb_settings, }) } @@ -904,55 +937,103 @@ impl<'a> Planner<'a> { DiscretionaryOmicronZone::Nexus, DiscretionaryOmicronZone::Oximeter, ] { - let num_zones_to_add = - self.num_additional_zones_needed(zone_kind, report); - if num_zones_to_add == 0 { - continue; - } - // We need to add at least one zone; construct our `zone_placement` - // (or reuse the existing one if a previous loop iteration already - // created it). - let zone_placement = zone_placement.get_or_insert_with(|| { - // This constructs a picture of the sleds as we currently - // understand them, as far as which sleds have discretionary - // zones. This will remain valid as we loop through the - // `zone_kind`s in this function, as any zone additions will - // update the `zone_placement` heap in-place. - let current_discretionary_zones = self - .input - .all_sled_resources(SledFilter::Discretionary) - .filter(|(sled_id, _)| { - !report.sleds_waiting_for_ntp_zone.contains(&sled_id) - }) - .map(|(sled_id, sled_resources)| { - OmicronZonePlacementSledState { - sled_id, - num_zpools: sled_resources - .all_zpools(ZpoolFilter::InService) - .count(), - discretionary_zones: self - .blueprint - .current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) - .filter_map(|zone| { - DiscretionaryOmicronZone::from_zone_type( - &zone.zone_type, - ) - }) - .collect(), + let image_sources = match zone_kind { + DiscretionaryOmicronZone::Nexus => { + let old_image = self + .input + .old_repo() + .description() + .zone_image_source(zone_kind.into())?; + let new_image = self + .input + .tuf_repo() + .description() + .zone_image_source(zone_kind.into())?; + let our_image = self.lookup_current_nexus_image(); + + let mut images = vec![]; + if old_image != new_image { + // We may still want to deploy the old image alongside + // the new image: if we're running the "old version of a + // Nexus" currently, we need to ensure we have + // redundancy before the handoff completes. + if our_image.as_ref() != Some(&new_image) { + images.push(old_image); } - }); - OmicronZonePlacement::new(current_discretionary_zones) - }); - self.add_discretionary_zones( - zone_placement, - zone_kind, - num_zones_to_add, - mgs_updates, - report, - )?; + // If there is a new image for us to use, deploy it + // immediately. The new Nexus will hang around mostly + // idle until handoff is ready. + images.push(new_image.clone()); + } else { + // If there is no new image to use, use the old image. + images.push(old_image); + } + + assert!(!images.is_empty()); + images + } + _ => { + vec![self.image_source_for_new_zone( + zone_kind.into(), + mgs_updates, + )?] + } + }; + + for image_source in image_sources { + let num_zones_to_add = self.num_additional_zones_needed( + zone_kind, + &image_source, + report, + ); + if num_zones_to_add == 0 { + continue; + } + // We need to add at least one zone; construct our `zone_placement` + // (or reuse the existing one if a previous loop iteration already + // created it). + let zone_placement = zone_placement.get_or_insert_with(|| { + // This constructs a picture of the sleds as we currently + // understand them, as far as which sleds have discretionary + // zones. This will remain valid as we loop through the + // `zone_kind`s in this function, as any zone additions will + // update the `zone_placement` heap in-place. + let current_discretionary_zones = self + .input + .all_sled_resources(SledFilter::Discretionary) + .filter(|(sled_id, _)| { + !report.sleds_waiting_for_ntp_zone.contains(&sled_id) + }) + .map(|(sled_id, sled_resources)| { + OmicronZonePlacementSledState { + sled_id, + num_zpools: sled_resources + .all_zpools(ZpoolFilter::InService) + .count(), + discretionary_zones: self + .blueprint + .current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) + .filter_map(|zone| { + DiscretionaryOmicronZone::from_zone_type( + &zone.zone_type, + ) + }) + .collect(), + } + }); + OmicronZonePlacement::new(current_discretionary_zones) + }); + self.add_discretionary_zones( + zone_placement, + zone_kind, + num_zones_to_add, + image_source, + report, + )?; + } } Ok(()) @@ -962,7 +1043,8 @@ impl<'a> Planner<'a> { /// additional zones needed of the given `zone_kind` to satisfy the policy. fn num_additional_zones_needed( &mut self, - zone_kind: DiscretionaryOmicronZone, + discretionary_zone_kind: DiscretionaryOmicronZone, + image_source: &BlueprintZoneImageSource, report: &mut PlanningAddStepReport, ) -> usize { // Count the number of `kind` zones on all in-service sleds. This @@ -971,7 +1053,7 @@ impl<'a> Planner<'a> { // decommissioned. let mut num_existing_kind_zones = 0; for sled_id in self.input.all_sled_ids(SledFilter::InService) { - let zone_kind = ZoneKind::from(zone_kind); + let zone_kind = ZoneKind::from(discretionary_zone_kind); // Internal DNS is special: if we have an expunged internal DNS zone // that might still be running, we want to count it here: we can't @@ -986,11 +1068,20 @@ impl<'a> Planner<'a> { num_existing_kind_zones += self .blueprint .current_sled_zones(sled_id, disposition_filter) - .filter(|z| z.zone_type.kind() == zone_kind) + .filter(|z| { + let matches_kind = z.zone_type.kind() == zone_kind; + let matches_image = z.image_source == *image_source; + match discretionary_zone_kind { + DiscretionaryOmicronZone::Nexus => { + matches_kind && matches_image + } + _ => matches_kind, + } + }) .count(); } - let target_count = match zone_kind { + let target_count = match discretionary_zone_kind { DiscretionaryOmicronZone::BoundaryNtp => { self.input.target_boundary_ntp_zone_count() } @@ -1032,7 +1123,7 @@ impl<'a> Planner<'a> { target_count.saturating_sub(num_existing_kind_zones); if num_zones_to_add == 0 { report.sufficient_zones_exist( - ZoneKind::from(zone_kind).report_str(), + ZoneKind::from(discretionary_zone_kind).report_str(), target_count, num_existing_kind_zones, ); @@ -1050,7 +1141,7 @@ impl<'a> Planner<'a> { zone_placement: &mut OmicronZonePlacement, kind: DiscretionaryOmicronZone, num_zones_to_add: usize, - mgs_updates: &PlanningMgsUpdatesStepReport, + image_source: BlueprintZoneImageSource, report: &mut PlanningAddStepReport, ) -> Result<(), Error> { for i in 0..num_zones_to_add { @@ -1070,46 +1161,45 @@ impl<'a> Planner<'a> { } }; - let image_source = - self.image_source_for_new_zone(kind.into(), mgs_updates)?; + let image = image_source.clone(); match kind { DiscretionaryOmicronZone::BoundaryNtp => { self.blueprint.sled_promote_internal_ntp_to_boundary_ntp( - sled_id, - image_source, + sled_id, image, )? } - DiscretionaryOmicronZone::Clickhouse => self - .blueprint - .sled_add_zone_clickhouse(sled_id, image_source)?, + DiscretionaryOmicronZone::Clickhouse => { + self.blueprint.sled_add_zone_clickhouse(sled_id, image)? + } DiscretionaryOmicronZone::ClickhouseKeeper => self .blueprint - .sled_add_zone_clickhouse_keeper(sled_id, image_source)?, + .sled_add_zone_clickhouse_keeper(sled_id, image)?, DiscretionaryOmicronZone::ClickhouseServer => self .blueprint - .sled_add_zone_clickhouse_server(sled_id, image_source)?, - DiscretionaryOmicronZone::CockroachDb => self - .blueprint - .sled_add_zone_cockroachdb(sled_id, image_source)?, + .sled_add_zone_clickhouse_server(sled_id, image)?, + DiscretionaryOmicronZone::CockroachDb => { + self.blueprint.sled_add_zone_cockroachdb(sled_id, image)? + } DiscretionaryOmicronZone::CruciblePantry => self .blueprint - .sled_add_zone_crucible_pantry(sled_id, image_source)?, - DiscretionaryOmicronZone::InternalDns => self - .blueprint - .sled_add_zone_internal_dns(sled_id, image_source)?, - DiscretionaryOmicronZone::ExternalDns => self - .blueprint - .sled_add_zone_external_dns(sled_id, image_source)?, + .sled_add_zone_crucible_pantry(sled_id, image)?, + DiscretionaryOmicronZone::InternalDns => { + self.blueprint.sled_add_zone_internal_dns(sled_id, image)? + } + DiscretionaryOmicronZone::ExternalDns => { + self.blueprint.sled_add_zone_external_dns(sled_id, image)? + } DiscretionaryOmicronZone::Nexus => { - self.blueprint.sled_add_zone_nexus(sled_id, image_source)? + self.blueprint.sled_add_zone_nexus(sled_id, image)? + } + DiscretionaryOmicronZone::Oximeter => { + self.blueprint.sled_add_zone_oximeter(sled_id, image)? } - DiscretionaryOmicronZone::Oximeter => self - .blueprint - .sled_add_zone_oximeter(sled_id, image_source)?, }; report.discretionary_zone_placed( sled_id, ZoneKind::from(kind).report_str(), + &image_source, ); } @@ -1191,13 +1281,9 @@ impl<'a> Planner<'a> { Ok(PlanningMgsUpdatesStepReport::new(pending_updates)) } - /// Update at most one existing zone to use a new image source. - fn do_plan_zone_updates( - &mut self, - mgs_updates: &PlanningMgsUpdatesStepReport, - ) -> Result { - let mut report = PlanningZoneUpdatesStepReport::new(); - + fn get_zones_not_yet_propagated_to_inventory( + &self, + ) -> Vec> { // We are only interested in non-decommissioned sleds. let sleds = self .input @@ -1212,31 +1298,7 @@ impl<'a> Planner<'a> { .map(|(z, sa_result)| (z.id, (&z.image_source, sa_result))) .collect::>(); - #[derive(Debug)] - #[expect(dead_code)] - struct ZoneCurrentlyUpdating<'a> { - zone_id: OmicronZoneUuid, - zone_kind: ZoneKind, - reason: UpdatingReason<'a>, - } - - #[derive(Debug)] - #[expect(dead_code)] - enum UpdatingReason<'a> { - ImageSourceMismatch { - bp_image_source: &'a BlueprintZoneImageSource, - inv_image_source: &'a OmicronZoneImageSource, - }, - MissingInInventory { - bp_image_source: &'a BlueprintZoneImageSource, - }, - ReconciliationError { - bp_image_source: &'a BlueprintZoneImageSource, - inv_image_source: &'a OmicronZoneImageSource, - message: &'a str, - }, - } - + let mut updating = vec![]; for &sled_id in &sleds { // Build a list of zones currently in the blueprint but where // inventory has a mismatch or does not know about the zone. @@ -1244,7 +1306,7 @@ impl<'a> Planner<'a> { // What about the case where a zone is in inventory but not in the // blueprint? See // https://github.com/oxidecomputer/omicron/issues/8589. - let zones_currently_updating = self + let mut zones_currently_updating = self .blueprint .current_sled_zones( sled_id, @@ -1306,17 +1368,35 @@ impl<'a> Planner<'a> { } }) .collect::>(); + updating.append(&mut zones_currently_updating); + } + updating + } - if !zones_currently_updating.is_empty() { - info!( - self.log, "some zones not yet up-to-date"; - "sled_id" => %sled_id, - "zones_currently_updating" => ?zones_currently_updating, - ); - return Ok(report); - } + /// Update at most one existing zone to use a new image source. + fn do_plan_zone_updates( + &mut self, + mgs_updates: &PlanningMgsUpdatesStepReport, + ) -> Result { + let mut report = PlanningZoneUpdatesStepReport::new(); + + let zones_currently_updating = + self.get_zones_not_yet_propagated_to_inventory(); + if !zones_currently_updating.is_empty() { + info!( + self.log, "some zones not yet up-to-date"; + "zones_currently_updating" => ?zones_currently_updating, + ); + return Ok(report); } + // We are only interested in non-decommissioned sleds. + let sleds = self + .input + .all_sleds(SledFilter::Commissioned) + .map(|(id, _details)| id) + .collect::>(); + // Find out of date zones, as defined by zones whose image source does // not match what it should be based on our current target release. let target_release = self.input.tuf_repo().description(); @@ -1368,10 +1448,7 @@ impl<'a> Planner<'a> { if !self.can_zone_be_shut_down_safely(zone, &mut report) { return false; } - match self.is_zone_ready_for_update( - zone.zone_type.kind(), - mgs_updates, - ) { + match self.is_zone_ready_for_update(mgs_updates) { Ok(true) => true, Ok(false) => false, Err(err) => { @@ -1681,6 +1758,125 @@ impl<'a> Planner<'a> { Ok(reasons) } + // Determines whether or not the top-level "nexus_generation" + // value should be increased. + // + // Doing so will be a signal for all running Nexus instances at + // lower versions to start quiescing, and to perform handoff. + fn do_plan_nexus_generation_update( + &mut self, + ) -> Result { + let mut report = PlanningNexusGenerationBumpReport::new(); + + // Nexus can only be updated if all non-Nexus zones have been + // updated, i.e., their image source is an artifact from the new + // repo. + let new_repo = self.input.tuf_repo().description(); + + // If we don't actually have a TUF repo here, we can't do + // updates anyway; any return value is fine. + if new_repo.tuf_repo().is_none() { + return Ok(report); + } + + // Check that all in-service zones (other than Nexus) on all + // sleds have an image source consistent with `new_repo`. + for sled_id in self.blueprint.sled_ids_with_zones() { + for z in self.blueprint.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) { + let kind = z.zone_type.kind(); + if kind != ZoneKind::Nexus + && z.image_source != new_repo.zone_image_source(kind)? + { + report.set_waiting_on( + NexusGenerationBumpWaitingOn::NonNexusZoneUpdate, + ); + return Ok(report); + } + } + } + + // Confirm that we have new nexuses at the desired generation number + let current_generation = self.blueprint.nexus_generation(); + let proposed_generation = self.blueprint.nexus_generation().next(); + let mut out_of_date_nexuses_at_current_gen = 0; + let mut nexuses_at_next_gen = 0; + for sled_id in self.blueprint.sled_ids_with_zones() { + for z in self.blueprint.current_sled_zones( + sled_id, + BlueprintZoneDisposition::is_in_service, + ) { + if let BlueprintZoneType::Nexus(nexus_zone) = &z.zone_type { + if nexus_zone.nexus_generation == proposed_generation { + nexuses_at_next_gen += 1; + } + + if nexus_zone.nexus_generation == current_generation + && z.image_source + != new_repo.zone_image_source(z.zone_type.kind())? + { + out_of_date_nexuses_at_current_gen += 1; + } + } + } + } + + if out_of_date_nexuses_at_current_gen == 0 { + // If all the current-generation Nexuses are "up-to-date", then we may have + // just completed handoff successfully. In this case, there's nothing to report. + return Ok(report); + } else { + // If there aren't enough Nexuses at the next generation, quiescing could + // be a dangerous operation. Blueprint execution should be able to continue + // even if the new Nexuses haven't started, but to be conservative, we'll wait + // for the target count. + if nexuses_at_next_gen < self.input.target_nexus_zone_count() { + report.set_waiting_on( + NexusGenerationBumpWaitingOn::NewNexusBringup, + ); + return Ok(report); + } + } + + // Confirm that all blueprint zones have propagated to inventory + let zones_currently_updating = + self.get_zones_not_yet_propagated_to_inventory(); + if !zones_currently_updating.is_empty() { + info!( + self.log, "some zones not yet up-to-date"; + "zones_currently_updating" => ?zones_currently_updating, + ); + report + .set_waiting_on(NexusGenerationBumpWaitingOn::ZonePropagation); + return Ok(report); + } + + // If we're here: + // - There's a new repo + // - The current generation of Nexuses are considered "out-of-date" + // - There are Nexuses running with "current generation + 1" + // - All non-Nexus zones have updated + // - All other blueprint zones have propagated to inventory + // + // If all of these are true, the "zone update" portion of the planner + // has completed, aside from Nexus, and we're ready for old Nexuses + // to start quiescing. + // + // Blueprint planning and execution will be able to continue past this + // point, for the purposes of restoring redundancy, expunging sleds, + // etc. However, making this committment will also halt the creation of + // new sagas temporarily, as handoff from old to new Nexuses occurs. + self.blueprint.set_nexus_generation( + self.blueprint.nexus_generation(), + proposed_generation, + )?; + report.set_next_generation(proposed_generation); + + Ok(report) + } + fn do_plan_cockroachdb_settings( &mut self, ) -> PlanningCockroachdbSettingsStepReport { @@ -1781,63 +1977,67 @@ impl<'a> Planner<'a> { zone_kind: ZoneKind, mgs_updates: &PlanningMgsUpdatesStepReport, ) -> Result { - let source_repo = - if self.is_zone_ready_for_update(zone_kind, mgs_updates)? { - self.input.tuf_repo().description() - } else { - self.input.old_repo().description() - }; + let source_repo = if self.is_zone_ready_for_update(mgs_updates)? { + self.input.tuf_repo().description() + } else { + self.input.old_repo().description() + }; source_repo.zone_image_source(zone_kind) } - /// Return `true` iff a zone of the given kind is ready to be updated; - /// i.e., its dependencies have been updated. + /// Return `true` iff a zone is ready to be updated; i.e., its dependencies + /// have been updated. fn is_zone_ready_for_update( &self, - zone_kind: ZoneKind, mgs_updates: &PlanningMgsUpdatesStepReport, ) -> Result { - // We return false regardless of `zone_kind` if there are still + // We return false for all zone kinds if there are still // pending updates for components earlier in the update ordering // than zones: RoT bootloader / RoT / SP / Host OS. if !mgs_updates.is_empty() { return Ok(false); } - match zone_kind { - ZoneKind::Nexus => { - // Nexus can only be updated if all non-Nexus zones have been - // updated, i.e., their image source is an artifact from the new - // repo. - let new_repo = self.input.tuf_repo().description(); - - // If we don't actually have a TUF repo here, we can't do - // updates anyway; any return value is fine. - if new_repo.tuf_repo().is_none() { - return Ok(false); + Ok(true) + } + + fn lookup_current_nexus_image(&self) -> Option { + // Get the current Nexus zone ID from the planning input + let current_nexus_zone_id = self.input.current_nexus_zone_id()?; + + // Look up our current Nexus zone in the blueprint to get its image + self.blueprint + .parent_blueprint() + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .find_map(|(_, blueprint_zone)| { + if blueprint_zone.id == current_nexus_zone_id { + Some(blueprint_zone.image_source.clone()) + } else { + None } + }) + } - // Check that all in-service zones (other than Nexus) on all - // sleds have an image source consistent with `new_repo`. - for sled_id in self.blueprint.sled_ids_with_zones() { - for z in self.blueprint.current_sled_zones( - sled_id, - BlueprintZoneDisposition::is_in_service, - ) { - let kind = z.zone_type.kind(); - if kind != ZoneKind::Nexus - && z.image_source - != new_repo.zone_image_source(kind)? - { - return Ok(false); + fn lookup_current_nexus_generation(&self) -> Option { + // Get the current Nexus zone ID from the planning input + let current_nexus_zone_id = self.input.current_nexus_zone_id()?; + + // Look up our current Nexus zone in the blueprint to get its generation + self.blueprint + .parent_blueprint() + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .find_map(|(_, blueprint_zone)| { + if blueprint_zone.id == current_nexus_zone_id { + match &blueprint_zone.zone_type { + BlueprintZoneType::Nexus(nexus_zone) => { + Some(nexus_zone.nexus_generation) } + _ => None, } + } else { + None } - - Ok(true) - } - _ => Ok(true), // other zone kinds have no special dependencies - } + }) } /// Return `true` iff we believe a zone can safely be shut down; e.g., any @@ -2018,6 +2218,47 @@ impl<'a> Planner<'a> { false } } + ZoneKind::Nexus => { + // Get the nexus_generation of the zone being considered for shutdown + let zone_nexus_generation = match &zone.zone_type { + BlueprintZoneType::Nexus(nexus_zone) => { + nexus_zone.nexus_generation + } + _ => unreachable!("zone kind is Nexus but type is not"), + }; + + let Some(current_gen) = self.lookup_current_nexus_generation() + else { + // If we don't know the current Nexus zone ID, or its + // generation, we can't perform the handoff safety check. + report.unsafe_zone( + zone, + Nexus { + zone_generation: zone_nexus_generation, + current_nexus_generation: None, + }, + ); + return false; + }; + + // It's only safe to shut down if handoff has occurred. + // + // That only happens when the current generation of Nexus (the + // one running right now) is greater than the zone we're + // considering expunging. + if current_gen <= zone_nexus_generation { + report.unsafe_zone( + zone, + Nexus { + zone_generation: zone_nexus_generation, + current_nexus_generation: Some(current_gen), + }, + ); + return false; + } + + true + } _ => true, // other zone kinds have no special safety checks } } @@ -5547,8 +5788,8 @@ pub(crate) mod test { /// Ensure that dependent zones (here just Crucible Pantry) are updated /// before Nexus. #[test] - fn test_update_crucible_pantry() { - static TEST_NAME: &str = "update_crucible_pantry"; + fn test_update_crucible_pantry_before_nexus() { + static TEST_NAME: &str = "update_crucible_pantry_before_nexus"; let logctx = test_setup_log(TEST_NAME); let log = logctx.log.clone(); @@ -5655,18 +5896,18 @@ pub(crate) mod test { }; } - // Request another Nexus zone. - input_builder.policy_mut().target_nexus_zone_count = - input_builder.policy_mut().target_nexus_zone_count + 1; - let input = input_builder.build(); + // Nexus should deploy new zones, but keep the old ones running. + let expected_new_nexus_zones = + input_builder.policy_mut().target_nexus_zone_count; + example.input = input_builder.build(); - // Check that there is a new nexus zone that does *not* use the new - // artifact (since not all of its dependencies are updated yet). + // Check that there are new nexus zones deployed, though handoff is + // incomplete (since not all of its dependencies are updated yet). update_collection_from_blueprint(&mut example, &blueprint1); let blueprint2 = Planner::new_based_on( log.clone(), &blueprint1, - &input, + &example.input, "test_blueprint3", &example.collection, PlannerRng::from_seed((TEST_NAME, "bp3")), @@ -5676,6 +5917,7 @@ pub(crate) mod test { .expect("can't re-plan for new Nexus zone"); { let summary = blueprint2.diff_since_blueprint(&blueprint1); + let mut modified_sleds = 0; for sled in summary.diff.sleds.modified_values_diff() { assert!(sled.zones.removed.is_empty()); assert_eq!(sled.zones.added.len(), 1); @@ -5684,11 +5926,10 @@ pub(crate) mod test { &added.zone_type, BlueprintZoneType::Nexus(_) )); - assert!(matches!( - &added.image_source, - BlueprintZoneImageSource::InstallDataset - )); + assert_eq!(&added.image_source, &image_source); + modified_sleds += 1; } + assert_eq!(modified_sleds, expected_new_nexus_zones); } // We should now have three sets of expunge/add iterations for the @@ -5700,7 +5941,7 @@ pub(crate) mod test { let blueprint = Planner::new_based_on( log.clone(), &parent, - &input, + &example.input, &blueprint_name, &example.collection, PlannerRng::from_seed((TEST_NAME, &blueprint_name)), @@ -5772,17 +6013,32 @@ pub(crate) mod test { .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_old_nexus(z)) .count(), - NEXUS_REDUNDANCY + 1, + NEXUS_REDUNDANCY, + ); + assert_eq!( + blueprint8 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_up_to_date_nexus(z)) + .count(), + NEXUS_REDUNDANCY, + ); + + // We have to pretend that we're running the "Newer Nexus" to shut down + // the old Nexuses. If we don't do this: it's as if handoff has not + // happened, and the old Nexuses cannot shut down. + set_current_nexus_to_highest_generation( + &mut example.input, + &blueprint8, ); let mut parent = blueprint8; - for i in 9..=16 { - update_collection_from_blueprint(&mut example, &parent); + for i in 9..=12 { + update_collection_from_blueprint(&mut example, &parent); let blueprint_name = format!("blueprint{i}"); let blueprint = Planner::new_based_on( log.clone(), &parent, - &input, + &example.input, &blueprint_name, &example.collection, PlannerRng::from_seed((TEST_NAME, &blueprint_name)), @@ -5793,41 +6049,72 @@ pub(crate) mod test { { let summary = blueprint.diff_since_blueprint(&parent); + assert!(summary.has_changes(), "No changes at iteration {i}"); for sled in summary.diff.sleds.modified_values_diff() { - if i % 2 == 1 { - assert!(sled.zones.added.is_empty()); - assert!(sled.zones.removed.is_empty()); - } else { - assert!(sled.zones.removed.is_empty()); - assert_eq!(sled.zones.added.len(), 1); - let added = sled.zones.added.values().next().unwrap(); + assert!(sled.zones.added.is_empty()); + assert!(sled.zones.removed.is_empty()); + for modified_zone in sled.zones.modified_values_diff() { + // We're only modifying Nexus zones on the old image assert!(matches!( - &added.zone_type, + *modified_zone.zone_type.before, BlueprintZoneType::Nexus(_) )); - assert_eq!(added.image_source, image_source); + assert_eq!( + *modified_zone.image_source.before, + BlueprintZoneImageSource::InstallDataset + ); + + // If the zone was previously in-service, it gets + // expunged. + if modified_zone.disposition.before.is_in_service() { + assert!( + modified_zone.disposition.after.is_expunged(), + ); + } + + // If the zone was previously expunged and not ready for + // cleanup, it should be marked ready-for-cleanup + if modified_zone.disposition.before.is_expunged() + && !modified_zone + .disposition + .before + .is_ready_for_cleanup() + { + assert!( + modified_zone + .disposition + .after + .is_ready_for_cleanup(), + ); + } } } } - parent = blueprint; } // Everything's up-to-date in Kansas City! - let blueprint16 = parent; + let blueprint12 = parent; assert_eq!( - blueprint16 + blueprint12 .all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_nexus(z)) .count(), - NEXUS_REDUNDANCY + 1, + NEXUS_REDUNDANCY, + ); + assert_eq!( + blueprint12 + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .filter(|(_, z)| is_old_nexus(z)) + .count(), + 0, ); - update_collection_from_blueprint(&mut example, &blueprint16); + update_collection_from_blueprint(&mut example, &blueprint12); assert_planning_makes_no_changes( &logctx.log, - &blueprint16, - &input, + &blueprint12, + &example.input, &example.collection, TEST_NAME, ); @@ -6992,6 +7279,47 @@ pub(crate) mod test { logctx.cleanup_successful(); } + // Updates the PlanningInput to pretend like we're running + // from whichever Nexus has the highest "nexus_generation" value. + fn set_current_nexus_to_highest_generation( + input: &mut PlanningInput, + blueprint: &Blueprint, + ) { + let mut current_gen = + if let Some(current_nexus_id) = input.current_nexus_zone_id() { + blueprint + .sleds + .values() + .find_map(|sled| { + for zone in &sled.zones { + if zone.id == current_nexus_id { + if let BlueprintZoneType::Nexus(nexus_config) = + &zone.zone_type + { + return Some(nexus_config.nexus_generation); + } + } + } + None + }) + .expect("Cannot find current Nexus zone in blueprint") + } else { + Generation::new() + }; + + for sled_config in blueprint.sleds.values() { + for zone in &sled_config.zones { + if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type + { + if nexus_config.nexus_generation > current_gen { + input.set_current_nexus_zone_id(zone.id); + current_gen = nexus_config.nexus_generation; + } + } + } + } + } + /// Ensure that planning to update all zones terminates. #[test] fn test_update_all_zones() { @@ -7049,13 +7377,13 @@ pub(crate) mod test { ), }; input_builder.policy_mut().tuf_repo = tuf_repo; - let input = input_builder.build(); + let mut input = input_builder.build(); /// Expected number of planner iterations required to converge. /// If incidental planner work changes this value occasionally, /// that's fine; but if we find we're changing it all the time, /// we should probably drop it and keep just the maximum below. - const EXP_PLANNING_ITERATIONS: usize = 57; + const EXP_PLANNING_ITERATIONS: usize = 55; /// Planning must not take more than this number of iterations. const MAX_PLANNING_ITERATIONS: usize = 100; @@ -7076,7 +7404,9 @@ pub(crate) mod test { ) .expect("can't create planner") .plan() - .unwrap_or_else(|_| panic!("can't re-plan after {i} iterations")); + .unwrap_or_else(|err| { + panic!("can't re-plan after {i} iterations: {err}") + }); assert_eq!(blueprint.report.blueprint_id, blueprint.id); eprintln!("{}\n", blueprint.report); @@ -7108,9 +7438,422 @@ pub(crate) mod test { } } + // If there is a newer Nexus, we must jump to it to expunge + // the older Nexus zones. + set_current_nexus_to_highest_generation(&mut input, &blueprint); + parent = blueprint; } panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations"); } + + struct BlueprintGenerator { + log: Logger, + example: ExampleSystem, + blueprint: Blueprint, + rng: SimRngState, + target_release_generation: Generation, + } + + impl BlueprintGenerator { + fn new( + log: Logger, + example: ExampleSystem, + blueprint: Blueprint, + rng: SimRngState, + ) -> Self { + Self { + log, + example, + blueprint, + rng, + target_release_generation: Generation::new(), + } + } + + fn create_image_at_version( + version: &ArtifactVersion, + ) -> BlueprintZoneImageSource { + let fake_hash = ArtifactHash([0; 32]); + BlueprintZoneImageSource::Artifact { + version: BlueprintArtifactVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + } + } + + // - Bumps the target_release_generation + // - Sets a new "tuf_repo" as part of the "example.input" + // - The system version is hard-coded as "2.0.0" + // - Sets artifacts in the repo to `artifacts` + fn set_new_tuf_repo_with_artifacts( + &mut self, + artifacts: Vec, + ) { + let mut input_builder = self.example.input.clone().into_builder(); + let fake_hash = ArtifactHash([0; 32]); + self.target_release_generation = + self.target_release_generation.next(); + + let tuf_repo = TufRepoPolicy { + target_release_generation: self.target_release_generation, + description: TargetReleaseDescription::TufRepo( + TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(2, 0, 0), + file_name: String::from(""), + }, + artifacts, + }, + ), + }; + + input_builder.policy_mut().tuf_repo = tuf_repo; + self.example.input = input_builder.build(); + } + + fn set_old_tuf_repo_to_target(&mut self) { + let mut input_builder = self.example.input.clone().into_builder(); + input_builder.policy_mut().old_repo = + self.example.input.tuf_repo().clone(); + self.example.input = input_builder.build(); + } + + // Plans a new blueprint, validates it, and returns it + // + // Does not set the current blueprint to this new value + #[track_caller] + fn plan_new_blueprint(&mut self, name: &str) -> Blueprint { + let planner = Planner::new_based_on( + self.log.clone(), + &self.blueprint, + &self.example.input, + name, + &self.example.collection, + self.rng.next_planner_rng(), + ) + .expect("can't create planner"); + let bp = planner.plan().expect("planning succeeded"); + verify_blueprint(&bp); + bp + } + + // Asserts that a new blueprint, if generated, will make no changes + #[track_caller] + fn assert_child_bp_makes_no_changes( + &self, + child_blueprint: &Blueprint, + ) { + verify_blueprint(&child_blueprint); + let summary = child_blueprint.diff_since_blueprint(&self.blueprint); + assert_eq!( + summary.diff.sleds.added.len(), + 0, + "{}", + summary.display() + ); + assert_eq!( + summary.diff.sleds.removed.len(), + 0, + "{}", + summary.display() + ); + assert_eq!( + summary.diff.sleds.modified().count(), + 0, + "{}", + summary.display() + ); + } + + // Asserts that a new blueprint, if generated, will have no report. + // + // This function explicitly ignores the "noop_image_source" report. + // + // NOTE: More reports can be added, but we aren't using + // "PlanningReport::is_empty()", because some checks (e.g. + // noop_image_source) are almost always non-empty. + #[track_caller] + fn assert_child_bp_has_no_report(&self, child_blueprint: &Blueprint) { + verify_blueprint(&child_blueprint); + let summary = child_blueprint.diff_since_blueprint(&self.blueprint); + + assert!( + child_blueprint.report.expunge.is_empty() + && child_blueprint.report.decommission.is_empty() + && child_blueprint.report.mgs_updates.is_empty() + && child_blueprint.report.add.is_empty() + && child_blueprint.report.zone_updates.is_empty() + && child_blueprint.report.nexus_generation_bump.is_empty() + && child_blueprint.report.cockroachdb_settings.is_empty(), + "Blueprint Summary: {}\n + Planning report is not empty: {}", + summary.display(), + child_blueprint.report, + ); + } + + // Updates the input inventory to reflect changes from the blueprint + fn update_inventory_from_blueprint(&mut self) { + update_collection_from_blueprint( + &mut self.example, + &self.blueprint, + ); + } + + // Use the "highest generation Nexus". + // + // This effectively changes "which Nexus is trying to perform planning". + fn set_current_nexus_to_highest_generation(&mut self) { + set_current_nexus_to_highest_generation( + &mut self.example.input, + &self.blueprint, + ); + } + } + + #[test] + fn test_nexus_generation_update() { + static TEST_NAME: &str = "test_nexus_generation_update"; + let logctx = test_setup_log(TEST_NAME); + + // Use our example system with multiple Nexus zones + let mut rng = SimRngState::from_seed(TEST_NAME); + let (example, blueprint) = ExampleSystemBuilder::new_with_rng( + &logctx.log, + rng.next_system_rng(), + ) + .nexus_count(3) // Ensure we have multiple Nexus zones + .build(); + verify_blueprint(&blueprint); + + let mut bp_generator = BlueprintGenerator::new( + logctx.log.clone(), + example, + blueprint, + rng, + ); + + // We shouldn't try to bump the generation number without a new TUF + // repo. + let new_bp = bp_generator.plan_new_blueprint("no-op"); + bp_generator.assert_child_bp_makes_no_changes(&new_bp); + bp_generator.assert_child_bp_has_no_report(&new_bp); + + // Initially, all zones should be sourced from the install dataset + assert!( + bp_generator + .blueprint + .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .all(|(_, z)| matches!( + z.image_source, + BlueprintZoneImageSource::InstallDataset + )) + ); + + // Set up a TUF repo with new artifacts + let artifact_version = + ArtifactVersion::new_static("2.0.0-nexus-gen-test") + .expect("can't parse artifact version"); + bp_generator.set_new_tuf_repo_with_artifacts( + create_artifacts_at_version(&artifact_version), + ); + let image_source = + BlueprintGenerator::create_image_at_version(&artifact_version); + + // Check: Initially, nexus generation update should be blocked because + // non-Nexus zones haven't been updated yet + { + let new_bp = + bp_generator.plan_new_blueprint("test_blocked_by_non_nexus"); + // The blueprint should have a report showing what's blocked + assert!(new_bp.report.nexus_generation_bump.waiting_on.is_some()); + assert!( + matches!( + new_bp.report.nexus_generation_bump.waiting_on, + Some(NexusGenerationBumpWaitingOn::NonNexusZoneUpdate) + ), + "Unexpected Nexus Generation report: {:?}", + new_bp.report.nexus_generation_bump + ); + } + + // Manually update all non-Nexus zones to the new image source + for sled_config in bp_generator.blueprint.sleds.values_mut() { + for mut zone in &mut sled_config.zones { + if zone.zone_type.kind() != ZoneKind::Nexus { + zone.image_source = image_source.clone(); + } + } + } + bp_generator.update_inventory_from_blueprint(); + + // Check: Now nexus generation update should be blocked by lack of new Nexus zones + let old_generation = bp_generator.blueprint.nexus_generation; + let new_bp = + bp_generator.plan_new_blueprint("test_blocked_by_new_nexus"); + { + assert_eq!(new_bp.nexus_generation, old_generation); + + let summary = new_bp.diff_since_blueprint(&bp_generator.blueprint); + assert_eq!( + summary.total_zones_added(), + bp_generator.example.input.target_nexus_zone_count() + ); + assert_eq!(summary.total_zones_removed(), 0); + assert_eq!(summary.total_zones_modified(), 0); + + // Should be blocked by new Nexus bringup + assert!( + matches!( + new_bp.report.nexus_generation_bump.waiting_on, + Some(NexusGenerationBumpWaitingOn::ZonePropagation) + ), + "Unexpected Nexus Generation report: {:?}", + new_bp.report.nexus_generation_bump + ); + } + + // Check: If we try generating a new blueprint, we're still stuck behind + // propagation to inventory. + // + // We'll refuse to bump the top-level generation number (which would + // begin quiescing old Nexuses) until we've seen that the new nexus + // zones are up. + bp_generator.blueprint = new_bp; + { + let new_bp = + bp_generator.plan_new_blueprint("wait_for_propagation"); + assert_eq!(new_bp.nexus_generation, old_generation); + + let summary = new_bp.diff_since_blueprint(&bp_generator.blueprint); + assert_eq!(summary.total_zones_added(), 0); + assert_eq!(summary.total_zones_removed(), 0); + assert_eq!(summary.total_zones_modified(), 0); + assert!( + matches!( + new_bp.report.nexus_generation_bump.waiting_on, + Some(NexusGenerationBumpWaitingOn::ZonePropagation) + ), + "Unexpected Nexus Generation report: {:?}", + new_bp.report.nexus_generation_bump + ); + } + + // Make the new Nexus zones appear in inventory + bp_generator.update_inventory_from_blueprint(); + + // Check: Now nexus generation update should succeed + let new_bp = bp_generator.plan_new_blueprint("update_generation"); + // Finally, the top-level Nexus generation should get bumped. + assert_eq!(new_bp.nexus_generation, old_generation.next()); + bp_generator.blueprint = new_bp; + + // Check: After the generation bump, further planning should make no changes + bp_generator.update_inventory_from_blueprint(); + let new_bp = bp_generator.plan_new_blueprint("no-op"); + bp_generator.assert_child_bp_makes_no_changes(&new_bp); + + // However, there will be a report of "three Nexus zones that aren't + // ready to shut down". The blueprint generator still thinks it's + // running from one of these "old Nexuses". + let unsafe_to_shutdown_zones = &new_bp.report.zone_updates.unsafe_zones; + assert_eq!( + unsafe_to_shutdown_zones.len(), + bp_generator.example.input.target_nexus_zone_count() + ); + for why in unsafe_to_shutdown_zones.values() { + use nexus_types::deployment::ZoneUnsafeToShutdown; + match why { + ZoneUnsafeToShutdown::Nexus { + zone_generation, + current_nexus_generation, + } => { + assert_eq!(zone_generation, &Generation::new()); + assert_eq!( + current_nexus_generation, + &Some(Generation::new()) + ); + } + _ => panic!("Unexpected unsafe-to-shutdown zone: {why}"), + } + } + assert_eq!( + unsafe_to_shutdown_zones.len(), + bp_generator.example.input.target_nexus_zone_count() + ); + + // Move ourselves to a "new Nexus". Now observe: we expunge the old + // Nexus zones. + bp_generator.set_current_nexus_to_highest_generation(); + + // Old Nexuses which are in-service + let mut old_nexuses = + bp_generator.example.input.target_nexus_zone_count(); + // Old Nexuses which were expunged, but which still need propagation + let mut expunging_nexuses = 0; + + while old_nexuses > 0 || expunging_nexuses > 0 { + let new_bp = bp_generator.plan_new_blueprint("removal"); + + // We expect to expunge one old nexus at a time, if any exist, and + // also to finalize the expungement of old nexuses that were removed + // in prior iterations. + let expected_modified_nexuses = + expunging_nexuses + if old_nexuses > 0 { 1 } else { 0 }; + + { + let summary = + new_bp.diff_since_blueprint(&bp_generator.blueprint); + assert_eq!( + summary.total_zones_added(), + 0, + "{}", + summary.display() + ); + assert_eq!( + summary.total_zones_removed(), + 0, + "{}", + summary.display() + ); + assert_eq!( + summary.total_zones_modified(), + expected_modified_nexuses, + "{}", + summary.display() + ); + } + if old_nexuses > 0 { + old_nexuses -= 1; + expunging_nexuses = 1; + } else { + expunging_nexuses = 0; + } + + bp_generator.blueprint = new_bp; + bp_generator.update_inventory_from_blueprint(); + } + + let new_bp = bp_generator.plan_new_blueprint("no-op"); + bp_generator.assert_child_bp_makes_no_changes(&new_bp); + bp_generator.assert_child_bp_has_no_report(&new_bp); + + // Check: If the "old TUF repo = new TUF repo", we'll still make no changes + bp_generator.set_old_tuf_repo_to_target(); + let new_bp = bp_generator.plan_new_blueprint("repo-update"); + bp_generator.assert_child_bp_makes_no_changes(&new_bp); + bp_generator.assert_child_bp_has_no_report(&new_bp); + + // After all this, the Nexus generation number has still been updated + // exactly once. + assert_eq!(new_bp.nexus_generation, old_generation.next()); + + logctx.cleanup_successful(); + } } diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index c26801e5a35..7f751f628f3 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -1019,6 +1019,7 @@ impl SystemDescription { self.internal_dns_version, self.external_dns_version, CockroachDbSettings::empty(), + None, ); builder.set_ignore_impossible_mgs_updates_since( self.ignore_impossible_mgs_updates_since, diff --git a/nexus/reconfigurator/planning/tests/output/example_builder_zone_counts_blueprint.txt b/nexus/reconfigurator/planning/tests/output/example_builder_zone_counts_blueprint.txt index 15d2c2d6a77..6125703611f 100644 --- a/nexus/reconfigurator/planning/tests/output/example_builder_zone_counts_blueprint.txt +++ b/nexus/reconfigurator/planning/tests/output/example_builder_zone_counts_blueprint.txt @@ -531,6 +531,7 @@ parent: e35b2fdd-354d-48d9-acb5-703b2c269a54 internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 diff --git a/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_2_3.txt b/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_2_3.txt index 99e950f246e..98982124114 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_2_3.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_2_3.txt @@ -71,6 +71,7 @@ to: blueprint 4171ad05-89dd-474b-846b-b007e4346366 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_3_5.txt b/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_3_5.txt index 41df1375187..b724ec830ab 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_3_5.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_basic_add_sled_3_5.txt @@ -101,6 +101,7 @@ to: blueprint f432fcd5-1284-4058-8b4a-9286a3de6163 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt index 0e936766516..b619e3f7ddf 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_dataset_settings_modified_in_place_1_2.txt @@ -122,6 +122,7 @@ to: blueprint fe13be30-94c2-4fa6-aad5-ae3c5028f6bb internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt index 488e3f69d00..e79bf2daf74 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_1_2.txt @@ -387,6 +387,7 @@ to: blueprint 1ac2d88f-27dd-4506-8585-6b2be832528e internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_bp2.txt b/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_bp2.txt index 93d346b3170..b96700bc0ba 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_bp2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_decommissions_sleds_bp2.txt @@ -322,6 +322,7 @@ parent: 516e80a3-b362-4fac-bd3c-4559717120dd internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -330,7 +331,9 @@ chicken switches: add zones with mupdate override: false * discretionary zones placed: - * 2 zones on sled d67ce8f0-a691-4010-b414-420d82e80527: crucible_pantry, nexus - * 2 zones on sled fefcf4cf-f7e7-46b3-b629-058526ce440e: clickhouse, internal_dns + * crucible_pantry zone on sled d67ce8f0-a691-4010-b414-420d82e80527 from source install dataset + * nexus zone on sled d67ce8f0-a691-4010-b414-420d82e80527 from source install dataset + * clickhouse zone on sled fefcf4cf-f7e7-46b3-b629-058526ce440e from source install dataset + * internal_dns zone on sled fefcf4cf-f7e7-46b3-b629-058526ce440e from source install dataset * zone updates waiting on discretionary zones diff --git a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_1_2.txt index 63380e2c1eb..1be47958ab0 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_1_2.txt @@ -320,6 +320,7 @@ to: blueprint 31ef2071-2ec9-49d9-8827-fd83b17a0e3d internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_3_4.txt b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_3_4.txt index b2d2dee5588..af0649e7506 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_3_4.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_3_4.txt @@ -9,6 +9,7 @@ to: blueprint 92fa943c-7dd4-48c3-9447-c9d0665744b6 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_4_5.txt b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_4_5.txt index ea64f823b0a..0dc39530072 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_4_5.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_4_5.txt @@ -223,6 +223,7 @@ to: blueprint 2886dab5-61a2-46b4-87af-bc7aeb44cccb internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_5_6.txt b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_5_6.txt index 6b4c5f48e30..e33a4d4a9f0 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_5_6.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_deploy_all_keeper_nodes_5_6.txt @@ -9,6 +9,7 @@ to: blueprint cb39be9d-5476-44fa-9edf-9938376219ef internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt index 9394b253cc6..64f86ecfe65 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_3_4.txt @@ -408,6 +408,7 @@ to: blueprint 74f2e7fd-687e-4c9e-b5d8-e474a5bb8e7c internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt index 744379716fc..f5a11fe0dbc 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_clusters_5_6.txt @@ -9,6 +9,7 @@ to: blueprint df68d4d4-5af4-4b56-95bb-1654a6957d4f internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_zones_after_policy_is_changed_3_4.txt b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_zones_after_policy_is_changed_3_4.txt index 5e439554691..fd72c3cda5c 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_zones_after_policy_is_changed_3_4.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_expunge_clickhouse_zones_after_policy_is_changed_3_4.txt @@ -338,6 +338,7 @@ to: blueprint d895ef50-9978-454c-bdfb-b8dbe2c9a918 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt index 99ccd504aaf..13f6efd866f 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt @@ -373,6 +373,7 @@ to: blueprint 9f71f5d3-a272-4382-9154-6ea2e171a6c6 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt index 8bd822a364c..162e4c1ad69 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt @@ -349,6 +349,9 @@ mismatched zone type: after: Nexus( }, external_tls: false, external_dns_servers: [], + nexus_generation: Generation( + 1, + ), }, ) @@ -368,6 +371,7 @@ mismatched zone type: after: InternalNtp( internal DNS version::: 1 (unchanged) * external DNS version::: 1 -> 2 target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_bp2.txt b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_bp2.txt index 77c19780bed..3ccbc731707 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_bp2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_bp2.txt @@ -510,6 +510,7 @@ parent: 4d4e6c38-cd95-4c4e-8f45-6af4d686964b internal DNS version::: 1 external DNS version::: 1 target release min gen: 1 + nexus gen:::::::::::::: 1 PENDING MGS-MANAGED UPDATES: 0 @@ -518,7 +519,11 @@ chicken switches: add zones with mupdate override: false * discretionary zones placed: - * 3 zones on sled 75bc286f-2b4b-482c-9431-59272af529da: nexus, nexus, nexus - * 3 zones on sled affab35f-600a-4109-8ea0-34a067a4e0bc: nexus, nexus, nexus + * nexus zone on sled 75bc286f-2b4b-482c-9431-59272af529da from source install dataset + * nexus zone on sled 75bc286f-2b4b-482c-9431-59272af529da from source install dataset + * nexus zone on sled 75bc286f-2b4b-482c-9431-59272af529da from source install dataset + * nexus zone on sled affab35f-600a-4109-8ea0-34a067a4e0bc from source install dataset + * nexus zone on sled affab35f-600a-4109-8ea0-34a067a4e0bc from source install dataset + * nexus zone on sled affab35f-600a-4109-8ea0-34a067a4e0bc from source install dataset * zone updates waiting on discretionary zones diff --git a/nexus/reconfigurator/planning/tests/output/zone_image_source_change_1.txt b/nexus/reconfigurator/planning/tests/output/zone_image_source_change_1.txt index 440e7e28e51..1f229c2ec10 100644 --- a/nexus/reconfigurator/planning/tests/output/zone_image_source_change_1.txt +++ b/nexus/reconfigurator/planning/tests/output/zone_image_source_change_1.txt @@ -122,6 +122,7 @@ to: blueprint 1481141d-a5cf-4103-8344-738967e0f110 internal DNS version::: 1 (unchanged) external DNS version::: 1 (unchanged) target release min gen: 1 (unchanged) + nexus gen:::::::::::::: 1 (unchanged) OXIMETER SETTINGS: generation: 1 (unchanged) diff --git a/nexus/reconfigurator/preparation/src/lib.rs b/nexus/reconfigurator/preparation/src/lib.rs index 49751223fc1..fb26e2657aa 100644 --- a/nexus/reconfigurator/preparation/src/lib.rs +++ b/nexus/reconfigurator/preparation/src/lib.rs @@ -94,6 +94,7 @@ impl PlanningInputFromDb<'_> { opctx: &OpContext, datastore: &DataStore, chicken_switches: PlannerChickenSwitches, + current_nexus_zone_id: Option, ) -> Result { opctx.check_complex_operations_allowed()?; // Note we list *all* rows here including the ones for decommissioned @@ -234,13 +235,16 @@ impl PlanningInputFromDb<'_> { old_repo, chicken_switches, } - .build() + .build(current_nexus_zone_id) .internal_context("assembling planning_input")?; Ok(planning_input) } - pub fn build(&self) -> Result { + pub fn build( + &self, + current_nexus_zone_id: Option, + ) -> Result { let service_ip_pool_ranges = self.ip_pool_range_rows.iter().map(IpRange::from).collect(); let policy = Policy { @@ -265,6 +269,7 @@ impl PlanningInputFromDb<'_> { self.internal_dns_version.into(), self.external_dns_version.into(), self.cockroachdb_settings.clone(), + current_nexus_zone_id, ); let mut zpools_by_sled_id = { @@ -373,6 +378,7 @@ impl PlanningInputFromDb<'_> { pub async fn reconfigurator_state_load( opctx: &OpContext, datastore: &DataStore, + current_nexus_zone_id: Option, ) -> Result { opctx.check_complex_operations_allowed()?; let chicken_switches = datastore @@ -381,9 +387,13 @@ pub async fn reconfigurator_state_load( .map_or_else(PlannerChickenSwitches::default, |switches| { switches.switches.planner_switches }); - let planning_input = - PlanningInputFromDb::assemble(opctx, datastore, chicken_switches) - .await?; + let planning_input = PlanningInputFromDb::assemble( + opctx, + datastore, + chicken_switches, + current_nexus_zone_id, + ) + .await?; let collection_ids = datastore .inventory_collections() .await diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 51b6d1d8658..78f1ef56617 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -499,6 +499,7 @@ impl BackgroundTasksInitializer { // target blueprint. let blueprint_planner = blueprint_planner::BlueprintPlanner::new( datastore.clone(), + args.nexus_id, chicken_switches_watcher.clone(), inventory_watcher.clone(), rx_blueprint.clone(), diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 2ac61471e7c..27315202719 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -278,6 +278,7 @@ mod test { internal_dns_version: dns_version, external_dns_version: dns_version, target_release_minimum_generation: Generation::new(), + nexus_generation: Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: Generation::new(), diff --git a/nexus/src/app/background/tasks/blueprint_load.rs b/nexus/src/app/background/tasks/blueprint_load.rs index d2d9c7c380e..7b7f546388d 100644 --- a/nexus/src/app/background/tasks/blueprint_load.rs +++ b/nexus/src/app/background/tasks/blueprint_load.rs @@ -225,6 +225,7 @@ mod test { internal_dns_version: Generation::new(), external_dns_version: Generation::new(), target_release_minimum_generation: Generation::new(), + nexus_generation: Generation::new(), cockroachdb_fingerprint: String::new(), clickhouse_cluster_config: None, oximeter_read_version: Generation::new(), diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index d6519fdad3b..9ae27a227fe 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -19,6 +19,7 @@ use nexus_types::internal_api::background::BlueprintPlannerStatus; use omicron_common::api::external::LookupType; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::GenericUuid as _; +use omicron_uuid_kinds::OmicronZoneUuid; use serde_json::json; use std::sync::Arc; use tokio::sync::watch::{self, Receiver, Sender}; @@ -26,6 +27,7 @@ use tokio::sync::watch::{self, Receiver, Sender}; /// Background task that runs the update planner. pub struct BlueprintPlanner { datastore: Arc, + nexus_id: OmicronZoneUuid, rx_chicken_switches: Receiver, rx_inventory: Receiver>, rx_blueprint: Receiver>>, @@ -35,6 +37,7 @@ pub struct BlueprintPlanner { impl BlueprintPlanner { pub fn new( datastore: Arc, + nexus_id: OmicronZoneUuid, rx_chicken_switches: Receiver, rx_inventory: Receiver>, rx_blueprint: Receiver>>, @@ -42,6 +45,7 @@ impl BlueprintPlanner { let (tx_blueprint, _) = watch::channel(None); Self { datastore, + nexus_id, rx_chicken_switches, rx_inventory, rx_blueprint, @@ -118,6 +122,7 @@ impl BlueprintPlanner { opctx, &self.datastore, switches.switches.planner_switches, + Some(self.nexus_id), ) .await { @@ -341,6 +346,7 @@ mod test { // Finally, spin up the planner background task. let mut planner = BlueprintPlanner::new( datastore.clone(), + nexus.id, chicken_switches_collector_rx, rx_collector, rx_loader.clone(), diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 43f6f558a4b..0956e07ba60 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -139,9 +139,13 @@ impl super::Nexus { switches.switches.planner_switches }); - let planning_input = - PlanningInputFromDb::assemble(opctx, datastore, chicken_switches) - .await?; + let planning_input = PlanningInputFromDb::assemble( + opctx, + datastore, + chicken_switches, + Some(self.id), + ) + .await?; // The choice of which inventory collection to use here is not // necessarily trivial. Inventory collections may be incomplete due to diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index e36238be78e..e61b2815362 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -882,6 +882,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { vni: Vni::SERVICES_VNI, transit_ips: vec![], }, + nexus_generation: Generation::new(), }), image_source: BlueprintZoneImageSource::InstallDataset, }); @@ -967,6 +968,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { internal_dns_version: dns_config.generation, external_dns_version: Generation::new(), target_release_minimum_generation: Generation::new(), + nexus_generation: Generation::new(), cockroachdb_fingerprint: String::new(), cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade::DoNotModify, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index a7080e3e877..23094feb17b 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -124,12 +124,14 @@ pub use planning_input::TufRepoContentsError; pub use planning_input::TufRepoPolicy; pub use planning_input::ZpoolFilter; pub use planning_report::CockroachdbUnsafeToShutdown; +pub use planning_report::NexusGenerationBumpWaitingOn; pub use planning_report::PlanningAddStepReport; pub use planning_report::PlanningCockroachdbSettingsStepReport; pub use planning_report::PlanningDecommissionStepReport; pub use planning_report::PlanningExpungeStepReport; pub use planning_report::PlanningMgsUpdatesStepReport; pub use planning_report::PlanningMupdateOverrideStepReport; +pub use planning_report::PlanningNexusGenerationBumpReport; pub use planning_report::PlanningNoopImageSourceSkipSledReason; pub use planning_report::PlanningNoopImageSourceSkipZoneReason; pub use planning_report::PlanningNoopImageSourceStepReport; @@ -226,6 +228,12 @@ pub struct Blueprint { /// driving the system to the target release. pub target_release_minimum_generation: Generation, + /// The generation of the active group of Nexuses + /// + /// If a Nexus instance notices it has a nexus_generation less than + /// this value, it will start to quiesce (see: RFD 588). + pub nexus_generation: Generation, + /// CockroachDB state fingerprint when this blueprint was created // See `nexus/db-queries/src/db/datastore/cockroachdb_settings.rs` for more // on this. @@ -274,6 +282,7 @@ impl Blueprint { external_dns_version: self.external_dns_version, target_release_minimum_generation: self .target_release_minimum_generation, + nexus_generation: self.nexus_generation, cockroachdb_fingerprint: self.cockroachdb_fingerprint.clone(), cockroachdb_setting_preserve_downgrade: Some( self.cockroachdb_setting_preserve_downgrade, @@ -608,6 +617,7 @@ impl BlueprintDisplay<'_> { .target_release_minimum_generation .to_string(), ), + (NEXUS_GENERATION, self.blueprint.nexus_generation.to_string()), ], ) } @@ -650,6 +660,7 @@ impl fmt::Display for BlueprintDisplay<'_> { // These six fields are handled by `make_metadata_table()`, called // below. target_release_minimum_generation: _, + nexus_generation: _, internal_dns_version: _, external_dns_version: _, time_created: _, @@ -2072,6 +2083,10 @@ pub struct BlueprintMetadata { /// /// See [`Blueprint::target_release_minimum_generation`]. pub target_release_minimum_generation: Generation, + /// The Nexus generation number + /// + /// See [`Blueprint::nexus_generation`]. + pub nexus_generation: Generation, /// CockroachDB state fingerprint when this blueprint was created pub cockroachdb_fingerprint: String, /// Whether to set `cluster.preserve_downgrade_option` and what to set it to diff --git a/nexus/types/src/deployment/blueprint_diff.rs b/nexus/types/src/deployment/blueprint_diff.rs index a29cb57317f..6a1646ddd48 100644 --- a/nexus/types/src/deployment/blueprint_diff.rs +++ b/nexus/types/src/deployment/blueprint_diff.rs @@ -64,6 +64,7 @@ impl<'a> BlueprintDiffSummary<'a> { pending_mgs_updates, clickhouse_cluster_config, target_release_minimum_generation, + nexus_generation, // Metadata fields for which changes don't reflect semantic // changes from one blueprint to the next. id: _, @@ -112,6 +113,11 @@ impl<'a> BlueprintDiffSummary<'a> { return true; } + // Did the nexus generation change? + if nexus_generation.before != nexus_generation.after { + return true; + } + // All fields checked or ignored; if we get here, there are no // meaningful changes. false @@ -1834,6 +1840,7 @@ impl<'diff, 'b> BlueprintDiffDisplay<'diff, 'b> { target_release_minimum_generation, TARGET_RELEASE_MIN_GEN ), + diff_row!(nexus_generation, NEXUS_GENERATION), ], ), ] diff --git a/nexus/types/src/deployment/blueprint_display.rs b/nexus/types/src/deployment/blueprint_display.rs index e0dc0080f95..dec9ce3e699 100644 --- a/nexus/types/src/deployment/blueprint_display.rs +++ b/nexus/types/src/deployment/blueprint_display.rs @@ -44,6 +44,7 @@ pub mod constants { pub const EXTERNAL_DNS_VERSION: &str = "external DNS version"; // Keep this a bit short to not make the key column too wide. pub const TARGET_RELEASE_MIN_GEN: &str = "target release min gen"; + pub const NEXUS_GENERATION: &str = "nexus gen"; pub const COMMENT: &str = "comment"; pub const UNCHANGED_PARENS: &str = "(unchanged)"; diff --git a/nexus/types/src/deployment/planning_input.rs b/nexus/types/src/deployment/planning_input.rs index e5c3f499360..6d4ee2213ff 100644 --- a/nexus/types/src/deployment/planning_input.rs +++ b/nexus/types/src/deployment/planning_input.rs @@ -123,6 +123,12 @@ pub struct PlanningInput { /// mark under the assumption that they may appear to be impossible because /// they're currently in progress. ignore_impossible_mgs_updates_since: DateTime, + + /// ID of the currently running Nexus zone + /// + /// This is used to identify which Nexus is currently executing the planning + /// operation, which is needed for safe shutdown decisions during handoff. + current_nexus_zone_id: Option, } impl PlanningInput { @@ -240,6 +246,15 @@ impl PlanningInput { self.policy.oximeter_read_policy.mode.single_node_enabled() } + /// ID of the currently running Nexus zone + pub fn current_nexus_zone_id(&self) -> Option { + self.current_nexus_zone_id + } + + pub fn set_current_nexus_zone_id(&mut self, id: OmicronZoneUuid) { + self.current_nexus_zone_id = Some(id); + } + pub fn all_sleds( &self, filter: SledFilter, @@ -318,6 +333,7 @@ impl PlanningInput { network_resources: self.network_resources, ignore_impossible_mgs_updates_since: self .ignore_impossible_mgs_updates_since, + current_nexus_zone_id: self.current_nexus_zone_id, } } } @@ -1260,6 +1276,7 @@ pub struct PlanningInputBuilder { sleds: BTreeMap, network_resources: OmicronZoneNetworkResources, ignore_impossible_mgs_updates_since: DateTime, + current_nexus_zone_id: Option, } impl PlanningInputBuilder { @@ -1288,6 +1305,7 @@ impl PlanningInputBuilder { sleds: BTreeMap::new(), network_resources: OmicronZoneNetworkResources::new(), ignore_impossible_mgs_updates_since: Utc::now(), + current_nexus_zone_id: None, } } @@ -1296,6 +1314,7 @@ impl PlanningInputBuilder { internal_dns_version: Generation, external_dns_version: Generation, cockroachdb_settings: CockroachDbSettings, + current_nexus_zone_id: Option, ) -> Self { Self { policy, @@ -1306,6 +1325,7 @@ impl PlanningInputBuilder { network_resources: OmicronZoneNetworkResources::new(), ignore_impossible_mgs_updates_since: Utc::now() - MGS_UPDATE_SETTLE_TIMEOUT, + current_nexus_zone_id, } } @@ -1401,6 +1421,13 @@ impl PlanningInputBuilder { self.cockroachdb_settings = cockroachdb_settings; } + pub fn set_current_nexus_zone_id( + &mut self, + current_nexus_zone_id: Option, + ) { + self.current_nexus_zone_id = current_nexus_zone_id; + } + pub fn build(self) -> PlanningInput { PlanningInput { policy: self.policy, @@ -1411,6 +1438,7 @@ impl PlanningInputBuilder { network_resources: self.network_resources, ignore_impossible_mgs_updates_since: self .ignore_impossible_mgs_updates_since, + current_nexus_zone_id: self.current_nexus_zone_id, } } } diff --git a/nexus/types/src/deployment/planning_report.rs b/nexus/types/src/deployment/planning_report.rs index 9511b97b631..a872cb6bbac 100644 --- a/nexus/types/src/deployment/planning_report.rs +++ b/nexus/types/src/deployment/planning_report.rs @@ -12,6 +12,7 @@ use super::PendingMgsUpdates; use super::PlannerChickenSwitches; use daft::Diffable; +use omicron_common::api::external::Generation; use omicron_common::policy::COCKROACHDB_REDUNDANCY; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::MupdateOverrideUuid; @@ -61,6 +62,7 @@ pub struct PlanningReport { pub mgs_updates: PlanningMgsUpdatesStepReport, pub add: PlanningAddStepReport, pub zone_updates: PlanningZoneUpdatesStepReport, + pub nexus_generation_bump: PlanningNexusGenerationBumpReport, pub cockroachdb_settings: PlanningCockroachdbSettingsStepReport, } @@ -77,6 +79,7 @@ impl PlanningReport { ), add: PlanningAddStepReport::new(), zone_updates: PlanningZoneUpdatesStepReport::new(), + nexus_generation_bump: PlanningNexusGenerationBumpReport::new(), cockroachdb_settings: PlanningCockroachdbSettingsStepReport::new(), } } @@ -88,6 +91,7 @@ impl PlanningReport { && self.mgs_updates.is_empty() && self.add.is_empty() && self.zone_updates.is_empty() + && self.nexus_generation_bump.is_empty() && self.cockroachdb_settings.is_empty() } } @@ -110,6 +114,7 @@ impl fmt::Display for PlanningReport { mgs_updates, add, zone_updates, + nexus_generation_bump, cockroachdb_settings, } = self; writeln!(f, "planning report for blueprint {blueprint_id}:")?; @@ -126,6 +131,7 @@ impl fmt::Display for PlanningReport { mgs_updates.fmt(f)?; add.fmt(f)?; zone_updates.fmt(f)?; + nexus_generation_bump.fmt(f)?; cockroachdb_settings.fmt(f)?; } Ok(()) @@ -460,6 +466,14 @@ pub struct PlanningAddSufficientZonesExist { pub num_existing: usize, } +#[derive( + Clone, Debug, Deserialize, Serialize, PartialEq, Eq, Diffable, JsonSchema, +)] +pub struct DiscretionaryZonePlacement { + kind: String, + source: String, +} + #[derive( Clone, Debug, Deserialize, Serialize, PartialEq, Eq, Diffable, JsonSchema, )] @@ -507,7 +521,8 @@ pub struct PlanningAddStepReport { /// Sled ID → kinds of discretionary zones placed there // TODO: make `sled_add_zone_*` methods return the added zone config // so that we can report it here. - pub discretionary_zones_placed: BTreeMap>, + pub discretionary_zones_placed: + BTreeMap>, } impl PlanningAddStepReport { @@ -589,11 +604,22 @@ impl PlanningAddStepReport { &mut self, sled_id: SledUuid, zone_kind: &str, + image_source: &BlueprintZoneImageSource, ) { self.discretionary_zones_placed .entry(sled_id) - .and_modify(|kinds| kinds.push(zone_kind.to_owned())) - .or_insert_with(|| vec![zone_kind.to_owned()]); + .and_modify(|kinds| { + kinds.push(DiscretionaryZonePlacement { + kind: zone_kind.to_owned(), + source: image_source.to_string(), + }) + }) + .or_insert_with(|| { + vec![DiscretionaryZonePlacement { + kind: zone_kind.to_owned(), + source: image_source.to_string(), + }] + }); } } @@ -704,13 +730,13 @@ impl fmt::Display for PlanningAddStepReport { if !discretionary_zones_placed.is_empty() { writeln!(f, "* discretionary zones placed:")?; - for (sled_id, kinds) in discretionary_zones_placed.iter() { - let (n, s) = plural_vec(kinds); - writeln!( - f, - " * {n} zone{s} on sled {sled_id}: {}", - kinds.join(", ") - )?; + for (sled_id, placements) in discretionary_zones_placed.iter() { + for DiscretionaryZonePlacement { kind, source } in placements { + writeln!( + f, + " * {kind} zone on sled {sled_id} from source {source}", + )?; + } } } @@ -913,9 +939,21 @@ impl ZoneUpdatesWaitingOn { )] #[serde(rename_all = "snake_case", tag = "type")] pub enum ZoneUnsafeToShutdown { - Cockroachdb { reason: CockroachdbUnsafeToShutdown }, - BoundaryNtp { total_boundary_ntp_zones: usize, synchronized_count: usize }, - InternalDns { total_internal_dns_zones: usize, synchronized_count: usize }, + Cockroachdb { + reason: CockroachdbUnsafeToShutdown, + }, + BoundaryNtp { + total_boundary_ntp_zones: usize, + synchronized_count: usize, + }, + InternalDns { + total_internal_dns_zones: usize, + synchronized_count: usize, + }, + Nexus { + zone_generation: Generation, + current_nexus_generation: Option, + }, } impl fmt::Display for ZoneUnsafeToShutdown { @@ -930,6 +968,96 @@ impl fmt::Display for ZoneUnsafeToShutdown { total_internal_dns_zones: t, synchronized_count: s, } => write!(f, "only {s}/{t} internal DNS zones are synchronized"), + Self::Nexus { zone_generation, current_nexus_generation } => { + match current_nexus_generation { + Some(current) => write!( + f, + "zone gen ({zone_generation}) >= currently-running \ + Nexus gen ({current})" + ), + None => write!( + f, + "zone gen is {zone_generation}, but currently-running \ + Nexus generation is unknown" + ), + } + } + } + } +} + +#[derive( + Clone, Debug, Deserialize, Serialize, PartialEq, Eq, Diffable, JsonSchema, +)] +pub struct PlanningNexusGenerationBumpReport { + /// What are we waiting on to increase the generation number? + pub waiting_on: Option, + + pub next_generation: Option, +} + +impl PlanningNexusGenerationBumpReport { + pub fn new() -> Self { + Self { waiting_on: None, next_generation: None } + } + + pub fn is_empty(&self) -> bool { + self.waiting_on.is_none() && self.next_generation.is_none() + } + + pub fn set_waiting_on(&mut self, why: NexusGenerationBumpWaitingOn) { + self.waiting_on = Some(why); + } + + pub fn set_next_generation(&mut self, next_generation: Generation) { + self.next_generation = Some(next_generation); + } +} + +impl fmt::Display for PlanningNexusGenerationBumpReport { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let PlanningNexusGenerationBumpReport { waiting_on, next_generation } = + self; + + match (waiting_on, next_generation) { + (Some(why), _) => { + writeln!( + f, + "* waiting to update top-level nexus_generation: {}", + why.as_str() + )?; + } + (None, Some(gen)) => { + writeln!(f, "* updating top-level nexus_generation to: {gen}")?; + } + // Nothing to report + (None, None) => (), + } + Ok(()) + } +} + +#[derive( + Clone, Debug, Deserialize, Serialize, PartialEq, Eq, Diffable, JsonSchema, +)] +#[serde(rename_all = "snake_case", tag = "type")] +pub enum NexusGenerationBumpWaitingOn { + /// Waiting for non-Nexus zones to finish updating + NonNexusZoneUpdate, + + /// Waiting for enough new Nexus zones to appear + NewNexusBringup, + + /// Waiting for zones to propagate to inventory + ZonePropagation, +} + +impl NexusGenerationBumpWaitingOn { + pub fn as_str(&self) -> &'static str { + match self { + Self::NonNexusZoneUpdate => "pending non-nexus zone updates", + Self::NewNexusBringup => "waiting for new nexus zones", + Self::ZonePropagation => "pending zone reconciliation", } } } diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index 31e26c3a994..79cb68fb98a 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -343,6 +343,7 @@ pub mod blueprint_zone_type { use crate::deployment::OmicronZoneExternalSnatIp; use daft::Diffable; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; + use omicron_common::api::external::Generation; use omicron_common::api::internal::shared::NetworkInterface; use schemars::JsonSchema; use serde::Deserialize; @@ -566,6 +567,10 @@ pub mod blueprint_zone_type { pub external_tls: bool, /// External DNS servers Nexus can use to resolve external hosts. pub external_dns_servers: Vec, + /// Generation number for this Nexus zone. + /// This is used to coordinate handoff between old and new Nexus instances + /// during updates. See RFD 588. + pub nexus_generation: Generation, } #[derive( diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 5fc544306fc..0ae5f90fc0c 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2568,6 +2568,14 @@ } ] }, + "nexus_generation": { + "description": "The generation of the active group of Nexuses\n\nIf a Nexus instance notices it has a nexus_generation less than this value, it will start to quiesce (see: RFD 588).", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, "oximeter_read_mode": { "description": "Whether oximeter should read from a single node or a cluster", "allOf": [ @@ -2638,6 +2646,7 @@ "external_dns_version", "id", "internal_dns_version", + "nexus_generation", "oximeter_read_mode", "oximeter_read_version", "pending_mgs_updates", @@ -2862,6 +2871,14 @@ } ] }, + "nexus_generation": { + "description": "The Nexus generation number\n\nSee [`Blueprint::nexus_generation`].", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, "parent_blueprint_id": { "nullable": true, "description": "which blueprint this blueprint is based on", @@ -2892,6 +2909,7 @@ "external_dns_version", "id", "internal_dns_version", + "nexus_generation", "target_release_minimum_generation", "time_created" ] @@ -3517,6 +3535,14 @@ "description": "The address at which the internal nexus server is reachable.", "type": "string" }, + "nexus_generation": { + "description": "Generation number for this Nexus zone. This is used to coordinate handoff between old and new Nexus instances during updates. See RFD 588.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, "nic": { "description": "The service vNIC providing external connectivity using OPTE.", "allOf": [ @@ -3537,6 +3563,7 @@ "external_ip", "external_tls", "internal_address", + "nexus_generation", "nic", "type" ] @@ -4192,6 +4219,21 @@ "saga_id" ] }, + "DiscretionaryZonePlacement": { + "type": "object", + "properties": { + "kind": { + "type": "string" + }, + "source": { + "type": "string" + } + }, + "required": [ + "kind", + "source" + ] + }, "DiskIdentity": { "description": "Uniquely identifies a disk.", "type": "object", @@ -6583,7 +6625,7 @@ "additionalProperties": { "type": "array", "items": { - "type": "string" + "$ref": "#/components/schemas/DiscretionaryZonePlacement" } } }, @@ -9184,6 +9226,32 @@ "total_internal_dns_zones", "type" ] + }, + { + "type": "object", + "properties": { + "current_nexus_generation": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "type": { + "type": "string", + "enum": [ + "nexus" + ] + }, + "zone_generation": { + "$ref": "#/components/schemas/Generation" + } + }, + "required": [ + "type", + "zone_generation" + ] } ] }, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index f87d11e0903..d3122aa18da 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4510,7 +4510,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.blueprint ( -- driving the system to the target release. -- -- This is set to 1 by default in application code. - target_release_minimum_generation INT8 NOT NULL + target_release_minimum_generation INT8 NOT NULL, + + -- The generation of the active group of Nexus instances + nexus_generation INT8 NOT NULL ); -- table describing both the current and historical target blueprints of the @@ -4720,6 +4723,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.bp_omicron_zone ( image_source omicron.public.bp_zone_image_source NOT NULL, image_artifact_sha256 STRING(64), + -- Generation for Nexus zones + nexus_generation INT8, + PRIMARY KEY (blueprint_id, id), CONSTRAINT expunged_disposition_properties CHECK ( @@ -6550,7 +6556,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '181.0.0', NULL) + (TRUE, NOW(), NOW(), '182.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/nexus-generation/up01.sql b/schema/crdb/nexus-generation/up01.sql new file mode 100644 index 00000000000..42d87c2f6f7 --- /dev/null +++ b/schema/crdb/nexus-generation/up01.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.bp_omicron_zone ADD COLUMN IF NOT EXISTS nexus_generation INT8; diff --git a/schema/crdb/nexus-generation/up02.sql b/schema/crdb/nexus-generation/up02.sql new file mode 100644 index 00000000000..53429df8ebe --- /dev/null +++ b/schema/crdb/nexus-generation/up02.sql @@ -0,0 +1,5 @@ +SET LOCAL disallow_full_table_scans = off; + +UPDATE omicron.public.bp_omicron_zone +SET nexus_generation = 1 +WHERE zone_type = 'nexus'; diff --git a/schema/crdb/nexus-generation/up03.sql b/schema/crdb/nexus-generation/up03.sql new file mode 100644 index 00000000000..d7623a84c80 --- /dev/null +++ b/schema/crdb/nexus-generation/up03.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.blueprint ADD COLUMN IF NOT EXISTS nexus_generation INT8 NOT NULL DEFAULT 1; diff --git a/schema/crdb/nexus-generation/up04.sql b/schema/crdb/nexus-generation/up04.sql new file mode 100644 index 00000000000..072231d9b01 --- /dev/null +++ b/schema/crdb/nexus-generation/up04.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.blueprint ALTER COLUMN nexus_generation DROP DEFAULT; diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 37c74805c3c..76a0b35d08c 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -26,7 +26,7 @@ use omicron_common::address::{ RSS_RESERVED_ADDRESSES, ReservedRackSubnet, SLED_PREFIX, get_sled_address, get_switch_zone_address, }; -use omicron_common::api::external::{MacAddr, Vni}; +use omicron_common::api::external::{Generation, MacAddr, Vni}; use omicron_common::api::internal::shared::{ NetworkInterface, NetworkInterfaceKind, SourceNatConfig, SourceNatConfigError, @@ -570,6 +570,7 @@ impl Plan { // development that it might not be. external_tls: !config.external_certificates.is_empty(), external_dns_servers: config.dns_servers.clone(), + nexus_generation: Generation::new(), }, ), filesystem_pool, diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index e2eaac58500..f18e205b6f7 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1631,6 +1631,7 @@ pub(crate) fn build_initial_blueprint_from_sled_configs( // (including creating the recovery silo). external_dns_version: Generation::new(), target_release_minimum_generation: Generation::new(), + nexus_generation: Generation::new(), // Nexus will fill in the CockroachDB values during initialization. cockroachdb_fingerprint: String::new(), cockroachdb_setting_preserve_downgrade: diff --git a/sled-agent/src/sim/server.rs b/sled-agent/src/sim/server.rs index f252e327834..324c0c45106 100644 --- a/sled-agent/src/sim/server.rs +++ b/sled-agent/src/sim/server.rs @@ -447,6 +447,7 @@ pub async fn run_standalone_server( }, external_tls: false, external_dns_servers: vec![], + nexus_generation: Generation::new(), }), filesystem_pool: get_random_zpool(), image_source: BlueprintZoneImageSource::InstallDataset, From 3ce28b35b8a6d60737ff06663cbf83fed3fa606a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 21 Aug 2025 14:09:48 -0700 Subject: [PATCH 02/22] fix openapi-manager --- openapi/nexus-internal.json | 75 +++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 0ae5f90fc0c..0a6ae8c075b 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5728,6 +5728,55 @@ "description": "Password hashes must be in PHC (Password Hashing Competition) string format. Passwords must be hashed with Argon2id. Password hashes may be rejected if the parameters appear not to be secure enough.", "type": "string" }, + "NexusGenerationBumpWaitingOn": { + "oneOf": [ + { + "description": "Waiting for non-Nexus zones to finish updating", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "non_nexus_zone_update" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Waiting for enough new Nexus zones to appear", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "new_nexus_bringup" + ] + } + }, + "required": [ + "type" + ] + }, + { + "description": "Waiting for zones to propagate to inventory", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "zone_propagation" + ] + } + }, + "required": [ + "type" + ] + } + ] + }, "NodeName": { "description": "Unique name for a saga [`Node`]\n\nEach node requires a string name that's unique within its DAG. The name is used to identify its output. Nodes that depend on a given node (either directly or indirectly) can access the node's output using its name.", "type": "string" @@ -6787,6 +6836,28 @@ "pending_mgs_updates" ] }, + "PlanningNexusGenerationBumpReport": { + "type": "object", + "properties": { + "next_generation": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "waiting_on": { + "nullable": true, + "description": "What are we waiting on to increase the generation number?", + "allOf": [ + { + "$ref": "#/components/schemas/NexusGenerationBumpWaitingOn" + } + ] + } + } + }, "PlanningNoopImageSourceConvertedZones": { "description": "How many of the total install-dataset zones were noop-converted to use the artifact store on a particular sled.", "type": "object", @@ -7043,6 +7114,9 @@ "mgs_updates": { "$ref": "#/components/schemas/PlanningMgsUpdatesStepReport" }, + "nexus_generation_bump": { + "$ref": "#/components/schemas/PlanningNexusGenerationBumpReport" + }, "noop_image_source": { "$ref": "#/components/schemas/PlanningNoopImageSourceStepReport" }, @@ -7058,6 +7132,7 @@ "decommission", "expunge", "mgs_updates", + "nexus_generation_bump", "noop_image_source", "zone_updates" ] From 051ecaf3e2dce13e62bb2dd32327697c06b0a301 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 20 Aug 2025 14:47:03 -0700 Subject: [PATCH 03/22] update quiesce states to reflect RFD 588 --- nexus/reconfigurator/execution/src/lib.rs | 72 ++-- nexus/src/app/background/init.rs | 5 +- .../background/tasks/blueprint_execution.rs | 66 +++- .../app/background/tasks/blueprint_planner.rs | 13 +- nexus/src/app/mod.rs | 23 +- nexus/src/app/quiesce.rs | 177 ++++++--- nexus/types/src/deployment.rs | 22 ++ nexus/types/src/internal_api/views.rs | 78 ++-- nexus/types/src/quiesce.rs | 364 +++++++++++------- 9 files changed, 522 insertions(+), 298 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 43c09485557..b9f83912bc8 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -611,50 +611,36 @@ fn register_reassign_sagas_step<'a>( .into(); }; - // Re-assign sagas, but only if we're allowed to. If Nexus is - // quiescing, we don't want to assign any new sagas to - // ourselves. - let result = saga_quiesce.reassign_if_possible(async || { - // For any expunged Nexus zones, re-assign in-progress sagas - // to some other Nexus. If this fails for some reason, it - // doesn't affect anything else. - let sec_id = nexus_db_model::SecId::from(nexus_id); - let reassigned = sagas::reassign_sagas_from_expunged( - opctx, datastore, blueprint, sec_id, - ) - .await - .context("failed to re-assign sagas"); - match reassigned { - Ok(needs_saga_recovery) => ( - StepSuccess::new(needs_saga_recovery).build(), - needs_saga_recovery, - ), - Err(error) => { - // It's possible that we failed after having - // re-assigned sagas in the database. - let maybe_reassigned = true; - ( - StepWarning::new(false, error.to_string()) - .build(), - maybe_reassigned, - ) + // Re-assign sagas. + Ok(saga_quiesce + .reassign_sagas(async || { + // For any expunged Nexus zones, re-assign in-progress + // sagas to some other Nexus. If this fails for some + // reason, it doesn't affect anything else. + let sec_id = nexus_db_model::SecId::from(nexus_id); + let reassigned = sagas::reassign_sagas_from_expunged( + opctx, datastore, blueprint, sec_id, + ) + .await + .context("failed to re-assign sagas"); + match reassigned { + Ok(needs_saga_recovery) => ( + StepSuccess::new(needs_saga_recovery).build(), + needs_saga_recovery, + ), + Err(error) => { + // It's possible that we failed after having + // re-assigned sagas in the database. + let maybe_reassigned = true; + ( + StepWarning::new(false, error.to_string()) + .build(), + maybe_reassigned, + ) + } } - } - }); - - match result.await { - // Re-assignment is allowed, and we did try. It may or may - // not have succeeded. Either way, that's reflected in - // `step_result`. - Ok(step_result) => Ok(step_result), - // Re-assignment is disallowed. Report this step skipped - // with an explanation of why. - Err(error) => StepSkipped::new( - false, - InlineErrorChain::new(&error).to_string(), - ) - .into(), - } + }) + .await) }, ) .register() diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 78f1ef56617..495930b0b72 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -131,6 +131,7 @@ use super::tasks::vpc_routes; use super::tasks::webhook_deliverator; use crate::Nexus; use crate::app::oximeter::PRODUCER_LEASE_DURATION; +use crate::app::quiesce::NexusQuiesceHandle; use crate::app::saga::StartSaga; use nexus_background_task_interface::Activator; use nexus_background_task_interface::BackgroundTasks; @@ -437,7 +438,7 @@ impl BackgroundTasksInitializer { nexus_id, task_saga_recovery.clone(), args.mgs_updates_tx, - args.saga_recovery.quiesce.clone(), + args.nexus_quiesce, ); let rx_blueprint_exec = blueprint_executor.watcher(); driver.register(TaskDefinition { @@ -1029,6 +1030,8 @@ pub struct BackgroundTasksData { pub webhook_delivery_client: reqwest::Client, /// Channel for configuring pending MGS updates pub mgs_updates_tx: watch::Sender, + /// handle for controlling Nexus quiesce + pub nexus_quiesce: NexusQuiesceHandle, } /// Starts the three DNS-propagation-related background tasks for either diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 27315202719..b930fc51079 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -4,7 +4,10 @@ //! Background task for realizing a plan blueprint -use crate::app::background::{Activator, BackgroundTask}; +use crate::app::{ + background::{Activator, BackgroundTask}, + quiesce::NexusQuiesceHandle, +}; use futures::FutureExt; use futures::future::BoxFuture; use internal_dns_resolver::Resolver; @@ -13,14 +16,12 @@ use nexus_db_queries::db::DataStore; use nexus_reconfigurator_execution::{ RealizeBlueprintOutput, RequiredRealizeArgs, }; -use nexus_types::{ - deployment::{ - Blueprint, BlueprintTarget, PendingMgsUpdates, execution::EventBuffer, - }, - quiesce::SagaQuiesceHandle, +use nexus_types::deployment::{ + Blueprint, BlueprintTarget, PendingMgsUpdates, execution::EventBuffer, }; use omicron_uuid_kinds::OmicronZoneUuid; use serde_json::json; +use slog_error_chain::InlineErrorChain; use std::sync::Arc; use tokio::sync::watch; use update_engine::NestedError; @@ -35,7 +36,7 @@ pub struct BlueprintExecutor { tx: watch::Sender, saga_recovery: Activator, mgs_update_tx: watch::Sender, - saga_quiesce: SagaQuiesceHandle, + nexus_quiesce: NexusQuiesceHandle, } impl BlueprintExecutor { @@ -48,7 +49,7 @@ impl BlueprintExecutor { nexus_id: OmicronZoneUuid, saga_recovery: Activator, mgs_update_tx: watch::Sender, - saga_quiesce: SagaQuiesceHandle, + nexus_quiesce: NexusQuiesceHandle, ) -> BlueprintExecutor { let (tx, _) = watch::channel(0); BlueprintExecutor { @@ -59,7 +60,7 @@ impl BlueprintExecutor { tx, saga_recovery, mgs_update_tx, - saga_quiesce, + nexus_quiesce, } } @@ -87,6 +88,47 @@ impl BlueprintExecutor { }; let (bp_target, blueprint) = &*update; + + // Regardless of anything else: propagate whatever this blueprint + // says about our quiescing state. + // + // During startup under normal operation, the blueprint will reflect + // that we're not quiescing. Propagating this will enable sagas to + // be created elsewhere in Nexus. + // + // At some point during an upgrade, we'll encounter a blueprint that + // reflects that we are quiescing. Propagating this will disable sagas + // from being created. + // + // In all other cases, this will have no effect. + // + // We do this now, before doing anything else, for two reasons: (1) + // during startup, we want to do this ASAP to minimize unnecessary saga + // creation failures (i.e., don't wait until we try to execute the + // blueprint before enabling sagas, since we already know if we're + // quiescing or not); and (2) because we want to do it even if blueprint + // execution is disabled. + match blueprint.nexus_quiescing(self.nexus_id) { + Ok(quiescing) => { + debug!( + &opctx.log, + "blueprint execution: quiesce check"; + "quiescing" => quiescing + ); + self.nexus_quiesce.set_quiescing(quiescing); + } + Err(error) => { + // This should be impossible. But it doesn't really affect + // anything else so there's no reason to stop execution. + error!( + &opctx.log, + "blueprint execution: failed to determine if this Nexus \ + is quiescing"; + InlineErrorChain::new(&*error) + ); + } + }; + if !bp_target.enabled { warn!(&opctx.log, "Blueprint execution: skipped"; @@ -119,7 +161,7 @@ impl BlueprintExecutor { blueprint, sender, mgs_updates: self.mgs_update_tx.clone(), - saga_quiesce: self.saga_quiesce.clone(), + saga_quiesce: self.nexus_quiesce.sagas(), } .as_nexus(self.nexus_id), ) @@ -181,6 +223,7 @@ impl BackgroundTask for BlueprintExecutor { mod test { use super::BlueprintExecutor; use crate::app::background::{Activator, BackgroundTask}; + use crate::app::quiesce::NexusQuiesceHandle; use httptest::Expectation; use httptest::matchers::{not, request}; use httptest::responders::status_code; @@ -207,7 +250,6 @@ mod test { PlanningReport, blueprint_zone_type, }; use nexus_types::external_api::views::SledState; - use nexus_types::quiesce::SagaQuiesceHandle; use omicron_common::api::external; use omicron_common::api::external::Generation; use omicron_common::zpool_name::ZpoolName; @@ -390,7 +432,7 @@ mod test { OmicronZoneUuid::new_v4(), Activator::new(), dummy_tx, - SagaQuiesceHandle::new(opctx.log.clone()), + NexusQuiesceHandle::new(&opctx.log, datastore.clone()), ); // Now we're ready. diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index 9ae27a227fe..752632e4730 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -278,18 +278,15 @@ impl BackgroundTask for BlueprintPlanner { #[cfg(test)] mod test { use super::*; - use crate::app::background::Activator; use crate::app::background::tasks::blueprint_execution::BlueprintExecutor; use crate::app::background::tasks::blueprint_load::TargetBlueprintLoader; use crate::app::background::tasks::inventory_collection::InventoryCollector; + use crate::app::{background::Activator, quiesce::NexusQuiesceHandle}; use nexus_inventory::now_db_precision; use nexus_test_utils_macros::nexus_test; - use nexus_types::{ - deployment::{ - PendingMgsUpdates, PlannerChickenSwitches, - ReconfiguratorChickenSwitches, - }, - quiesce::SagaQuiesceHandle, + use nexus_types::deployment::{ + PendingMgsUpdates, PlannerChickenSwitches, + ReconfiguratorChickenSwitches, }; use omicron_uuid_kinds::OmicronZoneUuid; @@ -429,7 +426,7 @@ mod test { OmicronZoneUuid::new_v4(), Activator::new(), dummy_tx, - SagaQuiesceHandle::new(opctx.log.clone()), + NexusQuiesceHandle::new(&opctx.log, datastore.clone()), ); let value = executor.activate(&opctx).await; let value = value.as_object().expect("response is not a JSON object"); diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index cb3ca045cf9..c8abeac6e05 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -27,7 +27,6 @@ use nexus_db_queries::db; use nexus_mgs_updates::ArtifactCache; use nexus_mgs_updates::MgsUpdateDriver; use nexus_types::deployment::PendingMgsUpdates; -use nexus_types::quiesce::SagaQuiesceHandle; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGD_PORT; use omicron_common::address::MGS_PORT; @@ -111,11 +110,11 @@ pub(crate) mod sagas; // TODO: When referring to API types, we should try to include // the prefix unless it is unambiguous. +use crate::app::quiesce::NexusQuiesceHandle; pub(crate) use nexus_db_model::MAX_NICS_PER_INSTANCE; pub(crate) use nexus_db_queries::db::queries::disk::MAX_DISKS_PER_INSTANCE; use nexus_mgs_updates::DEFAULT_RETRY_TIMEOUT; use nexus_types::internal_api::views::MgsUpdateDriverStatus; -use nexus_types::internal_api::views::QuiesceState; use sagas::demo::CompletingDemoSagas; // XXX: Might want to recast as max *floating* IPs, we have at most one @@ -280,11 +279,8 @@ pub struct Nexus { #[allow(dead_code)] repo_depot_resolver: Box, - /// whether Nexus is quiescing, and how far it's gotten - quiesce: watch::Sender, - - /// details about saga quiescing - saga_quiesce: SagaQuiesceHandle, + /// state of overall Nexus quiesce activity + quiesce: NexusQuiesceHandle, } impl Nexus { @@ -336,6 +332,8 @@ impl Nexus { sec_store, )); + let quiesce = NexusQuiesceHandle::new(&log, db_datastore.clone()); + // It's a bit of a red flag to use an unbounded channel. // // This particular channel is used to send a Uuid from the saga executor @@ -360,14 +358,11 @@ impl Nexus { // task. If someone changed the config, they'd have to remember to // update this here. This doesn't seem worth it. let (saga_create_tx, saga_recovery_rx) = mpsc::unbounded_channel(); - let saga_quiesce = SagaQuiesceHandle::new( - log.new(o!("component" => "SagaQuiesceHandle")), - ); let sagas = Arc::new(SagaExecutor::new( Arc::clone(&sec_client), log.new(o!("component" => "SagaExecutor")), saga_create_tx, - saga_quiesce.clone(), + quiesce.sagas(), )); // Create a channel for replicating repository artifacts. 16 is a @@ -465,8 +460,6 @@ impl Nexus { let mgs_update_status_rx = mgs_update_driver.status_rx(); let _mgs_driver_task = tokio::spawn(mgs_update_driver.run()); - let (quiesce, _) = watch::channel(QuiesceState::running()); - let nexus = Nexus { id: config.deployment.id, rack_id, @@ -520,7 +513,6 @@ impl Nexus { mgs_resolver, repo_depot_resolver, quiesce, - saga_quiesce, }; // TODO-cleanup all the extra Arcs here seems wrong @@ -570,6 +562,7 @@ impl Nexus { webhook_delivery_client: task_nexus .webhook_delivery_client .clone(), + nexus_quiesce: task_nexus.quiesce.clone(), saga_recovery: SagaRecoveryHelpers { recovery_opctx: saga_recovery_opctx, @@ -577,7 +570,7 @@ impl Nexus { sec_client: sec_client.clone(), registry: sagas::ACTION_REGISTRY.clone(), sagas_started_rx: saga_recovery_rx, - quiesce: task_nexus.saga_quiesce.clone(), + quiesce: task_nexus.quiesce.sagas(), }, tuf_artifact_replication_rx, mgs_updates_tx, diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 6c8fe05decd..a4f6e18fdfc 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -14,6 +14,7 @@ use nexus_types::internal_api::views::QuiesceStatus; use nexus_types::quiesce::SagaQuiesceHandle; use omicron_common::api::external::LookupResult; use omicron_common::api::external::UpdateResult; +use slog::Logger; use std::sync::Arc; use std::time::Instant; use tokio::sync::watch; @@ -21,26 +22,7 @@ use tokio::sync::watch; impl super::Nexus { pub async fn quiesce_start(&self, opctx: &OpContext) -> UpdateResult<()> { opctx.authorize(authz::Action::Modify, &authz::QUIESCE_STATE).await?; - let started = self.quiesce.send_if_modified(|q| { - if let QuiesceState::Running = q { - let time_requested = Utc::now(); - let time_waiting_for_sagas = Instant::now(); - *q = QuiesceState::WaitingForSagas { - time_requested, - time_waiting_for_sagas, - }; - true - } else { - false - } - }); - if started { - tokio::spawn(do_quiesce( - self.quiesce.clone(), - self.saga_quiesce.clone(), - self.datastore().clone(), - )); - } + self.quiesce.set_quiescing(true); Ok(()) } @@ -49,56 +31,163 @@ impl super::Nexus { opctx: &OpContext, ) -> LookupResult { opctx.authorize(authz::Action::Read, &authz::QUIESCE_STATE).await?; - let state = self.quiesce.borrow().clone(); - let sagas_pending = self.saga_quiesce.sagas_pending(); + let state = self.quiesce.state(); + let sagas_pending = self.quiesce.sagas().sagas_pending(); let db_claims = self.datastore().claims_held(); Ok(QuiesceStatus { state, sagas_pending, db_claims }) } } -async fn do_quiesce( - quiesce: watch::Sender, - saga_quiesce: SagaQuiesceHandle, +/// Describes the configuration and state around quiescing Nexus +#[derive(Clone)] +pub struct NexusQuiesceHandle { + log: Logger, datastore: Arc, -) { - assert_matches!(*quiesce.borrow(), QuiesceState::WaitingForSagas { .. }); - saga_quiesce.quiesce(); - saga_quiesce.wait_for_quiesced().await; - quiesce.send_modify(|q| { - let QuiesceState::WaitingForSagas { + sagas: SagaQuiesceHandle, + state: watch::Sender, +} + +impl NexusQuiesceHandle { + pub fn new(log: &Logger, datastore: Arc) -> NexusQuiesceHandle { + let my_log = log.new(o!("component" => "NexusQuiesceHandle")); + let saga_quiesce_log = log.new(o!("component" => "SagaQuiesceHandle")); + let sagas = SagaQuiesceHandle::new(saga_quiesce_log); + let (state, _) = watch::channel(QuiesceState::Undetermined); + NexusQuiesceHandle { log: my_log, datastore, sagas, state } + } + + pub fn sagas(&self) -> SagaQuiesceHandle { + self.sagas.clone() + } + + pub fn state(&self) -> QuiesceState { + self.state.borrow().clone() + } + + pub fn set_quiescing(&self, quiescing: bool) { + let new_state = if quiescing { + let time_requested = Utc::now(); + let time_draining_sagas = Instant::now(); + QuiesceState::DrainingSagas { time_requested, time_draining_sagas } + } else { + QuiesceState::Running + }; + + let changed = self.state.send_if_modified(|q| { + match q { + QuiesceState::Undetermined => { + info!(&self.log, "initial state"; "state" => ?new_state); + *q = new_state; + true + } + QuiesceState::Running if quiescing => { + info!(&self.log, "quiesce starting"); + *q = new_state; + true + } + _ => { + // All other cases are either impossible or no-ops. + false + } + } + }); + + if changed && quiescing { + // Immediately quiesce sagas. + self.sagas.set_quiescing(quiescing); + // Asynchronously complete the rest of the quiesce process. + if quiescing { + tokio::spawn(do_quiesce(self.clone())); + } + } + } +} + +async fn do_quiesce(quiesce: NexusQuiesceHandle) { + let saga_quiesce = quiesce.sagas.clone(); + let datastore = quiesce.datastore.clone(); + + // NOTE: This sequence will change as we implement RFD 588. + // We will need to use the datastore to report our saga drain status and + // also to see when other Nexus instances have finished draining their + // sagas. For now, this implementation begins quiescing its database as + // soon as its sagas are locally drained. + assert_matches!( + *quiesce.state.borrow(), + QuiesceState::DrainingSagas { .. } + ); + + // TODO per RFD 588, this is where we will enter a loop, pausing either on + // timeout or when our local quiesce state changes. At each pause: if we + // need to update our db_metadata_nexus record, do so. Then load the + // current blueprint and check the records for all nexus instances. + // + // For now, we skip the cross-Nexus coordination and simply wait for our own + // Nexus to finish what it's doing. + saga_quiesce.wait_for_drained().await; + + quiesce.state.send_modify(|q| { + let QuiesceState::DrainingSagas { time_requested, - time_waiting_for_sagas, + time_draining_sagas, } = *q else { panic!("wrong state in do_quiesce(): {:?}", q); }; - *q = QuiesceState::WaitingForDb { + let time_draining_db = Instant::now(); + *q = QuiesceState::DrainingDb { time_requested, - time_waiting_for_sagas, - duration_waiting_for_sagas: time_waiting_for_sagas.elapsed(), - time_waiting_for_db: Instant::now(), + time_draining_sagas, + duration_draining_sagas: time_draining_db - time_draining_sagas, + time_draining_db, }; }); datastore.quiesce(); datastore.wait_for_quiesced().await; - quiesce.send_modify(|q| { - let QuiesceState::WaitingForDb { + quiesce.state.send_modify(|q| { + let QuiesceState::DrainingDb { time_requested, - time_waiting_for_sagas, - duration_waiting_for_sagas, - time_waiting_for_db, + time_draining_sagas, + duration_draining_sagas, + time_draining_db, + } = *q + else { + panic!("wrong state in do_quiesce(): {:?}", q); + }; + let time_recording_quiesce = Instant::now(); + *q = QuiesceState::RecordingQuiesce { + time_requested, + time_draining_sagas, + duration_draining_sagas, + duration_draining_db: time_recording_quiesce - time_draining_db, + time_recording_quiesce, + }; + }); + + // TODO per RFD 588, this is where we will enter a loop trying to update our + // database record for the last time. + + quiesce.state.send_modify(|q| { + let QuiesceState::RecordingQuiesce { + time_requested, + time_draining_sagas, + duration_draining_sagas, + duration_draining_db, + time_recording_quiesce, } = *q else { panic!("wrong state in do_quiesce(): {:?}", q); }; + let finished = Instant::now(); *q = QuiesceState::Quiesced { time_requested, - duration_waiting_for_sagas, - duration_waiting_for_db: finished - time_waiting_for_db, - duration_total: finished - time_waiting_for_sagas, time_quiesced: Utc::now(), + duration_draining_sagas, + duration_draining_db, + duration_recording_quiesce: finished - time_recording_quiesce, + duration_total: finished - time_draining_sagas, }; }); } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 23094feb17b..92fcbb24382 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -76,6 +76,8 @@ mod planning_report; mod zone_type; use crate::inventory::BaseboardId; +use anyhow::anyhow; +use anyhow::bail; pub use blueprint_diff::BlueprintDiffSummary; use blueprint_display::BpPendingMgsUpdates; pub use chicken_switches::PlannerChickenSwitches; @@ -383,6 +385,26 @@ impl Blueprint { pub fn display(&self) -> BlueprintDisplay<'_> { BlueprintDisplay { blueprint: self } } + + /// Returns whether the given Nexus instance should be quiescing or quiesced + /// in preparation for handoff to the next generation + pub fn nexus_quiescing( + &self, + nexus_id: OmicronZoneUuid, + ) -> Result { + let zone = self + .all_omicron_zones(|_z| true) + .find(|(_sled_id, zone_config)| zone_config.id == nexus_id) + .ok_or_else(|| { + anyhow!("zone {} does not exist in blueprint", nexus_id) + })? + .1; + let BlueprintZoneType::Nexus(zone_config) = &zone.zone_type else { + bail!("zone {} is not a Nexus zone", nexus_id); + }; + + Ok(zone_config.nexus_generation < self.nexus_generation) + } } /// Wrapper to display a table of a `BlueprintSledConfig`'s host phase 2 diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index f7db6d86612..972a0b92df2 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -739,19 +739,28 @@ pub struct QuiesceStatus { /// At any given time, Nexus is always in one of these states: /// /// ```text +/// Undetermined (have not loaded persistent state; don't know yet) +/// | +/// | load persistent state and find we're not quiescing +/// v /// Running (normal operation) /// | /// | quiesce starts /// v -/// WaitingForSagas (no new sagas are allowed, but some are still running) +/// DrainingSagas (no new sagas are allowed, but some are still running) /// | /// | no more sagas running /// v -/// WaitingForDb (no sagas running; no new db connections may be -/// acquired by Nexus at-large, but some are still held) +/// DrainingDb (no sagas running; no new db connections may be +/// | acquired by Nexus at-large, but some are still held) /// | /// | no more database connections held /// v +/// RecordingQuiesce (everything is quiesced aside from one connection being +/// | used to record our final quiesced state) +/// | +/// | finish recording quiesce state in database +/// v /// Quiesced (no sagas running, no database connections in use) /// ``` /// @@ -762,58 +771,51 @@ pub struct QuiesceStatus { #[serde(rename_all = "snake_case")] #[serde(tag = "state", content = "quiesce_details")] pub enum QuiesceState { + /// We have not yet determined based on persistent state if we're supposed + /// to be quiesced or not + Undetermined, /// Normal operation Running, - /// New sagas disallowed, but some are still running. - WaitingForSagas { + /// New sagas disallowed, but some are still running on some Nexus instances + DrainingSagas { + time_requested: DateTime, + #[serde(skip)] + time_draining_sagas: Instant, + }, + /// No sagas running on any Nexus instances + /// + /// No new database connections may be claimed, but some database + /// connections are still held. + DrainingDb { time_requested: DateTime, #[serde(skip)] - time_waiting_for_sagas: Instant, + time_draining_sagas: Instant, + duration_draining_sagas: Duration, + #[serde(skip)] + time_draining_db: Instant, }, - /// No sagas running, no new database connections may be claimed, but some - /// database connections are still held. - WaitingForDb { + /// No database connections in use except to record the final "quiesced" + /// state + RecordingQuiesce { time_requested: DateTime, #[serde(skip)] - time_waiting_for_sagas: Instant, - duration_waiting_for_sagas: Duration, + time_draining_sagas: Instant, + duration_draining_sagas: Duration, + duration_draining_db: Duration, #[serde(skip)] - time_waiting_for_db: Instant, + time_recording_quiesce: Instant, }, /// Nexus has no sagas running and is not using the database Quiesced { time_requested: DateTime, time_quiesced: DateTime, - duration_waiting_for_sagas: Duration, - duration_waiting_for_db: Duration, + duration_draining_sagas: Duration, + duration_draining_db: Duration, + duration_recording_quiesce: Duration, duration_total: Duration, }, } -impl QuiesceState { - pub fn running() -> QuiesceState { - QuiesceState::Running - } - - pub fn quiescing(&self) -> bool { - match self { - QuiesceState::Running => false, - QuiesceState::WaitingForSagas { .. } - | QuiesceState::WaitingForDb { .. } - | QuiesceState::Quiesced { .. } => true, - } - } - - pub fn fully_quiesced(&self) -> bool { - match self { - QuiesceState::Running - | QuiesceState::WaitingForSagas { .. } - | QuiesceState::WaitingForDb { .. } => false, - QuiesceState::Quiesced { .. } => true, - } - } -} - /// Describes a pending saga (for debugging why quiesce is stuck) #[derive(Debug, Clone, Serialize, JsonSchema)] pub struct PendingSagaInfo { diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 76df318d80d..378011be2cf 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -12,6 +12,7 @@ use iddqd::IdOrdMap; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use slog::Logger; +use slog::error; use slog::info; use slog::o; use slog_error_chain::InlineErrorChain; @@ -27,15 +28,20 @@ use tokio::sync::watch; enum SagasAllowed { /// New sagas may be started (normal condition) Allowed, - /// New sagas may not be started (happens during quiesce) - Disallowed, + /// New sagas may not be started because we're quiescing or quiesced + DisallowedQuiesce, + /// New sagas may not be started because we just started up and haven't + /// determined if we're quiescing yet + DisallowedUnknown, } #[derive(Debug, Error)] -#[error( - "saga creation and reassignment are disallowed (Nexus quiescing/quiesced)" -)] -pub struct NoSagasAllowedError; +pub enum NoSagasAllowedError { + #[error("saga creation is disallowed (quiescing/quiesced)")] + Quiescing, + #[error("saga creation is disallowed (unknown yet if we're quiescing)")] + Unknown, +} impl From for Error { fn from(value: NoSagasAllowedError) -> Self { Error::unavail(&value.to_string()) @@ -80,7 +86,7 @@ pub struct SagaQuiesceHandle { // mutate the data, using it to protect data and not code. // // (2) `watch::Receiver` provides a really handy `wait_for()` method` that - // we use in `wait_for_quiesced()`. Besides being convenient, this + // we use in `wait_for_drained()`. Besides being convenient, this // would be surprisingly hard for us to implement ourselves with a // `Mutex`. Traditionally, you'd use a combination Mutex/Condvar for // this. But we'd want to use a `std` Mutex (since tokio Mutex's @@ -140,7 +146,7 @@ struct SagaQuiesceInner { impl SagaQuiesceHandle { pub fn new(log: Logger) -> SagaQuiesceHandle { let (inner, _) = watch::channel(SagaQuiesceInner { - new_sagas_allowed: SagasAllowed::Allowed, + new_sagas_allowed: SagasAllowed::DisallowedUnknown, sagas_pending: IdOrdMap::new(), first_recovery_complete: false, reassignment_generation: Generation::new(), @@ -151,26 +157,65 @@ impl SagaQuiesceHandle { SagaQuiesceHandle { log, inner } } - /// Disallow new sagas from being started or re-assigned to this Nexus + /// Set the intended quiescing state /// - /// This is currently a one-way trip. Sagas cannot be un-quiesced. - pub fn quiesce(&self) { - // Log this before changing the config to make sure this message - // appears before messages from code paths that saw this change. - info!(&self.log, "starting saga quiesce"); - self.inner - .send_modify(|q| q.new_sagas_allowed = SagasAllowed::Disallowed); + /// Quiescing is currently a one-way trip. Once we start quiescing, we + /// cannot then re-enable sagas. + pub fn set_quiescing(&self, quiescing: bool) { + self.inner.send_if_modified(|q| { + let new_state = if quiescing { + SagasAllowed::DisallowedQuiesce + } else { + SagasAllowed::Allowed + }; + + match q.new_sagas_allowed { + SagasAllowed::DisallowedUnknown => { + info!( + &self.log, + "initial quiesce state"; + "initial_state" => ?new_state + ); + q.new_sagas_allowed = new_state; + true + } + SagasAllowed::Allowed if quiescing => { + info!(&self.log, "saga quiesce starting"); + q.new_sagas_allowed = SagasAllowed::DisallowedQuiesce; + true + } + SagasAllowed::DisallowedQuiesce if !quiescing => { + // This should be impossible. Report a problem. + error!( + &self.log, + "asked to stop quiescing after previously quiescing" + ); + false + } + _ => { + // There's no transition happening in these cases: + // - SagasAllowed::Allowed and we're not quiescing + // - SagasAllowed::DisallowedQuiesce and we're now quiescing + false + } + } + }); } - /// Returns whether sagas are fully quiesced - pub fn is_fully_quiesced(&self) -> bool { - self.inner.borrow().is_fully_quiesced() + /// Returns whether sagas are fully drained + /// + /// Note that this state can change later if new sagas get assigned to this + /// Nexus. + pub fn is_fully_drained(&self) -> bool { + self.inner.borrow().is_fully_drained() } - /// Wait for sagas to be quiesced - pub async fn wait_for_quiesced(&self) { - let _ = - self.inner.subscribe().wait_for(|q| q.is_fully_quiesced()).await; + /// Wait for sagas to become drained + /// + /// Note that new sagas can still be assigned to this Nexus, resulting in it + /// no longer being fully drained. + pub async fn wait_for_drained(&self) { + let _ = self.inner.subscribe().wait_for(|q| q.is_fully_drained()).await; } /// Returns information about running sagas (involves a clone) @@ -180,13 +225,10 @@ impl SagaQuiesceHandle { /// Record an operation that might assign sagas to this Nexus /// - /// If reassignment is currently allowed, `f` will be invoked to potentially - /// re-assign sagas. `f` returns `(T, bool)`, where `T` is whatever value - /// you want and is returned back from this function. The boolean indicates - /// whether any sagas may have been assigned to the current Nexus. - /// - /// If reassignment is currently disallowed (because Nexus is quiescing), - /// `f` is not invoked and an error describing this condition is returned. + /// `f` will be invoked to potentially re-assign sagas. `f` returns `(T, + /// bool)`, where `T` is whatever value you want and is returned back from + /// this function. The boolean indicates whether any sagas may have been + /// assigned to the current Nexus. /// /// Only one of these may be outstanding at a time. It should not be called /// concurrently. This is easy today because this is only invoked by a few @@ -204,27 +246,22 @@ impl SagaQuiesceHandle { // mis-use (e.g., by forgetting to call `reassignment_done()`). But we keep // the other two functions around because it's easier to write tests against // those. - pub async fn reassign_if_possible( - &self, - f: F, - ) -> Result + pub async fn reassign_sagas(&self, f: F) -> T where F: AsyncFnOnce() -> (T, bool), { - let in_progress = self.reassignment_start()?; + let in_progress = self.reassignment_start(); let (result, maybe_reassigned) = f().await; in_progress.reassignment_done(maybe_reassigned); - Ok(result) + result } /// Record that we've begun a re-assignment operation. /// /// Only one of these may be outstanding at a time. The caller must call /// `reassignment_done()` before starting another one of these. - fn reassignment_start( - &self, - ) -> Result { - let okay = self.inner.send_if_modified(|q| { + fn reassignment_start(&self) -> SagaReassignmentInProgress { + self.inner.send_modify(|q| { assert!( !q.reassignment_pending, "two calls to reassignment_start() without intervening call \ @@ -232,21 +269,11 @@ impl SagaQuiesceHandle { reassign_if_possible()?)" ); - if q.new_sagas_allowed != SagasAllowed::Allowed { - return false; - } - q.reassignment_pending = true; - true }); - if okay { - info!(&self.log, "allowing saga re-assignment pass"); - Ok(SagaReassignmentInProgress { q: self.clone() }) - } else { - info!(&self.log, "disallowing saga re-assignment pass"); - Err(NoSagasAllowedError) - } + info!(&self.log, "starting saga re-assignment pass"); + SagaReassignmentInProgress { q: self.clone() } } /// Record that we've finished an operation that might assign new sagas to @@ -262,10 +289,10 @@ impl SagaQuiesceHandle { q.reassignment_pending = false; // If we may have assigned new sagas to ourselves, bump the - // generation number. We won't quiesce until a recovery pass has - // finished that *started* with this generation number. So this - // ensures that we won't quiesce until any sagas that may have been - // assigned to us have been recovered. + // generation number. We won't report being drained until a + // recovery pass has finished that *started* with this generation + // number. So this ensures that we won't report being drained until + // any sagas that may have been assigned to us have been recovered. if maybe_reassigned { q.reassignment_generation = q.reassignment_generation.next(); } @@ -344,7 +371,7 @@ impl SagaQuiesceHandle { /// Report that a saga has started running /// - /// This fails if sagas are quiesced. + /// This fails if sagas are quiescing or quiesced. /// /// Callers must also call `saga_completion_future()` to make sure it's /// recorded when this saga finishes. @@ -353,9 +380,18 @@ impl SagaQuiesceHandle { saga_id: steno::SagaId, saga_name: &steno::SagaName, ) -> Result { + let mut error: Option = None; let okay = self.inner.send_if_modified(|q| { - if q.new_sagas_allowed != SagasAllowed::Allowed { - return false; + match q.new_sagas_allowed { + SagasAllowed::Allowed => (), + SagasAllowed::DisallowedQuiesce => { + error = Some(NoSagasAllowedError::Quiescing); + return false; + } + SagasAllowed::DisallowedUnknown => { + error = Some(NoSagasAllowedError::Unknown); + return false; + } } q.sagas_pending @@ -379,12 +415,15 @@ impl SagaQuiesceHandle { init_finished: false, }) } else { + let error = + error.expect("error is always set when disallowing sagas"); info!( &self.log, "disallowing saga creation"; - "saga_id" => saga_id.to_string() + "saga_id" => saga_id.to_string(), + InlineErrorChain::new(&error), ); - Err(NoSagasAllowedError) + Err(error) } } @@ -403,8 +442,8 @@ impl SagaQuiesceHandle { /// sagas that might possibly have finished already.) /// /// Unlike `saga_created()`, this cannot fail as a result of sagas being - /// quiesced. That's because a saga that *needs* to be recovered is a - /// blocker for quiesce, whether it's running or not. So we need to + /// quiescing/quiesced. That's because a saga that *needs* to be recovered + /// is a blocker for quiesce, whether it's running or not. So we need to /// actually run and finish it. We do still want to prevent ourselves from /// taking on sagas needing recovery -- that's why we fail /// `reassign_if_possible()` when saga creation is disallowed. @@ -438,10 +477,13 @@ impl SagaQuiesceHandle { } impl SagaQuiesceInner { - /// Returns whether sagas are fully and permanently quiesced - pub fn is_fully_quiesced(&self) -> bool { + /// Returns whether sagas are fully drained + /// + /// This condition is not permanent. New sagas can be re-assigned to this + /// Nexus. + pub fn is_fully_drained(&self) -> bool { // No new sagas may be created - self.new_sagas_allowed == SagasAllowed::Disallowed + self.new_sagas_allowed == SagasAllowed::DisallowedQuiesce // and there are none currently running && self.sagas_pending.is_empty() // and there are none from a previous lifetime that still need to be @@ -640,32 +682,30 @@ mod test { // Set up a new handle. Complete the first saga recovery immediately so // that that doesn't block quiescing. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); let recovery = qq.recovery_start(); recovery.recovery_done(true); - // It's still not fully quiesced because we haven't asked it to quiesce + // It's still not fully drained because we haven't asked it to quiesce // yet. assert!(qq.sagas_pending().is_empty()); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); - // Now start quiescing. It should immediately report itself as - // quiesced. There's nothing asynchronous in this path. (It would be - // okay if there were.) - qq.quiesce(); - assert!(qq.is_fully_quiesced()); + // Now start quiescing. It should immediately report itself as drained. + // There's nothing asynchronous in this path. (It would be okay if + // there were.) + qq.set_quiescing(true); + assert!(qq.is_fully_drained()); - // It's not allowed to create sagas or begin re-assignment after - // quiescing has started, let alone finished. + // It's not allowed to create sagas after quiescing has started, let + // alone finished. let _ = qq .saga_create(*SAGA_ID, &SAGA_NAME) .expect_err("cannot create saga after quiescing started"); - let _ = qq - .reassignment_start() - .expect_err("cannot start re-assignment after quiescing started"); - // Waiting for quiesce should complete immediately. - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + // Waiting for drain should complete immediately. + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -680,6 +720,7 @@ mod test { // Set up a new handle. Complete the first saga recovery immediately so // that that doesn't block quiescing. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); let recovery = qq.recovery_start(); recovery.recovery_done(true); @@ -690,15 +731,15 @@ mod test { assert!(!qq.sagas_pending().is_empty()); // Start quiescing. - qq.quiesce(); - assert!(!qq.is_fully_quiesced()); + qq.set_quiescing(true); + assert!(!qq.is_fully_drained()); // Dropping the returned handle is as good as completing the saga. drop(started); assert!(qq.sagas_pending().is_empty()); - assert!(qq.is_fully_quiesced()); - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + assert!(qq.is_fully_drained()); + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -715,6 +756,7 @@ mod test { // Set up a new handle. Complete the first saga recovery immediately so // that that doesn't block quiescing. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); let recovery = qq.recovery_start(); recovery.recovery_done(true); @@ -730,8 +772,8 @@ mod test { assert!(!qq.sagas_pending().is_empty()); // Quiesce should block on the saga finishing. - qq.quiesce(); - assert!(!qq.is_fully_quiesced()); + qq.set_quiescing(true); + assert!(!qq.is_fully_drained()); // "Finish" the saga. tx.send(saga_result()).unwrap(); @@ -740,15 +782,15 @@ mod test { // able to notice that the saga finished yet. It's not that important // to assert this but it emphasizes that it really is waiting for // something to happen. - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // The consumer's completion future ought to be unblocked now. let _ = consumer_completion.await; // Wait for quiescing to finish. This should be immediate. - qq.wait_for_quiesced().await; + qq.wait_for_drained().await; assert!(qq.sagas_pending().is_empty()); - assert!(qq.is_fully_quiesced()); + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -761,24 +803,25 @@ mod test { // Set up a new handle. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); - // Quiesce should block on recovery having completed successfully once. - qq.quiesce(); - assert!(!qq.is_fully_quiesced()); + // Drain should block on recovery having completed successfully once. + qq.set_quiescing(true); + assert!(!qq.is_fully_drained()); // Act like the first recovery failed. Quiescing should still be // blocked. let recovery = qq.recovery_start(); recovery.recovery_done(false); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // Finish a normal saga recovery. Quiescing should proceed. // This happens synchronously (though it doesn't have to). let recovery = qq.recovery_start(); recovery.recovery_done(true); - assert!(qq.is_fully_quiesced()); - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + assert!(qq.is_fully_drained()); + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -792,25 +835,25 @@ mod test { // Set up a new handle. Complete the first saga recovery immediately so // that that doesn't block quiescing. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); let recovery = qq.recovery_start(); recovery.recovery_done(true); // Begin saga re-assignment. - let reassignment = - qq.reassignment_start().expect("can re-assign when not quiescing"); + let reassignment = qq.reassignment_start(); // Begin quiescing. - qq.quiesce(); + qq.set_quiescing(true); // Quiescing is blocked. - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // When re-assignment finishes *without* having re-assigned anything, // then we're immediately all set. reassignment.reassignment_done(false); - assert!(qq.is_fully_quiesced()); - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + assert!(qq.is_fully_drained()); + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -826,36 +869,36 @@ mod test { // Set up a new handle. Complete the first saga recovery immediately so // that that doesn't block quiescing. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); let recovery = qq.recovery_start(); recovery.recovery_done(true); // Begin saga re-assignment. - let reassignment = - qq.reassignment_start().expect("can re-assign when not quiescing"); + let reassignment = qq.reassignment_start(); // Begin quiescing. - qq.quiesce(); + qq.set_quiescing(true); // Quiescing is blocked. - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // When re-assignment finishes and re-assigned sagas, we're still // blocked. reassignment.reassignment_done(true); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // If the next recovery pass fails, we're still blocked. let recovery = qq.recovery_start(); recovery.recovery_done(false); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // Once a recovery pass succeeds, we're good. let recovery = qq.recovery_start(); recovery.recovery_done(true); - assert!(qq.is_fully_quiesced()); + assert!(qq.is_fully_drained()); - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -874,18 +917,18 @@ mod test { // Set up a new handle. Complete the first saga recovery immediately so // that that doesn't block quiescing. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); let recovery = qq.recovery_start(); recovery.recovery_done(true); // Begin saga re-assignment. - let reassignment = - qq.reassignment_start().expect("can re-assign when not quiescing"); + let reassignment = qq.reassignment_start(); // Begin quiescing. - qq.quiesce(); + qq.set_quiescing(true); // Quiescing is blocked. - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // Start a recovery pass. let recovery = qq.recovery_start(); @@ -893,25 +936,25 @@ mod test { // When re-assignment finishes and re-assigned sagas, we're still // blocked. reassignment.reassignment_done(true); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // Even if this recovery pass succeeds, we're still blocked, because it // started before re-assignment finished and so isn't guaranteed to have // seen all the re-assigned sagas. recovery.recovery_done(true); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // If the next pass fails, we're still blocked. let recovery = qq.recovery_start(); recovery.recovery_done(false); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // Finally, we have a successful pass that unblocks us. let recovery = qq.recovery_start(); recovery.recovery_done(true); - assert!(qq.is_fully_quiesced()); - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + assert!(qq.is_fully_drained()); + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -930,23 +973,23 @@ mod test { // Set up a new handle. Complete the first saga recovery immediately so // that that doesn't block quiescing. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); let recovery = qq.recovery_start(); recovery.recovery_done(true); // Begin saga re-assignment. - let reassignment = - qq.reassignment_start().expect("can re-assign when not quiescing"); + let reassignment = qq.reassignment_start(); // Begin quiescing. - qq.quiesce(); + qq.set_quiescing(true); // Quiescing is blocked. - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // When re-assignment finishes and re-assigned sagas, we're still // blocked because we haven't run recovery. reassignment.reassignment_done(true); - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // Start a recovery pass. Pretend like we found something. let recovery = qq.recovery_start(); @@ -958,7 +1001,7 @@ mod test { recovery.recovery_done(true); // We're still not quiesced because that saga is still running. - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); // Finish the recovered saga. That should unblock quiesce. tx.send(saga_result()).unwrap(); @@ -966,8 +1009,8 @@ mod test { // The consumer's completion future ought to be unblocked now. let _ = consumer_completion.await; - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); logctx.cleanup_successful(); } @@ -983,6 +1026,7 @@ mod test { // Set up a new handle. let qq = SagaQuiesceHandle::new(log.clone()); + qq.set_quiescing(false); // Start a recovery pass. Pretend like we found something. let recovery = qq.recovery_start(); let pending = recovery.record_saga_recovery(*SAGA_ID, &SAGA_NAME); @@ -993,20 +1037,66 @@ mod test { recovery.recovery_done(true); // Begin quiescing. - qq.quiesce(); + qq.set_quiescing(true); // Quiescing is blocked. - assert!(!qq.is_fully_quiesced()); + assert!(!qq.is_fully_drained()); - // Finish the recovered saga. That should unblock quiesce. + // Finish the recovered saga. That should unblock drain. tx.send(saga_result()).unwrap(); - qq.wait_for_quiesced().await; - assert!(qq.is_fully_quiesced()); + qq.wait_for_drained().await; + assert!(qq.is_fully_drained()); // The consumer's completion future ought to be unblocked now. let _ = consumer_completion.await; logctx.cleanup_successful(); } + + /// Tests that sagas are disabled at the start + #[tokio::test] + async fn test_quiesce_sagas_disabled_on_startup() { + let logctx = test_setup_log("test_quiesce_block_on_recovered_sagas"); + let log = &logctx.log; + + let qq = SagaQuiesceHandle::new(log.clone()); + assert!(!qq.is_fully_drained()); + let _ = qq + .saga_create(*SAGA_ID, &SAGA_NAME) + .expect_err("cannot create saga in initial state"); + qq.recovery_start().recovery_done(true); + qq.set_quiescing(true); + assert!(qq.is_fully_drained()); + let _ = qq + .saga_create(*SAGA_ID, &SAGA_NAME) + .expect_err("cannot create saga after quiescing"); + + // It's allowed to start a new re-assignment pass. That prevents us + // from being drained. + let reassignment = qq.reassignment_start(); + assert!(!qq.is_fully_drained()); + reassignment.reassignment_done(false); + // We're fully drained as soon as this one is done, since we know we + // didn't assign any sagas. + assert!(qq.is_fully_drained()); + + // Try again. This time, we'll act like we did reassign sagas. + let reassignment = qq.reassignment_start(); + assert!(!qq.is_fully_drained()); + reassignment.reassignment_done(true); + assert!(!qq.is_fully_drained()); + // Do a failed recovery pass. We still won't be fully drained. + let recovery = qq.recovery_start(); + assert!(!qq.is_fully_drained()); + recovery.recovery_done(false); + assert!(!qq.is_fully_drained()); + // Do a successful recovery pass. We'll be drained again. + let recovery = qq.recovery_start(); + assert!(!qq.is_fully_drained()); + recovery.recovery_done(true); + assert!(qq.is_fully_drained()); + + logctx.cleanup_successful(); + } } From a8862d59bc38862f809285bbed97224a6af628f0 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 20 Aug 2025 16:07:49 -0700 Subject: [PATCH 04/22] self-review + regenerate API spec --- dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs | 44 +++++++++-- .../background/tasks/blueprint_execution.rs | 2 +- nexus/src/app/quiesce.rs | 21 ++--- nexus/types/src/deployment.rs | 2 +- openapi/nexus-internal.json | 79 ++++++++++++++++--- 5 files changed, 117 insertions(+), 31 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs index b27c2c22fb1..76c0a229c3c 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs @@ -61,10 +61,13 @@ async fn quiesce_show( .context("fetching quiesce state")? .into_inner(); match quiesce.state { + QuiesceState::Undetermined => { + println!("has not yet determined if it is quiescing"); + } QuiesceState::Running => { println!("running normally (not quiesced, not quiescing)"); } - QuiesceState::WaitingForSagas { time_requested } => { + QuiesceState::DrainingSagas { time_requested } => { println!( "quiescing since {} ({} ago)", humantime::format_rfc3339_millis(time_requested.into()), @@ -72,9 +75,9 @@ async fn quiesce_show( ); println!("details: waiting for running sagas to finish"); } - QuiesceState::WaitingForDb { + QuiesceState::DrainingDb { time_requested, - duration_waiting_for_sagas, + duration_draining_sagas, .. } => { println!( @@ -87,13 +90,34 @@ async fn quiesce_show( ); println!( " previously: waiting for sagas took {}", - format_duration_ms(duration_waiting_for_sagas.into()), + format_duration_ms(duration_draining_sagas.into()), + ); + } + QuiesceState::RecordingQuiesce { + time_requested, + duration_draining_sagas, + duration_draining_db, + .. + } => { + println!( + "quiescing since {} ({} ago)", + humantime::format_rfc3339_millis(time_requested.into()), + format_time_delta(now - time_requested), + ); + println!( + " waiting for sagas took {}", + format_duration_ms(duration_draining_sagas.into()), + ); + println!( + " waiting for db quiesce took {}", + format_duration_ms(duration_draining_db.into()), ); } QuiesceState::Quiesced { time_quiesced, - duration_waiting_for_sagas, - duration_waiting_for_db, + duration_draining_sagas, + duration_draining_db, + duration_recording_quiesce, duration_total, .. } => { @@ -104,11 +128,15 @@ async fn quiesce_show( ); println!( " waiting for sagas took {}", - format_duration_ms(duration_waiting_for_sagas.into()), + format_duration_ms(duration_draining_sagas.into()), ); println!( " waiting for db quiesce took {}", - format_duration_ms(duration_waiting_for_db.into()), + format_duration_ms(duration_draining_db.into()), + ); + println!( + " recording quiesce took {}", + format_duration_ms(duration_recording_quiesce.into()), ); println!( " total quiesce time: {}", diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index b930fc51079..443ec5eec18 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -108,7 +108,7 @@ impl BlueprintExecutor { // blueprint before enabling sagas, since we already know if we're // quiescing or not); and (2) because we want to do it even if blueprint // execution is disabled. - match blueprint.nexus_quiescing(self.nexus_id) { + match blueprint.is_nexus_quiescing(self.nexus_id) { Ok(quiescing) => { debug!( &opctx.log, diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index a4f6e18fdfc..17b28e7fc9b 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -250,24 +250,27 @@ mod test { let QuiesceState::Quiesced { time_requested, time_quiesced, - duration_waiting_for_sagas, - duration_waiting_for_db, + duration_draining_sagas, + duration_draining_db, + duration_recording_quiesce, duration_total, } = status.state else { panic!("not quiesced"); }; let duration_total = Duration::from(duration_total); - let duration_waiting_for_sagas = - Duration::from(duration_waiting_for_sagas); - let duration_waiting_for_db = Duration::from(duration_waiting_for_db); + let duration_draining_sagas = Duration::from(duration_draining_sagas); + let duration_draining_db = Duration::from(duration_draining_db); + let duration_recording_quiesce = + Duration::from(duration_recording_quiesce); assert!(time_requested >= before); assert!(time_requested <= after); assert!(time_quiesced >= before); assert!(time_quiesced <= after); assert!(time_quiesced >= time_requested); - assert!(duration_total >= duration_waiting_for_sagas); - assert!(duration_total >= duration_waiting_for_db); + assert!(duration_total >= duration_draining_sagas); + assert!(duration_total >= duration_draining_db); + assert!(duration_total >= duration_recording_quiesce); assert!(duration_total <= (after - before).to_std().unwrap()); assert!(status.sagas_pending.is_empty()); assert!(status.db_claims.is_empty()); @@ -341,7 +344,7 @@ mod test { debug!(log, "found quiesce status"; "status" => ?quiesce_status); assert_matches!( quiesce_status.state, - QuiesceState::WaitingForSagas { .. } + QuiesceState::DrainingSagas { .. } ); assert!(quiesce_status.sagas_pending.contains_key(&demo_saga.saga_id)); // We should see at least one held database claim from the one we took @@ -404,7 +407,7 @@ mod test { .map_err(|e| CondCheckError::Failed(e))? .into_inner(); debug!(log, "found quiesce state"; "state" => ?rv); - if !matches!(rv.state, QuiesceState::WaitingForDb { .. }) { + if !matches!(rv.state, QuiesceState::DrainingDb { .. }) { return Err(CondCheckError::::NotYet); } assert!(rv.sagas_pending.is_empty()); diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 92fcbb24382..b6fd344adf7 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -388,7 +388,7 @@ impl Blueprint { /// Returns whether the given Nexus instance should be quiescing or quiesced /// in preparation for handoff to the next generation - pub fn nexus_quiescing( + pub fn is_nexus_quiescing( &self, nexus_id: OmicronZoneUuid, ) -> Result { diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 0a6ae8c075b..ea14b4c2d57 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -7477,8 +7477,23 @@ ] }, "QuiesceState": { - "description": "See [`QuiesceStatus`] for more on Nexus quiescing.\n\nAt any given time, Nexus is always in one of these states:\n\n```text Running (normal operation) | | quiesce starts v WaitingForSagas (no new sagas are allowed, but some are still running) | | no more sagas running v WaitingForDb (no sagas running; no new db connections may be acquired by Nexus at-large, but some are still held) | | no more database connections held v Quiesced (no sagas running, no database connections in use) ```\n\nQuiescing is (currently) a one-way trip: once a Nexus process starts quiescing, it will never go back to normal operation. It will never go back to an earlier stage, either.", + "description": "See [`QuiesceStatus`] for more on Nexus quiescing.\n\nAt any given time, Nexus is always in one of these states:\n\n```text Undetermined (have not loaded persistent state; don't know yet) | | load persistent state and find we're not quiescing v Running (normal operation) | | quiesce starts v DrainingSagas (no new sagas are allowed, but some are still running) | | no more sagas running v DrainingDb (no sagas running; no new db connections may be | acquired by Nexus at-large, but some are still held) | | no more database connections held v RecordingQuiesce (everything is quiesced aside from one connection being | used to record our final quiesced state) | | finish recording quiesce state in database v Quiesced (no sagas running, no database connections in use) ```\n\nQuiescing is (currently) a one-way trip: once a Nexus process starts quiescing, it will never go back to normal operation. It will never go back to an earlier stage, either.", "oneOf": [ + { + "description": "We have not yet determined based on persistent state if we're supposed to be quiesced or not", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": [ + "undetermined" + ] + } + }, + "required": [ + "state" + ] + }, { "description": "Normal operation", "type": "object", @@ -7495,7 +7510,7 @@ ] }, { - "description": "New sagas disallowed, but some are still running.", + "description": "New sagas disallowed, but some are still running on some Nexus instances", "type": "object", "properties": { "quiesce_details": { @@ -7513,7 +7528,7 @@ "state": { "type": "string", "enum": [ - "waiting_for_sagas" + "draining_sagas" ] } }, @@ -7523,13 +7538,13 @@ ] }, { - "description": "No sagas running, no new database connections may be claimed, but some database connections are still held.", + "description": "No sagas running on any Nexus instances\n\nNo new database connections may be claimed, but some database connections are still held.", "type": "object", "properties": { "quiesce_details": { "type": "object", "properties": { - "duration_waiting_for_sagas": { + "duration_draining_sagas": { "$ref": "#/components/schemas/Duration" }, "time_requested": { @@ -7538,14 +7553,50 @@ } }, "required": [ - "duration_waiting_for_sagas", + "duration_draining_sagas", "time_requested" ] }, "state": { "type": "string", "enum": [ - "waiting_for_db" + "draining_db" + ] + } + }, + "required": [ + "quiesce_details", + "state" + ] + }, + { + "description": "No database connections in use except to record the final \"quiesced\" state", + "type": "object", + "properties": { + "quiesce_details": { + "type": "object", + "properties": { + "duration_draining_db": { + "$ref": "#/components/schemas/Duration" + }, + "duration_draining_sagas": { + "$ref": "#/components/schemas/Duration" + }, + "time_requested": { + "type": "string", + "format": "date-time" + } + }, + "required": [ + "duration_draining_db", + "duration_draining_sagas", + "time_requested" + ] + }, + "state": { + "type": "string", + "enum": [ + "recording_quiesce" ] } }, @@ -7561,13 +7612,16 @@ "quiesce_details": { "type": "object", "properties": { - "duration_total": { + "duration_draining_db": { "$ref": "#/components/schemas/Duration" }, - "duration_waiting_for_db": { + "duration_draining_sagas": { "$ref": "#/components/schemas/Duration" }, - "duration_waiting_for_sagas": { + "duration_recording_quiesce": { + "$ref": "#/components/schemas/Duration" + }, + "duration_total": { "$ref": "#/components/schemas/Duration" }, "time_quiesced": { @@ -7580,9 +7634,10 @@ } }, "required": [ + "duration_draining_db", + "duration_draining_sagas", + "duration_recording_quiesce", "duration_total", - "duration_waiting_for_db", - "duration_waiting_for_sagas", "time_quiesced", "time_requested" ] From 356d60b7681c6382da883bfaeee566b2a5e07fda Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 20 Aug 2025 16:29:50 -0700 Subject: [PATCH 05/22] tests need to wait for sagas to be enabled --- nexus/src/app/mod.rs | 8 ++++++++ nexus/src/app/quiesce.rs | 10 +++++----- nexus/src/lib.rs | 19 +++++++++++++++++++ nexus/types/src/quiesce.rs | 12 ++++++++++++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index c8abeac6e05..c9c9e151614 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -621,6 +621,14 @@ impl Nexus { } } + // Waits for Nexus to determine whether sagas are supposed to be quiesced + // + // This is used by the test suite because most tests assume that sagas are + // operational as soon as they start. + pub(crate) async fn wait_for_saga_determination(&self) { + self.quiesce.sagas().wait_for_determination().await; + } + pub(crate) async fn external_tls_config( &self, tls_enabled: bool, diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 17b28e7fc9b..cd5e3242ce5 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -92,13 +92,13 @@ impl NexusQuiesceHandle { } }); + // Immediately (synchronously) update the saga quiesce status. It's + // okay to do this even if there wasn't a change. + self.sagas.set_quiescing(quiescing); + if changed && quiescing { - // Immediately quiesce sagas. - self.sagas.set_quiescing(quiescing); // Asynchronously complete the rest of the quiesce process. - if quiescing { - tokio::spawn(do_quiesce(self.clone())); - } + tokio::spawn(do_quiesce(self.clone())); } } } diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index fc32a4824f5..49010a87bfb 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -138,6 +138,15 @@ impl Server { // the external server we're about to start. apictx.context.nexus.await_ip_allowlist_plumbing().await; + // Wait until Nexus has determined if sagas are supposed to be quiesced. + // This is not strictly necessary. The goal here is to prevent 503 + // errors to clients that reach this Nexus while it's starting up and + // before it's figured out that it doesn't need to quiesce. The risk of + // doing this is that Nexus gets stuck here, but that should only happen + // if it's unable to load the current blueprint, in which case + // something's pretty wrong and it's likely pretty stuck anyway. + apictx.context.nexus.wait_for_saga_determination().await; + // Launch the external server. let tls_config = apictx .context @@ -332,6 +341,16 @@ impl nexus_test_interface::NexusServer for Server { .await .expect("Could not initialize rack"); + // Now that we have a blueprint, determination of whether sagas are + // quiesced can complete. Wait for that so that tests can assume they + // can immediately kick off sagas. + internal_server + .apictx + .context + .nexus + .wait_for_saga_determination() + .await; + // Start the Nexus external API. Server::start(internal_server).await.unwrap() } diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 378011be2cf..072a4ca3b1d 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -218,6 +218,18 @@ impl SagaQuiesceHandle { let _ = self.inner.subscribe().wait_for(|q| q.is_fully_drained()).await; } + /// Wait for the initial determination to be made about whether sagas are + /// allowed or not. + pub async fn wait_for_determination(&self) { + let _ = self + .inner + .subscribe() + .wait_for(|q| { + q.new_sagas_allowed != SagasAllowed::DisallowedUnknown + }) + .await; + } + /// Returns information about running sagas (involves a clone) pub fn sagas_pending(&self) -> IdOrdMap { self.inner.borrow().sagas_pending.clone() From 5f43b6042c6e8df23abba9ed3b0c51a2b0b53da1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 21 Aug 2025 11:47:43 -0700 Subject: [PATCH 06/22] need to activate blueprint loader after inserting initial blueprint --- nexus/src/app/rack.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/rack.rs b/nexus/src/app/rack.rs index 6f71055e8dc..55e8ecc3603 100644 --- a/nexus/src/app/rack.rs +++ b/nexus/src/app/rack.rs @@ -740,7 +740,8 @@ impl super::Nexus { // We've potentially updated the list of DNS servers and the DNS // configuration for both internal and external DNS, plus the Silo - // certificates. Activate the relevant background tasks. + // certificates and target blueprint. Activate the relevant background + // tasks. for task in &[ &self.background_tasks.task_internal_dns_config, &self.background_tasks.task_internal_dns_servers, @@ -748,6 +749,7 @@ impl super::Nexus { &self.background_tasks.task_external_dns_servers, &self.background_tasks.task_external_endpoints, &self.background_tasks.task_inventory_collection, + &self.background_tasks.task_blueprint_loader, ] { self.background_tasks.activate(task); } From 127d5a805de428b0f0a0884d16e9868f130b4eb0 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 21 Aug 2025 14:38:51 -0700 Subject: [PATCH 07/22] add the "second" Nexus to the test suite blueprint; fix omdb tests --- common/src/address.rs | 26 +++- dev-tools/omdb/src/bin/omdb/db.rs | 12 +- dev-tools/omdb/tests/successes.out | 182 +++++++++++++++--------- dev-tools/omdb/tests/test_all_output.rs | 42 +++--- nexus/test-utils/src/lib.rs | 99 +++++++++---- 5 files changed, 238 insertions(+), 123 deletions(-) diff --git a/common/src/address.rs b/common/src/address.rs index 92863c44a42..0efc485ae89 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -373,7 +373,9 @@ pub fn get_64_subnet( /// /// The first address in the range is guaranteed to be no greater than the last /// address. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize)] +#[derive( + Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, Ord, PartialOrd, +)] #[serde(untagged)] pub enum IpRange { V4(Ipv4Range), @@ -507,7 +509,16 @@ impl From for IpRange { /// /// The first address must be less than or equal to the last address. #[derive( - Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, + Clone, + Copy, + Debug, + PartialEq, + Eq, + Deserialize, + Serialize, + JsonSchema, + PartialOrd, + Ord, )] #[serde(try_from = "AnyIpv4Range")] pub struct Ipv4Range { @@ -571,7 +582,16 @@ impl TryFrom for Ipv4Range { /// /// The first address must be less than or equal to the last address. #[derive( - Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, + PartialOrd, + Ord, + Clone, + Copy, + Debug, + PartialEq, + Eq, + Deserialize, + Serialize, + JsonSchema, )] #[serde(try_from = "AnyIpv6Range")] pub struct Ipv6Range { diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 8290739a734..1ca878908bd 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -5016,7 +5016,7 @@ async fn cmd_db_dns_diff( // Load the added and removed items. use nexus_db_schema::schema::dns_name::dsl; - let added = dsl::dns_name + let mut added = dsl::dns_name .filter(dsl::dns_zone_id.eq(zone.id)) .filter(dsl::version_added.eq(version.version)) .limit(i64::from(u32::from(limit))) @@ -5026,7 +5026,7 @@ async fn cmd_db_dns_diff( .context("loading added names")?; check_limit(&added, limit, || "loading added names"); - let removed = dsl::dns_name + let mut removed = dsl::dns_name .filter(dsl::dns_zone_id.eq(zone.id)) .filter(dsl::version_removed.eq(version.version)) .limit(i64::from(u32::from(limit))) @@ -5042,6 +5042,11 @@ async fn cmd_db_dns_diff( ); println!(""); + // This is kind of stupid-expensive, but there aren't a lot of records + // here and it's helpful for this output to be stable. + added.sort_by_cached_key(|k| format!("{} {:?}", k.name, k.records())); + removed.sort_by_cached_key(|k| format!("{} {:?}", k.name, k.records())); + for a in added { print_name("+", &a.name, a.records().context("parsing records")); } @@ -5097,7 +5102,8 @@ async fn cmd_db_dns_names( } }); - for (name, records) in names { + for (name, mut records) in names { + records.sort(); print_name("", &name, Ok(records)); } } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 82577f26f53..bb8f36d4ef9 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -32,7 +32,9 @@ changes: names added: 3, names removed: 0 + @ NS ns1.oxide-dev.test + ns1 AAAA ::1 -+ test-suite-silo.sys A 127.0.0.1 ++ test-suite-silo.sys (records: 2) ++ A 127.0.0.1 ++ AAAA 100::1 --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable @@ -46,7 +48,9 @@ External zone: oxide-dev.test NAME RECORDS @ NS ns1.oxide-dev.test ns1 AAAA ::1 - test-suite-silo.sys A 127.0.0.1 + test-suite-silo.sys (records: 2) + A 127.0.0.1 + AAAA 100::1 --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable @@ -489,15 +493,21 @@ task: "nat_garbage_collector" task: "blueprint_loader" configured period: every m s - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by an explicit signal started at (s ago) and ran for ms - last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set + target blueprint: ............. + execution: disabled + created at: + status: first target blueprint task: "blueprint_executor" configured period: every m - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms - last completion reported error: no blueprint + target blueprint: ............. + execution: disabled + status: (no event report found) + error: (none) task: "abandoned_vmm_reaper" configured period: every m @@ -531,7 +541,18 @@ task: "blueprint_rendezvous" configured period: every m last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms - last completion reported error: no blueprint + target blueprint: ............. + inventory collection: ..................... + debug_dataset rendezvous counts: + num_inserted: 0 + num_already_exist: 0 + num_not_in_inventory: 0 + num_tombstoned: 0 + num_already_tombstoned: 0 + crucible_dataset rendezvous counts: + num_inserted: 0 + num_already_exist: 0 + num_not_in_inventory: 0 task: "chicken_switches_watcher" configured period: every s @@ -541,9 +562,9 @@ warning: unknown background task: "chicken_switches_watcher" (don't know how to task: "crdb_node_id_collector" configured period: every m - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms - last completion reported error: no blueprint +warning: unknown background task: "crdb_node_id_collector" (don't know how to interpret details: Object {"errors": Array [Object {"err": String("failed to fetch node ID for zone ..................... at http://[::1]:REDACTED_PORT: Communication Error: error sending request for url (http://[::1]:REDACTED_PORT/node/id): error sending request for url (http://[::1]:REDACTED_PORT/node/id): client error (Connect): tcp connect error: Connection refused (os error 146)"), "zone_id": String(".....................")}], "nsuccess": Number(0)}) task: "decommissioned_disk_cleaner" configured period: every m @@ -844,16 +865,22 @@ stdout: task: "blueprint_loader" configured period: every m s currently executing: no - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by an explicit signal started at (s ago) and ran for ms - last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set + target blueprint: ............. + execution: disabled + created at: + status: first target blueprint task: "blueprint_executor" configured period: every m currently executing: no - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms - last completion reported error: no blueprint + target blueprint: ............. + execution: disabled + status: (no event report found) + error: (none) --------------------------------------------- stderr: @@ -1001,16 +1028,22 @@ task: "nat_garbage_collector" task: "blueprint_loader" configured period: every m s currently executing: no - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by an explicit signal started at (s ago) and ran for ms - last completion reported error: failed to read target blueprint: Internal Error: no target blueprint set + target blueprint: ............. + execution: disabled + created at: + status: first target blueprint task: "blueprint_executor" configured period: every m currently executing: no - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms - last completion reported error: no blueprint + target blueprint: ............. + execution: disabled + status: (no event report found) + error: (none) task: "abandoned_vmm_reaper" configured period: every m @@ -1049,7 +1082,18 @@ task: "blueprint_rendezvous" currently executing: no last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms - last completion reported error: no blueprint + target blueprint: ............. + inventory collection: ..................... + debug_dataset rendezvous counts: + num_inserted: 0 + num_already_exist: 0 + num_not_in_inventory: 0 + num_tombstoned: 0 + num_already_tombstoned: 0 + crucible_dataset rendezvous counts: + num_inserted: 0 + num_already_exist: 0 + num_not_in_inventory: 0 task: "chicken_switches_watcher" configured period: every s @@ -1061,9 +1105,9 @@ warning: unknown background task: "chicken_switches_watcher" (don't know how to task: "crdb_node_id_collector" configured period: every m currently executing: no - last completed activation: , triggered by a periodic timer firing + last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms - last completion reported error: no blueprint +warning: unknown background task: "crdb_node_id_collector" (don't know how to interpret details: Object {"errors": Array [Object {"err": String("failed to fetch node ID for zone ..................... at http://[::1]:REDACTED_PORT: Communication Error: error sending request for url (http://[::1]:REDACTED_PORT/node/id): error sending request for url (http://[::1]:REDACTED_PORT/node/id): client error (Connect): tcp connect error: Connection refused (os error 146)"), "zone_id": String(".....................")}], "nsuccess": Number(0)}) task: "decommissioned_disk_cleaner" configured period: every m @@ -1355,53 +1399,6 @@ task: "webhook_deliverator" stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ ============================================= -EXECUTING COMMAND: omdb ["nexus", "chicken-switches", "show", "current"] -termination: Exited(0) ---------------------------------------------- -stdout: -No chicken switches enabled ---------------------------------------------- -stderr: -note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ -============================================= -EXECUTING COMMAND: omdb ["-w", "nexus", "chicken-switches", "set", "--planner-enabled", "true"] -termination: Exited(0) ---------------------------------------------- -stdout: -chicken switches updated to version 1: - planner enabled: true - planner switches: - add zones with mupdate override: true ---------------------------------------------- -stderr: -note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ -============================================= -EXECUTING COMMAND: omdb ["-w", "nexus", "chicken-switches", "set", "--add-zones-with-mupdate-override", "false"] -termination: Exited(0) ---------------------------------------------- -stdout: -chicken switches updated to version 2: - planner enabled: true (unchanged) - planner switches: - * add zones with mupdate override: true -> false ---------------------------------------------- -stderr: -note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ -============================================= -EXECUTING COMMAND: omdb ["nexus", "chicken-switches", "show", "current"] -termination: Exited(0) ---------------------------------------------- -stdout: -Reconfigurator chicken switches: - version: 2 - modified time: - planner enabled: true - planner switches: - add zones with mupdate override: false ---------------------------------------------- -stderr: -note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ -============================================= EXECUTING COMMAND: omdb ["nexus", "sagas", "list"] termination: Exited(0) --------------------------------------------- @@ -1538,6 +1535,7 @@ parent: oxp_...................../crypt/zone/oxz_external_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_internal_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off + oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_ntp_..................... ..................... in service none none off @@ -1552,6 +1550,7 @@ parent: external_dns ..................... install dataset in service ::1 internal_dns ..................... install dataset in service ::1 nexus ..................... install dataset in service ::ffff:127.0.0.1 + nexus ..................... install dataset in service ::1 COCKROACHDB SETTINGS: @@ -1662,6 +1661,7 @@ parent: oxp_...................../crypt/zone/oxz_external_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_internal_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off + oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_ntp_..................... ..................... in service none none off @@ -1676,6 +1676,7 @@ parent: external_dns ..................... install dataset in service ::1 internal_dns ..................... install dataset in service ::1 nexus ..................... install dataset in service ::ffff:127.0.0.1 + nexus ..................... install dataset in service ::1 COCKROACHDB SETTINGS: @@ -1739,6 +1740,53 @@ stderr: note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ Error: `blueprint2_id` was not specified and blueprint1 has no parent ============================================= +EXECUTING COMMAND: omdb ["nexus", "chicken-switches", "show", "current"] +termination: Exited(0) +--------------------------------------------- +stdout: +No chicken switches enabled +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["-w", "nexus", "chicken-switches", "set", "--planner-enabled", "true"] +termination: Exited(0) +--------------------------------------------- +stdout: +chicken switches updated to version 1: + planner enabled: true + planner switches: + add zones with mupdate override: true +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["-w", "nexus", "chicken-switches", "set", "--add-zones-with-mupdate-override", "false"] +termination: Exited(0) +--------------------------------------------- +stdout: +chicken switches updated to version 2: + planner enabled: true (unchanged) + planner switches: + * add zones with mupdate override: true -> false +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= +EXECUTING COMMAND: omdb ["nexus", "chicken-switches", "show", "current"] +termination: Exited(0) +--------------------------------------------- +stdout: +Reconfigurator chicken switches: + version: 2 + modified time: + planner enabled: true + planner switches: + add zones with mupdate override: false +--------------------------------------------- +stderr: +note: using Nexus URL http://127.0.0.1:REDACTED_PORT/ +============================================= EXECUTING COMMAND: omdb ["reconfigurator", "export", ""] termination: Exited(0) --------------------------------------------- diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index a46f2184720..2dc7bdbb2a5 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -208,27 +208,6 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { &["nexus", "background-tasks", "show", "dns_internal"], &["nexus", "background-tasks", "show", "dns_external"], &["nexus", "background-tasks", "show", "all"], - // chicken switches: show and set - &["nexus", "chicken-switches", "show", "current"], - &[ - "-w", - "nexus", - "chicken-switches", - "set", - "--planner-enabled", - "true", - ], - &[ - "-w", - "nexus", - "chicken-switches", - "set", - "--add-zones-with-mupdate-override", - "false", - ], - // After the set commands above, we should see chicken switches - // populated. - &["nexus", "chicken-switches", "show", "current"], &["nexus", "sagas", "list"], &["--destructive", "nexus", "sagas", "demo-create"], &["nexus", "sagas", "list"], @@ -251,6 +230,27 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { ], // This one should fail because it has no parent. &["nexus", "blueprints", "diff", &initial_blueprint_id], + // chicken switches: show and set + &["nexus", "chicken-switches", "show", "current"], + &[ + "-w", + "nexus", + "chicken-switches", + "set", + "--planner-enabled", + "true", + ], + &[ + "-w", + "nexus", + "chicken-switches", + "set", + "--add-zones-with-mupdate-override", + "false", + ], + // After the set commands above, we should see chicken switches + // populated. + &["nexus", "chicken-switches", "show", "current"], &["reconfigurator", "export", tmppath.as_str()], // We can't easily test the sled agent output because that's only // provided by a real sled agent, which is not available in the diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index e61b2815362..01b4e83ee7e 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -303,8 +303,6 @@ pub fn load_test_config() -> NexusConfig { // - the CockroachDB TCP listen port be 0, and // - if the log will go to a file then the path must be the sentinel value // "UNUSED". - // - each Nexus created for testing gets its own id so they don't see each - // others sagas and try to recover them // // (See LogContext::new() for details.) Given these restrictions, it may // seem barely worth reading a config file at all. However, developers can @@ -312,10 +310,8 @@ pub fn load_test_config() -> NexusConfig { // configuration options, we expect many of those can be usefully configured // (and reconfigured) for the test suite. let config_file_path = Utf8Path::new("tests/config.test.toml"); - let mut config = NexusConfig::from_file(config_file_path) - .expect("failed to load config.test.toml"); - config.deployment.id = OmicronZoneUuid::new_v4(); - config + NexusConfig::from_file(config_file_path) + .expect("failed to load config.test.toml") } pub async fn test_setup( @@ -835,47 +831,97 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { 0, ); - let mac = self - .rack_init_builder - .mac_addrs - .next() - .expect("ran out of MAC addresses"); - let external_address = - self.config.deployment.dropshot_external.dropshot.bind_address.ip(); - let nexus_id = self.config.deployment.id; self.rack_init_builder.add_service_to_dns( - nexus_id, + self.config.deployment.id, address, ServiceName::Nexus, ); + self.record_nexus_zone(self.config.clone(), address, 0); + self.nexus_internal = Some(nexus_internal); + self.nexus_internal_addr = Some(nexus_internal_addr); + + // Besides the Nexus that we just started, add an entry in the blueprint + // for the Nexus that developers can start using + // nexus/examples/config-second.toml. + // + // The details in its BlueprintZoneType mostly don't matter because + // those are mostly used for DNS (which we don't usually need here) and + // to tell sled agent how to start the zone (which isn't what's going on + // here). But it does need to be present for it to be able to determine + // on startup if it needs to quiesce. + let second_nexus_config_path = + Utf8Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../examples/config-second.toml"); + let mut second_nexus_config = + NexusConfig::from_file(&second_nexus_config_path).unwrap(); + // Okay, this is particularly awful. The system does not allow multiple + // zones to use the same external IP -- makes sense. But it actually is + // fine here because the IP is localhost and we're using host + // networking, and we've already ensured that the ports will be unique. + // Avoid tripping up the validation by using some other IP. This won't + // be used for anything. Pick something that's not in use anywhere + // else. This range is guaranteed by RFC 6666 to discard traffic. + second_nexus_config + .deployment + .dropshot_external + .dropshot + .bind_address + .set_ip("100::1".parse().unwrap()); + let SocketAddr::V6(second_internal_address) = + second_nexus_config.deployment.dropshot_internal.bind_address + else { + panic!( + "expected IPv6 address for dropshot_internal in \ + nexus/examples/config-second.toml" + ); + }; + self.record_nexus_zone(second_nexus_config, second_internal_address, 1); + Ok(()) + } + fn record_nexus_zone( + &mut self, + config: NexusConfig, + internal_address: SocketAddrV6, + which: usize, + ) { + let id = config.deployment.id; + let mac = self + .rack_init_builder + .mac_addrs + .next() + .expect("ran out of MAC addresses"); self.blueprint_zones.push(BlueprintZoneConfig { disposition: BlueprintZoneDisposition::InService, - id: nexus_id, + id, filesystem_pool: ZpoolName::new_external(ZpoolUuid::new_v4()), zone_type: BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { - external_dns_servers: self - .config + external_dns_servers: config .deployment .external_dns_servers .clone(), external_ip: OmicronZoneExternalFloatingIp { id: ExternalIpUuid::new_v4(), - ip: external_address, + ip: config + .deployment + .dropshot_external + .dropshot + .bind_address + .ip(), }, - external_tls: self.config.deployment.dropshot_external.tls, - internal_address: address, + external_tls: config.deployment.dropshot_external.tls, + internal_address, nic: NetworkInterface { id: Uuid::new_v4(), ip: NEXUS_OPTE_IPV4_SUBNET - .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1) + .nth(NUM_INITIAL_RESERVED_IP_ADDRESSES + 1 + which) .unwrap() .into(), kind: NetworkInterfaceKind::Service { - id: nexus_id.into_untyped_uuid(), + id: id.into_untyped_uuid(), }, mac, - name: format!("nexus-{}", nexus_id).parse().unwrap(), + name: format!("nexus-{}", id).parse().unwrap(), primary: true, slot: 0, subnet: (*NEXUS_OPTE_IPV4_SUBNET).into(), @@ -886,11 +932,6 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { }), image_source: BlueprintZoneImageSource::InstallDataset, }); - - self.nexus_internal = Some(nexus_internal); - self.nexus_internal_addr = Some(nexus_internal_addr); - - Ok(()) } pub async fn populate_internal_dns(&mut self) { From fe85a10acfd930e668640e657348f9ccbba2df27 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 22 Aug 2025 16:27:02 -0700 Subject: [PATCH 08/22] review feedback --- nexus/reconfigurator/execution/src/lib.rs | 5 +- nexus/src/app/quiesce.rs | 21 +++++-- nexus/types/src/quiesce.rs | 67 +++++++++++------------ 3 files changed, 51 insertions(+), 42 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index b9f83912bc8..a4f802b3c97 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -615,8 +615,9 @@ fn register_reassign_sagas_step<'a>( Ok(saga_quiesce .reassign_sagas(async || { // For any expunged Nexus zones, re-assign in-progress - // sagas to some other Nexus. If this fails for some - // reason, it doesn't affect anything else. + // sagas to `nexus_id` (which, in practice, is + // ourselves). If this fails for some reason, it + // doesn't affect anything else. let sec_id = nexus_db_model::SecId::from(nexus_id); let reassigned = sagas::reassign_sagas_from_expunged( opctx, datastore, blueprint, sec_id, diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index cd5e3242ce5..3b36ec97991 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -80,13 +80,22 @@ impl NexusQuiesceHandle { *q = new_state; true } - QuiesceState::Running if quiescing => { - info!(&self.log, "quiesce starting"); - *q = new_state; - true + QuiesceState::Running => { + if quiescing { + info!(&self.log, "quiesce starting"); + *q = new_state; + true + } else { + // We're not quiescing and not being asked to quiesce. + // Nothing to do. + false + } } - _ => { - // All other cases are either impossible or no-ops. + QuiesceState::DrainingSagas { .. } + | QuiesceState::DrainingDb { .. } + | QuiesceState::RecordingQuiesce { .. } + | QuiesceState::Quiesced { .. } => { + // Once we start quiescing, we never go back. false } } diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 072a4ca3b1d..7c2b3ad42dd 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -163,14 +163,13 @@ impl SagaQuiesceHandle { /// cannot then re-enable sagas. pub fn set_quiescing(&self, quiescing: bool) { self.inner.send_if_modified(|q| { - let new_state = if quiescing { - SagasAllowed::DisallowedQuiesce - } else { - SagasAllowed::Allowed - }; - match q.new_sagas_allowed { SagasAllowed::DisallowedUnknown => { + let new_state = if quiescing { + SagasAllowed::DisallowedQuiesce + } else { + SagasAllowed::Allowed + }; info!( &self.log, "initial quiesce state"; @@ -179,23 +178,25 @@ impl SagaQuiesceHandle { q.new_sagas_allowed = new_state; true } - SagasAllowed::Allowed if quiescing => { - info!(&self.log, "saga quiesce starting"); - q.new_sagas_allowed = SagasAllowed::DisallowedQuiesce; - true + SagasAllowed::Allowed => { + if quiescing { + info!(&self.log, "saga quiesce starting"); + q.new_sagas_allowed = SagasAllowed::DisallowedQuiesce; + true + } else { + false + } } - SagasAllowed::DisallowedQuiesce if !quiescing => { - // This should be impossible. Report a problem. - error!( - &self.log, - "asked to stop quiescing after previously quiescing" - ); - false - } - _ => { - // There's no transition happening in these cases: - // - SagasAllowed::Allowed and we're not quiescing - // - SagasAllowed::DisallowedQuiesce and we're now quiescing + SagasAllowed::DisallowedQuiesce => { + if !quiescing { + // This should be impossible. Report a problem. + error!( + &self.log, + "asked to stop quiescing after previously quiescing" + ); + } + + // Either way, we're not changing anything. false } } @@ -393,7 +394,7 @@ impl SagaQuiesceHandle { saga_name: &steno::SagaName, ) -> Result { let mut error: Option = None; - let okay = self.inner.send_if_modified(|q| { + self.inner.send_if_modified(|q| { match q.new_sagas_allowed { SagasAllowed::Allowed => (), SagasAllowed::DisallowedQuiesce => { @@ -417,7 +418,15 @@ impl SagaQuiesceHandle { true }); - if okay { + if let Some(error) = error { + info!( + &self.log, + "disallowing saga creation"; + "saga_id" => saga_id.to_string(), + InlineErrorChain::new(&error), + ); + Err(error) + } else { let log = self.log.new(o!("saga_id" => saga_id.to_string())); info!(&log, "tracking newly created saga"); Ok(NewlyPendingSagaRef { @@ -426,16 +435,6 @@ impl SagaQuiesceHandle { saga_id, init_finished: false, }) - } else { - let error = - error.expect("error is always set when disallowing sagas"); - info!( - &self.log, - "disallowing saga creation"; - "saga_id" => saga_id.to_string(), - InlineErrorChain::new(&error), - ); - Err(error) } } From b8d3ee30a1196de33497fa7eab055404765fdb88 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 22 Aug 2025 17:03:18 -0700 Subject: [PATCH 09/22] fix tests on GNU/Linux --- dev-tools/omdb/tests/successes.out | 4 ++-- dev-tools/omdb/tests/test_all_output.rs | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index bb8f36d4ef9..3c47bcc6c53 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -564,7 +564,7 @@ task: "crdb_node_id_collector" configured period: every m last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms -warning: unknown background task: "crdb_node_id_collector" (don't know how to interpret details: Object {"errors": Array [Object {"err": String("failed to fetch node ID for zone ..................... at http://[::1]:REDACTED_PORT: Communication Error: error sending request for url (http://[::1]:REDACTED_PORT/node/id): error sending request for url (http://[::1]:REDACTED_PORT/node/id): client error (Connect): tcp connect error: Connection refused (os error 146)"), "zone_id": String(".....................")}], "nsuccess": Number(0)}) +warning: unknown background task: "crdb_node_id_collector" (don't know how to interpret details: Object {"errors": Array [Object {"err": String("failed to fetch node ID for zone ..................... at http://[::1]:REDACTED_PORT: Communication Error: error sending request for url (http://[::1]:REDACTED_PORT/node/id): error sending request for url (http://[::1]:REDACTED_PORT/node/id): client error (Connect): tcp connect error: Connection refused (os error )"), "zone_id": String(".....................")}], "nsuccess": Number(0)}) task: "decommissioned_disk_cleaner" configured period: every m @@ -1107,7 +1107,7 @@ task: "crdb_node_id_collector" currently executing: no last completed activation: , triggered by a dependent task completing started at (s ago) and ran for ms -warning: unknown background task: "crdb_node_id_collector" (don't know how to interpret details: Object {"errors": Array [Object {"err": String("failed to fetch node ID for zone ..................... at http://[::1]:REDACTED_PORT: Communication Error: error sending request for url (http://[::1]:REDACTED_PORT/node/id): error sending request for url (http://[::1]:REDACTED_PORT/node/id): client error (Connect): tcp connect error: Connection refused (os error 146)"), "zone_id": String(".....................")}], "nsuccess": Number(0)}) +warning: unknown background task: "crdb_node_id_collector" (don't know how to interpret details: Object {"errors": Array [Object {"err": String("failed to fetch node ID for zone ..................... at http://[::1]:REDACTED_PORT: Communication Error: error sending request for url (http://[::1]:REDACTED_PORT/node/id): error sending request for url (http://[::1]:REDACTED_PORT/node/id): client error (Connect): tcp connect error: Connection refused (os error )"), "zone_id": String(".....................")}], "nsuccess": Number(0)}) task: "decommissioned_disk_cleaner" configured period: every m diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 2dc7bdbb2a5..11db63156a1 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -264,7 +264,9 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) { .extra_variable_length( "cockroachdb_fingerprint", &initial_blueprint.cockroachdb_fingerprint, - ); + ) + // Error numbers vary between operating systems. + .field("os error", r"\d+"); let crdb_version = initial_blueprint.cockroachdb_setting_preserve_downgrade.to_string(); From a6f2f63b07005b46a0e159bfef1f85cd01bfe899 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 22 Aug 2025 17:43:43 -0700 Subject: [PATCH 10/22] fix end to end dns test --- dev-tools/omicron-dev/src/main.rs | 4 ++-- nexus/test-utils/src/lib.rs | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index 456154243d4..6c8f22f9473 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -95,8 +95,8 @@ impl RunAllArgs { println!("omicron-dev: services are running."); // Print out basic information about what was started. - // NOTE: The stdout strings here are not intended to be stable, but they are - // used by the test suite. + // NOTE: The stdout strings here are not intended to be stable, but they + // are used by the test suite. let addr = cptestctx.external_client.bind_address; println!("omicron-dev: nexus external API: {:?}", addr); println!( diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 01b4e83ee7e..38ec064de88 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -839,7 +839,12 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { self.record_nexus_zone(self.config.clone(), address, 0); self.nexus_internal = Some(nexus_internal); self.nexus_internal_addr = Some(nexus_internal_addr); + Ok(()) + } + pub async fn configure_second_nexus(&mut self) { + let log = &self.logctx.log; + debug!(log, "Configuring second Nexus (not to run)"); // Besides the Nexus that we just started, add an entry in the blueprint // for the Nexus that developers can start using // nexus/examples/config-second.toml. @@ -876,7 +881,6 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> { ); }; self.record_nexus_zone(second_nexus_config, second_internal_address, 1); - Ok(()) } fn record_nexus_zone( @@ -1656,6 +1660,7 @@ pub async fn omicron_dev_setup_with_config( sim::SimMode::Auto, None, extra_sled_agents, + true, ) .await) } @@ -1675,6 +1680,7 @@ pub async fn test_setup_with_config( sim_mode, initial_cert, extra_sled_agents, + false, ) .await } @@ -1685,6 +1691,7 @@ async fn setup_with_config_impl( sim_mode: sim::SimMode, initial_cert: Option, extra_sled_agents: u16, + second_nexus: bool, ) -> ControlPlaneTestContext { const STEP_TIMEOUT: Duration = Duration::from_secs(60); @@ -1818,6 +1825,20 @@ async fn setup_with_config_impl( ) .await; + if second_nexus { + builder + .init_with_steps( + vec![( + "configure_second_nexus", + Box::new(|builder| { + builder.configure_second_nexus().boxed() + }), + )], + STEP_TIMEOUT, + ) + .await; + } + // The first and second sled agents have special UUIDs, and any extra ones // after that are random. From b09b83f3470923bf00dcea052f18120b5aad2050 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 23 Aug 2025 14:38:37 -0700 Subject: [PATCH 11/22] fix omdb test --- dev-tools/omdb/tests/successes.out | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 3c47bcc6c53..a5cdb0c7910 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -32,9 +32,7 @@ changes: names added: 3, names removed: 0 + @ NS ns1.oxide-dev.test + ns1 AAAA ::1 -+ test-suite-silo.sys (records: 2) -+ A 127.0.0.1 -+ AAAA 100::1 ++ test-suite-silo.sys A 127.0.0.1 --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable @@ -48,9 +46,7 @@ External zone: oxide-dev.test NAME RECORDS @ NS ns1.oxide-dev.test ns1 AAAA ::1 - test-suite-silo.sys (records: 2) - A 127.0.0.1 - AAAA 100::1 + test-suite-silo.sys A 127.0.0.1 --------------------------------------------- stderr: note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable @@ -1535,7 +1531,6 @@ parent: oxp_...................../crypt/zone/oxz_external_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_internal_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off - oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_ntp_..................... ..................... in service none none off @@ -1550,7 +1545,6 @@ parent: external_dns ..................... install dataset in service ::1 internal_dns ..................... install dataset in service ::1 nexus ..................... install dataset in service ::ffff:127.0.0.1 - nexus ..................... install dataset in service ::1 COCKROACHDB SETTINGS: @@ -1661,7 +1655,6 @@ parent: oxp_...................../crypt/zone/oxz_external_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_internal_dns_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off - oxp_...................../crypt/zone/oxz_nexus_..................... ..................... in service none none off oxp_...................../crypt/zone/oxz_ntp_..................... ..................... in service none none off @@ -1676,7 +1669,6 @@ parent: external_dns ..................... install dataset in service ::1 internal_dns ..................... install dataset in service ::1 nexus ..................... install dataset in service ::ffff:127.0.0.1 - nexus ..................... install dataset in service ::1 COCKROACHDB SETTINGS: From 912ea8f2040ce38e9df0483e29f6049de6c3a6ad Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 25 Aug 2025 10:53:24 -0700 Subject: [PATCH 12/22] add test that Nexus quiesces when reading a blueprint saying so --- nexus/tests/integration_tests/mod.rs | 1 + nexus/tests/integration_tests/quiesce.rs | 139 +++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 nexus/tests/integration_tests/quiesce.rs diff --git a/nexus/tests/integration_tests/mod.rs b/nexus/tests/integration_tests/mod.rs index 497585cceb8..c0ea06dcb7d 100644 --- a/nexus/tests/integration_tests/mod.rs +++ b/nexus/tests/integration_tests/mod.rs @@ -35,6 +35,7 @@ mod pantry; mod password_login; mod probe; mod projects; +mod quiesce; mod quotas; mod rack; mod role_assignments; diff --git a/nexus/tests/integration_tests/quiesce.rs b/nexus/tests/integration_tests/quiesce.rs new file mode 100644 index 00000000000..d5ef6bb7e1c --- /dev/null +++ b/nexus/tests/integration_tests/quiesce.rs @@ -0,0 +1,139 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{Context, anyhow}; +use nexus_auth::context::OpContext; +use nexus_client::types::QuiesceState; +use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; +use nexus_reconfigurator_planning::planner::PlannerRng; +use nexus_reconfigurator_preparation::PlanningInputFromDb; +use nexus_test_interface::NexusServer; +use nexus_test_utils_macros::nexus_test; +use nexus_types::deployment::BlueprintTargetSet; +use nexus_types::deployment::PlannerChickenSwitches; +use omicron_common::api::external::Error; +use omicron_test_utils::dev::poll::CondCheckError; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_uuid_kinds::GenericUuid; +use std::time::Duration; + +type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + +/// Tests that Nexus quiesces when the blueprint says that it should +#[nexus_test] +async fn test_quiesce(cptestctx: &ControlPlaneTestContext) { + let log = &cptestctx.logctx.log; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + let nexus_internal_url = format!( + "http://{}", + cptestctx.server.get_http_server_internal_address().await + ); + let nexus_client = + nexus_client::Client::new(&nexus_internal_url, log.clone()); + + // Collect what we need to modify the blueprint. + let collection = wait_for_condition( + || async { + let collection = datastore + .inventory_get_latest_collection(&opctx) + .await + .map_err(CondCheckError::Failed)?; + match collection { + Some(s) => Ok(s), + None => Err(CondCheckError::::NotYet), + } + }, + &Duration::from_secs(1), + &Duration::from_secs(60), + ) + .await + .expect("initial inventory collection"); + + let chicken_switches = datastore + .reconfigurator_chicken_switches_get_latest(&opctx) + .await + .expect("obtained latest chicken switches") + .map_or_else(PlannerChickenSwitches::default, |cs| { + cs.switches.planner_switches + }); + let planning_input = PlanningInputFromDb::assemble( + &opctx, + &datastore, + chicken_switches, + None, + ) + .await + .expect("planning input"); + let target_blueprint = nexus + .blueprint_target_view(&opctx) + .await + .expect("fetch current target config"); + let blueprint1 = nexus + .blueprint_view(&opctx, *target_blueprint.target_id.as_untyped_uuid()) + .await + .expect("fetch current target blueprint"); + + // Now, update the target blueprint to reflect that Nexus should quiesce. + // We don't need it to be enabled to still reflect quiescing. + let mut builder = BlueprintBuilder::new_based_on( + log, + &blueprint1, + &planning_input, + &collection, + "test-suite", + PlannerRng::from_entropy(), + ) + .expect("creating BlueprintBuilder"); + builder + .set_nexus_generation( + blueprint1.nexus_generation, + blueprint1.nexus_generation.next(), + ) + .expect("failed to set blueprint's Nexus generation"); + let blueprint2 = builder.build(); + nexus + .blueprint_import(&opctx, blueprint2.clone()) + .await + .expect("importing new blueprint"); + nexus + .blueprint_target_set( + &opctx, + BlueprintTargetSet { enabled: false, target_id: blueprint2.id }, + ) + .await + .expect("setting new target"); + + // Wait for Nexus to quiesce. + let _ = wait_for_condition( + || async { + let quiesce = nexus_client + .quiesce_get() + .await + .context("fetching quiesce state") + .map_err(CondCheckError::Failed)? + .into_inner(); + eprintln!("quiesce state: {:#?}\n", quiesce); + match quiesce.state { + QuiesceState::Undetermined => { + Err(CondCheckError::Failed(anyhow!( + "quiesce state should have been determined before \ + test started" + ))) + } + QuiesceState::Running => Err(CondCheckError::NotYet), + QuiesceState::DrainingSagas { .. } + | QuiesceState::DrainingDb { .. } + | QuiesceState::RecordingQuiesce { .. } + | QuiesceState::Quiesced { .. } => Ok(()), + } + }, + &Duration::from_millis(50), + &Duration::from_secs(30), + ) + .await + .expect("Nexus should have quiesced"); +} From 5ce4870599ce7fa04218d655254d6ef69adb6132 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 12:22:27 -0700 Subject: [PATCH 13/22] Remove sled_add_zone_nexus_internal, use config-based version --- nexus/db-queries/src/db/datastore/vpc.rs | 21 +++- .../planning/src/blueprint_builder/builder.rs | 106 +++++------------- nexus/reconfigurator/planning/src/example.rs | 11 +- 3 files changed, 54 insertions(+), 84 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 0bfe2e3bdfe..399a595e9f8 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -3330,12 +3330,16 @@ mod tests { ) .expect("ensured disks"); } - let must_have_nexus_zones = false; + let external_tls = false; + let external_dns_servers = vec![]; + let nexus_generation = builder.parent_blueprint().nexus_generation; builder - .sled_add_zone_nexus_internal( + .sled_add_zone_nexus_with_config( sled_ids[2], + external_tls, + external_dns_servers, BlueprintZoneImageSource::InstallDataset, - must_have_nexus_zones, + nexus_generation, ) .expect("added nexus to third sled"); builder.build() @@ -3405,12 +3409,17 @@ mod tests { ) .expect("created blueprint builder"); for &sled_id in &sled_ids { - let must_have_nexus_zones = false; + let external_tls = false; + let external_dns_servers = vec![]; + let nexus_generation = + builder.parent_blueprint().nexus_generation; builder - .sled_add_zone_nexus_internal( + .sled_add_zone_nexus_with_config( sled_id, + external_tls, + external_dns_servers, BlueprintZoneImageSource::InstallDataset, - must_have_nexus_zones, + nexus_generation, ) .expect("added nexus to third sled"); } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 6d987482152..0c76a7e6ad3 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -1526,19 +1526,6 @@ impl<'a> BlueprintBuilder<'a> { Ok(Ensure::Added) } - pub fn sled_add_zone_nexus( - &mut self, - sled_id: SledUuid, - image_source: BlueprintZoneImageSource, - ) -> Result<(), Error> { - let must_have_nexus_zones = true; - self.sled_add_zone_nexus_internal( - sled_id, - image_source, - must_have_nexus_zones, - ) - } - // Determines TLS and DNS server configuration from existing Nexus zones. // // Returns `Some((external_tls, external_dns_servers))` if existing Nexus @@ -1649,18 +1636,10 @@ impl<'a> BlueprintBuilder<'a> { } /// Adds a nexus zone on this sled. - /// - /// If `must_have_nexus_zones` is true, then other Nexus zones - /// are used to determine configuration settings (e.g., TLS, - /// DNS servers, generation number). - /// - /// If `must_have_nexus_zones` is false, then these settings - /// are permitted to use default values. - pub fn sled_add_zone_nexus_internal( + pub fn sled_add_zone_nexus( &mut self, sled_id: SledUuid, image_source: BlueprintZoneImageSource, - must_have_nexus_zones: bool, ) -> Result<(), Error> { // Whether Nexus should use TLS and what the external DNS servers it // should use are currently provided at rack-setup time, and should be @@ -1677,11 +1656,7 @@ impl<'a> BlueprintBuilder<'a> { match self.determine_nexus_tls_dns_config() { Some(config) => config, None => { - if must_have_nexus_zones { - return Err(Error::NoNexusZonesInParentBlueprint); - } else { - (false, Vec::new()) - } + return Err(Error::NoNexusZonesInParentBlueprint); } }; @@ -1689,13 +1664,7 @@ impl<'a> BlueprintBuilder<'a> { match self.determine_nexus_generation(&image_source)? { Some(generation) => generation, None => { - if must_have_nexus_zones { - return Err(Error::NoNexusZonesInParentBlueprint); - } else { - // If there are no existing Nexus zones, start with whatever the top-level - // blueprint value happens to be. - self.parent_blueprint.nexus_generation - } + return Err(Error::NoNexusZonesInParentBlueprint); } }; @@ -1708,7 +1677,10 @@ impl<'a> BlueprintBuilder<'a> { ) } - fn sled_add_zone_nexus_with_config( + /// Add a Nexus zone on this sled with a specific configuration. + /// + /// If possible, callers should prefer to use [Self::sled_add_zone_nexus] + pub fn sled_add_zone_nexus_with_config( &mut self, sled_id: SledUuid, external_tls: bool, @@ -3889,7 +3861,13 @@ pub mod test { // Add first Nexus zone - should get generation 1 builder - .sled_add_zone_nexus_internal(sled_id, image_source.clone(), false) + .sled_add_zone_nexus_with_config( + sled_id, + false, + vec![], + image_source.clone(), + builder.parent_blueprint().nexus_generation, + ) .expect("failed to add nexus zone"); let blueprint1 = builder.build(); @@ -3963,10 +3941,12 @@ pub mod test { // Add another Nexus zone with same image source - should reuse generation builder - .sled_add_zone_nexus_internal( + .sled_add_zone_nexus_with_config( second_sled_id, - image_source.clone(), false, + vec![], + image_source.clone(), + builder.parent_blueprint().nexus_generation, ) .expect("failed to add nexus zone"); @@ -4046,11 +4026,7 @@ pub mod test { // Add another Nexus zone with different image source - should increment generation builder - .sled_add_zone_nexus_internal( - second_sled_id, - different_image_source.clone(), - false, - ) + .sled_add_zone_nexus(second_sled_id, different_image_source.clone()) .expect("failed to add nexus zone"); let blueprint2 = builder.build(); @@ -4141,18 +4117,10 @@ pub mod test { // 1. One zone with image source A (should reuse existing generation) // 2. One zone with image source B (should get existing generation + 1) builder - .sled_add_zone_nexus_internal( - sled_ids[1], - image_source_a.clone(), - false, - ) + .sled_add_zone_nexus(sled_ids[1], image_source_a.clone()) .expect("failed to add nexus zone with image source A"); builder - .sled_add_zone_nexus_internal( - sled_ids[2], - image_source_b.clone(), - false, - ) + .sled_add_zone_nexus(sled_ids[2], image_source_b.clone()) .expect("failed to add nexus zone with image source B"); let blueprint2 = builder.build(); @@ -4192,7 +4160,11 @@ pub mod test { logctx.cleanup_successful(); } - /// Test nexus generation validation against blueprint generation + /// Test that the validation which normally occurs as a part of + /// "sled_add_zone_nexus" - namely, the invocation of + /// "determine_nexus_generation" - throws expected errors when the + /// "next Nexus zone" generation does not match the parent blueprint's + /// value of "nexus generation". #[test] fn test_nexus_generation_blueprint_validation() { static TEST_NAME: &str = "test_nexus_generation_blueprint_validation"; @@ -4210,7 +4182,7 @@ pub mod test { // Set the top-level nexus_generation to 2, but keep the zone generation at 1 blueprint.nexus_generation = Generation::new().next(); - let mut builder = BlueprintBuilder::new_based_on( + let builder = BlueprintBuilder::new_based_on( &logctx.log, &blueprint, &example_system.input, @@ -4220,19 +4192,11 @@ pub mod test { ) .expect("failed to create builder"); - let sled_ids: Vec<_> = example_system - .input - .all_sled_ids(SledFilter::Commissioned) - .collect(); let image_source = BlueprintZoneImageSource::InstallDataset; // Same as existing // Try to add another Nexus zone with same image source // This should fail because existing zone has generation 1 but blueprint has generation 2 - let result = builder.sled_add_zone_nexus_internal( - sled_ids[1], - image_source, - false, - ); + let result = builder.determine_nexus_generation(&image_source); match result { Err(Error::OldImageNexusGenerationMismatch { @@ -4271,7 +4235,7 @@ pub mod test { // the new image source logic would expect blueprint.nexus_generation = Generation::new().next().next(); // Set to generation 3 - let mut builder = BlueprintBuilder::new_based_on( + let builder = BlueprintBuilder::new_based_on( &logctx.log, &blueprint, &example_system.input, @@ -4281,11 +4245,6 @@ pub mod test { ) .expect("failed to create builder"); - let sled_ids: Vec<_> = example_system - .input - .all_sled_ids(SledFilter::Commissioned) - .collect(); - // Use a different image source (this should get existing generation + 1 = 2) let different_image_source = BlueprintZoneImageSource::Artifact { version: BlueprintArtifactVersion::Available { @@ -4296,11 +4255,8 @@ pub mod test { // Try to add a Nexus zone with different image source // This should fail because the calculated generation (2) doesn't match blueprint generation + 1 (4) - let result = builder.sled_add_zone_nexus_internal( - sled_ids[1], - different_image_source, - false, - ); + let result = + builder.determine_nexus_generation(&different_image_source); match result { Err(Error::NewImageNexusGenerationMismatch { diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 67ac9f79a2a..783bde303ff 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -480,12 +480,17 @@ impl ExampleSystemBuilder { for _ in 0..nexus_count .on(discretionary_ix, discretionary_sled_count) { - let must_have_nexus_zones = false; + let external_tls = false; + let external_dns_servers = vec![]; + let nexus_generation = + builder.parent_blueprint().nexus_generation; builder - .sled_add_zone_nexus_internal( + .sled_add_zone_nexus_with_config( sled_id, + external_tls, + external_dns_servers, image_source.clone(), - must_have_nexus_zones, + nexus_generation, ) .unwrap(); } From 3be10b8e80ea08c606a9c42c19b7f0253e72070b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 14:48:47 -0700 Subject: [PATCH 14/22] Update comment about image sources --- nexus/reconfigurator/planning/src/planner.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index e6d4e943428..68bed471a25 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -937,6 +937,13 @@ impl<'a> Planner<'a> { DiscretionaryOmicronZone::Nexus, DiscretionaryOmicronZone::Oximeter, ] { + // Our goal here is to make sure that if we have less redundancy for + // discretionary zones than needed, we deploy additional zones. + // + // For most zone types, we only care about the total count of that + // kind of zone, regardless of image. In contrast, for Nexus, we may + // need to reach a minimum redundancy count for multiple zone images + // (new and old) during a handoff. let image_sources = match zone_kind { DiscretionaryOmicronZone::Nexus => { let old_image = self From 55ebd1c12b62096aec41db2fcd1471c5a68a8fd8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 25 Aug 2025 14:53:36 -0700 Subject: [PATCH 15/22] update names, comments, for zone propagation --- nexus/reconfigurator/planning/src/planner.rs | 50 ++++++++++++-------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index 68bed471a25..b20c51c0dff 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -112,24 +112,33 @@ const NUM_CONCURRENT_MGS_UPDATES: usize = 1; /// A receipt that `check_input_validity` has been run prior to planning. struct InputChecked; +// Details of why a zone has not yet propagated from blueprint to sled inventory #[derive(Debug)] #[expect(dead_code)] -struct ZoneCurrentlyUpdating<'a> { +struct ZonePropagationIncomplete<'a> { zone_id: OmicronZoneUuid, zone_kind: ZoneKind, - reason: UpdatingReason<'a>, + reason: ZonePropagationStatus<'a>, } #[derive(Debug)] #[expect(dead_code)] -enum UpdatingReason<'a> { +enum ZonePropagationStatus<'a> { + // The current blueprint and the sled inventory disagree + // about the image source for a zone. + // + // This can mean that the sled inventory is out-of-date, or + // that a different blueprint has been applied. ImageSourceMismatch { bp_image_source: &'a BlueprintZoneImageSource, inv_image_source: &'a OmicronZoneImageSource, }, + // Although this zone appears in the blueprint, it does + // not exist on the sled's inventory. MissingInInventory { bp_image_source: &'a BlueprintZoneImageSource, }, + // The last reconciliation attempt for this zone failed ReconciliationError { bp_image_source: &'a BlueprintZoneImageSource, inv_image_source: &'a OmicronZoneImageSource, @@ -1290,7 +1299,7 @@ impl<'a> Planner<'a> { fn get_zones_not_yet_propagated_to_inventory( &self, - ) -> Vec> { + ) -> Vec> { // We are only interested in non-decommissioned sleds. let sleds = self .input @@ -1336,13 +1345,14 @@ impl<'a> Planner<'a> { ConfigReconcilerInventoryResult::Ok, )) => { // The inventory and blueprint image sources differ. - Some(ZoneCurrentlyUpdating { + Some(ZonePropagationIncomplete { zone_id: zone.id, zone_kind: zone.kind(), - reason: UpdatingReason::ImageSourceMismatch { - bp_image_source: &zone.image_source, - inv_image_source, - }, + reason: + ZonePropagationStatus::ImageSourceMismatch { + bp_image_source: &zone.image_source, + inv_image_source, + }, }) } Some(( @@ -1352,24 +1362,26 @@ impl<'a> Planner<'a> { // The inventory reports this zone but there was an // error reconciling it (most likely an error // starting the zone). - Some(ZoneCurrentlyUpdating { + Some(ZonePropagationIncomplete { zone_id: zone.id, zone_kind: zone.kind(), - reason: UpdatingReason::ReconciliationError { - bp_image_source: &zone.image_source, - inv_image_source, - message, - }, + reason: + ZonePropagationStatus::ReconciliationError { + bp_image_source: &zone.image_source, + inv_image_source, + message, + }, }) } None => { // The blueprint has a zone that inventory does not have. - Some(ZoneCurrentlyUpdating { + Some(ZonePropagationIncomplete { zone_id: zone.id, zone_kind: zone.kind(), - reason: UpdatingReason::MissingInInventory { - bp_image_source: &zone.image_source, - }, + reason: + ZonePropagationStatus::MissingInInventory { + bp_image_source: &zone.image_source, + }, }) } } From 4d7235b54b97d51ac520bf661e6c2806b684eef8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 26 Aug 2025 12:49:44 -0700 Subject: [PATCH 16/22] Finish merging (adding nexus_generation to structs) --- nexus/db-queries/src/db/datastore/rack.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index c67da273e19..e2c4f62ffb3 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -2062,6 +2062,7 @@ mod test { slot: 0, transit_ips: vec![], }, + nexus_generation: *Generation::new(), }, ), image_source: BlueprintZoneImageSource::InstallDataset, @@ -2113,6 +2114,7 @@ mod test { creator: "test suite".to_string(), comment: "test blueprint".to_string(), report: PlanningReport::new(blueprint_id), + nexus_generation: *Generation::new(), }; let rack = datastore From da595c9e2918c742ebaf4c9e793e2bfe205249f5 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 26 Aug 2025 17:37:39 -0700 Subject: [PATCH 17/22] quiesce needs to keep track of blueprint ids --- nexus/reconfigurator/execution/src/lib.rs | 21 ++- nexus/types/src/quiesce.rs | 185 ++++++++++++++++++---- 2 files changed, 168 insertions(+), 38 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index a4f802b3c97..ced20480e70 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -22,6 +22,7 @@ use nexus_types::deployment::execution::{ StepHandle, StepResult, UpdateEngine, }; use nexus_types::quiesce::SagaQuiesceHandle; +use nexus_types::quiesce::SagaReassignmentDone; use omicron_uuid_kinds::OmicronZoneUuid; use slog::info; use slog_error_chain::InlineErrorChain; @@ -627,18 +628,16 @@ fn register_reassign_sagas_step<'a>( match reassigned { Ok(needs_saga_recovery) => ( StepSuccess::new(needs_saga_recovery).build(), - needs_saga_recovery, + SagaReassignmentDone::ReassignedAllAsOf( + blueprint.id, + needs_saga_recovery, + ), + ), + Err(error) => ( + StepWarning::new(false, error.to_string()) + .build(), + SagaReassignmentDone::Indeterminate, ), - Err(error) => { - // It's possible that we failed after having - // re-assigned sagas in the database. - let maybe_reassigned = true; - ( - StepWarning::new(false, error.to_string()) - .build(), - maybe_reassigned, - ) - } } }) .await) diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 7c2b3ad42dd..63e969bbac7 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -11,6 +11,7 @@ use futures::future::BoxFuture; use iddqd::IdOrdMap; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; +use omicron_uuid_kinds::BlueprintUuid; use slog::Logger; use slog::error; use slog::info; @@ -48,6 +49,13 @@ impl From for Error { } } +/// Describes the result of a saga re-assignment +#[derive(Debug)] +pub enum SagaReassignmentDone { + Indeterminate, + ReassignedAllAsOf(BlueprintUuid, bool), +} + /// Describes both the configuration (whether sagas are allowed to be created) /// and the state (how many sagas are pending) for the purpose of quiescing /// Nexus. @@ -124,6 +132,13 @@ struct SagaQuiesceInner { /// we've recovered all sagas that could be assigned to us. reassignment_generation: Generation, + /// blueprint id associated with last successful saga reassignment + /// + /// Similar to the generation number, this is used to track whether we've + /// accounted for all sagas for all expungements up through this target + /// blueprint. + reassignment_blueprint_id: Option, + /// whether there is a saga reassignment operation happening /// /// These operatinos may assign new sagas to Nexus that must be recovered @@ -138,9 +153,37 @@ struct SagaQuiesceInner { /// given reassignment pass. See `reassignment_done()` for details. recovered_reassignment_generation: Generation, - /// whether a saga recovery operation is ongoing, and if one is, what - /// `reassignment_generation` was when it started - recovery_pending: Option, + /// blueprint id that saga recovery has "caught up to" + /// + /// This means that we have finished recovering any sagas that were + /// re-assigned to us due to expungements of other Nexus zones up through + /// this blueprint. Put differently: we know that we will never be assigned + /// more sagas due to expungement unless the target blueprint changes past + /// this one. + /// + /// This does not mean that we've fully drained all sagas up through this + /// blueprint. There may still be sagas running. + recovered_blueprint_id: Option, + + /// blueprint id that we're "fully drained up to" + /// + /// If this value is non-`None`, that means that: + /// + /// - saga creation is disallowed + /// - no sagas are running + /// - we have re-assigned sagas from other Nexus instances expunged in this + /// blueprint or earlier + /// - we have finished recovery for all those sagas (that had been assigned + /// to us as of the re-assignment pass for this blueprint id) + /// + /// This means that the only way we can wind up running another saga is if + /// there's a new blueprint that expunges a different Nexus zone. + drained_blueprint_id: Option, + + /// whether a saga recovery operation is ongoing, and if one is: + /// - what `reassignment_generation` was when it started + /// - which blueprint id we'll be fully caught up to upon completion + recovery_pending: Option<(Generation, Option)>, } impl SagaQuiesceHandle { @@ -153,6 +196,9 @@ impl SagaQuiesceHandle { reassignment_pending: false, recovered_reassignment_generation: Generation::new(), recovery_pending: None, + reassignment_blueprint_id: None, + recovered_blueprint_id: None, + drained_blueprint_id: None, }); SagaQuiesceHandle { log, inner } } @@ -163,7 +209,7 @@ impl SagaQuiesceHandle { /// cannot then re-enable sagas. pub fn set_quiescing(&self, quiescing: bool) { self.inner.send_if_modified(|q| { - match q.new_sagas_allowed { + let changed = match q.new_sagas_allowed { SagasAllowed::DisallowedUnknown => { let new_state = if quiescing { SagasAllowed::DisallowedQuiesce @@ -199,10 +245,18 @@ impl SagaQuiesceHandle { // Either way, we're not changing anything. false } - } + }; + + q.latch_drained_blueprint_id(); + changed }); } + /// Returns the blueprint id as of which sagas are fully drained + pub fn fully_drained_blueprint(&self) -> Option { + self.inner.borrow().drained_blueprint_id + } + /// Returns whether sagas are fully drained /// /// Note that this state can change later if new sagas get assigned to this @@ -261,7 +315,7 @@ impl SagaQuiesceHandle { // those. pub async fn reassign_sagas(&self, f: F) -> T where - F: AsyncFnOnce() -> (T, bool), + F: AsyncFnOnce() -> (T, SagaReassignmentDone), { let in_progress = self.reassignment_start(); let (result, maybe_reassigned) = f().await; @@ -291,24 +345,62 @@ impl SagaQuiesceHandle { /// Record that we've finished an operation that might assign new sagas to /// ourselves. - fn reassignment_done(&self, maybe_reassigned: bool) { + fn reassignment_done(&self, result: SagaReassignmentDone) { info!( &self.log, "saga re-assignment pass finished"; - "maybe_reassigned" => maybe_reassigned + "result" => ?result ); self.inner.send_modify(|q| { assert!(q.reassignment_pending); q.reassignment_pending = false; - // If we may have assigned new sagas to ourselves, bump the - // generation number. We won't report being drained until a - // recovery pass has finished that *started* with this generation - // number. So this ensures that we won't report being drained until - // any sagas that may have been assigned to us have been recovered. - if maybe_reassigned { - q.reassignment_generation = q.reassignment_generation.next(); + match result { + SagaReassignmentDone::ReassignedAllAsOf( + blueprint_id, + reassigned_any, + ) => { + if reassigned_any { + // If we assigned new sagas to ourselves, bump the + // generation number. We won't report being drained + // until a recovery pass has finished that *started* + // with this generation number. This ensures that we + // won't report being drained until any sagas that may + // have been assigned to us have been recovered. + q.reassignment_generation = + q.reassignment_generation.next(); + } + + // Record that we've completed assignments of all sagas from + // all Nexus instances expunged as of this blueprint. The + // only way we could re-assign ourselves more sagas is if + // the target blueprint changes. + q.reassignment_blueprint_id = Some(blueprint_id); + } + SagaReassignmentDone::Indeterminate => { + // This means the caller doesn't know for sure whether they + // re-assigned us any sagas. (This can happen if there's a + // network error talking to the database. We don't know if + // that happened before or after the database transaction + // committed.) + // + // The comment above about the reassignment_generation + // applies in this case. We must assume in this case that + // there may be sagas that we need to recover before we + // consider ourselves drained. That means we need another + // recovery pass, which means bumping this generation + // number. + // + // However, once we *do* finish that, we won't know that + // we've finished recovering all sagas associated with Nexus + // instances expunged in this blueprint. So we *don't* + // update `reassignment_blueprint_id`. + q.reassignment_generation = + q.reassignment_generation.next(); + } } + + q.latch_drained_blueprint_id(); }); } @@ -353,7 +445,8 @@ impl SagaQuiesceHandle { "recovery_start() called twice without intervening \ recovery_done() (concurrent calls to recover()?)", ); - q.recovery_pending = Some(q.reassignment_generation); + q.recovery_pending = + Some((q.reassignment_generation, q.reassignment_blueprint_id)); }); info!(&self.log, "saga recovery pass starting"); @@ -364,7 +457,8 @@ impl SagaQuiesceHandle { fn recovery_done(&self, success: bool) { let log = self.log.clone(); self.inner.send_modify(|q| { - let Some(generation) = q.recovery_pending.take() else { + let Some((generation, blueprint_id)) = q.recovery_pending.take() + else { panic!("cannot finish saga recovery when it was not running"); }; @@ -372,10 +466,13 @@ impl SagaQuiesceHandle { info!( &log, "saga recovery pass finished"; - "generation" => generation.to_string() + "generation" => generation.to_string(), + "blueprint_id" => ?blueprint_id, ); + q.recovered_blueprint_id = blueprint_id; q.recovered_reassignment_generation = generation; q.first_recovery_complete = true; + q.latch_drained_blueprint_id(); } else { info!(&log, "saga recovery pass failed"); } @@ -492,7 +589,7 @@ impl SagaQuiesceInner { /// /// This condition is not permanent. New sagas can be re-assigned to this /// Nexus. - pub fn is_fully_drained(&self) -> bool { + fn is_fully_drained(&self) -> bool { // No new sagas may be created self.new_sagas_allowed == SagasAllowed::DisallowedQuiesce // and there are none currently running @@ -507,6 +604,23 @@ impl SagaQuiesceInner { // and blueprint execution is not currently re-assigning stuff to us && !self.reassignment_pending } + + /// Invoked whenever the quiesce state changes to determine if we are + /// currently fully drained up to a given blueprint id + /// + /// We want to keep track of this even if the target blueprint moves beyond + /// this blueprint and we start re-assigning new sagas to ourselves as a + /// result of that blueprint. The rest of our bookkeeping would reflect + /// that we're not fully drained, which is true, but we still want to be + /// able to report that we were fully drained _as of this blueprint_. + fn latch_drained_blueprint_id(&mut self) { + if self.is_fully_drained() { + // If we've recovered up through a given blueprint id and are now + // fully drained, then we have definitely fully drained up through + // that blueprint id. + self.drained_blueprint_id = self.recovered_blueprint_id; + } + } } /// Handle used to ensure that we clean up records for a pending saga @@ -568,6 +682,7 @@ impl NewlyPendingSagaRef { q.sagas_pending .remove(&saga_id) .expect("saga should have been running"); + q.latch_drained_blueprint_id(); }); rv }); @@ -651,21 +766,25 @@ struct SagaReassignmentInProgress { } impl SagaReassignmentInProgress { - fn reassignment_done(self, maybe_reassigned: bool) { - self.q.reassignment_done(maybe_reassigned) + fn reassignment_done(self, result: SagaReassignmentDone) { + self.q.reassignment_done(result); } } #[cfg(test)] mod test { use crate::quiesce::SagaQuiesceHandle; + use crate::quiesce::SagaReassignmentDone; use futures::FutureExt; use omicron_test_utils::dev::test_setup_log; + use omicron_uuid_kinds::BlueprintUuid; use std::sync::LazyLock; use uuid::Uuid; static SAGA_ID: LazyLock = LazyLock::new(|| steno::SagaId(Uuid::new_v4())); + static BLUEPRINT_ID: LazyLock = + LazyLock::new(|| BlueprintUuid::new_v4()); static SAGA_NAME: LazyLock = LazyLock::new(|| steno::SagaName::new("test-saga")); @@ -861,7 +980,9 @@ mod test { // When re-assignment finishes *without* having re-assigned anything, // then we're immediately all set. - reassignment.reassignment_done(false); + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(*BLUEPRINT_ID, false), + ); assert!(qq.is_fully_drained()); qq.wait_for_drained().await; assert!(qq.is_fully_drained()); @@ -895,7 +1016,9 @@ mod test { // When re-assignment finishes and re-assigned sagas, we're still // blocked. - reassignment.reassignment_done(true); + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(*BLUEPRINT_ID, true), + ); assert!(!qq.is_fully_drained()); // If the next recovery pass fails, we're still blocked. @@ -946,7 +1069,9 @@ mod test { // When re-assignment finishes and re-assigned sagas, we're still // blocked. - reassignment.reassignment_done(true); + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(*BLUEPRINT_ID, true), + ); assert!(!qq.is_fully_drained()); // Even if this recovery pass succeeds, we're still blocked, because it @@ -999,7 +1124,9 @@ mod test { // When re-assignment finishes and re-assigned sagas, we're still // blocked because we haven't run recovery. - reassignment.reassignment_done(true); + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(*BLUEPRINT_ID, true), + ); assert!(!qq.is_fully_drained()); // Start a recovery pass. Pretend like we found something. @@ -1087,7 +1214,9 @@ mod test { // from being drained. let reassignment = qq.reassignment_start(); assert!(!qq.is_fully_drained()); - reassignment.reassignment_done(false); + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(*BLUEPRINT_ID, false), + ); // We're fully drained as soon as this one is done, since we know we // didn't assign any sagas. assert!(qq.is_fully_drained()); @@ -1095,7 +1224,9 @@ mod test { // Try again. This time, we'll act like we did reassign sagas. let reassignment = qq.reassignment_start(); assert!(!qq.is_fully_drained()); - reassignment.reassignment_done(true); + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(*BLUEPRINT_ID, true), + ); assert!(!qq.is_fully_drained()); // Do a failed recovery pass. We still won't be fully drained. let recovery = qq.recovery_start(); From d586ab51db63979107656b4b68eed02e1299e303 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 27 Aug 2025 10:02:04 -0700 Subject: [PATCH 18/22] add test --- nexus/types/src/quiesce.rs | 159 +++++++++++++++++++++++++++++++++++-- 1 file changed, 153 insertions(+), 6 deletions(-) diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 63e969bbac7..30d15b3cf2d 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -360,6 +360,12 @@ impl SagaQuiesceHandle { blueprint_id, reassigned_any, ) => { + // Record that we've completed assignments of all sagas from + // all Nexus instances expunged as of this blueprint. The + // only way we could re-assign ourselves more sagas is if + // the target blueprint changes. + q.reassignment_blueprint_id = Some(blueprint_id); + if reassigned_any { // If we assigned new sagas to ourselves, bump the // generation number. We won't report being drained @@ -369,13 +375,15 @@ impl SagaQuiesceHandle { // have been assigned to us have been recovered. q.reassignment_generation = q.reassignment_generation.next(); + } else if q.reassignment_generation + <= q.recovered_reassignment_generation + && q.first_recovery_complete + { + // If recovery has caught up to the current reassignment + // generation, then we can also say that we're recovered + // up to this blueprint. + q.recovered_blueprint_id = q.reassignment_blueprint_id; } - - // Record that we've completed assignments of all sagas from - // all Nexus instances expunged as of this blueprint. The - // only way we could re-assign ourselves more sagas is if - // the target blueprint changes. - q.reassignment_blueprint_id = Some(blueprint_id); } SagaReassignmentDone::Indeterminate => { // This means the caller doesn't know for sure whether they @@ -1241,4 +1249,143 @@ mod test { logctx.cleanup_successful(); } + + /// Tests tracking of the drained blueprint id + #[tokio::test] + async fn test_drained_blueprint() { + let logctx = test_setup_log("test_drained_blueprint"); + let log = &logctx.log; + + let qq = SagaQuiesceHandle::new(log.clone()); + assert!(qq.fully_drained_blueprint().is_none()); + + // Basic tests where we're *not* fully drained + + // Recovery by itself does not mean we're fully drained. + qq.recovery_start().recovery_done(true); + assert!(qq.fully_drained_blueprint().is_none()); + + // Even if we're quiescing now, we're not fully drained. + qq.set_quiescing(true); + assert!(qq.fully_drained_blueprint().is_none()); + + // Recovery still isn't enough. We haven't done a re-assignment pass. + // We are currently drained, though. + qq.recovery_start().recovery_done(true); + assert!(qq.fully_drained_blueprint().is_none()); + assert!(qq.is_fully_drained()); + + // No change after an indeterminate re-assignment. + let reassignment = qq.reassignment_start(); + reassignment.reassignment_done(SagaReassignmentDone::Indeterminate); + assert!(qq.fully_drained_blueprint().is_none()); + assert!(!qq.is_fully_drained()); + + // Fully drained case 1: saga re-assignment causes us to become fully + // drained. + // + // First, recover whatever we may have just assigned ourselves. + qq.recovery_start().recovery_done(true); + + // Now if we do a re-assignment pass that assigns no sagas, then we + // finally are fully drained up through this blueprint. This does not + // require recovery since no sagas were re-assigned. + let blueprint1_id = BlueprintUuid::new_v4(); + let reassignment = qq.reassignment_start(); + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(blueprint1_id, false), + ); + assert!(qq.is_fully_drained()); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint1_id)); + + // Next, test that even if we become no-longer-drained because we do + // another reassignment, we still record that we're fully drained as of + // the older blueprint. + + // Start another re-assignment pass. + let blueprint2_id = BlueprintUuid::new_v4(); + let reassignment = qq.reassignment_start(); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint1_id)); + // Act like we assigned some sagas. + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(blueprint2_id, true), + ); + // We're not fully drained because we haven't recovered those sagas. + assert!(!qq.is_fully_drained()); + // So the fully drained blueprint is the one from before. + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint1_id)); + + // Start a recovery pass. Pretend like we found a saga. + // We'll use a oneshot channel to emulate the saga completion future. + let (tx, rx) = tokio::sync::oneshot::channel(); + let recovery = qq.recovery_start(); + let pending = recovery.record_saga_recovery(*SAGA_ID, &SAGA_NAME); + let consumer_completion = pending.saga_completion_future( + async { rx.await.expect("cannot drop this before dropping tx") } + .boxed(), + ); + recovery.recovery_done(true); + + // We're still not fully drained because we haven't finished that saga. + assert!(!qq.is_fully_drained()); + // So the fully drained blueprint is the one from before. + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint1_id)); + + // Fully drained case 2: saga completion causes us to become fully + // drained. + // + // Complete the saga. + tx.send(saga_result()).unwrap(); + let _ = consumer_completion.await; + // Now, we should be fully drained up to the new blueprint. + assert!(qq.is_fully_drained()); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint2_id)); + + // Fully drained case 3: saga recovery causes us to become fully + // drained. + // + // For this case, imagine that we think we may have re-assigned + // ourselves some sagas, but recovery completes with no sagas + // outstanding. + let blueprint3_id = BlueprintUuid::new_v4(); + let reassignment = qq.reassignment_start(); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint2_id)); + // Act like we assigned some sagas. + reassignment.reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(blueprint3_id, true), + ); + assert!(!qq.is_fully_drained()); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint2_id)); + + // Quick check: failed recovery changes nothing. + qq.recovery_start().recovery_done(false); + assert!(!qq.is_fully_drained()); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint2_id)); + + // Successful recovery with no sagas running means we're fully drained + // as of the new blueprint. + qq.recovery_start().recovery_done(true); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint3_id)); + assert!(qq.is_fully_drained()); + + // Fully drained case 3: quiescing itself causes us to immediately + // become fully drained. + // + // This case requires a fresh handle, since the current one is already + // quiesced. + let blueprint4_id = BlueprintUuid::new_v4(); + let qq = SagaQuiesceHandle::new(log.clone()); + qq.reassignment_start().reassignment_done( + SagaReassignmentDone::ReassignedAllAsOf(blueprint4_id, true), + ); + qq.recovery_start().recovery_done(true); + assert!(qq.fully_drained_blueprint().is_none()); + assert!(!qq.is_fully_drained()); + + qq.set_quiescing(true); + assert_eq!(qq.fully_drained_blueprint(), Some(blueprint4_id)); + assert!(qq.is_fully_drained()); + + logctx.cleanup_successful(); + } } From 1f328a9c0328a7d6ab2d3ee2d9386a3451cd3559 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 27 Aug 2025 10:10:00 -0700 Subject: [PATCH 19/22] is_fully_drained() can be more private --- nexus/types/src/quiesce.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 30d15b3cf2d..cd5c29201cf 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -261,7 +261,8 @@ impl SagaQuiesceHandle { /// /// Note that this state can change later if new sagas get assigned to this /// Nexus. - pub fn is_fully_drained(&self) -> bool { + #[cfg(test)] + fn is_fully_drained(&self) -> bool { self.inner.borrow().is_fully_drained() } From b34523684f043cd8fe66cb651662fb8095832ae0 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 27 Aug 2025 10:49:46 -0700 Subject: [PATCH 20/22] update omdb --- dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs | 70 +++++++-- nexus/src/app/quiesce.rs | 12 +- nexus/types/src/internal_api/views.rs | 9 +- nexus/types/src/quiesce.rs | 30 ++-- openapi/nexus-internal.json | 144 ++++++++++++++++--- 5 files changed, 217 insertions(+), 48 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs index 76c0a229c3c..a886c6bc582 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs @@ -12,6 +12,8 @@ use chrono::Utc; use clap::Args; use clap::Subcommand; use nexus_client::types::QuiesceState; +use nexus_client::types::QuiesceStatus; +use nexus_client::types::SagaQuiesceStatus; use std::time::Duration; #[derive(Debug, Args)] @@ -31,9 +33,9 @@ pub enum QuiesceCommands { #[derive(Debug, Args)] pub struct QuiesceShowArgs { - /// Show details about held database connections + /// Show stack traces for held database connections #[clap(short, long, default_value_t = false)] - verbose: bool, + stacks: bool, } pub async fn cmd_nexus_quiesce( @@ -60,7 +62,10 @@ async fn quiesce_show( .await .context("fetching quiesce state")? .into_inner(); - match quiesce.state { + + let QuiesceStatus { db_claims, sagas, state } = quiesce; + + match state { QuiesceState::Undetermined => { println!("has not yet determined if it is quiescing"); } @@ -145,25 +150,70 @@ async fn quiesce_show( } } - println!("sagas running: {}", quiesce.sagas_pending.len()); - for saga in &quiesce.sagas_pending { + let SagaQuiesceStatus { + sagas_pending, + drained_blueprint_id, + first_recovery_complete, + new_sagas_allowed, + reassignment_blueprint_id, + reassignment_generation, + reassignment_pending, + recovered_blueprint_id, + recovered_reassignment_generation, + } = sagas; + + println!("saga quiesce: {:?}", new_sagas_allowed); + println!( + "drained as of blueprint: {}", + drained_blueprint_id + .map(|s| s.to_string()) + .as_deref() + .unwrap_or("none") + ); + println!( + " blueprint for last recovery pass: {}", + recovered_blueprint_id + .map(|s| s.to_string()) + .as_deref() + .unwrap_or("none") + ); + println!( + " blueprint for last reassignment pass: {}", + reassignment_blueprint_id + .map(|s| s.to_string()) + .as_deref() + .unwrap_or("none") + ); + println!( + " reassignment generation: {} (pass running: {})", + reassignment_generation, + if reassignment_pending { "yes" } else { "no " } + ); + println!(" recovered generation: {}", recovered_reassignment_generation); + println!( + " recovered at least once successfully: {}", + if first_recovery_complete { "yes" } else { "no" }, + ); + + println!(" sagas running: {}", sagas_pending.len()); + for saga in &sagas_pending { println!( - " saga {} pending since {} ({})", + " saga {} pending since {} ({})", saga.saga_id, humantime::format_rfc3339_millis(saga.time_pending.into()), saga.saga_name ); } - println!("database connections held: {}", quiesce.db_claims.len()); - for claim in &quiesce.db_claims { + println!("database connections held: {}", db_claims.len()); + for claim in &db_claims { println!( " claim {} held since {} ({} ago)", claim.id, claim.held_since, format_time_delta(Utc::now() - claim.held_since), ); - if args.verbose { + if args.stacks { println!(" acquired by:"); println!("{}", textwrap::indent(&claim.debug, " ")); } @@ -177,7 +227,7 @@ async fn quiesce_start( _token: DestructiveOperationToken, ) -> Result<(), anyhow::Error> { client.quiesce_start().await.context("quiescing Nexus")?; - quiesce_show(client, &QuiesceShowArgs { verbose: false }).await + quiesce_show(client, &QuiesceShowArgs { stacks: false }).await } fn format_duration_ms(duration: Duration) -> String { diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 3b36ec97991..22704521227 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -32,9 +32,9 @@ impl super::Nexus { ) -> LookupResult { opctx.authorize(authz::Action::Read, &authz::QUIESCE_STATE).await?; let state = self.quiesce.state(); - let sagas_pending = self.quiesce.sagas().sagas_pending(); + let sagas = self.quiesce.sagas().status(); let db_claims = self.datastore().claims_held(); - Ok(QuiesceStatus { state, sagas_pending, db_claims }) + Ok(QuiesceStatus { state, sagas, db_claims }) } } @@ -281,7 +281,7 @@ mod test { assert!(duration_total >= duration_draining_db); assert!(duration_total >= duration_recording_quiesce); assert!(duration_total <= (after - before).to_std().unwrap()); - assert!(status.sagas_pending.is_empty()); + assert!(status.sagas.sagas_pending.is_empty()); assert!(status.db_claims.is_empty()); } @@ -355,7 +355,9 @@ mod test { quiesce_status.state, QuiesceState::DrainingSagas { .. } ); - assert!(quiesce_status.sagas_pending.contains_key(&demo_saga.saga_id)); + assert!( + quiesce_status.sagas.sagas_pending.contains_key(&demo_saga.saga_id) + ); // We should see at least one held database claim from the one we took // above. assert!(!quiesce_status.db_claims.is_empty()); @@ -419,7 +421,7 @@ mod test { if !matches!(rv.state, QuiesceState::DrainingDb { .. }) { return Err(CondCheckError::::NotYet); } - assert!(rv.sagas_pending.is_empty()); + assert!(rv.sagas.sagas_pending.is_empty()); // The database claim we took is still held. assert!(!rv.db_claims.is_empty()); Ok(()) diff --git a/nexus/types/src/internal_api/views.rs b/nexus/types/src/internal_api/views.rs index 972a0b92df2..df5a0ef8eda 100644 --- a/nexus/types/src/internal_api/views.rs +++ b/nexus/types/src/internal_api/views.rs @@ -8,6 +8,7 @@ use crate::inventory::BaseboardId; use crate::inventory::Caboose; use crate::inventory::CabooseWhich; use crate::inventory::Collection; +use crate::quiesce::SagaQuiesceStatus; use chrono::DateTime; use chrono::SecondsFormat; use chrono::Utc; @@ -721,12 +722,8 @@ pub struct QuiesceStatus { /// what stage of quiescing is Nexus at pub state: QuiesceState, - /// what sagas are currently running or known needing to be recovered - /// - /// This should only be non-empty when state is `Running` or - /// `WaitingForSagas`. Entries here prevent transitioning from - /// `WaitingForSagas` to `WaitingForDb`. - pub sagas_pending: IdOrdMap, + /// information about saga quiescing + pub sagas: SagaQuiesceStatus, /// what database claims are currently held (by any part of Nexus) /// diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index cd5c29201cf..94e5d8b1f7b 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -12,6 +12,8 @@ use iddqd::IdOrdMap; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_uuid_kinds::BlueprintUuid; +use schemars::JsonSchema; +use serde::Serialize; use slog::Logger; use slog::error; use slog::info; @@ -25,7 +27,8 @@ use tokio::sync::watch; /// /// This is used by Nexus quiesce to disallow creation of new sagas when we're /// trying to quiesce Nexus. -#[derive(Debug, Clone, Copy, Eq, PartialEq)] +#[derive(Debug, Clone, Copy, Eq, PartialEq, JsonSchema, Serialize)] +#[serde(rename_all = "snake_case")] enum SagasAllowed { /// New sagas may be started (normal condition) Allowed, @@ -101,11 +104,11 @@ pub struct SagaQuiesceHandle { // cancellation behavior is abysmal), but we don't want to block on a // std `Condvar` in an async thread. There are options here (e.g., // `block_on`), but they're not pleasant. - inner: watch::Sender, + inner: watch::Sender, } -#[derive(Debug, Clone)] -struct SagaQuiesceInner { +#[derive(Debug, Clone, Serialize, JsonSchema)] +pub struct SagaQuiesceStatus { /// current policy: are we allowed to *create* new sagas? /// /// This also affects re-assigning sagas from expunged Nexus instances to @@ -183,12 +186,13 @@ struct SagaQuiesceInner { /// whether a saga recovery operation is ongoing, and if one is: /// - what `reassignment_generation` was when it started /// - which blueprint id we'll be fully caught up to upon completion + #[serde(skip)] // XXX-dap recovery_pending: Option<(Generation, Option)>, } impl SagaQuiesceHandle { pub fn new(log: Logger) -> SagaQuiesceHandle { - let (inner, _) = watch::channel(SagaQuiesceInner { + let (inner, _) = watch::channel(SagaQuiesceStatus { new_sagas_allowed: SagasAllowed::DisallowedUnknown, sagas_pending: IdOrdMap::new(), first_recovery_complete: false, @@ -253,6 +257,10 @@ impl SagaQuiesceHandle { } /// Returns the blueprint id as of which sagas are fully drained + /// + /// We may become un-drained if another re-assignment pass starts for a + /// subsequent blueprint, but this fact will still be true that we *were* + /// fully drained as of expungements included up through this blueprint. pub fn fully_drained_blueprint(&self) -> Option { self.inner.borrow().drained_blueprint_id } @@ -286,8 +294,14 @@ impl SagaQuiesceHandle { .await; } + /// Returns a summary of internal state for debugging (involves a clone) + pub fn status(&self) -> SagaQuiesceStatus { + self.inner.borrow().clone() + } + /// Returns information about running sagas (involves a clone) - pub fn sagas_pending(&self) -> IdOrdMap { + #[cfg(test)] + fn sagas_pending(&self) -> IdOrdMap { self.inner.borrow().sagas_pending.clone() } @@ -593,7 +607,7 @@ impl SagaQuiesceHandle { } } -impl SagaQuiesceInner { +impl SagaQuiesceStatus { /// Returns whether sagas are fully drained /// /// This condition is not permanent. New sagas can be re-assigned to this @@ -647,7 +661,7 @@ impl SagaQuiesceInner { #[must_use = "must record the saga completion future once the saga is running"] pub struct NewlyPendingSagaRef { log: Logger, - quiesce: watch::Sender, + quiesce: watch::Sender, saga_id: steno::SagaId, init_finished: bool, } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index dc31e3d3ca6..b3aa66a8bd6 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -7726,24 +7726,13 @@ }, "uniqueItems": true }, - "sagas_pending": { - "title": "IdOrdMap", - "description": "what sagas are currently running or known needing to be recovered\n\nThis should only be non-empty when state is `Running` or `WaitingForSagas`. Entries here prevent transitioning from `WaitingForSagas` to `WaitingForDb`.", - "x-rust-type": { - "crate": "iddqd", - "parameters": [ - { - "$ref": "#/components/schemas/PendingSagaInfo" - } - ], - "path": "iddqd::IdOrdMap", - "version": "*" - }, - "type": "array", - "items": { - "$ref": "#/components/schemas/PendingSagaInfo" - }, - "uniqueItems": true + "sagas": { + "description": "information about saga quiescing", + "allOf": [ + { + "$ref": "#/components/schemas/SagaQuiesceStatus" + } + ] }, "state": { "description": "what stage of quiescing is Nexus at", @@ -7756,7 +7745,7 @@ }, "required": [ "db_claims", - "sagas_pending", + "sagas", "state" ] }, @@ -8250,6 +8239,97 @@ } ] }, + "SagaQuiesceStatus": { + "type": "object", + "properties": { + "drained_blueprint_id": { + "nullable": true, + "description": "blueprint id that we're \"fully drained up to\"\n\nIf this value is non-`None`, that means that:\n\n- saga creation is disallowed - no sagas are running - we have re-assigned sagas from other Nexus instances expunged in this blueprint or earlier - we have finished recovery for all those sagas (that had been assigned to us as of the re-assignment pass for this blueprint id)\n\nThis means that the only way we can wind up running another saga is if there's a new blueprint that expunges a different Nexus zone.", + "allOf": [ + { + "$ref": "#/components/schemas/TypedUuidForBlueprintKind" + } + ] + }, + "first_recovery_complete": { + "description": "whether at least one recovery pass has successfully completed\n\nWe have to track this because we can't quiesce until we know we've recovered all outstanding sagas.", + "type": "boolean" + }, + "new_sagas_allowed": { + "description": "current policy: are we allowed to *create* new sagas?\n\nThis also affects re-assigning sagas from expunged Nexus instances to ourselves. It does **not** affect saga recovery.", + "allOf": [ + { + "$ref": "#/components/schemas/SagasAllowed" + } + ] + }, + "reassignment_blueprint_id": { + "nullable": true, + "description": "blueprint id associated with last successful saga reassignment\n\nSimilar to the generation number, this is used to track whether we've accounted for all sagas for all expungements up through this target blueprint.", + "allOf": [ + { + "$ref": "#/components/schemas/TypedUuidForBlueprintKind" + } + ] + }, + "reassignment_generation": { + "description": "generation number for the saga reassignment\n\nThis gets bumped whenever a saga reassignment operation completes that may have re-assigned us some sagas. It's used to keep track of when we've recovered all sagas that could be assigned to us.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "reassignment_pending": { + "description": "whether there is a saga reassignment operation happening\n\nThese operatinos may assign new sagas to Nexus that must be recovered and completed before quiescing can finish.", + "type": "boolean" + }, + "recovered_blueprint_id": { + "nullable": true, + "description": "blueprint id that saga recovery has \"caught up to\"\n\nThis means that we have finished recovering any sagas that were re-assigned to us due to expungements of other Nexus zones up through this blueprint. Put differently: we know that we will never be assigned more sagas due to expungement unless the target blueprint changes past this one.\n\nThis does not mean that we've fully drained all sagas up through this blueprint. There may still be sagas running.", + "allOf": [ + { + "$ref": "#/components/schemas/TypedUuidForBlueprintKind" + } + ] + }, + "recovered_reassignment_generation": { + "description": "\"saga reassignment generation number\" that was \"caught up to\" by the last recovery pass\n\nThis is used with `reassignment_generation` to help us know when we've recovered all the sagas that may have been assigned to us during a given reassignment pass. See `reassignment_done()` for details.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "sagas_pending": { + "title": "IdOrdMap", + "description": "list of sagas we need to wait to complete before quiescing\n\nThese are basically running sagas. They may have been created in this Nexus process lifetime or created in another process and then recovered in this one.", + "x-rust-type": { + "crate": "iddqd", + "parameters": [ + { + "$ref": "#/components/schemas/PendingSagaInfo" + } + ], + "path": "iddqd::IdOrdMap", + "version": "*" + }, + "type": "array", + "items": { + "$ref": "#/components/schemas/PendingSagaInfo" + }, + "uniqueItems": true + } + }, + "required": [ + "first_recovery_complete", + "new_sagas_allowed", + "reassignment_generation", + "reassignment_pending", + "recovered_reassignment_generation", + "sagas_pending" + ] + }, "SagaResultsPage": { "description": "A single page of results", "type": "object", @@ -8357,6 +8437,32 @@ } ] }, + "SagasAllowed": { + "description": "Policy determining whether new sagas are allowed to be started\n\nThis is used by Nexus quiesce to disallow creation of new sagas when we're trying to quiesce Nexus.", + "oneOf": [ + { + "description": "New sagas may be started (normal condition)", + "type": "string", + "enum": [ + "allowed" + ] + }, + { + "description": "New sagas may not be started because we're quiescing or quiesced", + "type": "string", + "enum": [ + "disallowed_quiesce" + ] + }, + { + "description": "New sagas may not be started because we just started up and haven't determined if we're quiescing yet", + "type": "string", + "enum": [ + "disallowed_unknown" + ] + } + ] + }, "ServerId": { "description": "A unique ID for a Clickhouse Server", "type": "integer", From 7e271d12e5479e258bc4f44183f11d1907657468 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Wed, 27 Aug 2025 15:41:53 -0700 Subject: [PATCH 21/22] omdb output tweaks --- dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs index a886c6bc582..e0aabe25c5b 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs @@ -162,9 +162,10 @@ async fn quiesce_show( recovered_reassignment_generation, } = sagas; - println!("saga quiesce: {:?}", new_sagas_allowed); + println!("saga quiesce:"); + println!(" new sagas: {:?}", new_sagas_allowed); println!( - "drained as of blueprint: {}", + " drained as of blueprint: {}", drained_blueprint_id .map(|s| s.to_string()) .as_deref() @@ -187,7 +188,7 @@ async fn quiesce_show( println!( " reassignment generation: {} (pass running: {})", reassignment_generation, - if reassignment_pending { "yes" } else { "no " } + if reassignment_pending { "yes" } else { "no" } ); println!(" recovered generation: {}", recovered_reassignment_generation); println!( From 73fff4950746fc7af6eb15089a8df4ebd501f17c Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Thu, 28 Aug 2025 09:40:14 -0700 Subject: [PATCH 22/22] review feedback --- dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs | 15 +++++++- nexus/types/src/quiesce.rs | 36 +++++++++++++------- openapi/nexus-internal.json | 35 +++++++++++++++++++ 3 files changed, 72 insertions(+), 14 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs index e0aabe25c5b..73720272521 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus/quiesce.rs @@ -11,6 +11,7 @@ use chrono::TimeDelta; use chrono::Utc; use clap::Args; use clap::Subcommand; +use nexus_client::types::PendingRecovery; use nexus_client::types::QuiesceState; use nexus_client::types::QuiesceStatus; use nexus_client::types::SagaQuiesceStatus; @@ -160,6 +161,7 @@ async fn quiesce_show( reassignment_pending, recovered_blueprint_id, recovered_reassignment_generation, + recovery_pending, } = sagas; println!("saga quiesce:"); @@ -172,7 +174,7 @@ async fn quiesce_show( .unwrap_or("none") ); println!( - " blueprint for last recovery pass: {}", + " blueprint for last completed recovery pass: {}", recovered_blueprint_id .map(|s| s.to_string()) .as_deref() @@ -195,6 +197,17 @@ async fn quiesce_show( " recovered at least once successfully: {}", if first_recovery_complete { "yes" } else { "no" }, ); + print!(" recovery pending: "); + if let Some(PendingRecovery { generation, blueprint_id }) = recovery_pending + { + println!( + "yes (generation {}, blueprint id {})", + generation, + blueprint_id.map(|s| s.to_string()).as_deref().unwrap_or("none") + ); + } else { + println!("no"); + } println!(" sagas running: {}", sagas_pending.len()); for saga in &sagas_pending { diff --git a/nexus/types/src/quiesce.rs b/nexus/types/src/quiesce.rs index 94e5d8b1f7b..532322917a1 100644 --- a/nexus/types/src/quiesce.rs +++ b/nexus/types/src/quiesce.rs @@ -183,11 +183,18 @@ pub struct SagaQuiesceStatus { /// there's a new blueprint that expunges a different Nexus zone. drained_blueprint_id: Option, - /// whether a saga recovery operation is ongoing, and if one is: - /// - what `reassignment_generation` was when it started - /// - which blueprint id we'll be fully caught up to upon completion - #[serde(skip)] // XXX-dap - recovery_pending: Option<(Generation, Option)>, + /// If a recovery pass is ongoing, a snapshot of reassignment state when it + /// started (which reflects what we'll be caught up to when it finishes) + recovery_pending: Option, +} + +/// Snapshot of reassignment state when a recovery pass started +#[derive(Debug, Clone, Serialize, JsonSchema)] +struct PendingRecovery { + /// what `reassignment_generation` was when this recovery started + generation: Generation, + /// which blueprint id we'd be fully caught up to upon completion + blueprint_id: Option, } impl SagaQuiesceHandle { @@ -251,7 +258,7 @@ impl SagaQuiesceHandle { } }; - q.latch_drained_blueprint_id(); + q.latch_blueprint_if_drained(); changed }); } @@ -423,7 +430,7 @@ impl SagaQuiesceHandle { } } - q.latch_drained_blueprint_id(); + q.latch_blueprint_if_drained(); }); } @@ -468,8 +475,10 @@ impl SagaQuiesceHandle { "recovery_start() called twice without intervening \ recovery_done() (concurrent calls to recover()?)", ); - q.recovery_pending = - Some((q.reassignment_generation, q.reassignment_blueprint_id)); + q.recovery_pending = Some(PendingRecovery { + generation: q.reassignment_generation, + blueprint_id: q.reassignment_blueprint_id, + }); }); info!(&self.log, "saga recovery pass starting"); @@ -480,7 +489,8 @@ impl SagaQuiesceHandle { fn recovery_done(&self, success: bool) { let log = self.log.clone(); self.inner.send_modify(|q| { - let Some((generation, blueprint_id)) = q.recovery_pending.take() + let Some(PendingRecovery { generation, blueprint_id }) = + q.recovery_pending.take() else { panic!("cannot finish saga recovery when it was not running"); }; @@ -495,7 +505,7 @@ impl SagaQuiesceHandle { q.recovered_blueprint_id = blueprint_id; q.recovered_reassignment_generation = generation; q.first_recovery_complete = true; - q.latch_drained_blueprint_id(); + q.latch_blueprint_if_drained(); } else { info!(&log, "saga recovery pass failed"); } @@ -636,7 +646,7 @@ impl SagaQuiesceStatus { /// result of that blueprint. The rest of our bookkeeping would reflect /// that we're not fully drained, which is true, but we still want to be /// able to report that we were fully drained _as of this blueprint_. - fn latch_drained_blueprint_id(&mut self) { + fn latch_blueprint_if_drained(&mut self) { if self.is_fully_drained() { // If we've recovered up through a given blueprint id and are now // fully drained, then we have definitely fully drained up through @@ -705,7 +715,7 @@ impl NewlyPendingSagaRef { q.sagas_pending .remove(&saga_id) .expect("saga should have been running"); - q.latch_drained_blueprint_id(); + q.latch_blueprint_if_drained(); }); rv }); diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index b3aa66a8bd6..e54c975e8ed 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -6527,6 +6527,32 @@ "by_baseboard" ] }, + "PendingRecovery": { + "description": "Snapshot of reassignment state when a recovery pass started", + "type": "object", + "properties": { + "blueprint_id": { + "nullable": true, + "description": "which blueprint id we'd be fully caught up to upon completion", + "allOf": [ + { + "$ref": "#/components/schemas/TypedUuidForBlueprintKind" + } + ] + }, + "generation": { + "description": "what `reassignment_generation` was when this recovery started", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + } + }, + "required": [ + "generation" + ] + }, "PendingSagaInfo": { "description": "Describes a pending saga (for debugging why quiesce is stuck)", "type": "object", @@ -8301,6 +8327,15 @@ } ] }, + "recovery_pending": { + "nullable": true, + "description": "If a recovery pass is ongoing, a snapshot of reassignment state when it started (which reflects what we'll be caught up to when it finishes)", + "allOf": [ + { + "$ref": "#/components/schemas/PendingRecovery" + } + ] + }, "sagas_pending": { "title": "IdOrdMap", "description": "list of sagas we need to wait to complete before quiescing\n\nThese are basically running sagas. They may have been created in this Nexus process lifetime or created in another process and then recovered in this one.",