Skip to content

Commit 174b829

Browse files
authored
[39/n] prepare sled-agent reconciler for honoring mupdate overrides (#8688)
This PR prepares the sled-agent config reconciler to honor mupdate overrides. In particular, the logic that requires bouncing zones needs to be updated to consider zone image locations after mupdate overrides have been considered, not before. Doing this required some refactoring. The biggest one is that looking up zone image sources is no longer done through the image resolver and/or within sled-agent's `services.rs`, but rather by gathering and querying the resolver status within the config reconciler. This is a nice improvement overall, particularly because it means we grab the lock once per reconciler run rather than each time we need to look up a zone's image source. We do not actually honor the mupdate override yet, though all the pieces are now in place. We'll combine the PRs to honor the override and update the blueprint logic into one, within #8456.
1 parent 1107dd7 commit 174b829

File tree

21 files changed

+1349
-453
lines changed

21 files changed

+1349
-453
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

common/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ pub mod progenitor_operation_retry;
3131
pub mod snake_case_result;
3232
pub mod update;
3333
pub mod vlan;
34+
pub mod zone_images;
3435
pub mod zpool_name;
3536

3637
/// A type that allows adding file and line numbers to log messages

common/src/zone_images.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
use camino::Utf8PathBuf;
6+
7+
/// Places to look for a zone's image.
8+
#[derive(Clone, Debug, PartialEq, Eq)]
9+
pub struct ZoneImageFileSource {
10+
/// The file name to look for.
11+
pub file_name: String,
12+
13+
/// The paths to look for the zone image in.
14+
///
15+
/// This represents a high-confidence belief, but not a guarantee, that the
16+
/// zone image will be found in one of these locations.
17+
pub search_paths: Vec<Utf8PathBuf>,
18+
}

illumos-utils/src/running_zone.rs

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use camino_tempfile::Utf8TempDir;
1919
use debug_ignore::DebugIgnore;
2020
use ipnetwork::IpNetwork;
2121
use omicron_common::backoff;
22+
use omicron_common::zone_images::ZoneImageFileSource;
2223
use omicron_uuid_kinds::OmicronZoneUuid;
2324
pub use oxlog::is_oxide_smf_log_file;
2425
use slog::{Logger, error, info, o, warn};
@@ -1338,19 +1339,6 @@ impl<'a> ZoneBuilder<'a> {
13381339
}
13391340
}
13401341

1341-
/// Places to look for a zone's image.
1342-
#[derive(Clone, Debug, PartialEq, Eq)]
1343-
pub struct ZoneImageFileSource {
1344-
/// The file name to look for.
1345-
pub file_name: String,
1346-
1347-
/// The paths to look for the zone image in.
1348-
///
1349-
/// This represents a high-confidence belief, but not a guarantee, that the
1350-
/// zone image will be found in one of these locations.
1351-
pub search_paths: Vec<Utf8PathBuf>,
1352-
}
1353-
13541342
/// Return true if the service with the given FMRI appears to be an
13551343
/// Oxide-managed service.
13561344
pub fn is_oxide_smf_service(fmri: impl AsRef<str>) -> bool {

nexus/inventory/src/examples.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,7 @@ pub fn zone_image_resolver(
889889
},
890890
},
891891
},
892+
image_directory_override: None,
892893
};
893894

894895
status.to_inventory()

sled-agent/config-reconciler/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ proptest.workspace = true
5454
schemars.workspace = true
5555
scopeguard.workspace = true
5656
serde_json.workspace = true
57+
sled-agent-zone-images-examples.workspace = true
5758
sled-storage = { workspace = true, features = ["testing"] }
5859
test-strategy.workspace = true
5960
xshell.workspace = true

sled-agent/config-reconciler/src/host_phase_2.rs

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//! partitions.
77
88
use crate::InternalDisks;
9+
use crate::ResolverStatusExt;
910
use crate::SledAgentArtifactStore;
1011
use camino::Utf8Path;
1112
use camino::Utf8PathBuf;
@@ -15,9 +16,11 @@ use nexus_sled_agent_shared::inventory::BootPartitionDetails;
1516
use nexus_sled_agent_shared::inventory::HostPhase2DesiredContents;
1617
use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots;
1718
use omicron_common::disk::M2Slot;
19+
use sled_agent_types::zone_images::ResolverStatus;
1820
use sled_hardware::PooledDiskError;
1921
use slog::Logger;
2022
use slog::info;
23+
use slog::o;
2124
use slog::warn;
2225
use slog_error_chain::InlineErrorChain;
2326
use std::io;
@@ -128,27 +131,35 @@ pub(crate) struct BootPartitionReconciler {
128131
impl BootPartitionReconciler {
129132
pub(crate) async fn reconcile<T: SledAgentArtifactStore>(
130133
&mut self,
134+
resolver_status: &ResolverStatus,
131135
internal_disks: &InternalDisks,
132136
desired: &HostPhase2DesiredSlots,
133137
artifact_store: &T,
134-
// TODO We should also consider the mupdate override, once that work
135-
// lands. Never overwrite the boot partitions while we're in that state.
136138
log: &Logger,
137139
) -> BootPartitionContents {
140+
let prepared_slot_a = resolver_status.prepare_host_phase_2_contents(
141+
&log.new(o!("slot" => "A")),
142+
&desired.slot_a,
143+
);
144+
let prepared_slot_b = resolver_status.prepare_host_phase_2_contents(
145+
&log.new(o!("slot" => "B")),
146+
&desired.slot_b,
147+
);
148+
138149
let (slot_a, slot_b) = futures::join!(
139150
Self::reconcile_slot::<_, RawDiskReader, RawDiskWriter>(
140151
M2Slot::A,
141152
internal_disks,
142153
&mut self.cached_slot_a,
143-
&desired.slot_a,
154+
prepared_slot_a.desired_contents(),
144155
artifact_store,
145156
log,
146157
),
147158
Self::reconcile_slot::<_, RawDiskReader, RawDiskWriter>(
148159
M2Slot::B,
149160
internal_disks,
150161
&mut self.cached_slot_b,
151-
&desired.slot_b,
162+
prepared_slot_b.desired_contents(),
152163
artifact_store,
153164
log,
154165
),
@@ -337,6 +348,27 @@ impl BootPartitionReconciler {
337348
}
338349
}
339350

351+
pub enum HostPhase2PreparedContents<'a> {
352+
/// No mupdate override was found, so the desired host phase 2 contents were
353+
/// used.
354+
NoMupdateOverride(&'a HostPhase2DesiredContents),
355+
356+
/// A mupdate override was found, so the contents are always set to
357+
/// `CurrentContents`.
358+
WithMupdateOverride,
359+
}
360+
361+
impl<'a> HostPhase2PreparedContents<'a> {
362+
pub fn desired_contents(&self) -> &'a HostPhase2DesiredContents {
363+
match self {
364+
Self::NoMupdateOverride(contents) => contents,
365+
Self::WithMupdateOverride => {
366+
&HostPhase2DesiredContents::CurrentContents
367+
}
368+
}
369+
}
370+
}
371+
340372
/// Traits to allow unit tests to run without accessing real raw disks; these
341373
/// are trivially implemented by the real types below and by unit tests inside
342374
/// the test module.

sled-agent/config-reconciler/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ mod handle;
5353
mod host_phase_2;
5454
mod internal_disks;
5555
mod ledger;
56+
mod mupdate_override;
5657
mod raw_disks;
5758
mod reconciler_task;
5859
mod sled_agent_facilities;
@@ -74,6 +75,7 @@ pub use internal_disks::InternalDisksWithBootDisk;
7475
pub use ledger::LedgerArtifactConfigError;
7576
pub use ledger::LedgerNewConfigError;
7677
pub use ledger::LedgerTaskError;
78+
pub use mupdate_override::ResolverStatusExt;
7779
pub use raw_disks::RawDisksSender;
7880
pub use reconciler_task::CurrentlyManagedZpools;
7981
pub use reconciler_task::CurrentlyManagedZpoolsReceiver;
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Behaviors of the config reconciler in the presence of mupdate overrides.
6+
7+
use crate::InternalDisks;
8+
use crate::host_phase_2::HostPhase2PreparedContents;
9+
use camino::Utf8PathBuf;
10+
use nexus_sled_agent_shared::inventory::HostPhase2DesiredContents;
11+
use nexus_sled_agent_shared::inventory::OmicronZoneConfig;
12+
use nexus_sled_agent_shared::inventory::OmicronZoneImageSource;
13+
use nexus_sled_agent_shared::inventory::ZoneKind;
14+
use omicron_common::zone_images::ZoneImageFileSource;
15+
use sled_agent_types::zone_images::OmicronZoneFileSource;
16+
use sled_agent_types::zone_images::OmicronZoneImageLocation;
17+
use sled_agent_types::zone_images::PreparedOmicronZone;
18+
use sled_agent_types::zone_images::RAMDISK_IMAGE_PATH;
19+
use sled_agent_types::zone_images::ResolverStatus;
20+
use sled_agent_types::zone_images::ZoneImageLocationError;
21+
use slog::error;
22+
use slog_error_chain::InlineErrorChain;
23+
use tufaceous_artifact::ArtifactHash;
24+
25+
/// An extension trait for `ResolverStatus`.
26+
///
27+
/// This trait refers to types that aren't available within `sled-agent-types`.
28+
pub trait ResolverStatusExt {
29+
/// Look up the file source for an Omicron zone.
30+
fn omicron_file_source(
31+
&self,
32+
log: &slog::Logger,
33+
zone_kind: ZoneKind,
34+
image_source: &OmicronZoneImageSource,
35+
internal_disks: &InternalDisks,
36+
) -> OmicronZoneFileSource;
37+
38+
/// Prepare an Omicron zone for installation.
39+
fn prepare_omicron_zone<'a>(
40+
&self,
41+
log: &slog::Logger,
42+
zone_config: &'a OmicronZoneConfig,
43+
internal_disks: &InternalDisks,
44+
) -> PreparedOmicronZone<'a> {
45+
let file_source = self.omicron_file_source(
46+
log,
47+
zone_config.zone_type.kind(),
48+
&zone_config.image_source,
49+
internal_disks,
50+
);
51+
PreparedOmicronZone::new(zone_config, file_source)
52+
}
53+
54+
fn prepare_host_phase_2_contents<'a>(
55+
&self,
56+
log: &slog::Logger,
57+
desired: &'a HostPhase2DesiredContents,
58+
) -> HostPhase2PreparedContents<'a>;
59+
}
60+
61+
impl ResolverStatusExt for ResolverStatus {
62+
fn omicron_file_source(
63+
&self,
64+
log: &slog::Logger,
65+
zone_kind: ZoneKind,
66+
image_source: &OmicronZoneImageSource,
67+
internal_disks: &InternalDisks,
68+
) -> OmicronZoneFileSource {
69+
match image_source {
70+
OmicronZoneImageSource::InstallDataset => {
71+
let file_name = zone_kind.artifact_in_install_dataset();
72+
73+
// There's always at least one image path (the RAM disk below).
74+
let mut search_paths = Vec::with_capacity(1);
75+
76+
// Inject an image path if requested by a test.
77+
if let Some(path) = &self.image_directory_override {
78+
search_paths.push(path.clone());
79+
};
80+
81+
// Any zones not part of the RAM disk are managed via the zone
82+
// manifest.
83+
let hash = install_dataset_hash(
84+
log,
85+
self,
86+
zone_kind,
87+
internal_disks,
88+
|path| search_paths.push(path),
89+
);
90+
91+
// Look for the image in the RAM disk as a fallback. Note that
92+
// install dataset images are not stored on the RAM disk in
93+
// production, just in development or test workflows.
94+
search_paths.push(Utf8PathBuf::from(RAMDISK_IMAGE_PATH));
95+
96+
OmicronZoneFileSource {
97+
location: OmicronZoneImageLocation::InstallDataset { hash },
98+
file_source: ZoneImageFileSource {
99+
file_name: file_name.to_owned(),
100+
search_paths,
101+
},
102+
}
103+
}
104+
OmicronZoneImageSource::Artifact { hash } => {
105+
// TODO: implement mupdate override here.
106+
//
107+
// Search both artifact datasets. This iterator starts with the
108+
// dataset for the boot disk (if it exists), and then is followed
109+
// by all other disks.
110+
let search_paths =
111+
internal_disks.all_artifact_datasets().collect();
112+
OmicronZoneFileSource {
113+
// TODO: with mupdate overrides, return InstallDataset here
114+
location: OmicronZoneImageLocation::Artifact {
115+
hash: Ok(*hash),
116+
},
117+
file_source: ZoneImageFileSource {
118+
file_name: hash.to_string(),
119+
search_paths,
120+
},
121+
}
122+
}
123+
}
124+
}
125+
126+
fn prepare_host_phase_2_contents<'a>(
127+
&self,
128+
#[expect(unused)] log: &slog::Logger,
129+
desired: &'a HostPhase2DesiredContents,
130+
) -> HostPhase2PreparedContents<'a> {
131+
// TODO: Implement mupdate override logic.
132+
HostPhase2PreparedContents::NoMupdateOverride(desired)
133+
}
134+
}
135+
136+
fn install_dataset_hash<F>(
137+
log: &slog::Logger,
138+
resolver_status: &ResolverStatus,
139+
zone_kind: ZoneKind,
140+
internal_disks: &InternalDisks,
141+
mut search_paths_cb: F,
142+
) -> Result<ArtifactHash, ZoneImageLocationError>
143+
where
144+
F: FnMut(Utf8PathBuf),
145+
{
146+
// XXX: we ask for the boot zpool to be passed in here. But
147+
// `ResolverStatus` also caches the boot zpool. How should we
148+
// reconcile the two?
149+
let hash = if let Some(path) = internal_disks.boot_disk_install_dataset() {
150+
let hash = resolver_status.zone_manifest.zone_hash(zone_kind);
151+
match hash {
152+
Ok(hash) => {
153+
search_paths_cb(path);
154+
Ok(hash)
155+
}
156+
Err(error) => {
157+
error!(
158+
log,
159+
"zone {} not found in the boot disk zone manifest, \
160+
not returning it as a source",
161+
zone_kind.report_str();
162+
"file_name" => zone_kind.artifact_in_install_dataset(),
163+
"error" => InlineErrorChain::new(&error),
164+
);
165+
Err(ZoneImageLocationError::ZoneHash(error))
166+
}
167+
}
168+
} else {
169+
// The boot disk is not available, so we cannot add the
170+
// install dataset path from it.
171+
error!(
172+
log,
173+
"boot disk install dataset not available, \
174+
not returning it as a source";
175+
"zone_kind" => zone_kind.report_str(),
176+
);
177+
Err(ZoneImageLocationError::BootDiskMissing)
178+
};
179+
hash
180+
}
181+
182+
// Tests for this module live inside sled-agent-zone-images. (This is a bit
183+
// weird and should probably be addressed at some point.)

0 commit comments

Comments
 (0)