Skip to content

Commit 1ee1183

Browse files
authored
[reconfigurator] Fix race in chicken switch loader (#8998)
The chicken switch loading task exposes a watch channel that downstream consumers can use to read the current value. It initializes that channel to the `::default()` value of chicken switches and then reads the most recent value from the db when it's activated, but that introduces a race where consumers can see an incorrect value: if there is a chicken switch value in the db, they can see the `::default()` _before_ the first activation, allowing them to ignore the actual value in the db in favor of whatever the default set is. This PR closes the race window by setting the initial watch channel value to `NotYetLoaded`, and then replacing that during the first activation. This fell out of trying to fix up tests on #8980, but is a legit bug and easily separable from that work. So here it is!
1 parent c1e0c5b commit 1ee1183

File tree

2 files changed

+110
-31
lines changed

2 files changed

+110
-31
lines changed

nexus/src/app/background/tasks/blueprint_planner.rs

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
//! Background task for automatic update planning.
66
7+
use super::chicken_switches::ReconfiguratorChickenSwitchesLoaderState;
78
use crate::app::background::BackgroundTask;
89
use chrono::Utc;
910
use futures::future::BoxFuture;
@@ -13,7 +14,6 @@ use nexus_db_queries::db::DataStore;
1314
use nexus_reconfigurator_planning::planner::Planner;
1415
use nexus_reconfigurator_planning::planner::PlannerRng;
1516
use nexus_reconfigurator_preparation::PlanningInputFromDb;
16-
use nexus_types::deployment::ReconfiguratorChickenSwitchesView;
1717
use nexus_types::deployment::{Blueprint, BlueprintTarget};
1818
use nexus_types::internal_api::background::BlueprintPlannerStatus;
1919
use omicron_common::api::external::LookupType;
@@ -26,7 +26,7 @@ use tokio::sync::watch::{self, Receiver, Sender};
2626
/// Background task that runs the update planner.
2727
pub struct BlueprintPlanner {
2828
datastore: Arc<DataStore>,
29-
rx_chicken_switches: Receiver<ReconfiguratorChickenSwitchesView>,
29+
rx_chicken_switches: Receiver<ReconfiguratorChickenSwitchesLoaderState>,
3030
rx_inventory: Receiver<Option<CollectionUuid>>,
3131
rx_blueprint: Receiver<Option<Arc<(BlueprintTarget, Blueprint)>>>,
3232
tx_blueprint: Sender<Option<Arc<(BlueprintTarget, Blueprint)>>>,
@@ -35,7 +35,7 @@ pub struct BlueprintPlanner {
3535
impl BlueprintPlanner {
3636
pub fn new(
3737
datastore: Arc<DataStore>,
38-
rx_chicken_switches: Receiver<ReconfiguratorChickenSwitchesView>,
38+
rx_chicken_switches: Receiver<ReconfiguratorChickenSwitchesLoaderState>,
3939
rx_inventory: Receiver<Option<CollectionUuid>>,
4040
rx_blueprint: Receiver<Option<Arc<(BlueprintTarget, Blueprint)>>>,
4141
) -> Self {
@@ -59,7 +59,21 @@ impl BlueprintPlanner {
5959
/// If it is different from the current target blueprint,
6060
/// save it and make it the current target.
6161
pub async fn plan(&mut self, opctx: &OpContext) -> BlueprintPlannerStatus {
62-
let switches = self.rx_chicken_switches.borrow_and_update().clone();
62+
// Refuse to run if we haven't had a chance to load the chicken switches
63+
// from the database yet. (There might not be any in the db, which is
64+
// fine! But the loading task needs to have a chance to check.)
65+
let switches = match &*self.rx_chicken_switches.borrow_and_update() {
66+
ReconfiguratorChickenSwitchesLoaderState::NotYetLoaded => {
67+
debug!(
68+
opctx.log,
69+
"chicken switches not yet loaded; doing nothing"
70+
);
71+
return BlueprintPlannerStatus::Disabled;
72+
}
73+
ReconfiguratorChickenSwitchesLoaderState::Loaded(switches) => {
74+
switches.clone()
75+
}
76+
};
6377
if !switches.switches.planner_enabled {
6478
debug!(&opctx.log, "blueprint planning disabled, doing nothing");
6579
return BlueprintPlannerStatus::Disabled;
@@ -281,7 +295,7 @@ mod test {
281295
use nexus_test_utils_macros::nexus_test;
282296
use nexus_types::deployment::{
283297
PendingMgsUpdates, PlannerChickenSwitches,
284-
ReconfiguratorChickenSwitches,
298+
ReconfiguratorChickenSwitches, ReconfiguratorChickenSwitchesView,
285299
};
286300
use omicron_uuid_kinds::OmicronZoneUuid;
287301

@@ -326,14 +340,16 @@ mod test {
326340

327341
// Enable the planner
328342
let (_tx, chicken_switches_collector_rx) =
329-
watch::channel(ReconfiguratorChickenSwitchesView {
330-
version: 1,
331-
switches: ReconfiguratorChickenSwitches {
332-
planner_enabled: true,
333-
planner_switches: PlannerChickenSwitches::default(),
343+
watch::channel(ReconfiguratorChickenSwitchesLoaderState::Loaded(
344+
ReconfiguratorChickenSwitchesView {
345+
version: 1,
346+
switches: ReconfiguratorChickenSwitches {
347+
planner_enabled: true,
348+
planner_switches: PlannerChickenSwitches::default(),
349+
},
350+
time_modified: now_db_precision(),
334351
},
335-
time_modified: now_db_precision(),
336-
});
352+
));
337353

338354
// Finally, spin up the planner background task.
339355
let mut planner = BlueprintPlanner::new(

nexus/src/app/background/tasks/chicken_switches.rs

Lines changed: 82 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,32 @@ use serde_json::json;
1515
use std::sync::Arc;
1616
use tokio::sync::watch;
1717

18+
/// Enum that allows downstream tasks to know whether this task has had a chance
19+
/// to read the current chicken switches from the database.
20+
#[derive(Debug, Clone, PartialEq, Eq)]
21+
pub enum ReconfiguratorChickenSwitchesLoaderState {
22+
NotYetLoaded,
23+
Loaded(ReconfiguratorChickenSwitchesView),
24+
}
25+
1826
/// Background task that tracks reconfigurator chicken switches from the DB
1927
pub struct ChickenSwitchesLoader {
2028
datastore: Arc<DataStore>,
21-
tx: watch::Sender<ReconfiguratorChickenSwitchesView>,
22-
rx: watch::Receiver<ReconfiguratorChickenSwitchesView>,
29+
tx: watch::Sender<ReconfiguratorChickenSwitchesLoaderState>,
2330
}
2431

2532
impl ChickenSwitchesLoader {
2633
pub fn new(datastore: Arc<DataStore>) -> Self {
27-
let (tx, rx) =
28-
watch::channel(ReconfiguratorChickenSwitchesView::default());
29-
Self { datastore, tx, rx }
34+
let (tx, _rx) = watch::channel(
35+
ReconfiguratorChickenSwitchesLoaderState::NotYetLoaded,
36+
);
37+
Self { datastore, tx }
3038
}
3139

3240
pub fn watcher(
3341
&self,
34-
) -> watch::Receiver<ReconfiguratorChickenSwitchesView> {
35-
self.rx.clone()
42+
) -> watch::Receiver<ReconfiguratorChickenSwitchesLoaderState> {
43+
self.tx.subscribe()
3644
}
3745
}
3846

@@ -55,15 +63,21 @@ impl BackgroundTask for ChickenSwitchesLoader {
5563
json!({ "error": message })
5664
}
5765
Ok(switches) => {
58-
let switches = switches.unwrap_or_default();
66+
let switches =
67+
ReconfiguratorChickenSwitchesLoaderState::Loaded(
68+
switches.unwrap_or_default(),
69+
);
5970
let updated = self.tx.send_if_modified(|s| {
6071
if *s != switches {
61-
*s = switches;
72+
*s = switches.clone();
6273
return true;
6374
}
6475
false
6576
});
66-
debug!(opctx.log, "chicken switches load complete");
77+
debug!(
78+
opctx.log, "chicken switches load complete";
79+
"switches" => ?switches,
80+
);
6781
json!({ "chicken_switches_updated": updated })
6882
}
6983
}
@@ -94,14 +108,35 @@ mod test {
94108
);
95109

96110
let mut task = ChickenSwitchesLoader::new(datastore.clone());
111+
112+
// Initial state should be `NotYetLoaded`.
113+
let mut rx = task.watcher();
114+
assert_eq!(
115+
*rx.borrow_and_update(),
116+
ReconfiguratorChickenSwitchesLoaderState::NotYetLoaded
117+
);
118+
119+
// We haven't inserted anything into the DB, so the initial activation
120+
// should populate the channel with our default values.
121+
let default_switches = ReconfiguratorChickenSwitchesView::default();
97122
let out = task.activate(&opctx).await;
98-
assert_eq!(out["chicken_switches_updated"], false);
123+
assert_eq!(out["chicken_switches_updated"], true);
124+
assert!(rx.has_changed().unwrap());
125+
assert_eq!(
126+
*rx.borrow_and_update(),
127+
ReconfiguratorChickenSwitchesLoaderState::Loaded(
128+
default_switches.clone()
129+
)
130+
);
131+
132+
// Insert an initial set of switches.
133+
let expected_switches = ReconfiguratorChickenSwitches {
134+
planner_enabled: !default_switches.switches.planner_enabled,
135+
planner_switches: PlannerChickenSwitches::default(),
136+
};
99137
let switches = ReconfiguratorChickenSwitchesParam {
100138
version: 1,
101-
switches: ReconfiguratorChickenSwitches {
102-
planner_enabled: true,
103-
planner_switches: PlannerChickenSwitches::default(),
104-
},
139+
switches: expected_switches,
105140
};
106141
datastore
107142
.reconfigurator_chicken_switches_insert_latest_version(
@@ -111,14 +146,31 @@ mod test {
111146
.unwrap();
112147
let out = task.activate(&opctx).await;
113148
assert_eq!(out["chicken_switches_updated"], true);
149+
assert!(rx.has_changed().unwrap());
150+
{
151+
let view = match rx.borrow_and_update().clone() {
152+
ReconfiguratorChickenSwitchesLoaderState::NotYetLoaded => {
153+
panic!("unexpected value")
154+
}
155+
ReconfiguratorChickenSwitchesLoaderState::Loaded(view) => view,
156+
};
157+
assert_eq!(view.version, 1);
158+
assert_eq!(view.switches, expected_switches);
159+
}
160+
161+
// Activating again should not change things.
114162
let out = task.activate(&opctx).await;
115163
assert_eq!(out["chicken_switches_updated"], false);
164+
assert!(!rx.has_changed().unwrap());
165+
166+
// Insert a new version.
167+
let expected_switches = ReconfiguratorChickenSwitches {
168+
planner_enabled: !expected_switches.planner_enabled,
169+
planner_switches: PlannerChickenSwitches::default(),
170+
};
116171
let switches = ReconfiguratorChickenSwitchesParam {
117172
version: 2,
118-
switches: ReconfiguratorChickenSwitches {
119-
planner_enabled: false,
120-
planner_switches: PlannerChickenSwitches::default(),
121-
},
173+
switches: expected_switches,
122174
};
123175
datastore
124176
.reconfigurator_chicken_switches_insert_latest_version(
@@ -128,5 +180,16 @@ mod test {
128180
.unwrap();
129181
let out = task.activate(&opctx).await;
130182
assert_eq!(out["chicken_switches_updated"], true);
183+
assert!(rx.has_changed().unwrap());
184+
{
185+
let view = match rx.borrow_and_update().clone() {
186+
ReconfiguratorChickenSwitchesLoaderState::NotYetLoaded => {
187+
panic!("unexpected value")
188+
}
189+
ReconfiguratorChickenSwitchesLoaderState::Loaded(view) => view,
190+
};
191+
assert_eq!(view.version, 2);
192+
assert_eq!(view.switches, expected_switches);
193+
}
131194
}
132195
}

0 commit comments

Comments
 (0)