Skip to content

Commit ca91632

Browse files
Bugfix (8206) - Don't fail switch slot resolution when one switch zone is unresponsive (#8914)
This PR addresses two issues that impacts users in circumstances where a switch zone is unavailable: 1. When resolving which switch slot is managed by which switch zone, we would continuously retry whenever there was a communication error. This causes RPWs to stop running / Sagas to get stuck whenever a switch zone becomes unavailable but still has entries in DNS. 2. In the `dpd_ensure` node of the `instance_start` saga, we were bailing out if we encountered any errors while notifying dpd of nat changes. This means that in the event of one of our switch zones being unavailable we would no longer allow users to start instances, which is probably a bit too strict. This PR makes the following adjustments to the switch slot resolution behavior: * Log an error instead of a warning whenever a DNS entry exists for a switch zone service but communication fails * Return the mappings of the Slot -> Address for the switch zones that we were successfully able to resolve * Let the caller decide whether or not they should retry or proceed. After checking all of the current call sites, it appears that the logic already accommodates scenarios where switch zone mappings are missing. This PR makes the following changes to the behavior to the NAT configuration steps for instance related sagas: * Do not bail if we fail to notify a dendrite service of NAT changes. Log a warning and rely on the NAT RPW to catch us back up whenever the switch zone becomes available again. Still return a `Result` from the `notify` function so that callers can decide if they want to make a subsequent decision based on the error. This PR makes the following changes to the behavior of NAT garbage collection: * Pause GC (exit early) if we detect that both switches cannot be reached Before now, due to a configuration issue, the instant start saga tests weren't actually driving ensuring that NAT configurations were eventually landing on the dendrite instances. It also turns out that Nexus in the `TestContext` could not actually communicate with dendrite because the port values were hard coded and our resolution logic relied on comparing the different addresses for MGS (and all mgs instances listen on localhost in testing). This PR introduces the following changes: * Use an updated version of Dendrite that allows us to provide a MGS `SocketAddr` on startup so it can use a simulated MGS to learn switch slot information * Adjust the `TestContext` Builder logic to provide MGS information to dendrite * Have Nexus ask dendrite for its switch slot information instead of having Nexus attempt to resolve it via MGS * Change the test configuration to create a NIC for the instance so we actually exercise the NAT creation logic in the `dpd_ensure` saga node * Introduce a test that ensures we still land configurations on the remaining switch even when one of our switches are unavailable Related --- #8206 #6896 #8999 Depends On --- oxidecomputer/dendrite#117 --------- Co-authored-by: David Pacheco <[email protected]>
1 parent 362c4f7 commit ca91632

File tree

16 files changed

+552
-177
lines changed

16 files changed

+552
-177
lines changed

nexus/src/app/background/tasks/nat_cleanup.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,16 @@
66
//! Responsible for cleaning up soft deleted entries once they
77
//! have been propagated to running dpd instances.
88
9-
use crate::app::switch_zone_address_mappings;
9+
use crate::app::dpd_clients;
1010

11-
use super::networking::build_dpd_clients;
1211
use crate::app::background::BackgroundTask;
1312
use chrono::{Duration, Utc};
1413
use futures::FutureExt;
1514
use futures::future::BoxFuture;
1615
use internal_dns_resolver::Resolver;
1716
use nexus_db_queries::context::OpContext;
1817
use nexus_db_queries::db::DataStore;
18+
use omicron_common::api::internal::shared::SwitchLocation;
1919
use serde_json::json;
2020
use std::sync::Arc;
2121

@@ -64,8 +64,8 @@ impl BackgroundTask for Ipv4NatGarbageCollector {
6464
}
6565
};
6666

67-
let mappings = match
68-
switch_zone_address_mappings(&self.resolver, log).await
67+
let dpd_clients = match
68+
dpd_clients(&self.resolver, log).await
6969
{
7070
Ok(mappings) => mappings,
7171
Err(e) => {
@@ -83,9 +83,15 @@ impl BackgroundTask for Ipv4NatGarbageCollector {
8383
}
8484
};
8585

86-
let dpd_clients = build_dpd_clients(&mappings, log);
86+
for location in [SwitchLocation::Switch0, SwitchLocation::Switch1] {
87+
if !dpd_clients.contains_key(&location) {
88+
let message = format!("dendrite for {location} is unavailable, cannot perform nat cleanup");
89+
error!(log, "{message}");
90+
return json!({"error": message});
91+
}
92+
}
8793

88-
for (_location, client) in dpd_clients {
94+
for client in dpd_clients.values() {
8995
let response = client.ipv4_nat_generation().await;
9096
match response {
9197
Ok(gen) => min_gen = std::cmp::min(min_gen, *gen),

nexus/src/app/background/tasks/networking.rs

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ use dpd_client::types::{
88
};
99
use nexus_db_model::{SwitchLinkFec, SwitchLinkSpeed};
1010
use nexus_db_queries::db;
11-
use omicron_common::address::DENDRITE_PORT;
1211
use omicron_common::{address::MGD_PORT, api::external::SwitchLocation};
1312
use std::{collections::HashMap, net::SocketAddrV6};
1413

@@ -30,32 +29,6 @@ pub(crate) fn build_mgd_clients(
3029
clients.into_iter().collect::<HashMap<_, _>>()
3130
}
3231

33-
pub(crate) fn build_dpd_clients(
34-
mappings: &HashMap<SwitchLocation, std::net::Ipv6Addr>,
35-
log: &slog::Logger,
36-
) -> HashMap<SwitchLocation, dpd_client::Client> {
37-
let dpd_clients: HashMap<SwitchLocation, dpd_client::Client> = mappings
38-
.iter()
39-
.map(|(location, addr)| {
40-
let port = DENDRITE_PORT;
41-
42-
let client_state = dpd_client::ClientState {
43-
tag: String::from("nexus"),
44-
log: log.new(o!(
45-
"component" => "DpdClient"
46-
)),
47-
};
48-
49-
let dpd_client = dpd_client::Client::new(
50-
&format!("http://[{addr}]:{port}"),
51-
client_state,
52-
);
53-
(*location, dpd_client)
54-
})
55-
.collect();
56-
dpd_clients
57-
}
58-
5932
pub(crate) fn api_to_dpd_port_settings(
6033
settings: &SwitchPortSettingsCombinedResult,
6134
) -> Result<PortSettings, String> {

nexus/src/app/background/tasks/sync_service_zone_nat.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
//! Background task for detecting changes to service zone locations and
66
//! updating the NAT rpw table accordingly
77
8-
use crate::app::switch_zone_address_mappings;
8+
use crate::app::dpd_clients;
99

10-
use super::networking::build_dpd_clients;
1110
use crate::app::background::BackgroundTask;
1211
use anyhow::Context;
1312
use futures::FutureExt;
@@ -261,9 +260,7 @@ impl BackgroundTask for ServiceZoneNatTracker {
261260
// notify dpd if we've added any new records
262261
if result > 0 {
263262

264-
let mappings = match
265-
switch_zone_address_mappings(&self.resolver, log).await
266-
{
263+
let dpd_clients = match dpd_clients(&self.resolver, log).await {
267264
Ok(mappings) => mappings,
268265
Err(e) => {
269266
error!(log, "failed to resolve addresses for Dendrite services"; "error" => %e);
@@ -277,8 +274,6 @@ impl BackgroundTask for ServiceZoneNatTracker {
277274
},
278275
};
279276

280-
let dpd_clients = build_dpd_clients(&mappings, log);
281-
282277
for (_location, client) in dpd_clients {
283278
if let Err(e) = client.ipv4_nat_trigger_update().await {
284279
error!(

nexus/src/app/background/tasks/sync_switch_configuration.rs

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
88
use crate::app::{
99
background::tasks::networking::{
10-
api_to_dpd_port_settings, build_dpd_clients, build_mgd_clients,
10+
api_to_dpd_port_settings, build_mgd_clients,
1111
},
12-
switch_zone_address_mappings,
12+
dpd_clients, switch_zone_address_mappings,
1313
};
1414
use oxnet::Ipv4Net;
1515
use slog::o;
@@ -328,19 +328,29 @@ impl BackgroundTask for SwitchPortSettingsManager {
328328
Err(e) => {
329329
error!(
330330
log,
331-
"failed to resolve addresses for Dendrite services";
331+
"failed to resolve addresses for switch services";
332332
"error" => %e);
333333
continue;
334334
},
335335
};
336336

337337
// TODO https://github.com/oxidecomputer/omicron/issues/5201
338-
// build sled agent clients
339-
let sled_agent_clients = build_sled_agent_clients(&mappings, &log);
338+
// build sled agent clients for sleds that are connected to the switches
339+
let scrimlet_sled_agent_clients = build_sled_agent_clients(&mappings, &log);
340340

341341
// TODO https://github.com/oxidecomputer/omicron/issues/5201
342-
// build dpd clients
343-
let dpd_clients = build_dpd_clients(&mappings, &log);
342+
let dpd_clients = match
343+
dpd_clients(&self.resolver, &log).await
344+
{
345+
Ok(mappings) => mappings,
346+
Err(e) => {
347+
error!(
348+
log,
349+
"failed to resolve addresses for Dendrite";
350+
"error" => %e);
351+
continue;
352+
},
353+
};
344354

345355
// TODO https://github.com/oxidecomputer/omicron/issues/5201
346356
// build mgd clients
@@ -455,7 +465,7 @@ impl BackgroundTask for SwitchPortSettingsManager {
455465
// yeet the messages
456466
for (location, config) in &uplinks {
457467
let client: &sled_agent_client::Client =
458-
match sled_agent_clients.get(location) {
468+
match scrimlet_sled_agent_clients.get(location) {
459469
Some(client) => client,
460470
None => {
461471
error!(log, "sled-agent client is missing, cannot send updates"; "location" => %location);
@@ -844,7 +854,7 @@ impl BackgroundTask for SwitchPortSettingsManager {
844854

845855
// Since we update the first scrimlet we can reach (we failover to the second one
846856
// if updating the first one fails) we need to check them both.
847-
for (_location, client) in &sled_agent_clients {
857+
for (_location, client) in &scrimlet_sled_agent_clients {
848858
let scrimlet_cfg = match client.read_network_bootstore_config_cache().await {
849859
Ok(config) => config,
850860
Err(e) => {
@@ -1286,7 +1296,7 @@ impl BackgroundTask for SwitchPortSettingsManager {
12861296
// push the updates to both scrimlets
12871297
// if both scrimlets are down, bootstore updates aren't happening anyway
12881298
let mut one_succeeded = false;
1289-
for (location, client) in &sled_agent_clients {
1299+
for (location, client) in &scrimlet_sled_agent_clients {
12901300
if let Err(e) = client.write_network_bootstore_config(&desired_config).await {
12911301
error!(
12921302
log,

nexus/src/app/instance_network.rs

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -408,15 +408,26 @@ pub(crate) async fn instance_ensure_dpd_config(
408408
return Err(e);
409409
}
410410

411-
notify_dendrite_nat_state(
411+
// We should not bail out if there is an error while notifying dendrite.
412+
// If there is an error communicating with one dendrite instance but the
413+
// other is operational and we bail here, it will prevent users from starting
414+
// an instance. Dendrite should still catch back up via a RPW if we fail to
415+
// notify it here.
416+
if let Err(e) = notify_dendrite_nat_state(
412417
datastore,
413418
log,
414419
resolver,
415420
opctx_alloc,
416421
Some(instance_id),
417-
true,
418422
)
419-
.await?;
423+
.await
424+
{
425+
warn!(
426+
log,
427+
"error encountered when notifying dendrite";
428+
"error" => %e
429+
)
430+
};
420431

421432
Ok(nat_entries)
422433
}
@@ -559,7 +570,6 @@ pub(crate) async fn instance_delete_dpd_config(
559570
resolver,
560571
opctx_alloc,
561572
Some(instance_id),
562-
false,
563573
)
564574
.await
565575
}
@@ -684,15 +694,7 @@ pub(crate) async fn delete_dpd_config_by_entry(
684694
},
685695
}
686696

687-
notify_dendrite_nat_state(
688-
datastore,
689-
log,
690-
resolver,
691-
opctx_alloc,
692-
None,
693-
false,
694-
)
695-
.await
697+
notify_dendrite_nat_state(datastore, log, resolver, opctx_alloc, None).await
696698
}
697699

698700
/// Soft-delete an individual external IP from the NAT RPW, without
@@ -724,30 +726,29 @@ async fn external_ip_delete_dpd_config_inner(
724726
/// Informs all available boundary switches that the set of NAT entries
725727
/// has changed.
726728
///
727-
/// When `fail_fast` is set, this function will return on any error when
728-
/// acquiring a handle to a DPD client. Otherwise, it will attempt to notify
729-
/// all clients and then finally return the first error.
729+
/// It will attempt to notify all dpd daemons and then finally return the first error,
730+
/// if any errors were encountered.
730731
async fn notify_dendrite_nat_state(
731732
datastore: &DataStore,
732733
log: &slog::Logger,
733734
resolver: &internal_dns_resolver::Resolver,
734735
opctx_alloc: &OpContext,
735736
instance_id: Option<InstanceUuid>,
736-
fail_fast: bool,
737737
) -> Result<(), Error> {
738738
// Querying boundary switches also requires fleet access and the use of the
739739
// instance allocator context.
740740
let boundary_switches = boundary_switches(datastore, opctx_alloc).await?;
741741

742+
let clients = super::dpd_clients(resolver, log).await.map_err(|e| {
743+
Error::internal_error(&format!("failed to get dpd clients: {e}"))
744+
})?;
745+
742746
let mut errors = vec![];
743747
for switch in &boundary_switches {
744748
debug!(log, "notifying dendrite of updates";
745749
"instance_id" => ?instance_id,
746750
"switch" => switch.to_string());
747751

748-
let clients = super::dpd_clients(resolver, log).await.map_err(|e| {
749-
Error::internal_error(&format!("failed to get dpd clients: {e}"))
750-
})?;
751752
let client_result = clients.get(switch).ok_or_else(|| {
752753
Error::internal_error(&format!(
753754
"unable to find dendrite client for {switch}"
@@ -758,11 +759,7 @@ async fn notify_dendrite_nat_state(
758759
Ok(client) => client,
759760
Err(new_error) => {
760761
errors.push(new_error);
761-
if fail_fast {
762-
break;
763-
} else {
764-
continue;
765-
}
762+
continue;
766763
}
767764
};
768765

0 commit comments

Comments
 (0)