Skip to content

Commit 42f4233

Browse files
authored
Reconfigurator: Bump MGS client timeout (#9135)
See #9133 for context; sidecar resets can take longer than 15 seconds, so the default timing us was causing us to get stuck in an infinite loop. Also changes one error log to include the full chain.
1 parent 4d5bdc6 commit 42f4233

File tree

2 files changed

+12
-28
lines changed

2 files changed

+12
-28
lines changed

nexus/mgs-updates/src/driver_update.rs

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -194,34 +194,18 @@ pub enum ApplyUpdateError {
194194
WaitError(#[source] PrecheckError),
195195
}
196196

197-
/// Construct an MGS client.
198-
//
199-
// We split this into a prod version and a test version to work around timeout
200-
// issues in tests. In practice we expect basically all requests to MGS to be
201-
// fulfilled very quickly, even in tests, but our test infrastructure allows us
202-
// to _pause_ an update for extended periods of time. If we start an update then
203-
// happen to pause it as its making an MGS request, leave it paused for more
204-
// than 15 seconds (the default progenitor client timeout), then try to resume
205-
// it, we'll immediately fail with a timeout (even though MGS is perfectly
206-
// responsive!).
207-
#[cfg(not(test))]
208-
fn make_mgs_clients(backends: &AllBackends, log: &slog::Logger) -> MgsClients {
209-
MgsClients::from_clients(backends.iter().map(|(backend_name, backend)| {
210-
gateway_client::Client::new(
211-
&format!("http://{}", backend.address),
212-
log.new(o!(
213-
"mgs_backend_name" => backend_name.0.to_string(),
214-
"mgs_backend_addr" => backend.address.to_string(),
215-
)),
216-
)
217-
}))
218-
}
219-
#[cfg(test)]
197+
/// Construct a set of MGS clients.
220198
fn make_mgs_clients(backends: &AllBackends, log: &slog::Logger) -> MgsClients {
221199
MgsClients::from_clients(backends.iter().map(|(backend_name, backend)| {
200+
// MGS has its own timeouts it applies to communications on our behalf
201+
// to SPs. The longest of these timeouts is currently set to 60 seconds
202+
// (specifically: MGS will wait up to 60 seconds for devices to reset,
203+
// because we've seen sidecars take 20+ seconds to reset). We should
204+
// therefore wait at least as long as its timeouts; we'll add a buffer
205+
// here to leave plenty of time for minor weather between us and MGS.
222206
let client = reqwest::ClientBuilder::new()
223-
.connect_timeout(Duration::from_secs(60))
224-
.timeout(Duration::from_secs(60))
207+
.connect_timeout(Duration::from_secs(75))
208+
.timeout(Duration::from_secs(75))
225209
.build()
226210
.unwrap();
227211

nexus/mgs-updates/src/mgs_clients.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ use gateway_client::HostPhase1HashError;
1010
use gateway_client::types::SpComponentResetError;
1111
use slog::Logger;
1212
use slog::warn;
13+
use slog_error_chain::InlineErrorChain;
1314
use std::collections::VecDeque;
14-
use std::fmt;
1515
use std::sync::Arc;
1616

1717
pub(super) type GatewayClientError =
@@ -24,7 +24,7 @@ pub(super) type GatewaySpComponentResetError =
2424
///
2525
/// For each error type, we need to know whether we should retry the request (by
2626
/// sending it to the next MGS instance).
27-
pub(super) trait RetryableMgsError: fmt::Display {
27+
pub(super) trait RetryableMgsError: std::error::Error {
2828
fn should_try_next_mgs(&self) -> bool;
2929
}
3030

@@ -131,7 +131,7 @@ impl MgsClients {
131131
log, "retryable error with MGS; \
132132
will try next client";
133133
"mgs_addr" => client.baseurl(),
134-
"err" => %err,
134+
InlineErrorChain::new(&err),
135135
);
136136
}
137137
last_err = Some(err);

0 commit comments

Comments
 (0)