Skip to content

Commit 9bbac47

Browse files
authored
Make SP reset attempt count (and therefore timeout) configurable (#285)
1 parent ca9f20e commit 9bbac47

File tree

3 files changed

+129
-71
lines changed

3 files changed

+129
-71
lines changed

faux-mgs/src/main.rs

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ use gateway_sp_comms::InMemoryHostPhase2Provider;
3434
use gateway_sp_comms::SharedSocket;
3535
use gateway_sp_comms::SingleSp;
3636
use gateway_sp_comms::SpComponentDetails;
37+
use gateway_sp_comms::SpRetryConfig;
3738
use gateway_sp_comms::SwitchPortConfig;
3839
use gateway_sp_comms::VersionedSpState;
3940
use gateway_sp_comms::MGS_PORT;
@@ -100,10 +101,16 @@ struct Args {
100101
#[clap(long, required = true)]
101102
interface: Vec<String>,
102103

103-
/// Maximum number of attempts to make when sending requests to the SP.
104+
/// Maximum number of attempts to make when sending general (non-reset)
105+
/// requests to the SP.
104106
#[clap(long, default_value = "5")]
105107
max_attempts: usize,
106108

109+
/// Maximum number of attempts to make when sending reset requests to the
110+
/// SP.
111+
#[clap(long, default_value = "30")]
112+
max_attempts_reset: usize,
113+
107114
/// Timeout (in milliseconds) for each attempt.
108115
#[clap(long, default_value = "2000")]
109116
per_attempt_timeout_millis: u64,
@@ -631,8 +638,13 @@ async fn main() -> Result<()> {
631638
let (log, log_guard) =
632639
build_logger(args.log_level, args.logfile.as_deref())?;
633640

634-
let per_attempt_timeout =
635-
Duration::from_millis(args.per_attempt_timeout_millis);
641+
let retry_config = SpRetryConfig {
642+
per_attempt_timeout: Duration::from_millis(
643+
args.per_attempt_timeout_millis,
644+
),
645+
max_attempts_reset: args.max_attempts,
646+
max_attempts_general: args.max_attempts_reset,
647+
};
636648

637649
let listen_port =
638650
args.listen_port.unwrap_or_else(|| args.command.default_listen_port());
@@ -661,8 +673,7 @@ async fn main() -> Result<()> {
661673
discovery_addr: args.discovery_addr,
662674
interface,
663675
},
664-
args.max_attempts,
665-
per_attempt_timeout,
676+
retry_config,
666677
)
667678
.await,
668679
);

gateway-sp-comms/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ pub use single_sp::SingleSp;
4141
pub use single_sp::SpComponentDetails;
4242
pub use single_sp::SpDevice;
4343
pub use single_sp::SpInventory;
44+
pub use single_sp::SpRetryConfig;
4445

4546
const SP_TO_MGS_MULTICAST_ADDR: Ipv6Addr =
4647
Ipv6Addr::new(0xff02, 0, 0, 0, 0, 0, 0x1de, 1);

gateway-sp-comms/src/single_sp.rs

Lines changed: 112 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,6 @@ const DISCOVERY_INTERVAL_IDLE: Duration = Duration::from_secs(60);
105105
// will require an MGS update.
106106
const TLV_RPC_TOTAL_ITEMS_DOS_LIMIT: u32 = 1024;
107107

108-
// We allow our client to specify the max RPC attempts and the
109-
// per-attempt timeout; however, it's very easy to set a timeout that is
110-
// too low for the "reset the SP" request, especially if the SP being
111-
// reset is a sidecar (which means it won't be able to respond until it
112-
// brings the management network back online). We will override the max
113-
// attempt count for only that message to ensure we give SPs ample time
114-
// to reset.
115-
const SP_RESET_TIME_ALLOWED: Duration = Duration::from_secs(30);
116-
117108
type Result<T, E = CommunicationError> = std::result::Result<T, E>;
118109

119110
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -143,13 +134,60 @@ pub struct SpComponentDetails {
143134
pub entries: Vec<ComponentDetails>,
144135
}
145136

137+
#[derive(Debug, Clone, Copy)]
138+
pub struct SpRetryConfig {
139+
/// Timeout between retries (applies to all request types).
140+
pub per_attempt_timeout: Duration,
141+
142+
/// Maximum number of retries for requests that attempt to reset the SP.
143+
///
144+
/// The overall timeout for a reset attempt is this count multiplied by
145+
/// `per_attempt_timeout`. We have seen sidecar resets take nearly 30
146+
/// seconds (https://github.com/oxidecomputer/hubris/issues/1867), so this
147+
/// value should be high enough to allow for resets at least that long with
148+
/// some headroom.
149+
pub max_attempts_reset: usize,
150+
151+
/// Maximum number of retries for general requests (currently, all requests
152+
/// _other_ than resets, which are governed my `max_attempts_reset`).
153+
///
154+
/// The overall timeout for requests is this count multiplied by
155+
/// `per_attempt_timeout`.
156+
pub max_attempts_general: usize,
157+
}
158+
159+
impl SpRetryConfig {
160+
fn reset_watchdog_timeout_ms(&self) -> u32 {
161+
// Calculate our total timeout for resets in ms. We'll use
162+
// `saturating_mul`; we're calculating a u128 so should never hit that
163+
// unless we're configured with `Duration::MAX` or something silly.
164+
let reset_timeout_ms = self
165+
.per_attempt_timeout
166+
.as_millis()
167+
.saturating_mul(self.max_attempts_reset as u128);
168+
169+
// We'll set the watchdog timer to 50% longer than the total reset
170+
// timeout; this means that if things fail, the watchdog will reset the
171+
// SP **after** the MGS timeout expires, so we won't have a
172+
// false-positive success in this function.
173+
//
174+
// We use saturating_mul again and then blindly divide by two; if we
175+
// saturated a u128, half that will still result in us returning
176+
// u32::MAX below.
177+
let inflated_reset_timeout_ms = reset_timeout_ms.saturating_mul(3) / 2;
178+
179+
u32::try_from(inflated_reset_timeout_ms).unwrap_or(u32::MAX)
180+
}
181+
}
182+
146183
#[derive(Debug)]
147184
pub struct SingleSp {
148185
interface: String,
149186
cmds_tx: mpsc::Sender<InnerCommand>,
150187
sp_addr_rx: watch::Receiver<Option<(SocketAddrV6, SpPort)>>,
151188
inner_task: JoinHandle<()>,
152189
log: Logger,
190+
reset_watchdog_timeout_ms: u32,
153191
}
154192

155193
impl Drop for SingleSp {
@@ -175,31 +213,18 @@ impl SingleSp {
175213
/// determined by the previous step). If this bind fails (e.g., because
176214
/// `config.listen_addr` is invalid), the returned `SingleSp` will return
177215
/// a "UDP bind failed" error from all methods forever.
178-
///
179-
/// Note that `max_attempts_per_rpc` may be overridden for certain kinds of
180-
/// requests. Today, the only request that overrides this value is resetting
181-
/// an SP, which (particularly for sidecars) can take much longer than any
182-
/// other request. `SingleSp` will internally use a higher max attempt count
183-
/// for these messages (but will still respect `per_attempt_timeout`).
184216
pub async fn new(
185217
shared_socket: &SharedSocket,
186218
config: SwitchPortConfig,
187-
max_attempts_per_rpc: usize,
188-
per_attempt_timeout: Duration,
219+
retry_config: SpRetryConfig,
189220
) -> Self {
190221
let handle = shared_socket
191222
.single_sp_handler(&config.interface, config.discovery_addr)
192223
.await;
193224

194225
let log = handle.log().clone();
195226

196-
Self::new_impl(
197-
handle,
198-
config.interface,
199-
max_attempts_per_rpc,
200-
per_attempt_timeout,
201-
log,
202-
)
227+
Self::new_impl(handle, config.interface, retry_config, log)
203228
}
204229

205230
/// Create a new `SingleSp` instance specifically for testing (i.e.,
@@ -212,8 +237,7 @@ impl SingleSp {
212237
pub fn new_direct_socket_for_testing(
213238
socket: UdpSocket,
214239
discovery_addr: SocketAddrV6,
215-
max_attempts_per_rpc: usize,
216-
per_attempt_timeout: Duration,
240+
retry_config: SpRetryConfig,
217241
log: Logger,
218242
) -> Self {
219243
let wrapper =
@@ -222,8 +246,7 @@ impl SingleSp {
222246
Self::new_impl(
223247
wrapper,
224248
"(direct socket handle)".to_string(),
225-
max_attempts_per_rpc,
226-
per_attempt_timeout,
249+
retry_config,
227250
log,
228251
)
229252
}
@@ -234,8 +257,7 @@ impl SingleSp {
234257
fn new_impl<T: InnerSocket + Send + 'static>(
235258
socket: T,
236259
interface: String,
237-
max_attempts_per_rpc: usize,
238-
per_attempt_timeout: Duration,
260+
retry_config: SpRetryConfig,
239261
log: Logger,
240262
) -> Self {
241263
// SPs don't support pipelining, so any command we send to
@@ -247,17 +269,25 @@ impl SingleSp {
247269
let (cmds_tx, cmds_rx) = mpsc::channel(8);
248270
let (sp_addr_tx, sp_addr_rx) = watch::channel(None);
249271

250-
let inner = Inner::new(
251-
socket,
252-
sp_addr_tx,
253-
max_attempts_per_rpc,
254-
per_attempt_timeout,
255-
cmds_rx,
256-
);
272+
// `retry_config` is primarily for `Inner`, but we need to know the
273+
// reset watchdog timeout so we know how to construct
274+
// reset-with-watchdog requests to _send_ to inner. Stash that here,
275+
// then give the rest of the config to Inner.
276+
let reset_watchdog_timeout_ms =
277+
retry_config.reset_watchdog_timeout_ms();
278+
279+
let inner = Inner::new(socket, sp_addr_tx, retry_config, cmds_rx);
257280

258281
let inner_task = tokio::spawn(inner.run());
259282

260-
Self { interface, cmds_tx, sp_addr_rx, inner_task, log }
283+
Self {
284+
interface,
285+
cmds_tx,
286+
sp_addr_rx,
287+
inner_task,
288+
log,
289+
reset_watchdog_timeout_ms,
290+
}
261291
}
262292

263293
fn log(&self) -> &Logger {
@@ -841,14 +871,11 @@ impl SingleSp {
841871
}
842872

843873
let reset_command = if use_watchdog {
844-
// We'll set the watchdog timer to slightly longer than
845-
// SP_RESET_TIME_ALLOWED; this means that if things fail, the
846-
// watchdog will reset the SP **after** the MGS timeout expires, so
847-
// we won't have a false-positive success in this function.
848-
let time_ms =
849-
u32::try_from(SP_RESET_TIME_ALLOWED.as_millis()).unwrap() * 3
850-
/ 2;
851-
info!(self.log, "using watchdog during reset");
874+
let time_ms = self.reset_watchdog_timeout_ms;
875+
info!(
876+
self.log, "using watchdog during reset";
877+
"watchdog_timeout_ms" => time_ms,
878+
);
852879
MgsRequest::ResetComponentTriggerWithWatchdog { component, time_ms }
853880
} else {
854881
MgsRequest::ResetComponentTrigger { component }
@@ -1663,8 +1690,7 @@ impl InnerSocket for InnerSocketWrapper {
16631690
struct Inner<T> {
16641691
socket_handle: T,
16651692
sp_addr_tx: watch::Sender<Option<(SocketAddrV6, SpPort)>>,
1666-
max_attempts_per_rpc: usize,
1667-
per_attempt_timeout: Duration,
1693+
retry_config: SpRetryConfig,
16681694
serial_console_tx: Option<mpsc::Sender<(u64, Vec<u8>)>>,
16691695
cmds_rx: mpsc::Receiver<InnerCommand>,
16701696
message_id: u32,
@@ -1679,15 +1705,13 @@ impl<T: InnerSocket> Inner<T> {
16791705
fn new(
16801706
socket_handle: T,
16811707
sp_addr_tx: watch::Sender<Option<(SocketAddrV6, SpPort)>>,
1682-
max_attempts_per_rpc: usize,
1683-
per_attempt_timeout: Duration,
1708+
retry_config: SpRetryConfig,
16841709
cmds_rx: mpsc::Receiver<InnerCommand>,
16851710
) -> Self {
16861711
Self {
16871712
socket_handle,
16881713
sp_addr_tx,
1689-
max_attempts_per_rpc,
1690-
per_attempt_timeout,
1714+
retry_config,
16911715
serial_console_tx: None,
16921716
cmds_rx,
16931717
message_id: 0,
@@ -1976,22 +2000,17 @@ impl<T: InnerSocket> Inner<T> {
19762000
};
19772001
let outgoing_buf = &outgoing_buf[..n];
19782002

1979-
// See comment on `SP_RESET_TIME_ALLOWED` above; bump up the retry count
1980-
// if we're trying to trigger an SP reset.
1981-
let calc_reset_attempts = || {
1982-
let time_desired = SP_RESET_TIME_ALLOWED.as_millis();
1983-
let per_attempt = self.per_attempt_timeout.as_millis().max(1);
1984-
((time_desired + per_attempt - 1) / per_attempt) as usize
1985-
};
19862003
let max_attempts = match &request.kind {
19872004
MessageKind::MgsRequest(MgsRequest::ResetComponentTrigger {
19882005
component,
1989-
}) if *component == SpComponent::SP_ITSELF => calc_reset_attempts(),
2006+
}) if *component == SpComponent::SP_ITSELF => {
2007+
self.retry_config.max_attempts_reset
2008+
}
19902009
MessageKind::MgsRequest(
19912010
MgsRequest::ResetTrigger
19922011
| MgsRequest::ResetComponentTriggerWithWatchdog { .. },
1993-
) => calc_reset_attempts(),
1994-
_ => self.max_attempts_per_rpc,
2012+
) => self.retry_config.max_attempts_reset,
2013+
_ => self.retry_config.max_attempts_general,
19952014
};
19962015

19972016
for attempt in 1..=max_attempts {
@@ -2010,7 +2029,7 @@ impl<T: InnerSocket> Inner<T> {
20102029
}
20112030
}
20122031

2013-
Err(CommunicationError::ExhaustedNumAttempts(self.max_attempts_per_rpc))
2032+
Err(CommunicationError::ExhaustedNumAttempts(max_attempts))
20142033
}
20152034

20162035
async fn rpc_call_one_attempt(
@@ -2037,7 +2056,8 @@ impl<T: InnerSocket> Inner<T> {
20372056
// can loop _without_ resending (and therefore without resetting this
20382057
// interval) - this allows us to still time out even if we're getting a
20392058
// steady stream of out-of-band messages.
2040-
let mut timeout = tokio::time::interval(self.per_attempt_timeout);
2059+
let mut timeout =
2060+
tokio::time::interval(self.retry_config.per_attempt_timeout);
20412061

20422062
loop {
20432063
if resend_request {
@@ -2313,8 +2333,11 @@ mod tests {
23132333
let mut inner = Inner::new(
23142334
socket,
23152335
sp_addr_tx,
2316-
1,
2317-
Duration::from_millis(200),
2336+
SpRetryConfig {
2337+
per_attempt_timeout: Duration::from_millis(200),
2338+
max_attempts_reset: 1,
2339+
max_attempts_general: 1,
2340+
},
23182341
cmds_rx,
23192342
);
23202343

@@ -2365,4 +2388,27 @@ mod tests {
23652388
}
23662389
}
23672390
}
2391+
2392+
#[tokio::test]
2393+
async fn test_watchdog_timeout_calculation() {
2394+
let retry_config = SpRetryConfig {
2395+
per_attempt_timeout: Duration::from_millis(2000),
2396+
max_attempts_reset: 15,
2397+
max_attempts_general: 1,
2398+
};
2399+
2400+
// Total reset is 2 sec * 15 = 30 sec, and that should be inflated by
2401+
// 50% for the watchdog.
2402+
assert_eq!(retry_config.reset_watchdog_timeout_ms(), 45_000);
2403+
2404+
// For an absurdly large timeout value, we should get back a u32::MAX
2405+
// and not panic from overflowing arithmetic.
2406+
let retry_config = SpRetryConfig {
2407+
per_attempt_timeout: Duration::MAX,
2408+
max_attempts_reset: 3,
2409+
max_attempts_general: 1,
2410+
};
2411+
2412+
assert_eq!(retry_config.reset_watchdog_timeout_ms(), u32::MAX);
2413+
}
23682414
}

0 commit comments

Comments
 (0)