@@ -105,15 +105,6 @@ const DISCOVERY_INTERVAL_IDLE: Duration = Duration::from_secs(60);
105
105
// will require an MGS update.
106
106
const TLV_RPC_TOTAL_ITEMS_DOS_LIMIT : u32 = 1024 ;
107
107
108
- // We allow our client to specify the max RPC attempts and the
109
- // per-attempt timeout; however, it's very easy to set a timeout that is
110
- // too low for the "reset the SP" request, especially if the SP being
111
- // reset is a sidecar (which means it won't be able to respond until it
112
- // brings the management network back online). We will override the max
113
- // attempt count for only that message to ensure we give SPs ample time
114
- // to reset.
115
- const SP_RESET_TIME_ALLOWED : Duration = Duration :: from_secs ( 30 ) ;
116
-
117
108
type Result < T , E = CommunicationError > = std:: result:: Result < T , E > ;
118
109
119
110
#[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
@@ -143,13 +134,60 @@ pub struct SpComponentDetails {
143
134
pub entries : Vec < ComponentDetails > ,
144
135
}
145
136
137
+ #[ derive( Debug , Clone , Copy ) ]
138
+ pub struct SpRetryConfig {
139
+ /// Timeout between retries (applies to all request types).
140
+ pub per_attempt_timeout : Duration ,
141
+
142
+ /// Maximum number of retries for requests that attempt to reset the SP.
143
+ ///
144
+ /// The overall timeout for a reset attempt is this count multiplied by
145
+ /// `per_attempt_timeout`. We have seen sidecar resets take nearly 30
146
+ /// seconds (https://github.com/oxidecomputer/hubris/issues/1867), so this
147
+ /// value should be high enough to allow for resets at least that long with
148
+ /// some headroom.
149
+ pub max_attempts_reset : usize ,
150
+
151
+ /// Maximum number of retries for general requests (currently, all requests
152
+ /// _other_ than resets, which are governed my `max_attempts_reset`).
153
+ ///
154
+ /// The overall timeout for requests is this count multiplied by
155
+ /// `per_attempt_timeout`.
156
+ pub max_attempts_general : usize ,
157
+ }
158
+
159
+ impl SpRetryConfig {
160
+ fn reset_watchdog_timeout_ms ( & self ) -> u32 {
161
+ // Calculate our total timeout for resets in ms. We'll use
162
+ // `saturating_mul`; we're calculating a u128 so should never hit that
163
+ // unless we're configured with `Duration::MAX` or something silly.
164
+ let reset_timeout_ms = self
165
+ . per_attempt_timeout
166
+ . as_millis ( )
167
+ . saturating_mul ( self . max_attempts_reset as u128 ) ;
168
+
169
+ // We'll set the watchdog timer to 50% longer than the total reset
170
+ // timeout; this means that if things fail, the watchdog will reset the
171
+ // SP **after** the MGS timeout expires, so we won't have a
172
+ // false-positive success in this function.
173
+ //
174
+ // We use saturating_mul again and then blindly divide by two; if we
175
+ // saturated a u128, half that will still result in us returning
176
+ // u32::MAX below.
177
+ let inflated_reset_timeout_ms = reset_timeout_ms. saturating_mul ( 3 ) / 2 ;
178
+
179
+ u32:: try_from ( inflated_reset_timeout_ms) . unwrap_or ( u32:: MAX )
180
+ }
181
+ }
182
+
146
183
#[ derive( Debug ) ]
147
184
pub struct SingleSp {
148
185
interface : String ,
149
186
cmds_tx : mpsc:: Sender < InnerCommand > ,
150
187
sp_addr_rx : watch:: Receiver < Option < ( SocketAddrV6 , SpPort ) > > ,
151
188
inner_task : JoinHandle < ( ) > ,
152
189
log : Logger ,
190
+ reset_watchdog_timeout_ms : u32 ,
153
191
}
154
192
155
193
impl Drop for SingleSp {
@@ -175,31 +213,18 @@ impl SingleSp {
175
213
/// determined by the previous step). If this bind fails (e.g., because
176
214
/// `config.listen_addr` is invalid), the returned `SingleSp` will return
177
215
/// a "UDP bind failed" error from all methods forever.
178
- ///
179
- /// Note that `max_attempts_per_rpc` may be overridden for certain kinds of
180
- /// requests. Today, the only request that overrides this value is resetting
181
- /// an SP, which (particularly for sidecars) can take much longer than any
182
- /// other request. `SingleSp` will internally use a higher max attempt count
183
- /// for these messages (but will still respect `per_attempt_timeout`).
184
216
pub async fn new (
185
217
shared_socket : & SharedSocket ,
186
218
config : SwitchPortConfig ,
187
- max_attempts_per_rpc : usize ,
188
- per_attempt_timeout : Duration ,
219
+ retry_config : SpRetryConfig ,
189
220
) -> Self {
190
221
let handle = shared_socket
191
222
. single_sp_handler ( & config. interface , config. discovery_addr )
192
223
. await ;
193
224
194
225
let log = handle. log ( ) . clone ( ) ;
195
226
196
- Self :: new_impl (
197
- handle,
198
- config. interface ,
199
- max_attempts_per_rpc,
200
- per_attempt_timeout,
201
- log,
202
- )
227
+ Self :: new_impl ( handle, config. interface , retry_config, log)
203
228
}
204
229
205
230
/// Create a new `SingleSp` instance specifically for testing (i.e.,
@@ -212,8 +237,7 @@ impl SingleSp {
212
237
pub fn new_direct_socket_for_testing (
213
238
socket : UdpSocket ,
214
239
discovery_addr : SocketAddrV6 ,
215
- max_attempts_per_rpc : usize ,
216
- per_attempt_timeout : Duration ,
240
+ retry_config : SpRetryConfig ,
217
241
log : Logger ,
218
242
) -> Self {
219
243
let wrapper =
@@ -222,8 +246,7 @@ impl SingleSp {
222
246
Self :: new_impl (
223
247
wrapper,
224
248
"(direct socket handle)" . to_string ( ) ,
225
- max_attempts_per_rpc,
226
- per_attempt_timeout,
249
+ retry_config,
227
250
log,
228
251
)
229
252
}
@@ -234,8 +257,7 @@ impl SingleSp {
234
257
fn new_impl < T : InnerSocket + Send + ' static > (
235
258
socket : T ,
236
259
interface : String ,
237
- max_attempts_per_rpc : usize ,
238
- per_attempt_timeout : Duration ,
260
+ retry_config : SpRetryConfig ,
239
261
log : Logger ,
240
262
) -> Self {
241
263
// SPs don't support pipelining, so any command we send to
@@ -247,17 +269,25 @@ impl SingleSp {
247
269
let ( cmds_tx, cmds_rx) = mpsc:: channel ( 8 ) ;
248
270
let ( sp_addr_tx, sp_addr_rx) = watch:: channel ( None ) ;
249
271
250
- let inner = Inner :: new (
251
- socket,
252
- sp_addr_tx,
253
- max_attempts_per_rpc,
254
- per_attempt_timeout,
255
- cmds_rx,
256
- ) ;
272
+ // `retry_config` is primarily for `Inner`, but we need to know the
273
+ // reset watchdog timeout so we know how to construct
274
+ // reset-with-watchdog requests to _send_ to inner. Stash that here,
275
+ // then give the rest of the config to Inner.
276
+ let reset_watchdog_timeout_ms =
277
+ retry_config. reset_watchdog_timeout_ms ( ) ;
278
+
279
+ let inner = Inner :: new ( socket, sp_addr_tx, retry_config, cmds_rx) ;
257
280
258
281
let inner_task = tokio:: spawn ( inner. run ( ) ) ;
259
282
260
- Self { interface, cmds_tx, sp_addr_rx, inner_task, log }
283
+ Self {
284
+ interface,
285
+ cmds_tx,
286
+ sp_addr_rx,
287
+ inner_task,
288
+ log,
289
+ reset_watchdog_timeout_ms,
290
+ }
261
291
}
262
292
263
293
fn log ( & self ) -> & Logger {
@@ -841,14 +871,11 @@ impl SingleSp {
841
871
}
842
872
843
873
let reset_command = if use_watchdog {
844
- // We'll set the watchdog timer to slightly longer than
845
- // SP_RESET_TIME_ALLOWED; this means that if things fail, the
846
- // watchdog will reset the SP **after** the MGS timeout expires, so
847
- // we won't have a false-positive success in this function.
848
- let time_ms =
849
- u32:: try_from ( SP_RESET_TIME_ALLOWED . as_millis ( ) ) . unwrap ( ) * 3
850
- / 2 ;
851
- info ! ( self . log, "using watchdog during reset" ) ;
874
+ let time_ms = self . reset_watchdog_timeout_ms ;
875
+ info ! (
876
+ self . log, "using watchdog during reset" ;
877
+ "watchdog_timeout_ms" => time_ms,
878
+ ) ;
852
879
MgsRequest :: ResetComponentTriggerWithWatchdog { component, time_ms }
853
880
} else {
854
881
MgsRequest :: ResetComponentTrigger { component }
@@ -1663,8 +1690,7 @@ impl InnerSocket for InnerSocketWrapper {
1663
1690
struct Inner < T > {
1664
1691
socket_handle : T ,
1665
1692
sp_addr_tx : watch:: Sender < Option < ( SocketAddrV6 , SpPort ) > > ,
1666
- max_attempts_per_rpc : usize ,
1667
- per_attempt_timeout : Duration ,
1693
+ retry_config : SpRetryConfig ,
1668
1694
serial_console_tx : Option < mpsc:: Sender < ( u64 , Vec < u8 > ) > > ,
1669
1695
cmds_rx : mpsc:: Receiver < InnerCommand > ,
1670
1696
message_id : u32 ,
@@ -1679,15 +1705,13 @@ impl<T: InnerSocket> Inner<T> {
1679
1705
fn new (
1680
1706
socket_handle : T ,
1681
1707
sp_addr_tx : watch:: Sender < Option < ( SocketAddrV6 , SpPort ) > > ,
1682
- max_attempts_per_rpc : usize ,
1683
- per_attempt_timeout : Duration ,
1708
+ retry_config : SpRetryConfig ,
1684
1709
cmds_rx : mpsc:: Receiver < InnerCommand > ,
1685
1710
) -> Self {
1686
1711
Self {
1687
1712
socket_handle,
1688
1713
sp_addr_tx,
1689
- max_attempts_per_rpc,
1690
- per_attempt_timeout,
1714
+ retry_config,
1691
1715
serial_console_tx : None ,
1692
1716
cmds_rx,
1693
1717
message_id : 0 ,
@@ -1976,22 +2000,17 @@ impl<T: InnerSocket> Inner<T> {
1976
2000
} ;
1977
2001
let outgoing_buf = & outgoing_buf[ ..n] ;
1978
2002
1979
- // See comment on `SP_RESET_TIME_ALLOWED` above; bump up the retry count
1980
- // if we're trying to trigger an SP reset.
1981
- let calc_reset_attempts = || {
1982
- let time_desired = SP_RESET_TIME_ALLOWED . as_millis ( ) ;
1983
- let per_attempt = self . per_attempt_timeout . as_millis ( ) . max ( 1 ) ;
1984
- ( ( time_desired + per_attempt - 1 ) / per_attempt) as usize
1985
- } ;
1986
2003
let max_attempts = match & request. kind {
1987
2004
MessageKind :: MgsRequest ( MgsRequest :: ResetComponentTrigger {
1988
2005
component,
1989
- } ) if * component == SpComponent :: SP_ITSELF => calc_reset_attempts ( ) ,
2006
+ } ) if * component == SpComponent :: SP_ITSELF => {
2007
+ self . retry_config . max_attempts_reset
2008
+ }
1990
2009
MessageKind :: MgsRequest (
1991
2010
MgsRequest :: ResetTrigger
1992
2011
| MgsRequest :: ResetComponentTriggerWithWatchdog { .. } ,
1993
- ) => calc_reset_attempts ( ) ,
1994
- _ => self . max_attempts_per_rpc ,
2012
+ ) => self . retry_config . max_attempts_reset ,
2013
+ _ => self . retry_config . max_attempts_general ,
1995
2014
} ;
1996
2015
1997
2016
for attempt in 1 ..=max_attempts {
@@ -2010,7 +2029,7 @@ impl<T: InnerSocket> Inner<T> {
2010
2029
}
2011
2030
}
2012
2031
2013
- Err ( CommunicationError :: ExhaustedNumAttempts ( self . max_attempts_per_rpc ) )
2032
+ Err ( CommunicationError :: ExhaustedNumAttempts ( max_attempts ) )
2014
2033
}
2015
2034
2016
2035
async fn rpc_call_one_attempt (
@@ -2037,7 +2056,8 @@ impl<T: InnerSocket> Inner<T> {
2037
2056
// can loop _without_ resending (and therefore without resetting this
2038
2057
// interval) - this allows us to still time out even if we're getting a
2039
2058
// steady stream of out-of-band messages.
2040
- let mut timeout = tokio:: time:: interval ( self . per_attempt_timeout ) ;
2059
+ let mut timeout =
2060
+ tokio:: time:: interval ( self . retry_config . per_attempt_timeout ) ;
2041
2061
2042
2062
loop {
2043
2063
if resend_request {
@@ -2313,8 +2333,11 @@ mod tests {
2313
2333
let mut inner = Inner :: new (
2314
2334
socket,
2315
2335
sp_addr_tx,
2316
- 1 ,
2317
- Duration :: from_millis ( 200 ) ,
2336
+ SpRetryConfig {
2337
+ per_attempt_timeout : Duration :: from_millis ( 200 ) ,
2338
+ max_attempts_reset : 1 ,
2339
+ max_attempts_general : 1 ,
2340
+ } ,
2318
2341
cmds_rx,
2319
2342
) ;
2320
2343
@@ -2365,4 +2388,27 @@ mod tests {
2365
2388
}
2366
2389
}
2367
2390
}
2391
+
2392
+ #[ tokio:: test]
2393
+ async fn test_watchdog_timeout_calculation ( ) {
2394
+ let retry_config = SpRetryConfig {
2395
+ per_attempt_timeout : Duration :: from_millis ( 2000 ) ,
2396
+ max_attempts_reset : 15 ,
2397
+ max_attempts_general : 1 ,
2398
+ } ;
2399
+
2400
+ // Total reset is 2 sec * 15 = 30 sec, and that should be inflated by
2401
+ // 50% for the watchdog.
2402
+ assert_eq ! ( retry_config. reset_watchdog_timeout_ms( ) , 45_000 ) ;
2403
+
2404
+ // For an absurdly large timeout value, we should get back a u32::MAX
2405
+ // and not panic from overflowing arithmetic.
2406
+ let retry_config = SpRetryConfig {
2407
+ per_attempt_timeout : Duration :: MAX ,
2408
+ max_attempts_reset : 3 ,
2409
+ max_attempts_general : 1 ,
2410
+ } ;
2411
+
2412
+ assert_eq ! ( retry_config. reset_watchdog_timeout_ms( ) , u32 :: MAX ) ;
2413
+ }
2368
2414
}
0 commit comments