Skip to content

Commit 3214fd9

Browse files
committed
Replace linear retry backoff with full jitter exponential backoff (#5440)
The retry logic in gRPC client requests, gRPC subscription reconnection, and cross-chain message forwarding all used linear backoff without jitter (`delay * retry_count`). This creates a [thundering herd](https://en.wikipedia.org/wiki/Thundering_herd_problem) risk: when a validator goes down and comes back up, all clients that were retrying wake up at nearly the same time and hit the recovering validator simultaneously, potentially bringing it down again. This happens because linear backoff is deterministic — every client on the same retry count sleeps the exact same duration, so their retries synchronize into bursts. Replace all three retry sites with [Full Jitter exponential backoff](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/), the industry-standard approach recommended by [AWS](https://docs.aws.amazon.com/sdkref/latest/guide/feature-retry-behavior.html), [Google Cloud](https://cloud.google.com/storage/docs/retry-rategy), and the [gRPC spec](https://github.com/grpc/grpc/blob/master/doc/connection-backoff.md). The formula is: `sleep = random(0, min(cap, base * 2^attempt))`. - Exponential growth spaces retries further apart over time - Full randomization (jitter across the entire range, not just an additive offset) desynchronizes clients so they spread their retries evenly instead of clustering - A fixed 30s cap prevents excessive wait times (Google Cloud uses 30s, AWS uses 20s) - CI - These changes should be backported to the latest `testnet` branch, then - be released in a validator hotfix.
1 parent 7670c9b commit 3214fd9

File tree

16 files changed

+117
-12
lines changed

16 files changed

+117
-12
lines changed

CLI.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ Client implementation and command-line tool for the Linera blockchain
151151
* `--max-retries <MAX_RETRIES>` — Number of times to retry connecting to a validator
152152

153153
Default value: `10`
154+
* `--max-backoff-ms <MAX_BACKOFF>` — Maximum backoff delay for retrying to connect to a validator
155+
156+
Default value: `30000`
154157
* `--wait-for-outgoing-messages` — Whether to wait until a quorum of validators has confirmed that all sent cross-chain messages have been delivered
155158
* `--allow-fast-blocks` — Whether to allow creating blocks in the fast round. Fast blocks have lower latency but must be used carefully so that there are never any conflicting fast block proposals
156159
* `--long-lived-services` — (EXPERIMENTAL) Whether application services can persist in some cases between queries
@@ -1203,6 +1206,9 @@ Start a Local Linera Network
12031206
* `--cross-chain-retry-delay-ms <RETRY_DELAY_MS>` — Delay before retrying of cross-chain message
12041207

12051208
Default value: `2000`
1209+
* `--cross-chain-max-backoff-ms <MAX_BACKOFF_MS>` — Maximum backoff delay for cross-chain message retries
1210+
1211+
Default value: `30000`
12061212
* `--cross-chain-sender-delay-ms <SENDER_DELAY_MS>` — Introduce a delay before sending every cross-chain message (e.g. for testing purpose)
12071213

12081214
Default value: `0`

linera-client/src/client_context.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ pub struct ClientContext<Env: Environment> {
209209
pub recv_timeout: Duration,
210210
pub retry_delay: Duration,
211211
pub max_retries: u32,
212+
pub max_backoff: Duration,
212213
pub chain_listeners: JoinSet,
213214
// TODO(#5082): move this into the upstream UI layers (maybe just the CLI)
214215
pub default_chain: Option<ChainId>,
@@ -282,6 +283,7 @@ where
282283
recv_timeout: options.recv_timeout,
283284
retry_delay: options.retry_delay,
284285
max_retries: options.max_retries,
286+
max_backoff: options.max_backoff,
285287
});
286288
let chain_modes: Vec<_> = wallet
287289
.items()
@@ -331,6 +333,7 @@ where
331333
recv_timeout: options.recv_timeout,
332334
retry_delay: options.retry_delay,
333335
max_retries: options.max_retries,
336+
max_backoff: options.max_backoff,
334337
chain_listeners: JoinSet::default(),
335338
#[cfg(not(web))]
336339
client_metrics,
@@ -386,6 +389,7 @@ impl<Env: Environment> ClientContext<Env> {
386389
recv_timeout: self.recv_timeout,
387390
retry_delay: self.retry_delay,
388391
max_retries: self.max_retries,
392+
max_backoff: self.max_backoff,
389393
}
390394
}
391395

linera-client/src/client_options.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,14 @@ pub struct Options {
9797
#[arg(long, default_value = "10")]
9898
pub max_retries: u32,
9999

100+
/// Maximum backoff delay for retrying to connect to a validator.
101+
#[arg(
102+
long = "max-backoff-ms",
103+
default_value = "30000",
104+
value_parser = util::parse_millis
105+
)]
106+
pub max_backoff: Duration,
107+
100108
/// Whether to wait until a quorum of validators has confirmed that all sent cross-chain
101109
/// messages have been delivered.
102110
#[arg(long)]

linera-rpc/src/config.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ pub struct CrossChainConfig {
2525
#[arg(long = "cross-chain-retry-delay-ms", default_value = "2000")]
2626
pub(crate) retry_delay_ms: u64,
2727

28+
/// Maximum backoff delay for cross-chain message retries.
29+
#[arg(long = "cross-chain-max-backoff-ms", default_value = "30000")]
30+
pub(crate) max_backoff_ms: u64,
31+
2832
/// Introduce a delay before sending every cross-chain message (e.g. for testing purpose).
2933
#[arg(long = "cross-chain-sender-delay-ms", default_value = "0")]
3034
pub(crate) sender_delay_ms: u64,
@@ -49,6 +53,8 @@ impl CrossChainConfig {
4953
self.max_retries.to_string(),
5054
"--cross-chain-retry-delay-ms".to_string(),
5155
self.retry_delay_ms.to_string(),
56+
"--cross-chain-max-backoff-ms".to_string(),
57+
self.max_backoff_ms.to_string(),
5258
"--cross-chain-sender-delay-ms".to_string(),
5359
self.sender_delay_ms.to_string(),
5460
"--cross-chain-sender-failure-rate".to_string(),

linera-rpc/src/cross_chain_message_queue.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use linera_core::data_types::CrossChainRequest;
1919
use rand::Rng as _;
2020
use tracing::{trace, warn};
2121

22-
use crate::config::ShardId;
22+
use crate::{config::ShardId, full_jitter_delay};
2323

2424
#[cfg(with_metrics)]
2525
mod metrics {
@@ -51,6 +51,7 @@ pub(crate) async fn forward_cross_chain_queries<F, G>(
5151
nickname: String,
5252
cross_chain_max_retries: u32,
5353
cross_chain_retry_delay: Duration,
54+
cross_chain_max_backoff: Duration,
5455
cross_chain_sender_delay: Duration,
5556
cross_chain_sender_failure_rate: f32,
5657
this_shard: ShardId,
@@ -104,7 +105,12 @@ pub(crate) async fn forward_cross_chain_queries<F, G>(
104105
}
105106

106107
Action::Retry => {
107-
linera_base::time::timer::sleep(cross_chain_retry_delay * state.retries).await;
108+
let delay = full_jitter_delay(
109+
cross_chain_retry_delay,
110+
state.retries,
111+
cross_chain_max_backoff,
112+
);
113+
linera_base::time::timer::sleep(delay).await;
108114
Action::Proceed { id: state.id }
109115
}
110116
},

linera-rpc/src/grpc/client.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ use super::{
4343
#[cfg(feature = "opentelemetry")]
4444
use crate::propagation::{get_context_with_traffic_type, inject_context};
4545
use crate::{
46-
grpc::api::RawCertificate, HandleConfirmedCertificateRequest, HandleLiteCertRequest,
47-
HandleTimeoutCertificateRequest, HandleValidatedCertificateRequest,
46+
full_jitter_delay, grpc::api::RawCertificate, HandleConfirmedCertificateRequest,
47+
HandleLiteCertRequest, HandleTimeoutCertificateRequest, HandleValidatedCertificateRequest,
4848
};
4949

5050
#[derive(Clone)]
@@ -53,6 +53,7 @@ pub struct GrpcClient {
5353
client: ValidatorNodeClient<transport::Channel>,
5454
retry_delay: Duration,
5555
max_retries: u32,
56+
max_backoff: Duration,
5657
}
5758

5859
impl GrpcClient {
@@ -61,6 +62,7 @@ impl GrpcClient {
6162
channel: transport::Channel,
6263
retry_delay: Duration,
6364
max_retries: u32,
65+
max_backoff: Duration,
6466
) -> Self {
6567
let client = ValidatorNodeClient::new(channel)
6668
.max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE)
@@ -70,6 +72,7 @@ impl GrpcClient {
7072
client,
7173
retry_delay,
7274
max_retries,
75+
max_backoff,
7376
}
7477
}
7578

@@ -137,7 +140,7 @@ impl GrpcClient {
137140
inject_context(&get_context_with_traffic_type(), request.metadata_mut());
138141
match f(self.client.clone(), request).await {
139142
Err(s) if Self::is_retryable(&s) && retry_count < self.max_retries => {
140-
let delay = self.retry_delay.saturating_mul(retry_count);
143+
let delay = full_jitter_delay(self.retry_delay, retry_count, self.max_backoff);
141144
retry_count += 1;
142145
linera_base::time::timer::sleep(delay).await;
143146
continue;
@@ -295,6 +298,7 @@ impl ValidatorNode for GrpcClient {
295298
async fn subscribe(&self, chains: Vec<ChainId>) -> Result<Self::NotificationStream, NodeError> {
296299
let retry_delay = self.retry_delay;
297300
let max_retries = self.max_retries;
301+
let max_backoff = self.max_backoff;
298302
// Use shared atomic counter so unfold can reset it on successful reconnection.
299303
let retry_count = Arc::new(AtomicU32::new(0));
300304
let subscription_request = SubscriptionRequest {
@@ -362,7 +366,7 @@ impl ValidatorNode for GrpcClient {
362366
{
363367
return future::Either::Left(future::ready(false));
364368
}
365-
let delay = retry_delay.saturating_mul(current_retry_count);
369+
let delay = full_jitter_delay(retry_delay, current_retry_count, max_backoff);
366370
retry_count.fetch_add(1, Ordering::Relaxed);
367371
future::Either::Right(async move {
368372
linera_base::time::timer::sleep(delay).await;

linera-rpc/src/grpc/node_provider.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,21 @@ pub struct GrpcNodeProvider {
1818
pool: GrpcConnectionPool,
1919
retry_delay: Duration,
2020
max_retries: u32,
21+
max_backoff: Duration,
2122
}
2223

2324
impl GrpcNodeProvider {
2425
pub fn new(options: NodeOptions) -> Self {
2526
let transport_options = transport::Options::from(&options);
2627
let retry_delay = options.retry_delay;
2728
let max_retries = options.max_retries;
29+
let max_backoff = options.max_backoff;
2830
let pool = GrpcConnectionPool::new(transport_options);
2931
Self {
3032
pool,
3133
retry_delay,
3234
max_retries,
35+
max_backoff,
3336
}
3437
}
3538
}
@@ -56,6 +59,7 @@ impl ValidatorNodeProvider for GrpcNodeProvider {
5659
channel,
5760
self.retry_delay,
5861
self.max_retries,
62+
self.max_backoff,
5963
))
6064
}
6165
}

linera-rpc/src/grpc/server.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,7 @@ where
396396
internal_network.clone(),
397397
cross_chain_config.max_retries,
398398
Duration::from_millis(cross_chain_config.retry_delay_ms),
399+
Duration::from_millis(cross_chain_config.max_backoff_ms),
399400
Duration::from_millis(cross_chain_config.sender_delay_ms),
400401
cross_chain_config.sender_failure_rate,
401402
shard_id,
@@ -610,6 +611,7 @@ where
610611
network: ValidatorInternalNetworkConfig,
611612
cross_chain_max_retries: u32,
612613
cross_chain_retry_delay: Duration,
614+
cross_chain_max_backoff: Duration,
613615
cross_chain_sender_delay: Duration,
614616
cross_chain_sender_failure_rate: f32,
615617
this_shard: ShardId,
@@ -633,6 +635,7 @@ where
633635
nickname,
634636
cross_chain_max_retries,
635637
cross_chain_retry_delay,
638+
cross_chain_max_backoff,
636639
cross_chain_sender_delay,
637640
cross_chain_sender_failure_rate,
638641
this_shard,

linera-rpc/src/lib.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ pub mod grpc;
2323

2424
pub use client::Client;
2525
pub use message::RpcMessage;
26-
pub use node_provider::{NodeOptions, NodeProvider};
26+
pub use node_provider::{NodeOptions, NodeProvider, DEFAULT_MAX_BACKOFF};
2727

2828
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
2929
#[cfg_attr(with_testing, derive(Eq, PartialEq))]
@@ -57,3 +57,21 @@ pub const FILE_DESCRIPTOR_SET: &[u8] = tonic::include_file_descriptor_set!("file
5757
pub const CERT_PEM: &str = include_str!(concat!(env!("OUT_DIR"), "/self_signed_cert.pem"));
5858
#[cfg(not(target_arch = "wasm32"))]
5959
pub const KEY_PEM: &str = include_str!(concat!(env!("OUT_DIR"), "/private_key.pem"));
60+
61+
/// Computes a Full Jitter delay for exponential backoff.
62+
///
63+
/// Uses the AWS-recommended formula: `sleep = random(0, min(cap, base * 2^attempt))`.
64+
/// Reference: <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
65+
pub(crate) fn full_jitter_delay(
66+
base_delay: std::time::Duration,
67+
attempt: u32,
68+
max_backoff: std::time::Duration,
69+
) -> std::time::Duration {
70+
use rand::Rng as _;
71+
let exponential_delay =
72+
base_delay.saturating_mul(1u32.checked_shl(attempt).unwrap_or(u32::MAX));
73+
let capped_delay = exponential_delay.min(max_backoff);
74+
std::time::Duration::from_millis(
75+
rand::thread_rng().gen_range(0..=capped_delay.as_millis() as u64),
76+
)
77+
}

linera-rpc/src/node_provider.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,30 @@ impl ValidatorNodeProvider for NodeProvider {
4646
}
4747
}
4848

49-
#[derive(Copy, Clone, Default)]
49+
/// Default maximum backoff delay (30 seconds), following Google Cloud's recommendation.
50+
/// References:
51+
/// - <https://cloud.google.com/storage/docs/retry-strategy>
52+
/// - <https://docs.aws.amazon.com/sdkref/latest/guide/feature-retry-behavior.html>
53+
/// - <https://github.com/grpc/grpc/blob/master/doc/connection-backoff.md>
54+
pub const DEFAULT_MAX_BACKOFF: Duration = Duration::from_secs(30);
55+
56+
#[derive(Copy, Clone)]
5057
pub struct NodeOptions {
5158
pub send_timeout: Duration,
5259
pub recv_timeout: Duration,
5360
pub retry_delay: Duration,
5461
pub max_retries: u32,
62+
pub max_backoff: Duration,
63+
}
64+
65+
impl Default for NodeOptions {
66+
fn default() -> Self {
67+
Self {
68+
send_timeout: Duration::ZERO,
69+
recv_timeout: Duration::ZERO,
70+
retry_delay: Duration::ZERO,
71+
max_retries: 0,
72+
max_backoff: DEFAULT_MAX_BACKOFF,
73+
}
74+
}
5575
}

0 commit comments

Comments
 (0)