Skip to content

Commit 8244a6e

Browse files
committed
Replace linear retry backoff with full jitter exponential backoff
1 parent 8805a12 commit 8244a6e

File tree

16 files changed

+119
-12
lines changed

16 files changed

+119
-12
lines changed

CLI.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ Client implementation and command-line tool for the Linera blockchain
151151
* `--max-retries <MAX_RETRIES>` — Number of times to retry connecting to a validator
152152

153153
Default value: `10`
154+
* `--max-backoff-ms <MAX_BACKOFF>` — Maximum backoff delay for retrying to connect to a validator
155+
156+
Default value: `30000`
154157
* `--wait-for-outgoing-messages` — Whether to wait until a quorum of validators has confirmed that all sent cross-chain messages have been delivered
155158
* `--allow-fast-blocks` — Whether to allow creating blocks in the fast round. Fast blocks have lower latency but must be used carefully so that there are never any conflicting fast block proposals
156159
* `--long-lived-services` — (EXPERIMENTAL) Whether application services can persist in some cases between queries
@@ -1212,6 +1215,9 @@ Start a Local Linera Network
12121215
* `--cross-chain-retry-delay-ms <RETRY_DELAY_MS>` — Delay before retrying of cross-chain message
12131216

12141217
Default value: `2000`
1218+
* `--cross-chain-max-backoff-ms <MAX_BACKOFF_MS>` — Maximum backoff delay for cross-chain message retries
1219+
1220+
Default value: `30000`
12151221
* `--cross-chain-sender-delay-ms <SENDER_DELAY_MS>` — Introduce a delay before sending every cross-chain message (e.g. for testing purpose)
12161222

12171223
Default value: `0`

linera-client/src/client_context.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ pub struct ClientContext<Env: Environment> {
209209
pub recv_timeout: Duration,
210210
pub retry_delay: Duration,
211211
pub max_retries: u32,
212+
pub max_backoff: Duration,
212213
pub chain_listeners: JoinSet,
213214
// TODO(#5082): move this into the upstream UI layers (maybe just the CLI)
214215
pub default_chain: Option<ChainId>,
@@ -284,6 +285,7 @@ where
284285
recv_timeout: options.recv_timeout,
285286
retry_delay: options.retry_delay,
286287
max_retries: options.max_retries,
288+
max_backoff: options.max_backoff,
287289
});
288290
let chain_modes: Vec<_> = wallet
289291
.items()
@@ -342,6 +344,7 @@ where
342344
recv_timeout: options.recv_timeout,
343345
retry_delay: options.retry_delay,
344346
max_retries: options.max_retries,
347+
max_backoff: options.max_backoff,
345348
chain_listeners: JoinSet::default(),
346349
#[cfg(not(web))]
347350
client_metrics,
@@ -397,6 +400,7 @@ impl<Env: Environment> ClientContext<Env> {
397400
recv_timeout: self.recv_timeout,
398401
retry_delay: self.retry_delay,
399402
max_retries: self.max_retries,
403+
max_backoff: self.max_backoff,
400404
}
401405
}
402406

linera-client/src/client_options.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ pub struct Options {
9393
#[arg(long, default_value = "10")]
9494
pub max_retries: u32,
9595

96+
/// Maximum backoff delay for retrying to connect to a validator.
97+
#[arg(
98+
long = "max-backoff-ms",
99+
default_value = "30000",
100+
value_parser = util::parse_millis
101+
)]
102+
pub max_backoff: Duration,
103+
96104
/// Whether to wait until a quorum of validators has confirmed that all sent cross-chain
97105
/// messages have been delivered.
98106
#[arg(long)]

linera-rpc/src/config.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ pub struct CrossChainConfig {
2525
#[arg(long = "cross-chain-retry-delay-ms", default_value = "2000")]
2626
pub(crate) retry_delay_ms: u64,
2727

28+
/// Maximum backoff delay for cross-chain message retries.
29+
#[arg(long = "cross-chain-max-backoff-ms", default_value = "30000")]
30+
pub(crate) max_backoff_ms: u64,
31+
2832
/// Introduce a delay before sending every cross-chain message (e.g. for testing purpose).
2933
#[arg(long = "cross-chain-sender-delay-ms", default_value = "0")]
3034
pub(crate) sender_delay_ms: u64,
@@ -49,6 +53,8 @@ impl CrossChainConfig {
4953
self.max_retries.to_string(),
5054
"--cross-chain-retry-delay-ms".to_string(),
5155
self.retry_delay_ms.to_string(),
56+
"--cross-chain-max-backoff-ms".to_string(),
57+
self.max_backoff_ms.to_string(),
5258
"--cross-chain-sender-delay-ms".to_string(),
5359
self.sender_delay_ms.to_string(),
5460
"--cross-chain-sender-failure-rate".to_string(),

linera-rpc/src/cross_chain_message_queue.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use linera_core::data_types::CrossChainRequest;
1919
use rand::Rng as _;
2020
use tracing::{trace, warn};
2121

22-
use crate::config::ShardId;
22+
use crate::{config::ShardId, full_jitter_delay};
2323

2424
#[cfg(with_metrics)]
2525
mod metrics {
@@ -51,6 +51,7 @@ pub(crate) async fn forward_cross_chain_queries<F, G>(
5151
nickname: String,
5252
cross_chain_max_retries: u32,
5353
cross_chain_retry_delay: Duration,
54+
cross_chain_max_backoff: Duration,
5455
cross_chain_sender_delay: Duration,
5556
cross_chain_sender_failure_rate: f32,
5657
this_shard: ShardId,
@@ -104,7 +105,12 @@ pub(crate) async fn forward_cross_chain_queries<F, G>(
104105
}
105106

106107
Action::Retry => {
107-
linera_base::time::timer::sleep(cross_chain_retry_delay * state.retries).await;
108+
let delay = full_jitter_delay(
109+
cross_chain_retry_delay,
110+
state.retries,
111+
cross_chain_max_backoff,
112+
);
113+
linera_base::time::timer::sleep(delay).await;
108114
Action::Proceed { id: state.id }
109115
}
110116
},

linera-rpc/src/grpc/client.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ use super::{
4343
#[cfg(feature = "opentelemetry")]
4444
use crate::propagation::{get_context_with_traffic_type, inject_context};
4545
use crate::{
46-
grpc::api::RawCertificate, HandleConfirmedCertificateRequest, HandleLiteCertRequest,
47-
HandleTimeoutCertificateRequest, HandleValidatedCertificateRequest,
46+
full_jitter_delay, grpc::api::RawCertificate, HandleConfirmedCertificateRequest,
47+
HandleLiteCertRequest, HandleTimeoutCertificateRequest, HandleValidatedCertificateRequest,
4848
};
4949

5050
#[derive(Clone)]
@@ -53,6 +53,7 @@ pub struct GrpcClient {
5353
client: ValidatorNodeClient<transport::Channel>,
5454
retry_delay: Duration,
5555
max_retries: u32,
56+
max_backoff: Duration,
5657
}
5758

5859
impl GrpcClient {
@@ -61,6 +62,7 @@ impl GrpcClient {
6162
channel: transport::Channel,
6263
retry_delay: Duration,
6364
max_retries: u32,
65+
max_backoff: Duration,
6466
) -> Self {
6567
let client = ValidatorNodeClient::new(channel)
6668
.max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE)
@@ -70,6 +72,7 @@ impl GrpcClient {
7072
client,
7173
retry_delay,
7274
max_retries,
75+
max_backoff,
7376
}
7477
}
7578

@@ -137,7 +140,7 @@ impl GrpcClient {
137140
inject_context(&get_context_with_traffic_type(), request.metadata_mut());
138141
match f(self.client.clone(), request).await {
139142
Err(s) if Self::is_retryable(&s) && retry_count < self.max_retries => {
140-
let delay = self.retry_delay.saturating_mul(retry_count);
143+
let delay = full_jitter_delay(self.retry_delay, retry_count, self.max_backoff);
141144
retry_count += 1;
142145
linera_base::time::timer::sleep(delay).await;
143146
continue;
@@ -295,6 +298,7 @@ impl ValidatorNode for GrpcClient {
295298
async fn subscribe(&self, chains: Vec<ChainId>) -> Result<Self::NotificationStream, NodeError> {
296299
let retry_delay = self.retry_delay;
297300
let max_retries = self.max_retries;
301+
let max_backoff = self.max_backoff;
298302
// Use shared atomic counter so unfold can reset it on successful reconnection.
299303
let retry_count = Arc::new(AtomicU32::new(0));
300304
let subscription_request = SubscriptionRequest {
@@ -362,7 +366,7 @@ impl ValidatorNode for GrpcClient {
362366
{
363367
return future::Either::Left(future::ready(false));
364368
}
365-
let delay = retry_delay.saturating_mul(current_retry_count);
369+
let delay = full_jitter_delay(retry_delay, current_retry_count, max_backoff);
366370
retry_count.fetch_add(1, Ordering::Relaxed);
367371
future::Either::Right(async move {
368372
linera_base::time::timer::sleep(delay).await;

linera-rpc/src/grpc/node_provider.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,21 @@ pub struct GrpcNodeProvider {
1818
pool: GrpcConnectionPool,
1919
retry_delay: Duration,
2020
max_retries: u32,
21+
max_backoff: Duration,
2122
}
2223

2324
impl GrpcNodeProvider {
2425
pub fn new(options: NodeOptions) -> Self {
2526
let transport_options = transport::Options::from(&options);
2627
let retry_delay = options.retry_delay;
2728
let max_retries = options.max_retries;
29+
let max_backoff = options.max_backoff;
2830
let pool = GrpcConnectionPool::new(transport_options);
2931
Self {
3032
pool,
3133
retry_delay,
3234
max_retries,
35+
max_backoff,
3336
}
3437
}
3538
}
@@ -56,6 +59,7 @@ impl ValidatorNodeProvider for GrpcNodeProvider {
5659
channel,
5760
self.retry_delay,
5861
self.max_retries,
62+
self.max_backoff,
5963
))
6064
}
6165
}

linera-rpc/src/grpc/server.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,7 @@ where
396396
internal_network.clone(),
397397
cross_chain_config.max_retries,
398398
Duration::from_millis(cross_chain_config.retry_delay_ms),
399+
Duration::from_millis(cross_chain_config.max_backoff_ms),
399400
Duration::from_millis(cross_chain_config.sender_delay_ms),
400401
cross_chain_config.sender_failure_rate,
401402
shard_id,
@@ -609,6 +610,7 @@ where
609610
network: ValidatorInternalNetworkConfig,
610611
cross_chain_max_retries: u32,
611612
cross_chain_retry_delay: Duration,
613+
cross_chain_max_backoff: Duration,
612614
cross_chain_sender_delay: Duration,
613615
cross_chain_sender_failure_rate: f32,
614616
this_shard: ShardId,
@@ -632,6 +634,7 @@ where
632634
nickname,
633635
cross_chain_max_retries,
634636
cross_chain_retry_delay,
637+
cross_chain_max_backoff,
635638
cross_chain_sender_delay,
636639
cross_chain_sender_failure_rate,
637640
this_shard,

linera-rpc/src/lib.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ pub mod propagation;
2424

2525
pub use client::Client;
2626
pub use message::{RpcMessage, ShardInfo};
27-
pub use node_provider::{NodeOptions, NodeProvider};
27+
pub use node_provider::{NodeOptions, NodeProvider, DEFAULT_MAX_BACKOFF};
2828

2929
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
3030
#[cfg_attr(with_testing, derive(Eq, PartialEq))]
@@ -58,3 +58,23 @@ pub const FILE_DESCRIPTOR_SET: &[u8] = tonic::include_file_descriptor_set!("file
5858
pub const CERT_PEM: &str = include_str!(concat!(env!("OUT_DIR"), "/self_signed_cert.pem"));
5959
#[cfg(not(target_arch = "wasm32"))]
6060
pub const KEY_PEM: &str = include_str!(concat!(env!("OUT_DIR"), "/private_key.pem"));
61+
62+
/// Maximum bit-shift exponent for u32: `1u32 << 31` is the largest valid shift before overflow.
63+
const MAX_SHIFT_EXPONENT: u32 = 31;
64+
65+
/// Computes a Full Jitter delay for exponential backoff.
66+
///
67+
/// Uses the AWS-recommended formula: `sleep = random(0, min(cap, base * 2^attempt))`.
68+
/// Reference: <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
69+
pub(crate) fn full_jitter_delay(
70+
base_delay: std::time::Duration,
71+
attempt: u32,
72+
max_backoff: std::time::Duration,
73+
) -> std::time::Duration {
74+
use rand::Rng as _;
75+
let exponential_delay = base_delay.saturating_mul(1u32 << attempt.min(MAX_SHIFT_EXPONENT));
76+
let capped_delay = exponential_delay.min(max_backoff);
77+
std::time::Duration::from_millis(
78+
rand::thread_rng().gen_range(0..=capped_delay.as_millis() as u64),
79+
)
80+
}

linera-rpc/src/node_provider.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,30 @@ impl ValidatorNodeProvider for NodeProvider {
4646
}
4747
}
4848

49-
#[derive(Copy, Clone, Default)]
49+
/// Default maximum backoff delay (30 seconds), following Google Cloud's recommendation.
50+
/// References:
51+
/// - <https://cloud.google.com/storage/docs/retry-strategy>
52+
/// - <https://docs.aws.amazon.com/sdkref/latest/guide/feature-retry-behavior.html>
53+
/// - <https://github.com/grpc/grpc/blob/master/doc/connection-backoff.md>
54+
pub const DEFAULT_MAX_BACKOFF: Duration = Duration::from_secs(30);
55+
56+
#[derive(Copy, Clone)]
5057
pub struct NodeOptions {
5158
pub send_timeout: Duration,
5259
pub recv_timeout: Duration,
5360
pub retry_delay: Duration,
5461
pub max_retries: u32,
62+
pub max_backoff: Duration,
63+
}
64+
65+
impl Default for NodeOptions {
66+
fn default() -> Self {
67+
Self {
68+
send_timeout: Duration::ZERO,
69+
recv_timeout: Duration::ZERO,
70+
retry_delay: Duration::ZERO,
71+
max_retries: 0,
72+
max_backoff: DEFAULT_MAX_BACKOFF,
73+
}
74+
}
5575
}

0 commit comments

Comments
 (0)