When first request fails, start subsequent ones in parallel with increasing delay. (#4913)

deuszx · web-flow · commit e7192561b427 · 2025-11-07T18:52:09.000Z
## Motivation We observed that when the first request failed, the failure was broadcasted to all waiting peers. This slowed down the process and didn't use alternative peers we expect to have the data. ## Proposal Detect if request fails and if so, try all alternative peers before erroring. We spawn a retry operation for every alternative peer with ever-increasing delay, delayed by `75ms` by default. ## Test Plan CI (a test was added for this case). ## Release Plan - These changes should be backported to the latest `testnet` branch, then - be released in a new SDK, ## Links - [reviewer checklist](https://github.com/linera-io/linera-protocol/blob/main/CONTRIBUTING.md#reviewer-checklist)
diff --git a/CLI.md b/CLI.md
@@ -201,6 +201,9 @@ Client implementation and command-line tool for the Linera blockchain
 * `--alpha <ALPHA>` — Smoothing factor for Exponential Moving Averages (0 < alpha < 1). Higher values give more weight to recent observations. Typical values are between 0.01 and 0.5. A value of 0.1 means that 10% of the new observation is considered and 90% of the previous average is retained
 
   Default value: `0.1`
+* `--alternative-peers-retry-delay-ms <ALTERNATIVE_PEERS_RETRY_DELAY_MS>` — Delay in milliseconds between starting requests to different peers. This helps to stagger requests and avoid overwhelming the network
+
+  Default value: `150`
 * `--storage <STORAGE_CONFIG>` — Storage configuration for the blockchain history
 * `--storage-max-concurrent-queries <STORAGE_MAX_CONCURRENT_QUERIES>` — The maximal number of simultaneous queries to the database
 * `--storage-max-stream-queries <STORAGE_MAX_STREAM_QUERIES>` — The maximal number of simultaneous stream queries to the database
diff --git a/linera-client/src/client_options.rs b/linera-client/src/client_options.rs
@@ -230,6 +230,15 @@ pub struct ClientContextOptions {
         env = "LINERA_REQUESTS_SCHEDULER_ALPHA"
     )]
     pub alpha: f64,
+
+    /// Delay in milliseconds between starting requests to different peers.
+    /// This helps to stagger requests and avoid overwhelming the network.
+    #[arg(
+        long,
+        default_value_t = linera_core::client::requests_scheduler::STAGGERED_DELAY_MS,
+        env = "LINERA_REQUESTS_SCHEDULER_ALTERNATIVE_PEERS_RETRY_DELAY_MS"
+    )]
+    pub alternative_peers_retry_delay_ms: u64,
 }
 
 impl ClientContextOptions {
@@ -273,6 +282,7 @@ impl ClientContextOptions {
             cache_max_size: self.cache_max_size,
             max_request_ttl_ms: self.max_request_ttl_ms,
             alpha: self.alpha,
+            retry_delay_ms: self.alternative_peers_retry_delay_ms,
         }
     }
 }
diff --git a/linera-core/src/client/requests_scheduler/in_flight_tracker.rs b/linera-core/src/client/requests_scheduler/in_flight_tracker.rs
@@ -173,6 +173,21 @@ impl<N: Clone> InFlightTracker<N> {
         let peers = entry.alternative_peers.read().await;
         Some(peers.clone())
     }
+
+    /// Removes a specific peer from the alternative peers list.
+    ///
+    /// # Arguments
+    /// - `key`: The request key to look up
+    /// - `peer`: The peer to remove from alternatives
+    pub(super) async fn remove_alternative_peer(&self, key: &RequestKey, peer: &N)
+    where
+        N: PartialEq + Eq,
+    {
+        if let Some(entry) = self.entries.read().await.get(key) {
+            let mut alt_peers = entry.alternative_peers.write().await;
+            alt_peers.retain(|p| p != peer);
+        }
+    }
 }
 
 /// Type of in-flight request match found.
diff --git a/linera-core/src/client/requests_scheduler/mod.rs b/linera-core/src/client/requests_scheduler/mod.rs
@@ -21,6 +21,7 @@ pub const CACHE_TTL_MS: u64 = 2000;
 pub const CACHE_MAX_SIZE: usize = 1000;
 pub const MAX_REQUEST_TTL_MS: u64 = 200;
 pub const ALPHA_SMOOTHING_FACTOR: f64 = 0.1;
+pub const STAGGERED_DELAY_MS: u64 = 150;
 
 /// Configuration for the `RequestsScheduler`.
 #[derive(Debug, Clone)]
@@ -35,6 +36,8 @@ pub struct RequestsSchedulerConfig {
     pub max_request_ttl_ms: u64,
     /// Smoothing factor for Exponential Moving Averages (0 < alpha < 1)
     pub alpha: f64,
+    /// Delay in milliseconds between starting requests to different peers.
+    pub retry_delay_ms: u64,
 }
 
 impl Default for RequestsSchedulerConfig {
@@ -45,6 +48,7 @@ impl Default for RequestsSchedulerConfig {
             cache_max_size: CACHE_MAX_SIZE,
             max_request_ttl_ms: MAX_REQUEST_TTL_MS,
             alpha: ALPHA_SMOOTHING_FACTOR,
+            retry_delay_ms: STAGGERED_DELAY_MS,
         }
     }
 }
diff --git a/linera-core/src/client/requests_scheduler/scheduler.rs b/linera-core/src/client/requests_scheduler/scheduler.rs
diff --git a/linera-web/src/lib.rs b/linera-web/src/lib.rs

Original file line number	Diff line number	Diff line change
`@@ -230,6 +230,15 @@ pub struct ClientContextOptions {`
`230`	`230`	`env = "LINERA_REQUESTS_SCHEDULER_ALPHA"`
`231`	`231`	`)]`
`232`	`232`	`pub alpha: f64,`
	`233`	`+`
	`234`	`+ /// Delay in milliseconds between starting requests to different peers.`
	`235`	`+ /// This helps to stagger requests and avoid overwhelming the network.`
	`236`	`+ #[arg(`
	`237`	`+ long,`
	`238`	`+ default_value_t = linera_core::client::requests_scheduler::STAGGERED_DELAY_MS,`
	`239`	`+ env = "LINERA_REQUESTS_SCHEDULER_ALTERNATIVE_PEERS_RETRY_DELAY_MS"`
	`240`	`+ )]`
	`241`	`+ pub alternative_peers_retry_delay_ms: u64,`
`233`	`242`	`}`
`234`	`243`
`235`	`244`	`impl ClientContextOptions {`
`@@ -273,6 +282,7 @@ impl ClientContextOptions {`
`273`	`282`	`cache_max_size: self.cache_max_size,`
`274`	`283`	`max_request_ttl_ms: self.max_request_ttl_ms,`
`275`	`284`	`alpha: self.alpha,`
	`285`	`+ retry_delay_ms: self.alternative_peers_retry_delay_ms,`
`276`	`286`	`}`
`277`	`287`	`}`
`278`	`288`	`}`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ pub const CACHE_TTL_MS: u64 = 2000;`
`21`	`21`	`pub const CACHE_MAX_SIZE: usize = 1000;`
`22`	`22`	`pub const MAX_REQUEST_TTL_MS: u64 = 200;`
`23`	`23`	`pub const ALPHA_SMOOTHING_FACTOR: f64 = 0.1;`
	`24`	`+pub const STAGGERED_DELAY_MS: u64 = 150;`
`24`	`25`
`25`	`26`	/// Configuration for the `RequestsScheduler`.
`26`	`27`	`#[derive(Debug, Clone)]`
`@@ -35,6 +36,8 @@ pub struct RequestsSchedulerConfig {`
`35`	`36`	`pub max_request_ttl_ms: u64,`
`36`	`37`	`/// Smoothing factor for Exponential Moving Averages (0 < alpha < 1)`
`37`	`38`	`pub alpha: f64,`
	`39`	`+ /// Delay in milliseconds between starting requests to different peers.`
	`40`	`+ pub retry_delay_ms: u64,`
`38`	`41`	`}`
`39`	`42`
`40`	`43`	`impl Default for RequestsSchedulerConfig {`
`@@ -45,6 +48,7 @@ impl Default for RequestsSchedulerConfig {`
`45`	`48`	`cache_max_size: CACHE_MAX_SIZE,`
`46`	`49`	`max_request_ttl_ms: MAX_REQUEST_TTL_MS,`
`47`	`50`	`alpha: ALPHA_SMOOTHING_FACTOR,`
	`51`	`+ retry_delay_ms: STAGGERED_DELAY_MS,`
`48`	`52`	`}`
`49`	`53`	`}`
`50`	`54`	`}`