Skip to content

Commit 30b8f86

Browse files
deuszxafckndr-ds
authored
Retry certificate reads for up to a second before erroring. (#4903)
## Motivation In Testnet deployment we see that block processor fails to find the certificate in the DB. ## Proposal Scylla is eventually consistent database and maybe processor tries to read the cert before Scylla fully commits. Retry certificate read for up to a second since the first try/failure. ## Test Plan CI/manual. ## Release Plan - These changes should be released to testnet deployment of exporter, and - If successful backported to `main`. - ## Links - [reviewer checklist](https://github.com/linera-io/linera-protocol/blob/main/CONTRIBUTING.md#reviewer-checklist) --------- Signed-off-by: deuszx <[email protected]> Co-authored-by: Andreas Fackler <[email protected]> Co-authored-by: Andre da Silva <[email protected]>
1 parent 6939851 commit 30b8f86

File tree

1 file changed

+32
-1
lines changed
  • linera-service/src/exporter/runloops/block_processor

1 file changed

+32
-1
lines changed

linera-service/src/exporter/runloops/block_processor/mod.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
use std::{
5+
collections::HashMap,
56
future::{Future, IntoFuture},
6-
time::Duration,
7+
time::{Duration, Instant},
78
};
89

10+
use linera_base::crypto::CryptoHash;
911
use linera_execution::committee::Committee;
1012
use linera_service::config::DestinationId;
1113
use linera_storage::Storage;
@@ -27,6 +29,10 @@ where
2729
storage: BlockProcessorStorage<T>,
2830
new_block_queue: NewBlockQueue,
2931
committee_destination_update: bool,
32+
// Temporary solution.
33+
// Tracks certificates that failed to be read from storage
34+
// along with the time of the failure to avoid retrying for too long.
35+
retried_certs: HashMap<CryptoHash, (u8, Instant)>,
3036
}
3137

3238
impl<S, T> BlockProcessor<S, T>
@@ -46,6 +52,7 @@ where
4652
exporters_tracker,
4753
committee_destination_update,
4854
new_block_queue,
55+
retried_certs: HashMap::new(),
4956
}
5057
}
5158

@@ -120,6 +127,30 @@ where
120127
self.new_block_queue.push_back(next_block_notification);
121128
},
122129

130+
Err(ExporterError::ReadCertificateError(hash)) => {
131+
match self.retried_certs.remove(&hash) {
132+
// We retry only if the time elapsed since the first attempt is
133+
// less than 1 second. The assumption is that Scylla cannot
134+
// be inconsistent for too long.
135+
Some((retries, first_attempt)) => {
136+
let elapsed = Instant::now().duration_since(first_attempt);
137+
if retries < 3 || elapsed < Duration::from_secs(1) {
138+
tracing::warn!(?hash, retry=retries+1, "retrying to read certificate");
139+
self.retried_certs.insert(hash, (retries + 1, first_attempt));
140+
self.new_block_queue.push_back(next_block_notification);
141+
} else {
142+
tracing::error!(?hash, "certificate is missing from the database");
143+
return Err(ExporterError::ReadCertificateError(hash));
144+
}
145+
},
146+
None => {
147+
tracing::warn!(?hash, retry=1, "retrying to read certificate");
148+
self.retried_certs.insert(hash, (1, Instant::now()));
149+
self.new_block_queue.push_back(next_block_notification);
150+
}
151+
}
152+
},
153+
123154
Err(e @ (ExporterError::UnprocessedChain
124155
| ExporterError::BadInitialization
125156
| ExporterError::ChainAlreadyExists(_))

0 commit comments

Comments
 (0)