Skip to content

Commit 9726267

Browse files
committed
retry when artifact creation failed
1 parent 76c6930 commit 9726267

File tree

4 files changed

+134
-31
lines changed

4 files changed

+134
-31
lines changed

mithril-aggregator/src/runtime/error.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::error::Error as StdError;
1+
use mithril_common::StdError;
22
use thiserror::Error;
33

44
/// Error encountered or produced by the Runtime.
@@ -13,7 +13,7 @@ pub enum RuntimeError {
1313
message: String,
1414

1515
/// Eventual caught error
16-
nested_error: Option<Box<dyn StdError + Sync + Send>>,
16+
nested_error: Option<StdError>,
1717
},
1818
/// A Critical error means the Runtime stops and the software exits with an
1919
/// error code.
@@ -23,30 +23,39 @@ pub enum RuntimeError {
2323
message: String,
2424

2525
/// Eventual caught error
26-
nested_error: Option<Box<dyn StdError + Sync + Send>>,
26+
nested_error: Option<StdError>,
27+
},
28+
/// An error that needs to re-initialize the state machine.
29+
#[error("An error occured: {message}. The state machine will be re-initialized. Nested error: {nested_error:#?}")]
30+
ReInit {
31+
/// error message
32+
message: String,
33+
34+
/// Eventual caught error
35+
nested_error: Option<StdError>,
2736
},
2837
}
2938

3039
impl RuntimeError {
3140
/// Create a new KeepState error
32-
pub fn keep_state(message: &str, error: Option<Box<dyn StdError + Sync + Send>>) -> Self {
41+
pub fn keep_state(message: &str, error: Option<StdError>) -> Self {
3342
Self::KeepState {
3443
message: message.to_string(),
3544
nested_error: error,
3645
}
3746
}
3847

3948
/// Create a new Critical error
40-
pub fn critical(message: &str, error: Option<Box<dyn StdError + Sync + Send>>) -> Self {
49+
pub fn critical(message: &str, error: Option<StdError>) -> Self {
4150
Self::Critical {
4251
message: message.to_string(),
4352
nested_error: error,
4453
}
4554
}
4655
}
4756

48-
impl From<Box<dyn StdError + Sync + Send>> for RuntimeError {
49-
fn from(value: Box<dyn StdError + Sync + Send>) -> Self {
57+
impl From<StdError> for RuntimeError {
58+
fn from(value: StdError) -> Self {
5059
Self::KeepState {
5160
message: "Error caught, state preserved, will retry to cycle.".to_string(),
5261
nested_error: Some(value),

mithril-aggregator/src/runtime/runner.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,26 +197,32 @@ impl AggregatorRunnerTrait for AggregatorRunner {
197197
"RUNNER: get_current_non_certified_open_message_for_signed_entity_type: existing open message found, not already certified";
198198
"signed_entity_type" => ?signed_entity_type
199199
);
200+
200201
Some(existing_open_message)
201202
}
202203
Some(_) => {
203204
info!(
204205
"RUNNER: get_current_non_certified_open_message_for_signed_entity_type: existing open message found, already certified";
205206
"signed_entity_type" => ?signed_entity_type
206207
);
208+
207209
None
208210
}
209211
None => {
210212
info!(
211213
"RUNNER: get_current_non_certified_open_message_for_signed_entity_type: no open message found, a new one will be created";
212214
"signed_entity_type" => ?signed_entity_type
213215
);
214-
// In order to fix flakiness in the end to end tests, we are compelled to update the beacon (of the multi-signer)
215-
// This avoids the computation of the next AVK on the wrong epoch (in 'compute_protocol_message')
216-
// TODO: remove 'current_beacon' from the multi-signer state which will avoid this fix
216+
// In order to fix flakiness in the end to end tests, we are
217+
// compelled to update the beacon (of the multi-signer) This
218+
// avoids the computation of the next AVK on the wrong epoch (in
219+
// 'compute_protocol_message')
220+
// TODO: remove 'current_beacon' from the multi-signer state
221+
// which will avoid this fix
217222
let chain_beacon: Beacon = self.get_beacon_from_chain().await?;
218223
self.update_beacon(&chain_beacon).await?;
219224
let protocol_message = self.compute_protocol_message(signed_entity_type).await?;
225+
220226
Some(
221227
self.create_open_message(signed_entity_type, &protocol_message)
222228
.await?,

mithril-aggregator/src/runtime/state_machine.rs

Lines changed: 95 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ impl Display for AggregatorState {
5151

5252
/// The AggregatorRuntime responsibility is to create a state machine to handle
5353
/// all actions required by the process of getting multi-signatures.
54-
/// See the [documentation](https://mithril.network/doc/mithril/mithril-network/aggregator#under-the-hood) for more explanations about the Aggregator state machine.
54+
/// See the
55+
/// [documentation](https://mithril.network/doc/mithril/mithril-network/aggregator#under-the-hood)
56+
/// for more explanations about the Aggregator state machine.
5557
pub struct AggregatorRuntime {
5658
/// the internal state of the automate
5759
state: AggregatorState,
@@ -104,6 +106,8 @@ impl AggregatorRuntime {
104106

105107
loop {
106108
if let Err(e) = self.cycle().await {
109+
warn!("State machine issued an error: {e}");
110+
107111
match &e {
108112
RuntimeError::Critical {
109113
message: _,
@@ -113,8 +117,32 @@ impl AggregatorRuntime {
113117

114118
return Err(e);
115119
}
116-
_ => {
117-
warn!("state machine: cycle returned an error: '{e}'.");
120+
RuntimeError::KeepState {
121+
message,
122+
nested_error,
123+
} => {
124+
warn!(
125+
"KeepState Error: {message}. Nested error: «{}».",
126+
nested_error
127+
.as_ref()
128+
.map(|e| format!("{e}"))
129+
.unwrap_or("None".into())
130+
);
131+
}
132+
RuntimeError::ReInit {
133+
message,
134+
nested_error,
135+
} => {
136+
warn!(
137+
"ReInit Error: {message}. Nested error: «{}».",
138+
nested_error
139+
.as_ref()
140+
.map(|e| format!("{e}"))
141+
.unwrap_or("None".into())
142+
);
143+
self.state = AggregatorState::Idle(IdleState {
144+
current_beacon: None,
145+
});
118146
}
119147
}
120148
}
@@ -280,23 +308,21 @@ impl AggregatorRuntime {
280308
message: "not enough signature yet to create a certificate, waiting…".to_string(),
281309
nested_error: None,
282310
})?;
283-
284-
// Every error raised below this comment must be raised to critical in order to trigger an aggregator restart which will avoid a dead loop in the state machine
285-
// TODO: Implement a graceful retry mechanism for artifact creation and label open messages with `is_artifact_created` (to keep track of certified but without artifacts open messages)
286-
self.runner.drop_pending_certificate().await.map_err(|e| {
287-
RuntimeError::critical(
288-
"transiting SIGNING → READY: failed dropping pending certificate",
289-
Some(e),
290-
)
291-
})?;
311+
self.runner
312+
.drop_pending_certificate()
313+
.await
314+
.map_err(|e| RuntimeError::ReInit {
315+
message: "transiting SIGNING → READY: failed to drop pending certificate"
316+
.to_string(),
317+
nested_error: Some(e),
318+
})?;
292319
self.runner
293320
.create_artifact(&state.open_message.signed_entity_type, &certificate)
294321
.await
295-
.map_err(|e| {
296-
RuntimeError::critical(
297-
"transiting SIGNING → READY: failed creating artifact",
298-
Some(e),
299-
)
322+
.map_err(|e| RuntimeError::ReInit {
323+
message: "transiting SIGNING → READY: failed to create artifact. Retrying…"
324+
.to_string(),
325+
nested_error: Some(e),
300326
})?;
301327

302328
Ok(ReadyState {
@@ -653,11 +679,62 @@ mod tests {
653679
open_message: OpenMessage::dummy(),
654680
};
655681
let mut runtime = init_runtime(Some(AggregatorState::Signing(state)), runner).await;
656-
runtime
682+
let err = runtime
657683
.cycle()
658684
.await
659685
.expect_err("cycle should have returned an error");
660686

687+
match err {
688+
RuntimeError::KeepState {
689+
message: _,
690+
nested_error: _,
691+
} => (),
692+
_ => panic!("KeepState error expected, got {err:?}."),
693+
};
694+
695+
assert_eq!("signing".to_string(), runtime.get_state());
696+
}
697+
698+
#[tokio::test]
699+
async fn signing_artifact_not_created() {
700+
let mut runner = MockAggregatorRunner::new();
701+
runner
702+
.expect_get_beacon_from_chain()
703+
.once()
704+
.returning(|| Ok(fake_data::beacon()));
705+
runner
706+
.expect_get_current_non_certified_open_message_for_signed_entity_type()
707+
.once()
708+
.returning(|_| Ok(Some(OpenMessage::dummy())));
709+
runner
710+
.expect_create_certificate()
711+
.return_once(move |_| Ok(Some(fake_data::certificate("whatever".to_string()))));
712+
runner
713+
.expect_drop_pending_certificate()
714+
.once()
715+
.returning(|| Ok(Some(fake_data::certificate_pending())));
716+
runner
717+
.expect_create_artifact()
718+
.once()
719+
.returning(|_, _| Err("whatever".into()));
720+
let state = SigningState {
721+
current_beacon: fake_data::beacon(),
722+
open_message: OpenMessage::dummy(),
723+
};
724+
let mut runtime = init_runtime(Some(AggregatorState::Signing(state)), runner).await;
725+
let err = runtime
726+
.cycle()
727+
.await
728+
.expect_err("cycle should have returned an error");
729+
730+
match err {
731+
RuntimeError::ReInit {
732+
message: _,
733+
nested_error: _,
734+
} => (),
735+
_ => panic!("ReInit error expected, got {err:?}."),
736+
};
737+
661738
assert_eq!("signing".to_string(), runtime.get_state());
662739
}
663740

mithril-aggregator/src/signed_entity_service.rs

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,20 @@ impl SignedEntityService for MithrilSignedEntityService {
120120
"certificate_hash" => &certificate.hash
121121
);
122122

123-
let artifact = self
124-
.compute_artifact(signed_entity_type.clone(), certificate)
125-
.await?;
123+
let mut remaining_retries = 3;
124+
let artifact = loop {
125+
remaining_retries -= 1;
126+
127+
match self
128+
.compute_artifact(signed_entity_type.clone(), certificate)
129+
.await
130+
{
131+
Err(error) if remaining_retries == 0 => break Err(error),
132+
Err(_error) => (),
133+
Ok(artifact) => break Ok(artifact),
134+
};
135+
}?;
136+
126137
let signed_entity = SignedEntityRecord {
127138
signed_entity_id: artifact.get_id(),
128139
signed_entity_type,

0 commit comments

Comments
 (0)