Skip to content

Commit 5ddc3e1

Browse files
authored
Add exponential backoff retry for non deterministic errors (#2988)
* deployment: Have just one is_failed function for SubgraphHealth * writable_store: Add method to get health of deployment * instance_manager: Use subgraph health to decide unfailure * instance_manager: Add exponential backoff retry for non-deterministic errors * store: Make health function async
1 parent 1744e46 commit 5ddc3e1

File tree

8 files changed

+161
-55
lines changed

8 files changed

+161
-55
lines changed

core/src/subgraph/instance_manager.rs

Lines changed: 90 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ use graph::data::store::EntityVersion;
88
use graph::data::subgraph::{UnifiedMappingApiVersion, MAX_SPEC_VERSION};
99
use graph::prelude::TryStreamExt;
1010
use graph::prelude::{SubgraphInstanceManager as SubgraphInstanceManagerTrait, *};
11-
use graph::util::lfu_cache::LfuCache;
11+
use graph::util::{backoff::ExponentialBackoff, lfu_cache::LfuCache};
1212
use graph::{blockchain::block_stream::BlockStreamMetrics, components::store::WritableStore};
1313
use graph::{blockchain::block_stream::BlockWithTriggers, data::subgraph::SubgraphFeature};
1414
use graph::{
1515
blockchain::NodeCapabilities,
1616
blockchain::TriggersAdapter,
17-
data::subgraph::schema::{SubgraphError, POI_OBJECT},
17+
data::subgraph::schema::{SubgraphError, SubgraphHealth, POI_OBJECT},
1818
};
1919
use graph::{
2020
blockchain::{block_stream::BlockStreamEvent, Blockchain, TriggerFilter as _},
@@ -27,9 +27,11 @@ use graph::{
2727
use lazy_static::lazy_static;
2828
use std::collections::{BTreeSet, HashMap};
2929
use std::sync::{Arc, RwLock};
30-
use std::time::Instant;
30+
use std::time::{Duration, Instant};
3131
use tokio::task;
3232

33+
const MINUTE: Duration = Duration::from_secs(60);
34+
3335
lazy_static! {
3436
/// Size limit of the entity LFU cache, in bytes.
3537
// Multiplied by 1000 because the env var is in KB.
@@ -43,6 +45,14 @@ lazy_static! {
4345
// Used for testing Graph Node itself.
4446
pub static ref DISABLE_FAIL_FAST: bool =
4547
std::env::var("GRAPH_DISABLE_FAIL_FAST").is_ok();
48+
49+
/// Ceiling for the backoff retry of non-deterministic errors, in seconds.
50+
pub static ref SUBGRAPH_ERROR_RETRY_CEIL_SECS: Duration =
51+
std::env::var("GRAPH_SUBGRAPH_ERROR_RETRY_CEIL_SECS")
52+
.unwrap_or((MINUTE * 30).as_secs().to_string())
53+
.parse::<u64>()
54+
.map(Duration::from_secs)
55+
.expect("invalid GRAPH_SUBGRAPH_ERROR_RETRY_CEIL_SECS");
4656
}
4757

4858
type SharedInstanceKeepAliveMap = Arc<RwLock<HashMap<DeploymentId, CancelGuard>>>;
@@ -462,6 +472,10 @@ where
462472
let mut should_try_unfail_deterministic = true;
463473
let mut should_try_unfail_non_deterministic = true;
464474

475+
// Exponential backoff that starts with two minutes and keeps
476+
// increasing its timeout exponentially until it reaches the ceiling.
477+
let mut backoff = ExponentialBackoff::new(MINUTE * 2, *SUBGRAPH_ERROR_RETRY_CEIL_SECS);
478+
465479
loop {
466480
debug!(logger, "Starting or restarting subgraph");
467481

@@ -634,18 +648,29 @@ where
634648

635649
match res {
636650
Ok(needs_restart) => {
637-
// Runs only once
651+
// Keep trying to unfail subgraph for everytime it advances block(s) until it's
652+
// health is not Failed anymore.
638653
if should_try_unfail_non_deterministic {
639-
should_try_unfail_non_deterministic = false;
640-
641654
// If the deployment head advanced, we can unfail
642655
// the non-deterministic error (if there's any).
643656
ctx.inputs
644657
.store
645658
.unfail_non_deterministic_error(&block_ptr)?;
646-
}
647659

648-
deployment_failed.set(0.0);
660+
match ctx.inputs.store.health(&ctx.inputs.deployment.hash).await? {
661+
SubgraphHealth::Failed => {
662+
// If the unfail call didn't change the subgraph health, we keep
663+
// `should_try_unfail_non_deterministic` as `true` until it's
664+
// actually unfailed.
665+
}
666+
SubgraphHealth::Healthy | SubgraphHealth::Unhealthy => {
667+
// Stop trying to unfail.
668+
should_try_unfail_non_deterministic = false;
669+
deployment_failed.set(0.0);
670+
backoff.reset();
671+
}
672+
};
673+
}
649674

650675
// Notify the BlockStream implementation that a block was succesfully consumed
651676
// and that its internal cursoring mechanism can be saved to memory.
@@ -674,24 +699,74 @@ where
674699

675700
// Handle unexpected stream errors by marking the subgraph as failed.
676701
Err(e) => {
702+
deployment_failed.set(1.0);
703+
677704
let message = format!("{:#}", e).replace("\n", "\t");
678705
let err = anyhow!("{}, code: {}", message, LogCode::SubgraphSyncingFailure);
706+
let deterministic = e.is_deterministic();
679707

680708
let error = SubgraphError {
681709
subgraph_id: id_for_err.clone(),
682710
message,
683711
block_ptr: Some(block_ptr),
684712
handler: None,
685-
deterministic: e.is_deterministic(),
713+
deterministic,
686714
};
687-
deployment_failed.set(1.0);
688715

689-
store_for_err
690-
.fail_subgraph(error)
691-
.await
692-
.context("Failed to set subgraph status to `failed`")?;
716+
match deterministic {
717+
true => {
718+
// Fail subgraph:
719+
// - Change status/health.
720+
// - Save the error to the database.
721+
store_for_err
722+
.fail_subgraph(error)
723+
.await
724+
.context("Failed to set subgraph status to `failed`")?;
725+
726+
return Err(err);
727+
}
728+
false => {
729+
// Shouldn't fail subgraph if it's already failed for non-deterministic
730+
// reasons.
731+
//
732+
// If we don't do this check we would keep adding the same error to the
733+
// database.
734+
let should_fail_subgraph =
735+
ctx.inputs.store.health(&ctx.inputs.deployment.hash).await?
736+
!= SubgraphHealth::Failed;
737+
738+
if should_fail_subgraph {
739+
// Fail subgraph:
740+
// - Change status/health.
741+
// - Save the error to the database.
742+
store_for_err
743+
.fail_subgraph(error)
744+
.await
745+
.context("Failed to set subgraph status to `failed`")?;
746+
}
747+
748+
// Retry logic below:
749+
750+
// Cancel the stream for real.
751+
ctx.state
752+
.instances
753+
.write()
754+
.unwrap()
755+
.remove(&ctx.inputs.deployment.id);
693756

694-
return Err(err);
757+
error!(logger, "Subgraph failed for non-deterministic error: {}", e;
758+
"attempt" => backoff.attempt,
759+
"retry_delay_s" => backoff.delay().as_secs());
760+
761+
// Sleep before restarting.
762+
backoff.sleep_async().await;
763+
764+
should_try_unfail_non_deterministic = true;
765+
766+
// And restart the subgraph.
767+
break;
768+
}
769+
}
695770
}
696771
}
697772
}

graph/src/components/store.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,8 @@ pub trait WritableStore: Send + Sync + 'static {
10741074
/// Report the name of the shard in which the subgraph is stored. This
10751075
/// should only be used for reporting and monitoring
10761076
fn shard(&self) -> &str;
1077+
1078+
async fn health(&self, id: &DeploymentHash) -> Result<SubgraphHealth, StoreError>;
10771079
}
10781080

10791081
#[async_trait]
@@ -1264,6 +1266,10 @@ impl WritableStore for MockStore {
12641266
fn shard(&self) -> &str {
12651267
unimplemented!()
12661268
}
1269+
1270+
async fn health(&self, _: &DeploymentHash) -> Result<SubgraphHealth, StoreError> {
1271+
unimplemented!()
1272+
}
12671273
}
12681274

12691275
pub trait BlockStore: Send + Sync + 'static {

graph/src/data/subgraph/schema.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,8 @@ impl SubgraphHealth {
4444

4545
pub fn is_failed(&self) -> bool {
4646
match self {
47-
SubgraphHealth::Healthy => false,
48-
SubgraphHealth::Unhealthy => false,
4947
SubgraphHealth::Failed => true,
48+
SubgraphHealth::Healthy | SubgraphHealth::Unhealthy => false,
5049
}
5150
}
5251
}

graph/src/util/backoff.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,8 @@ impl ExponentialBackoff {
4545
self.attempt += 1;
4646
delay
4747
}
48+
49+
pub fn reset(&mut self) {
50+
self.attempt = 0;
51+
}
4852
}

store/postgres/src/deployment.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,9 @@ pub enum SubgraphHealth {
3636

3737
impl SubgraphHealth {
3838
fn is_failed(&self) -> bool {
39-
match self {
40-
Self::Failed => true,
41-
Self::Healthy | Self::Unhealthy => false,
42-
}
39+
use graph::data::subgraph::schema::SubgraphHealth as H;
40+
41+
H::from(*self).is_failed()
4342
}
4443
}
4544

@@ -627,6 +626,19 @@ fn check_health(
627626
.map_err(|e| e.into())
628627
}
629628

629+
pub(crate) fn health(
630+
conn: &PgConnection,
631+
id: &DeploymentHash,
632+
) -> Result<SubgraphHealth, StoreError> {
633+
use subgraph_deployment as d;
634+
635+
d::table
636+
.filter(d::deployment.eq(id.as_str()))
637+
.select(d::health)
638+
.get_result(conn)
639+
.map_err(|e| e.into())
640+
}
641+
630642
/// Reverts the errors and updates the subgraph health if necessary.
631643
pub(crate) fn revert_subgraph_errors(
632644
conn: &PgConnection,

store/postgres/src/deployment_store.rs

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1321,48 +1321,43 @@ impl DeploymentStore {
13211321
// Deployment head (current_ptr) advanced more than the error.
13221322
// That means it's healthy, and the non-deterministic error got
13231323
// solved (didn't happen on another try).
1324-
//
1325-
// This should be the scenario where the unfail happens, however
1326-
// for now we unfail in all cases that non-deterministic errors
1327-
// were found and the deployment head advanced.
13281324
(Bound::Included(error_block_number), _)
13291325
if current_ptr.number >= error_block_number =>
13301326
{
1327+
info!(
1328+
self.logger,
1329+
"Unfailing the deployment status";
1330+
"subgraph_id" => deployment_id,
1331+
);
1332+
1333+
// Unfail the deployment.
1334+
deployment::update_deployment_status(
1335+
conn,
1336+
deployment_id,
1337+
deployment::SubgraphHealth::Healthy,
1338+
None,
1339+
)?;
1340+
1341+
// Delete the fatal error.
1342+
deployment::delete_error(conn, &subgraph_error.id)?;
1343+
1344+
Ok(())
13311345
}
1332-
// The deployment head is still before where non-deterministic error happened.
1333-
//
1334-
// Technically we shouldn't unfail the subgraph and delete the error
1335-
// until it's head actually passed the error block range. But for
1336-
// now we'll only log this and keep the old behavior.
1346+
// NOOP, the deployment head is still before where non-deterministic error happened.
13371347
block_range => {
13381348
info!(
13391349
self.logger,
1340-
"Subgraph error is still ahead of deployment head";
1350+
"Subgraph error is still ahead of deployment head, nothing to unfail";
13411351
"subgraph_id" => deployment_id,
13421352
"block_number" => format!("{}", current_ptr.number),
13431353
"block_hash" => format!("{}", current_ptr.hash),
13441354
"error_block_range" => format!("{:?}", block_range),
13451355
"error_block_hash" => subgraph_error.block_hash.as_ref().map(|hash| format!("0x{}", hex::encode(hash))),
13461356
);
1347-
}
1348-
};
13491357

1350-
info!(
1351-
self.logger,
1352-
"Unfailing the deployment status";
1353-
"subgraph_id" => deployment_id,
1354-
);
1355-
1356-
// Unfail the deployment.
1357-
deployment::update_deployment_status(
1358-
conn,
1359-
deployment_id,
1360-
deployment::SubgraphHealth::Healthy,
1361-
None,
1362-
)?;
1363-
1364-
// Delete the fatal error.
1365-
deployment::delete_error(conn, &subgraph_error.id)
1358+
Ok(())
1359+
}
1360+
}
13661361
})
13671362
}
13681363

@@ -1379,4 +1374,13 @@ impl DeploymentStore {
13791374
"shard" => self.pool.shard.as_str())
13801375
});
13811376
}
1377+
1378+
pub(crate) async fn health(
1379+
&self,
1380+
id: &DeploymentHash,
1381+
) -> Result<deployment::SubgraphHealth, StoreError> {
1382+
let id = id.clone();
1383+
self.with_conn(move |conn, _| deployment::health(&conn, &id).map_err(Into::into))
1384+
.await
1385+
}
13821386
}

store/postgres/src/subgraph_store.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use graph::{
1616
},
1717
constraint_violation,
1818
data::query::QueryTarget,
19-
data::subgraph::schema::SubgraphError,
19+
data::subgraph::schema::{self, SubgraphError},
2020
data::{
2121
store::{EntityVersion, Vid},
2222
subgraph::status,
@@ -1325,6 +1325,13 @@ impl WritableStoreTrait for WritableStore {
13251325
fn shard(&self) -> &str {
13261326
self.site.shard.as_str()
13271327
}
1328+
1329+
async fn health(&self, id: &DeploymentHash) -> Result<schema::SubgraphHealth, StoreError> {
1330+
self.retry_async("health", || async {
1331+
self.writable.health(id).await.map(Into::into)
1332+
})
1333+
.await
1334+
}
13281335
}
13291336

13301337
fn same_subgraph(mods: &Vec<EntityModification>, id: &DeploymentHash) -> bool {

store/postgres/tests/subgraph.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -966,15 +966,14 @@ fn fail_unfail_non_deterministic_error_noop() {
966966
// Fail the subgraph with a non-deterministic error, but with an advanced block.
967967
writable.fail_subgraph(error).await.unwrap();
968968

969-
// Since the block range of the block won't match the deployment head, this would be NOOP,
970-
// but we're skipping the confidence check for now.
969+
// Since the block range of the block won't match the deployment head, this will be NOOP.
971970
writable.unfail_non_deterministic_error(&BLOCKS[1]).unwrap();
972971

973-
// Unfail ocurrs as expected.
974-
assert_eq!(count(), 1);
972+
// State continues the same besides a new error added to the database.
973+
assert_eq!(count(), 2);
975974
let vi = get_version_info(&store, NAME);
976975
assert_eq!(&*NAME, vi.deployment_id.as_str());
977-
assert_eq!(false, vi.failed);
976+
assert_eq!(true, vi.failed);
978977
assert_eq!(Some(1), vi.latest_ethereum_block_number);
979978

980979
test_store::remove_subgraphs();

0 commit comments

Comments
 (0)