Skip to content

Commit 1e7732c

Browse files
authored
core: do not allow multiple active runners for a subgraph (#5715)
1 parent 6b48bfd commit 1e7732c

File tree

4 files changed

+82
-12
lines changed

4 files changed

+82
-12
lines changed

core/src/subgraph/context/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ impl SubgraphKeepAlive {
5858
self.sg_metrics.running_count.inc();
5959
}
6060
}
61+
62+
pub fn contains(&self, deployment_id: &DeploymentId) -> bool {
63+
self.alive_map.read().unwrap().contains_key(deployment_id)
64+
}
6165
}
6266

6367
// The context keeps track of mutable in-memory state that is retained across blocks.

core/src/subgraph/instance_manager.rs

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
use std::sync::atomic::AtomicU64;
2+
use std::sync::atomic::Ordering;
3+
14
use crate::polling_monitor::{ArweaveService, IpfsService};
25
use crate::subgraph::context::{IndexingContext, SubgraphKeepAlive};
36
use crate::subgraph::inputs::IndexingInputs;
@@ -22,6 +25,7 @@ use tokio::task;
2225

2326
use super::context::OffchainMonitor;
2427
use super::SubgraphTriggerProcessor;
28+
use crate::subgraph::runner::SubgraphRunnerError;
2529

2630
#[derive(Clone)]
2731
pub struct SubgraphInstanceManager<S: SubgraphStore> {
@@ -35,6 +39,18 @@ pub struct SubgraphInstanceManager<S: SubgraphStore> {
3539
arweave_service: ArweaveService,
3640
static_filters: bool,
3741
env_vars: Arc<EnvVars>,
42+
43+
/// By design, there should be only one subgraph runner process per subgraph, but the current
44+
/// implementation does not completely prevent multiple runners from being active at the same
45+
/// time, and we have already had a [bug][0] due to this limitation. Investigating the problem
46+
/// was quite complicated because there was no way to know that the logs were coming from two
47+
/// different processes because all the logs looked the same. Ideally, the implementation
48+
/// should be refactored to make it more strict, but until then, we keep this counter, which
49+
/// is incremented each time a new runner is started, and the previous count is embedded in
50+
/// each log of the started runner, to make debugging future issues easier.
51+
///
52+
/// [0]: https://github.com/graphprotocol/graph-node/issues/5452
53+
subgraph_start_counter: Arc<AtomicU64>,
3854
}
3955

4056
#[async_trait]
@@ -45,7 +61,11 @@ impl<S: SubgraphStore> SubgraphInstanceManagerTrait for SubgraphInstanceManager<
4561
manifest: serde_yaml::Mapping,
4662
stop_block: Option<BlockNumber>,
4763
) {
64+
let runner_index = self.subgraph_start_counter.fetch_add(1, Ordering::SeqCst);
65+
4866
let logger = self.logger_factory.subgraph_logger(&loc);
67+
let logger = logger.new(o!("runner_index" => runner_index));
68+
4969
let err_logger = logger.clone();
5070
let instance_manager = self.cheap_clone();
5171

@@ -185,6 +205,7 @@ impl<S: SubgraphStore> SubgraphInstanceManager<S> {
185205
static_filters,
186206
env_vars,
187207
arweave_service,
208+
subgraph_start_counter: Arc::new(AtomicU64::new(0)),
188209
}
189210
}
190211

@@ -491,13 +512,18 @@ impl<S: SubgraphStore> SubgraphInstanceManager<S> {
491512
// it has a dedicated OS thread so the OS will handle the preemption. See
492513
// https://github.com/tokio-rs/tokio/issues/3493.
493514
graph::spawn_thread(deployment.to_string(), move || {
494-
if let Err(e) = graph::block_on(task::unconstrained(runner.run())) {
495-
error!(
496-
&logger,
497-
"Subgraph instance failed to run: {}",
498-
format!("{:#}", e)
499-
);
515+
match graph::block_on(task::unconstrained(runner.run())) {
516+
Ok(()) => {}
517+
Err(SubgraphRunnerError::Duplicate) => {
518+
// We do not need to unregister metrics because they are unique per subgraph
519+
// and another runner is still active.
520+
return;
521+
}
522+
Err(err) => {
523+
error!(&logger, "Subgraph instance failed to run: {:#}", err);
524+
}
500525
}
526+
501527
subgraph_metrics_unregister.unregister(registry);
502528
});
503529

core/src/subgraph/runner.rs

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,15 @@ where
5252
pub metrics: RunnerMetrics,
5353
}
5454

55+
#[derive(Debug, thiserror::Error)]
56+
pub enum SubgraphRunnerError {
57+
#[error("subgraph runner terminated because a newer one was active")]
58+
Duplicate,
59+
60+
#[error(transparent)]
61+
Unknown(#[from] Error),
62+
}
63+
5564
impl<C, T> SubgraphRunner<C, T>
5665
where
5766
C: Blockchain,
@@ -109,7 +118,7 @@ where
109118

110119
#[cfg(debug_assertions)]
111120
pub async fn run_for_test(self, break_on_restart: bool) -> Result<Self, Error> {
112-
self.run_inner(break_on_restart).await
121+
self.run_inner(break_on_restart).await.map_err(Into::into)
113122
}
114123

115124
fn is_static_filters_enabled(&self) -> bool {
@@ -166,11 +175,11 @@ where
166175
self.build_filter()
167176
}
168177

169-
pub async fn run(self) -> Result<(), Error> {
178+
pub async fn run(self) -> Result<(), SubgraphRunnerError> {
170179
self.run_inner(false).await.map(|_| ())
171180
}
172181

173-
async fn run_inner(mut self, break_on_restart: bool) -> Result<Self, Error> {
182+
async fn run_inner(mut self, break_on_restart: bool) -> Result<Self, SubgraphRunnerError> {
174183
// If a subgraph failed for deterministic reasons, before start indexing, we first
175184
// revert the deployment head. It should lead to the same result since the error was
176185
// deterministic.
@@ -246,15 +255,39 @@ where
246255
// TODO: move cancel handle to the Context
247256
// This will require some code refactor in how the BlockStream is created
248257
let block_start = Instant::now();
249-
match self
258+
259+
let action = self
250260
.handle_stream_event(event, &block_stream_cancel_handle)
251261
.await
252262
.map(|res| {
253263
self.metrics
254264
.subgraph
255265
.observe_block_processed(block_start.elapsed(), res.block_finished());
256266
res
257-
})? {
267+
})?;
268+
269+
// It is possible that the subgraph was unassigned, but the runner was in
270+
// a retry delay state and did not observe the cancel signal.
271+
if block_stream_cancel_handle.is_canceled() {
272+
// It is also possible that the runner was in a retry delay state while
273+
// the subgraph was reassigned and a new runner was started.
274+
if self.ctx.instances.contains(&self.inputs.deployment.id) {
275+
warn!(
276+
self.logger,
277+
"Terminating the subgraph runner because a newer one is active. \
278+
Possible reassignment detected while the runner was in a non-cancellable pending state",
279+
);
280+
return Err(SubgraphRunnerError::Duplicate);
281+
}
282+
283+
warn!(
284+
self.logger,
285+
"Terminating the subgraph runner because subgraph was unassigned",
286+
);
287+
return Ok(self);
288+
}
289+
290+
match action {
258291
Action::Continue => continue,
259292
Action::Stop => {
260293
info!(self.logger, "Stopping subgraph");
@@ -1579,6 +1612,12 @@ where
15791612
}
15801613
}
15811614

1615+
impl From<StoreError> for SubgraphRunnerError {
1616+
fn from(err: StoreError) -> Self {
1617+
Self::Unknown(err.into())
1618+
}
1619+
}
1620+
15821621
/// Transform the proof of indexing changes into entity updates that will be
15831622
/// inserted when as_modifications is called.
15841623
async fn update_proof_of_indexing(

store/postgres/src/writable.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use graph::prelude::{
1717
SubgraphStore as _, BLOCK_NUMBER_MAX,
1818
};
1919
use graph::schema::{EntityKey, EntityType, InputSchema};
20-
use graph::slog::{info, warn};
20+
use graph::slog::{debug, info, warn};
2121
use graph::tokio::select;
2222
use graph::tokio::sync::Notify;
2323
use graph::tokio::task::JoinHandle;
@@ -936,6 +936,7 @@ impl Queue {
936936
// Graceful shutdown. We also handled the request
937937
// successfully
938938
queue.queue.pop().await;
939+
debug!(logger, "Subgraph writer has processed a stop request");
939940
return;
940941
}
941942
Ok(Err(e)) => {

0 commit comments

Comments
 (0)