diff --git a/Cargo.lock b/Cargo.lock index 2a244d99504b6..f31bf5ab7d5e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13460,6 +13460,7 @@ dependencies = [ "sui-data-store", "sui-execution", "sui-faucet", + "sui-futures", "sui-genesis-builder", "sui-indexer-alt", "sui-indexer-alt-consistent-store", @@ -13492,7 +13493,6 @@ dependencies = [ "thiserror 1.0.69", "tikv-jemalloc-ctl", "tokio", - "tokio-util 0.7.13", "toml 0.7.4", "tower 0.5.2", "tower-http", @@ -13921,7 +13921,6 @@ dependencies = [ "sui-indexer-alt-metrics", "telemetry-subscribers", "tokio", - "tokio-util 0.7.13", "tracing", "url", ] @@ -13962,7 +13961,6 @@ dependencies = [ "sui-rpc", "sui-types", "tokio", - "tokio-util 0.7.13", "tracing", "tracing-subscriber", "url", @@ -14849,7 +14847,6 @@ dependencies = [ "telemetry-subscribers", "tempfile", "tokio", - "tokio-util 0.7.13", "toml 0.7.4", "tracing", "url", @@ -14910,7 +14907,6 @@ dependencies = [ "thiserror 1.0.69", "tokio", "tokio-stream", - "tokio-util 0.7.13", "toml 0.7.4", "tonic 0.14.2", "tonic-health", @@ -14944,6 +14940,7 @@ dependencies = [ "shared-crypto", "simulacrum", "sui-field-count", + "sui-futures", "sui-indexer-alt", "sui-indexer-alt-consistent-api", "sui-indexer-alt-consistent-store", @@ -14968,7 +14965,6 @@ dependencies = [ "tempfile", "test-cluster", "tokio", - "tokio-util 0.7.13", "tonic 0.14.2", "tracing", "url", @@ -14983,6 +14979,7 @@ dependencies = [ "axum 0.8.3", "backoff", "bb8", + "bytes", "chrono", "clap", "dashmap", @@ -15013,7 +15010,6 @@ dependencies = [ "thiserror 1.0.69", "tokio", "tokio-stream", - "tokio-util 0.7.13", "tonic 0.14.2", "tracing", "tracing-subscriber", @@ -15068,6 +15064,7 @@ dependencies = [ "shared-crypto", "sui-default-config", "sui-display", + "sui-futures", "sui-indexer-alt-metrics", "sui-indexer-alt-reader", "sui-indexer-alt-schema", @@ -15083,7 +15080,6 @@ dependencies = [ "telemetry-subscribers", "thiserror 1.0.69", "tokio", - "tokio-util 0.7.13", "toml 0.7.4", "tonic 0.14.2", "tower-http", @@ -15122,6 +15118,7 @@ dependencies = [ "serde_with", "sui-default-config", "sui-display", + "sui-futures", "sui-indexer-alt-metrics", "sui-indexer-alt-reader", "sui-indexer-alt-schema", @@ -15139,7 +15136,6 @@ dependencies = [ "telemetry-subscribers", "thiserror 1.0.69", "tokio", - "tokio-util 0.7.13", "toml 0.7.4", "tower 0.4.13", "tower-http", @@ -15157,9 +15153,9 @@ dependencies = [ "clap", "prometheus", "prometheus-closure-metric", + "sui-futures", "sui-pg-db", "tokio", - "tokio-util 0.7.13", "tracing", ] @@ -15194,6 +15190,7 @@ dependencies = [ "prometheus", "prost-types 0.14.1", "serde_json", + "sui-futures", "sui-indexer-alt-consistent-api", "sui-indexer-alt-metrics", "sui-indexer-alt-schema", @@ -15207,7 +15204,6 @@ dependencies = [ "sui-types", "thiserror 1.0.69", "tokio", - "tokio-util 0.7.13", "tonic 0.14.2", "tracing", "url", diff --git a/crates/sui-bridge-indexer-alt/Cargo.toml b/crates/sui-bridge-indexer-alt/Cargo.toml index cdda182338f28..1783d260d15fa 100644 --- a/crates/sui-bridge-indexer-alt/Cargo.toml +++ b/crates/sui-bridge-indexer-alt/Cargo.toml @@ -8,7 +8,6 @@ edition = "2024" [dependencies] tokio.workspace = true -tokio-util.workspace = true anyhow.workspace = true sui-bridge.workspace = true sui-bridge-schema.workspace = true diff --git a/crates/sui-bridge-indexer-alt/src/main.rs b/crates/sui-bridge-indexer-alt/src/main.rs index 3e9196cb5e1c2..02d2c381d0837 100644 --- a/crates/sui-bridge-indexer-alt/src/main.rs +++ b/crates/sui-bridge-indexer-alt/src/main.rs @@ -12,9 +12,9 @@ use sui_bridge_indexer_alt::metrics::BridgeIndexerMetrics; use sui_bridge_schema::MIGRATIONS; use sui_indexer_alt_framework::ingestion::{ClientArgs, ingestion_client::IngestionClientArgs}; use sui_indexer_alt_framework::postgres::DbArgs; +use sui_indexer_alt_framework::service::Error; use sui_indexer_alt_framework::{Indexer, IndexerArgs}; use sui_indexer_alt_metrics::{MetricsArgs, MetricsService}; -use tokio_util::sync::CancellationToken; use url::Url; #[derive(Parser)] @@ -49,18 +49,14 @@ async fn main() -> Result<(), anyhow::Error> { remote_store_url, } = Args::parse(); - let cancel = CancellationToken::new(); + let is_bounded_job = indexer_args.last_checkpoint.is_some(); let registry = Registry::new_custom(Some("bridge".into()), None) .context("Failed to create Prometheus registry.")?; // Initialize bridge-specific metrics let bridge_metrics = BridgeIndexerMetrics::new(®istry); - let metrics = MetricsService::new( - MetricsArgs { metrics_address }, - registry, - cancel.child_token(), - ); + let metrics = MetricsService::new(MetricsArgs { metrics_address }, registry); let metrics_prefix = None; let mut indexer = Indexer::new_from_pg( @@ -78,7 +74,6 @@ async fn main() -> Result<(), anyhow::Error> { Some(&MIGRATIONS), metrics_prefix, metrics.registry(), - cancel.clone(), ) .await?; @@ -104,11 +99,23 @@ async fn main() -> Result<(), anyhow::Error> { .concurrent_pipeline(ErrorTransactionHandler, Default::default()) .await?; - let h_indexer = indexer.run().await?; - let h_metrics = metrics.run().await?; + let s_indexer = indexer.run().await?; + let s_metrics = metrics.run().await?; - let _ = h_indexer.await; - cancel.cancel(); - let _ = h_metrics.await; - Ok(()) + match s_indexer.attach(s_metrics).main().await { + Ok(()) => Ok(()), + Err(Error::Terminated) => { + if is_bounded_job { + std::process::exit(1); + } else { + Ok(()) + } + } + Err(Error::Aborted) => { + std::process::exit(1); + } + Err(Error::Task(_)) => { + std::process::exit(2); + } + } } diff --git a/crates/sui-checkpoint-blob-indexer/Cargo.toml b/crates/sui-checkpoint-blob-indexer/Cargo.toml index 43868ddd514ca..4b36317b12421 100644 --- a/crates/sui-checkpoint-blob-indexer/Cargo.toml +++ b/crates/sui-checkpoint-blob-indexer/Cargo.toml @@ -31,7 +31,6 @@ tracing.workspace = true tracing-subscriber.workspace = true url.workspace = true prometheus.workspace = true -tokio-util.workspace = true async-trait.workspace = true object_store.workspace = true serde_json.workspace = true diff --git a/crates/sui-checkpoint-blob-indexer/src/main.rs b/crates/sui-checkpoint-blob-indexer/src/main.rs index b8e958bc547f5..5d7f5bd4f9780 100644 --- a/crates/sui-checkpoint-blob-indexer/src/main.rs +++ b/crates/sui-checkpoint-blob-indexer/src/main.rs @@ -8,6 +8,7 @@ use object_store::{ gcp::GoogleCloudStorageBuilder, http::HttpBuilder, local::LocalFileSystem, }; use sui_checkpoint_blob_indexer::{CheckpointBlobPipeline, EpochsPipeline}; +use sui_indexer_alt_framework::service::Error; use sui_indexer_alt_framework::{Indexer, IndexerArgs, ingestion::ClientArgs}; use sui_indexer_alt_metrics::MetricsArgs; use sui_indexer_alt_object_store::ObjectStore; @@ -124,14 +125,9 @@ async fn main() -> anyhow::Result<()> { let store = ObjectStore::new(object_store); - let cancel = tokio_util::sync::CancellationToken::new(); - let registry = prometheus::Registry::new_custom(Some("checkpoint_blob".into()), None)?; - let metrics_service = sui_indexer_alt_metrics::MetricsService::new( - args.metrics_args, - registry.clone(), - cancel.clone(), - ); + let metrics_service = + sui_indexer_alt_metrics::MetricsService::new(args.metrics_args, registry.clone()); let config = ConcurrentConfig { committer: CommitterConfig { @@ -149,7 +145,6 @@ async fn main() -> anyhow::Result<()> { IngestionConfig::default(), None, ®istry, - cancel.clone(), ) .await?; @@ -166,67 +161,23 @@ async fn main() -> anyhow::Result<()> { .concurrent_pipeline(EpochsPipeline, config.clone()) .await?; - let h_metrics = metrics_service.run().await?; - let mut h_indexer = indexer.run().await?; - - enum ExitReason { - Completed, - UserInterrupt, // SIGINT / Ctrl-C - Terminated, // SIGTERM (i.e. from K8s) - } - - let exit_reason = tokio::select! { - res = &mut h_indexer => { - tracing::info!("Indexer completed successfully"); - res?; - ExitReason::Completed + let s_metrics = metrics_service.run().await?; + let s_indexer = indexer.run().await?; + + match s_indexer.attach(s_metrics).main().await { + Ok(()) => Ok(()), + Err(Error::Terminated) => { + if is_bounded_job { + std::process::exit(1); + } else { + Ok(()) + } } - _ = tokio::signal::ctrl_c() => { - tracing::info!("Received SIGINT, shutting down..."); - ExitReason::UserInterrupt - } - _ = wait_for_sigterm() => { - tracing::info!("Received SIGTERM, shutting down..."); - ExitReason::Terminated - } - }; - - cancel.cancel(); - tracing::info!("Waiting for graceful shutdown..."); - let _ = h_indexer.await; - let _ = h_metrics.await; - - // Determine exit code based on exit reason and job type - match exit_reason { - ExitReason::Completed => { - // Job finished all work successfully - Ok(()) - } - ExitReason::UserInterrupt => { - // User manually stopped it - treat as success - Ok(()) - } - ExitReason::Terminated if is_bounded_job => { - // Bounded job interrupted by K8s - work incomplete, trigger restart + Err(Error::Aborted) => { std::process::exit(1); } - ExitReason::Terminated => { - // Continuous indexer - normal shutdown - Ok(()) + Err(Error::Task(_)) => { + std::process::exit(2); } } } - -#[cfg(unix)] -async fn wait_for_sigterm() { - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) - .expect("Failed to install SIGTERM handler") - .recv() - .await; -} - -#[cfg(not(unix))] -async fn wait_for_sigterm() { - // SIGTERM doesn't exist on Windows, so wait forever - std::future::pending::<()>().await -} diff --git a/crates/sui-indexer-alt-consistent-store/Cargo.toml b/crates/sui-indexer-alt-consistent-store/Cargo.toml index abab408b90607..21b70ff7b3d45 100644 --- a/crates/sui-indexer-alt-consistent-store/Cargo.toml +++ b/crates/sui-indexer-alt-consistent-store/Cargo.toml @@ -34,7 +34,6 @@ telemetry-subscribers.workspace = true thiserror.workspace = true tokio.workspace = true tokio-stream.workspace = true -tokio-util.workspace = true toml.workspace = true tonic.workspace = true tonic-health.workspace = true diff --git a/crates/sui-indexer-alt-consistent-store/src/indexer.rs b/crates/sui-indexer-alt-consistent-store/src/indexer.rs index c7447071d4804..e921067f2711c 100644 --- a/crates/sui-indexer-alt-consistent-store/src/indexer.rs +++ b/crates/sui-indexer-alt-consistent-store/src/indexer.rs @@ -9,9 +9,8 @@ use sui_indexer_alt_framework::{ self as framework, IndexerArgs, ingestion::{ClientArgs, IngestionConfig}, pipeline::sequential::{self, SequentialConfig}, + service::Service, }; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; use crate::{ config::ConsistencyConfig, @@ -59,7 +58,6 @@ impl Indexer { ingestion_config: IngestionConfig, db_config: DbConfig, registry: &Registry, - cancel: CancellationToken, ) -> anyhow::Result { let store = Store::open( path, @@ -74,7 +72,6 @@ impl Indexer { consistency_config.stride, consistency_config.buffer_size, indexer_args.first_checkpoint, - cancel.child_token(), ); let metrics_prefix = Some("consistent_indexer"); @@ -85,7 +82,6 @@ impl Indexer { ingestion_config, metrics_prefix, registry, - cancel.child_token(), ) .await .context("Failed to create indexer")?; @@ -135,16 +131,15 @@ impl Indexer { /// Start ingesting checkpoints, consuming the indexer in the process. /// /// See [`framework::Indexer::run`] for details. - pub(crate) async fn run(self) -> anyhow::Result> { + pub(crate) async fn run(self) -> anyhow::Result { // Associate the indexer's store with the synchronizer. This spins up a separate task for // each pipeline that was registered, and installs the write queues that talk to those // tasks into the store, so that when a write arrives to the store for a particular // pipeline, it can make its way to the right task. - let h_sync = self.indexer.store().sync(self.sync)?; - let h_indexer = self.indexer.run(); - Ok(tokio::spawn(async move { - let (_, _) = futures::join!(h_sync, h_indexer); - })) + let s_sync = self.indexer.store().sync(self.sync)?; + let s_indexer = self.indexer.run().await?; + + Ok(s_indexer.attach(s_sync)) } } @@ -241,7 +236,6 @@ mod tests { IngestionConfig::default(), DbConfig::default(), &prometheus::Registry::new(), - CancellationToken::new(), ) .await .unwrap(); @@ -274,7 +268,6 @@ mod tests { IngestionConfig::default(), DbConfig::default(), &prometheus::Registry::new(), - CancellationToken::new(), ) .await .unwrap(); diff --git a/crates/sui-indexer-alt-consistent-store/src/lib.rs b/crates/sui-indexer-alt-consistent-store/src/lib.rs index a86d9db3847bb..7b326e2d0681c 100644 --- a/crates/sui-indexer-alt-consistent-store/src/lib.rs +++ b/crates/sui-indexer-alt-consistent-store/src/lib.rs @@ -43,10 +43,8 @@ use sui_indexer_alt_consistent_api::proto::{ }; use sui_indexer_alt_framework::{ IndexerArgs, ingestion::ClientArgs, pipeline::CommitterConfig, - pipeline::sequential::SequentialConfig, + pipeline::sequential::SequentialConfig, service::Service, }; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; pub mod args; pub mod config; @@ -60,8 +58,7 @@ pub(crate) mod schema; mod store; /// Set-up and run the Indexer and RPC service, using the provided arguments (expected to be -/// extracted from the command-line). The service will continue to run until the cancellation token -/// is triggered, and will signal cancellation on the token when it is shutting down. +/// extracted from the command-line). /// /// `path` is the path to the RocksDB database,which will be created if it does not exist. /// `indexer_args` and `client_args` control the behavior of the Indexer, while `rpc_args` controls @@ -77,8 +74,7 @@ pub async fn start_service( version: &'static str, config: ServiceConfig, registry: &Registry, - cancel: CancellationToken, -) -> anyhow::Result> { +) -> anyhow::Result { let ServiceConfig { ingestion, consistency, @@ -103,7 +99,6 @@ pub async fn start_service( ingestion.into(), rocksdb, registry, - cancel.child_token(), ) .await?; @@ -113,7 +108,7 @@ pub async fn start_service( consistency_config: Arc::new(consistency), }; - let rpc = RpcService::new(rpc_args, version, registry, cancel.child_token()) + let rpc = RpcService::new(rpc_args, version, registry) .await? .register_encoded_file_descriptor_set(proto::rpc::consistent::v1alpha::FILE_DESCRIPTOR_SET) .add_service(ConsistentServiceServer::new(state.clone())); @@ -138,10 +133,8 @@ pub async fn start_service( add_sequential!(ObjectByOwner, object_by_owner); add_sequential!(ObjectByType, object_by_type); - let h_rpc = rpc.run().await?; - let h_indexer = indexer.run().await?; + let s_rpc = rpc.run().await?; + let s_indexer = indexer.run().await?; - Ok(tokio::spawn(async move { - let (_, _) = futures::join!(h_rpc, h_indexer); - })) + Ok(s_rpc.merge(s_indexer)) } diff --git a/crates/sui-indexer-alt-consistent-store/src/main.rs b/crates/sui-indexer-alt-consistent-store/src/main.rs index 1696c89d4b6ae..c7935507b73eb 100644 --- a/crates/sui-indexer-alt-consistent-store/src/main.rs +++ b/crates/sui-indexer-alt-consistent-store/src/main.rs @@ -5,6 +5,7 @@ use std::path::PathBuf; use anyhow::Context; use clap::Parser; +use futures::TryFutureExt as _; use prometheus::Registry; use sui_indexer_alt_consistent_store::{ args::{Args, Command}, @@ -12,10 +13,9 @@ use sui_indexer_alt_consistent_store::{ restore::start_restorer, start_service, }; +use sui_indexer_alt_framework::service::Error; use sui_indexer_alt_metrics::{MetricsService, uptime}; -use tokio::{fs, signal, task::JoinHandle}; -use tokio_util::sync::CancellationToken; -use tracing::info; +use tokio::fs; // Define the `GIT_REVISION` const bin_version::git_revision!(); @@ -49,19 +49,17 @@ async fn main() -> anyhow::Result<()> { config, } => { let config = read_config(config).await?; - let cancel = CancellationToken::new(); let registry = Registry::new_custom(Some("consistent_store".into()), None) .context("Failed to create Prometheus registry")?; - let metrics = MetricsService::new(metrics_args, registry, cancel.child_token()); - let h_ctrl_c = handle_interrupt(cancel.clone()); + let metrics = MetricsService::new(metrics_args, registry); metrics .registry() .register(uptime(VERSION)?) .context("Failed to register uptime metric")?; - let h_service = start_service( + let s_service = start_service( database_path, indexer_args, client_args, @@ -69,16 +67,22 @@ async fn main() -> anyhow::Result<()> { VERSION, config, metrics.registry(), - cancel.child_token(), ) .await?; - let h_metrics = metrics.run().await?; + let s_metrics = metrics.run().await?; - let _ = h_service.await; - cancel.cancel(); - let _ = h_metrics.await; - let _ = h_ctrl_c.await; + match s_service.attach(s_metrics).main().await { + Ok(()) | Err(Error::Terminated) => {} + + Err(Error::Aborted) => { + std::process::exit(1); + } + + Err(Error::Task(_)) => { + std::process::exit(2); + } + } } Command::Restore { @@ -91,19 +95,17 @@ async fn main() -> anyhow::Result<()> { config, } => { let config = read_config(config).await?; - let cancel = CancellationToken::new(); let registry = Registry::new_custom(Some("consistent_store".into()), None) .context("Failed to create Prometheus registry")?; - let metrics = MetricsService::new(metrics_args, registry, cancel.child_token()); - let h_ctrl_c = handle_interrupt(cancel.clone()); + let metrics = MetricsService::new(metrics_args, registry); metrics .registry() .register(uptime(VERSION)?) .context("Failed to register uptime metric")?; - let h_restorer = start_restorer( + let (s_restorer, finalizer) = start_restorer( database_path, formal_snapshot_args, storage_connection_args, @@ -111,16 +113,29 @@ async fn main() -> anyhow::Result<()> { pipeline.into_iter().collect(), config.rocksdb, metrics.registry(), - cancel.child_token(), ) .await?; - let h_metrics = metrics.run().await?; - - let _ = h_restorer.await; - cancel.cancel(); - let _ = h_metrics.await; - let _ = h_ctrl_c.await; + let s_metrics = metrics.run().await?; + + match s_restorer + .attach(s_metrics) + .main() + .and_then(|_| finalizer.run().main()) + .await + { + Ok(()) => {} + + // We can only guarantee that the restorer succeeded if it is allowed to complete + // without being instructed to exit or abort. + Err(Error::Terminated | Error::Aborted) => { + std::process::exit(1); + } + + Err(Error::Task(_)) => { + std::process::exit(2); + } + } } Command::GenerateConfig => { @@ -146,15 +161,3 @@ async fn read_config(path: Option) -> anyhow::Result { Ok(ServiceConfig::default()) } } - -fn handle_interrupt(cancel: CancellationToken) -> JoinHandle<()> { - tokio::spawn(async move { - tokio::select! { - _ = cancel.cancelled() => {} - _ = signal::ctrl_c() => { - info!("Received Ctrl-C, shutting down..."); - cancel.cancel(); - } - } - }) -} diff --git a/crates/sui-indexer-alt-consistent-store/src/restore/broadcaster.rs b/crates/sui-indexer-alt-consistent-store/src/restore/broadcaster.rs index 05f379f9e10f6..7bffbfacc85cd 100644 --- a/crates/sui-indexer-alt-consistent-store/src/restore/broadcaster.rs +++ b/crates/sui-indexer-alt-consistent-store/src/restore/broadcaster.rs @@ -13,16 +13,15 @@ use std::{ use anyhow::Context as _; use backoff::{Error as BE, ExponentialBackoff}; use futures::{future::try_join_all, stream}; -use sui_futures::future::with_slow_future_monitor; -use sui_futures::stream::TrySpawnStreamExt; -use tokio::{sync::mpsc, task::JoinHandle}; -use tokio_util::sync::CancellationToken; +use sui_futures::stream::{Break, TrySpawnStreamExt}; +use sui_futures::{future::with_slow_future_monitor, service::Service}; +use tokio::sync::mpsc; use tracing::{error, info, warn}; use crate::db::Db; use super::{ - Break, FormalSnapshot, LiveObjects, RestorerMetrics, + FormalSnapshot, LiveObjects, RestorerMetrics, format::{EpochManifest, FileMetadata, FileType}, }; @@ -36,32 +35,24 @@ const SLOW_FETCH_THRESHOLD: Duration = Duration::from_secs(600); /// fetching object files from the formal snapshot and disseminating them to all subscribers in /// `subscribers`. /// -/// The task will shut down if the `cancel` token is signalled, or if all object files have been -/// restored across all subscribers. Returns `Ok(_)` if all object files were successfully fetched -/// and passed to subscribers, or `Err(_)` otherwise. +/// The task will shut down if all object files have been restored across all subscribers. Returns +/// `Ok(_)` if all object files were successfully fetched and passed to subscribers, or `Err(_)` +/// otherwise. pub(super) fn broadcaster( object_file_concurrency: usize, subscribers: BTreeMap>>, db: Arc, snapshot: FormalSnapshot, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle> { - tokio::spawn(async move { +) -> Service { + Service::new().spawn_aborting(async move { info!("Starting broadcaster"); - let manifest = match snapshot + let manifest = snapshot .manifest() .await .and_then(|bs| EpochManifest::read(&bs)) - { - Ok(manifest) => manifest, - Err(e) => { - error!("Failed to read snapshot manifest: {e:#}"); - cancel.cancel(); - return Err(()); - } - }; + .context("Failed to read snapshot manifest")?; let metadata: Vec<_> = manifest .metadata() @@ -80,9 +71,6 @@ pub(super) fn broadcaster( let snapshot = snapshot.clone(); let metrics = metrics.clone(); - let supervisor_cancel = cancel.clone(); - let cancel = cancel.clone(); - async move { let restored = db .is_restored( @@ -90,7 +78,8 @@ pub(super) fn broadcaster( metadata.partition, subscribers.keys().map(|p| p.as_str()), ) - .context("Failed to check restored markers")?; + .context("Failed to check restored markers") + .map_err(Break::Err)?; // If all the pipelines have restored this partition, it can be skipped. if restored.iter().all(|r| *r) { @@ -110,12 +99,11 @@ pub(super) fn broadcaster( } // Download the object file. - let objects = tokio::select! { - objects = fetch_objects(snapshot, metadata, metrics.as_ref()) => Arc::new(objects?), - _ = cancel.cancelled() => { - return Err(Break::Cancel); - } - }; + let objects = Arc::new( + fetch_objects(snapshot, metadata, metrics.as_ref()) + .await + .map_err(Break::Err)?, + ); // Send it to all subscribers who are not restored yet. let futures = subscribers @@ -126,8 +114,7 @@ pub(super) fn broadcaster( if try_join_all(futures).await.is_err() { info!("Subscription dropped, signalling shutdown"); - supervisor_cancel.cancel(); - Err(Break::Cancel) + Err(Break::Break) } else { metrics.total_partitions_broadcast.inc(); Ok(()) @@ -141,14 +128,14 @@ pub(super) fn broadcaster( Ok(()) } - Err(Break::Cancel) => { - info!("Shutdown received, stopping broadcaster"); - Err(()) + Err(Break::Break) => { + info!("Channel closed, stopping broadcaster"); + Ok(()) } Err(Break::Err(e)) => { error!("Error from broadcaster: {e:#}"); - Err(()) + Err(e) } } }) diff --git a/crates/sui-indexer-alt-consistent-store/src/restore/mod.rs b/crates/sui-indexer-alt-consistent-store/src/restore/mod.rs index 66d4fae05990f..b849f3ba82b3b 100644 --- a/crates/sui-indexer-alt-consistent-store/src/restore/mod.rs +++ b/crates/sui-indexer-alt-consistent-store/src/restore/mod.rs @@ -8,11 +8,10 @@ use std::{ }; use anyhow::{Context as _, ensure}; -use futures::future; use prometheus::Registry; +use sui_indexer_alt_framework::service::Service; use sui_indexer_alt_framework::{pipeline::Processor, types::object::Object}; -use tokio::{sync::mpsc, task::JoinHandle}; -use tokio_util::sync::CancellationToken; +use tokio::sync::mpsc; use tracing::{info, warn}; use crate::{ @@ -76,28 +75,19 @@ pub(crate) struct Restorer { /// Channels to send live object partitions down, one per pipeline being restored. restore_tx: BTreeMap>>, - /// Handles for tasks spawned by the restorer. They return `Ok(_)` if they complete - /// successfully, or `Err(_)` otherwise (they encountered an error, or they were cancelled). - handles: Vec>>, + /// Services spawned by the restorer, for individual pipelines. + workers: Vec, /// Number of object files to download concurrenctly object_file_concurrency: usize, /// Maximum size of the backlog of object files waiting to be processed by one worker. object_file_buffer_size: usize, - - /// Cancellation token shared among all continuous tasks in the service. - cancel: CancellationToken, } -/// Internal type used by tasks to propagate errors or shutdown signals. -#[derive(thiserror::Error, Debug)] -enum Break { - #[error("Shutdown received")] - Cancel, - - #[error(transparent)] - Err(#[from] anyhow::Error), +pub struct Finalizer { + db: Arc, + pipelines: Vec, } impl Restorer { @@ -114,7 +104,6 @@ impl Restorer { config: DbConfig, metrics_prefix: Option<&str>, registry: &Registry, - cancel: CancellationToken, ) -> anyhow::Result { let RestoreArgs { object_file_concurrency, @@ -144,10 +133,9 @@ impl Restorer { snapshot, metrics, restore_tx: BTreeMap::new(), - handles: vec![], + workers: vec![], object_file_concurrency, object_file_buffer_size, - cancel, }) } @@ -165,12 +153,11 @@ impl Restorer { let watermark = self.snapshot.watermark(); self.db.restore_at(R::NAME, watermark)?; - self.handles.push(worker::( + self.workers.push(worker::( rx, self.db.clone(), self.schema.clone(), self.metrics.clone(), - self.cancel.clone(), )); Ok(()) @@ -179,41 +166,43 @@ impl Restorer { /// Start restoring live objects across all registered pipelines. The service will run until it /// can confirm that every registered pipeline has been fully restored, at which point, it will /// clean up the restoration state and set the watermark for the restored pipeline. - fn run(mut self) -> JoinHandle<()> { + fn run(self) -> (Service, Finalizer) { // Remember the pipelines being restored for the clean-up process. - let pipelines: Vec<_> = self.restore_tx.keys().cloned().collect(); - info!(?pipelines, "Starting restoration"); + let finalizer = Finalizer { + db: self.db.clone(), + pipelines: self.restore_tx.keys().cloned().collect(), + }; - let broadcaster_h = broadcaster( + info!(pipelines = ?finalizer.pipelines, "Starting restoration"); + let mut service = broadcaster( self.object_file_concurrency, self.restore_tx, - self.db.clone(), + self.db, self.snapshot, self.metrics, - self.cancel, ); - self.handles.push(broadcaster_h); - - tokio::spawn(async move { - // Wait for the broadcaster and all workers to wind down gracefully, and then clean up. - if future::join_all(self.handles) - .await - .into_iter() - .any(|r| matches!(r, Err(_) | Ok(Err(_)))) - { - warn!("Restoration did not complete successfully, not updating watermarks"); - return; - }; - - // Mark each pipeline as restored, setting its watermark. - for pipeline in pipelines { + for worker in self.workers { + service = service.merge(worker); + } + + (service, finalizer) + } +} + +impl Finalizer { + pub fn run(self) -> Service { + Service::new().spawn_aborting(async move { + // Clean up restoration state for each pipeline. + for pipeline in self.pipelines { if let Err(e) = self.db.complete_restore(&pipeline) { - warn!(pipeline, "Failed to finalize restoration: {e:#}"); + warn!(pipeline, "Failed to clear restoration state: {e:#}"); } else { - info!(pipeline, "Restoration complete"); + info!(pipeline, "Restoration state cleared"); } } + + Ok(()) }) } } @@ -228,8 +217,7 @@ impl Default for RestoreArgs { } /// Set-up and run the Restorer, using the provided arguments (expected to be extracted from the -/// command-line). The service will run until restoration complete, or until the cancellation token -/// is triggered. +/// command-line). The service will run until restoration complete. /// /// `path` is the path to the RocksDB database to restore into. It will be created if it does not /// exist. `formal_snapshot_args` describes the formal snapshot source, `connection_args` controls @@ -243,8 +231,7 @@ pub async fn start_restorer( mut pipelines: BTreeSet, config: DbConfig, registry: &Registry, - cancel: CancellationToken, -) -> anyhow::Result> { +) -> anyhow::Result<(Service, Finalizer)> { let mut restorer: Restorer = Restorer::new( path, formal_snapshot_args, @@ -253,7 +240,6 @@ pub async fn start_restorer( config, Some("restorer"), registry, - cancel.child_token(), ) .await?; diff --git a/crates/sui-indexer-alt-consistent-store/src/restore/worker.rs b/crates/sui-indexer-alt-consistent-store/src/restore/worker.rs index cb79ffc963e9f..6806f2c2e63e0 100644 --- a/crates/sui-indexer-alt-consistent-store/src/restore/worker.rs +++ b/crates/sui-indexer-alt-consistent-store/src/restore/worker.rs @@ -4,27 +4,23 @@ use std::sync::Arc; use anyhow::Context as _; -use sui_futures::stream::TrySpawnStreamExt as _; -use tokio::{sync::mpsc, task::JoinHandle}; +use sui_futures::{service::Service, stream::TrySpawnStreamExt as _}; +use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; -use tokio_util::sync::CancellationToken; use tracing::{error, info}; use crate::{db::Db, store::Schema}; -use super::{Break, LiveObjects, Restore, RestorerMetrics}; +use super::{LiveObjects, Restore, RestorerMetrics}; /// A worker that processes live objects from a single bucket and partition, for a given pipeline. -/// -/// Returns `Ok(_)` if it was able to process all live objects it was given, or `Err(_)` otherwise. pub(super) fn worker>( rx: mpsc::Receiver>, db: Arc, schema: Arc, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle> { - tokio::spawn(async move { +) -> Service { + Service::new().spawn_aborting(async move { info!(pipeline = R::NAME, "Starting worker"); match ReceiverStream::new(rx) @@ -32,7 +28,6 @@ pub(super) fn worker>( let db = db.clone(); let schema = schema.clone(); let metrics = metrics.clone(); - let cancel = cancel.clone(); async move { info!( @@ -60,9 +55,7 @@ pub(super) fn worker>( let mut batch = rocksdb::WriteBatch::default(); for object in &objects.objects { - if cancel.is_cancelled() { - return Err(Break::Cancel); - } + tokio::task::yield_now().await; R::restore(&schema, object, &mut batch).with_context(|| { format!( @@ -111,15 +104,9 @@ pub(super) fn worker>( Ok(()) } - Err(Break::Cancel) => { - info!(pipeline = R::NAME, "Shutdown received, stopping worker"); - Err(()) - } - - Err(Break::Err(e)) => { + Err(e) => { error!(pipeline = R::NAME, "Error from worker: {e:#}"); - cancel.cancel(); - Err(()) + Err(e) } } }) diff --git a/crates/sui-indexer-alt-consistent-store/src/rpc/mod.rs b/crates/sui-indexer-alt-consistent-store/src/rpc/mod.rs index a4afd281e8472..9428359276b33 100644 --- a/crates/sui-indexer-alt-consistent-store/src/rpc/mod.rs +++ b/crates/sui-indexer-alt-consistent-store/src/rpc/mod.rs @@ -11,20 +11,17 @@ use axum::extract::Request; use axum::response::IntoResponse; use axum_server::Handle; use axum_server::tls_rustls::RustlsConfig; -use futures::future::OptionFuture; use metrics::RpcMetrics; use middleware::metrics::MakeMetricsHandler; use middleware::version::Version; use mysten_network::callback::CallbackLayer; use prometheus::Registry; -use tokio::join; +use sui_futures::service::Service; use tokio::net::TcpListener; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; +use tokio::sync::oneshot; use tonic::server::NamedService; use tonic_health::ServingStatus; -use tower::Service; -use tracing::{error, info}; +use tracing::info; pub(crate) mod consistent_service; mod error; @@ -88,9 +85,6 @@ pub(crate) struct RpcService<'d> { /// Metrics for the RPC service. metrics: Arc, - - /// Cancellation token controls lifecycle for all RPC-related services. - cancel: CancellationToken, } pub type BoxError = Box; @@ -100,7 +94,6 @@ impl<'d> RpcService<'d> { args: RpcArgs, version: &'static str, registry: &Registry, - cancel: CancellationToken, ) -> anyhow::Result { let RpcArgs { rpc_listen_address, @@ -133,7 +126,6 @@ impl<'d> RpcService<'d> { service_names: vec![], router: Router::new(), metrics: Arc::new(RpcMetrics::new(registry)), - cancel, }) } @@ -151,7 +143,7 @@ impl<'d> RpcService<'d> { where S: Clone + Send + Sync + 'static, S: NamedService, - S: Service, + S: tower::Service, S::Future: Send + 'static, S::Error: Send + Into, { @@ -161,7 +153,7 @@ impl<'d> RpcService<'d> { } /// Run the RPC service. This binds the listener and exposes handlers for the RPC service. - pub(crate) async fn run(self) -> anyhow::Result> { + pub(crate) async fn run(self) -> anyhow::Result { let Self { rpc_listen_address, rpc_tls_listen_address, @@ -172,7 +164,6 @@ impl<'d> RpcService<'d> { mut service_names, mut router, metrics, - cancel, } = self; let reflection_v1 = reflection_v1 @@ -209,31 +200,30 @@ impl<'d> RpcService<'d> { .await; } + let mut service = Service::new(); + // Start HTTPS server if TLS is configured - let https_service: OptionFuture<_> = - if let (Some(listen_address), Some(config)) = (rpc_tls_listen_address, tls_config) { - info!("Starting Consistent RPC TLS service on {listen_address}"); + if let (Some(listen_address), Some(config)) = (rpc_tls_listen_address, tls_config) { + info!("Starting Consistent RPC TLS service on {listen_address}"); + let handle = Handle::new(); + let tls_router = router.clone(); - // Handle graceful shutdown for TLS service - let handle = Handle::new(); - tokio::spawn({ + service = service + .with_shutdown_signal({ let handle = handle.clone(); - let cancel = cancel.clone(); async move { - cancel.cancelled().await; handle.graceful_shutdown(None); } - }); - - Some( + }) + .spawn(async move { axum_server::bind_rustls(listen_address, config) .handle(handle) - .serve(router.clone().into_make_service()), - ) - } else { - None - } - .into(); + .serve(tls_router.into_make_service()) + .await + .context("Consistent RPC TLS service failed")?; + Ok(()) + }); + } // Start HTTP server info!("Starting Consistent RPC service on {rpc_listen_address}"); @@ -241,28 +231,21 @@ impl<'d> RpcService<'d> { .await .context("Failed to bind Consistent RPC to listen address")?; - let http_service = axum::serve(listener, router.clone()).with_graceful_shutdown({ - let cancel = cancel.clone(); - async move { - cancel.cancelled().await; - info!("Shutting down Consistent RPC HTTP service"); - } - }); - - // Return a single task that waits for all servers - Ok(tokio::spawn(async move { - let (https, http) = join!(https_service, http_service); - - if let Err(e) = https.transpose() { - error!("Failed to start Consistent RPC TLS service: {e:?}"); - cancel.cancel(); - } - - if let Err(e) = http { - error!("Failed to start Consistent RPC service: {e:?}"); - cancel.cancel(); - } - })) + let (stx, srx) = oneshot::channel::<()>(); + service = service + .with_shutdown_signal(async move { + let _ = stx.send(()); + }) + .spawn(async move { + axum::serve(listener, router) + .with_graceful_shutdown(async move { + let _ = srx.await; + }) + .await + .context("Consistent RPC HTTP service failed") + }); + + Ok(service) } } @@ -283,7 +266,7 @@ fn add_service(router: Router, s: S) -> Router where S: Clone + Send + Sync + 'static, S: NamedService, - S: Service, + S: tower::Service, S::Future: Send + 'static, S::Error: Send + Into, { diff --git a/crates/sui-indexer-alt-consistent-store/src/store/mod.rs b/crates/sui-indexer-alt-consistent-store/src/store/mod.rs index 19e371ade7d11..f7b743d4b88f1 100644 --- a/crates/sui-indexer-alt-consistent-store/src/store/mod.rs +++ b/crates/sui-indexer-alt-consistent-store/src/store/mod.rs @@ -7,9 +7,9 @@ use std::{path::Path, sync::Arc, time::Duration}; use anyhow::{Context as _, anyhow, bail}; use prometheus::Registry; use scoped_futures::ScopedBoxFuture; +use sui_indexer_alt_framework::service::Service; use sui_indexer_alt_framework::store::{self, CommitterWatermark, Store as _}; use synchronizer::Queue; -use tokio::task::JoinHandle; use crate::db::config::DbConfig; use crate::db::{Db, Watermark}; @@ -105,13 +105,13 @@ impl Store { /// Run the provided synchronizer, and register its queue with the store. This will fail if the /// store already has a synchronizer running. - pub(crate) fn sync(&self, s: Synchronizer) -> anyhow::Result> { - let (handle, queue) = s.run()?; + pub(crate) fn sync(&self, s: Synchronizer) -> anyhow::Result { + let (service, queue) = s.run()?; self.0 .queue .set(queue) .map_err(|_| anyhow!("Store already has synchronizer"))?; - Ok(handle) + Ok(service) } } @@ -225,7 +225,6 @@ mod tests { use scoped_futures::ScopedFutureExt; use sui_indexer_alt_framework::store::{Connection as _, TransactionalStore}; use tokio::time::{self, error::Elapsed}; - use tokio_util::sync::CancellationToken; use crate::db::map::DbMap; @@ -330,17 +329,10 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("test").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); write(&store, "test", 0, |s, b| { s.a.insert("x".to_owned(), 42, b)?; @@ -357,9 +349,6 @@ mod tests { let s = store.schema(); assert_eq!(s.a.get(0, "x".to_owned()).unwrap(), Some(42)); assert_eq!(s.b.get(0, 42).unwrap(), Some("x".to_owned())); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -371,18 +360,11 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("a").unwrap(); sync.register_pipeline("b").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); write(&store, "a", 0, |s, b| { s.a.insert("x".to_owned(), 42, b)?; @@ -411,9 +393,6 @@ mod tests { let s = store.schema(); assert_eq!(s.a.get(0, "x".to_owned()).unwrap(), Some(42)); assert_eq!(s.b.get(0, 42).unwrap(), Some("x".to_owned())); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -448,26 +427,16 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("b").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); // When there is existing data, the synchronizer will take a snapshot to make it available // before the store sees any writes. wait_until(|| async { has_range(&store, None, Some(0)) }) .await .unwrap(); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -507,27 +476,17 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("a").unwrap(); sync.register_pipeline("b").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); // When there is existing data, the synchronizer will take a snapshot to make it available // before the store sees any writes. wait_until(|| async { has_range(&store, None, Some(0)) }) .await .unwrap(); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -562,18 +521,11 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("a").unwrap(); sync.register_pipeline("b").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); // The pipelines are not in sync to begin with, so the synchronizer is waiting for the // writes for the other pipeline in order to take a snapshot. @@ -620,9 +572,6 @@ mod tests { let s = store.schema(); assert_eq!(s.a.get(1, "x".to_owned()).unwrap(), Some(42)); assert_eq!(s.b.get(1, 42).unwrap(), Some("y".to_owned())); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -635,18 +584,11 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); // Register a different pipeline, but not "test" sync.register_pipeline("other").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); let err = write(&store, "test", 0, |_, _| Ok(())) .await @@ -656,9 +598,6 @@ mod tests { // If pipelines are not registered with the synchronizer before it is associated with the // store, writes to them will fail. assert!(err.contains("No \"test\" synchronizer queue"), "{err}"); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -671,14 +610,7 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); // Don't register any pipelines let err = store.sync(sync).unwrap_err().to_string(); @@ -698,17 +630,10 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = Some(100); - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("test").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); write(&store, "test", 100, |s, b| { s.a.insert("x".to_owned(), 42, b)?; @@ -727,9 +652,6 @@ mod tests { let s = store.schema(); assert_eq!(s.a.get(100, "x".to_owned()).unwrap(), Some(42)); assert_eq!(s.b.get(100, 42).unwrap(), Some("x".to_owned())); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -742,17 +664,10 @@ mod tests { let stride = 3; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("test").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); // Write a run of checkpoints. for cp in 0..=10 { @@ -794,9 +709,6 @@ mod tests { // Going back beyond the first checkpoint results in an empty range. assert_eq!(None, d.snapshot_range(1)); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -809,17 +721,10 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = Some(100); - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("test").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let _svc = store.sync(sync).unwrap(); let err = store .transaction(|c| { @@ -838,9 +743,6 @@ mod tests { .to_string(); assert!(err.contains("No watermark set during transaction"), "{err}"); - - cancel.cancel(); - h_sync.await.unwrap(); } #[tokio::test] @@ -853,17 +755,10 @@ mod tests { let stride = 1; let buffer_size = 10; let first_checkpoint = None; - let cancel = CancellationToken::new(); - let mut sync = Synchronizer::new( - store.db().clone(), - stride, - buffer_size, - first_checkpoint, - cancel.clone(), - ); + let mut sync = Synchronizer::new(store.db().clone(), stride, buffer_size, first_checkpoint); sync.register_pipeline("test").unwrap(); - let h_sync = store.sync(sync).unwrap(); + let mut svc = store.sync(sync).unwrap(); write(&store, "test", 0, |s, b| { s.a.insert("x".to_owned(), 42, b)?; @@ -880,11 +775,11 @@ mod tests { .unwrap(); // The out of order batch will appear to succeed, but the synchronizer will detect the - // situation and stop gracefully. - time::timeout(Duration::from_millis(500), h_sync) + // situation and stop gracefully, with an error. + time::timeout(Duration::from_millis(500), svc.join()) .await .unwrap() - .unwrap(); + .unwrap_err(); // The first write made it through, but the second one did not. let s = store.schema(); diff --git a/crates/sui-indexer-alt-consistent-store/src/store/synchronizer.rs b/crates/sui-indexer-alt-consistent-store/src/store/synchronizer.rs index e48ceaef7bff7..fa69345c9c293 100644 --- a/crates/sui-indexer-alt-consistent-store/src/store/synchronizer.rs +++ b/crates/sui-indexer-alt-consistent-store/src/store/synchronizer.rs @@ -3,15 +3,10 @@ use std::{cmp::Ordering, collections::HashMap, sync::Arc, time::Duration}; -use anyhow::{Context, bail}; -use futures::future; -use sui_futures::future::with_slow_future_monitor; -use tokio::{ - sync::{Barrier, mpsc}, - task::JoinHandle, -}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; +use anyhow::{Context, bail, ensure}; +use sui_futures::{future::with_slow_future_monitor, service::Service}; +use tokio::sync::{Barrier, mpsc}; +use tracing::{debug, info, warn}; use crate::db::{Db, Watermark}; @@ -38,9 +33,6 @@ pub(crate) struct Synchronizer { /// The size of queues that feed each synchronizer task. buffer_size: usize, - - // Cancellation token shared among all synchronizer tasks. - cancel: CancellationToken, } /// Write access to each pipeline's synchronizer task. @@ -55,14 +47,13 @@ impl Synchronizer { /// `first_checkpoint` is the first checkpoint the service expects to see written to for /// completely new pipelines. If `None`, the first checkpoint is assumed to be `0`. /// - /// The service must be started by calling [Self::run], and it will stop if signalled on - /// `cancel`, if it has been instructed to write data out-of-order, or if a write fails. + /// The service must be started by calling [Self::run], and it will stop if it has been + /// instructed to write data out-of-order, or if a write fails. pub(crate) fn new( db: Arc, stride: u64, buffer_size: usize, first_checkpoint: Option, - cancel: CancellationToken, ) -> Self { Self { db, @@ -70,7 +61,6 @@ impl Synchronizer { first_checkpoint: first_checkpoint.unwrap_or(0), stride, buffer_size, - cancel, } } @@ -91,12 +81,10 @@ impl Synchronizer { } /// Start the service, accepting writes for registered pipelines. This consumes the service and - /// returns a `JoinHandle<()>` that will complete when all its tasks have completed, and the - /// `queue` data structure which gives access to the write side of the channels feeding each - /// task. - pub(super) fn run(self) -> anyhow::Result<(JoinHandle<()>, Queue)> { + /// returns a `Service` that will complete when all its tasks have completed, and the `queue` + /// data structure which gives access to the write side of the channels feeding each task. + pub(super) fn run(self) -> anyhow::Result<(Service, Queue)> { let mut queue = Queue::new(); - let mut tasks = Vec::new(); let pre_snap = Arc::new(Barrier::new(self.last_watermarks.len())); let post_snap = Arc::new(Barrier::new(self.last_watermarks.len())); @@ -115,11 +103,12 @@ impl Synchronizer { // checkpoint. let next_snapshot_checkpoint = ((init_checkpoint / self.stride) + 1) * self.stride; + let mut service = Service::new(); for (pipeline, last_watermark) in self.last_watermarks { let (tx, rx) = mpsc::channel(self.buffer_size); queue.insert(pipeline, tx); - tasks.push(synchronizer( + service = service.spawn_aborting(synchronizer( self.db.clone(), rx, pipeline, @@ -129,16 +118,10 @@ impl Synchronizer { last_watermark, pre_snap.clone(), post_snap.clone(), - self.cancel.child_token(), )); } - let handle = tokio::spawn(async move { - future::join_all(tasks).await; - info!("Synchronizer gracefully shut down"); - }); - - Ok((handle, queue)) + Ok((service, queue)) } } @@ -153,9 +136,9 @@ impl Synchronizer { /// `pre_snap` before a snapshot is to be taken, and on `post_snap` after the snapshot is taken /// (and data from future checkpoints can be written). /// -/// The task will stop when it receives a shutdown signal on `cancel`, or if it detects an issue -/// during writes (an out-of-order batch, or an error during writes). -fn synchronizer( +/// The task will stop if it detects an issue during writes (an out-of-order batch, or an error +/// during writes). +async fn synchronizer( db: Arc, mut rx: mpsc::Receiver<(Watermark, rocksdb::WriteBatch)>, pipeline: &'static str, @@ -165,93 +148,78 @@ fn synchronizer( mut current_watermark: Option, pre_snap: Arc, post_snap: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { - tokio::spawn(async move { - loop { - let next_checkpoint = current_watermark - .as_ref() - .map(|w| w.checkpoint_hi_inclusive + 1) - .unwrap_or(first_checkpoint); - - match next_snapshot_checkpoint.cmp(&next_checkpoint) { - // The next checkpoint should be included in the next snapshot, so allow it to be - // written. - Ordering::Greater => {} - - // If the next checkpoint is more than one checkpoint ahead of the next snapshot, - // something has gone wrong. - Ordering::Less => { - error!( - pipeline, - next_snapshot_checkpoint, next_checkpoint, "Missed snapshot" - ); - break; - } - - // The next checkpoint does not belong in the next snapshot, so wait for other - // synchronizers to reach this point, and take the snapshot before proceeding. - // - // One arbitrary task (the "leader") is responsible for taking the snapshot, the others - // just bump their own synchronization point and wait. - Ordering::Equal => { - tokio::select! { - w = with_slow_future_monitor(pre_snap.wait(), SLOW_SYNC_WARNING_THRESHOLD, || { - warn!(pipeline, "Synchronizer stuck, pre-snapshot") - }) => if w.is_leader() { - if let Some(watermark) = current_watermark { - db.take_snapshot(watermark); - } else { - error!(pipeline, next_snapshot_checkpoint, "No watermark available for snapshot"); - break; - } - }, - - _ = cancel.cancelled() => { - info!(pipeline, "Shutdown received before pre-snapshot barrier"); - break; - } - } +) -> anyhow::Result<()> { + loop { + let next_checkpoint = current_watermark + .as_ref() + .map(|w| w.checkpoint_hi_inclusive + 1) + .unwrap_or(first_checkpoint); + + match next_snapshot_checkpoint.cmp(&next_checkpoint) { + // The next checkpoint should be included in the next snapshot, so allow it to be + // written. + Ordering::Greater => {} + + // If the next checkpoint is more than one checkpoint ahead of the next snapshot, + // something has gone wrong. + Ordering::Less => { + bail!( + "Missed snapshot {next_snapshot_checkpoint} for {pipeline}, got {next_checkpoint}" + ); + } - next_snapshot_checkpoint += stride; - tokio::select! { - _ = with_slow_future_monitor(post_snap.wait(), SLOW_SYNC_WARNING_THRESHOLD, || { - warn!(pipeline, "Synchronizer stuck, post-snapshot") - }) => {} - _ = cancel.cancelled() => { - info!(pipeline, "Shutdown received before post-snapshot barrier"); - break; - } + // The next checkpoint does not belong in the next snapshot, so wait for other + // synchronizers to reach this point, and take the snapshot before proceeding. + // + // One arbitrary task (the "leader") is responsible for taking the snapshot, the others + // just bump their own synchronization point and wait. + Ordering::Equal => { + let take_snapshot = + with_slow_future_monitor(pre_snap.wait(), SLOW_SYNC_WARNING_THRESHOLD, || { + warn!(pipeline, "Synchronizer stuck, pre-snapshot") + }) + .await + .is_leader(); + + if take_snapshot { + if let Some(watermark) = current_watermark { + db.take_snapshot(watermark); + } else { + bail!( + "{pipeline} has no watermark available for snapshot at {next_snapshot_checkpoint}" + ); } } + + next_snapshot_checkpoint += stride; + with_slow_future_monitor(post_snap.wait(), SLOW_SYNC_WARNING_THRESHOLD, || { + warn!(pipeline, "Synchronizer stuck, post-snapshot") + }) + .await; } + } - tokio::select! { - _ = cancel.cancelled() => { - info!(pipeline, "Shutdown received waiting for batch"); - break; - } + let Some((watermark, batch)) = rx.recv().await else { + info!(pipeline, "Synchronizer channel closed"); + break; + }; - Some((watermark, batch)) = rx.recv() => { - debug!(pipeline, ?watermark, "Received batch"); - if watermark.checkpoint_hi_inclusive != next_checkpoint { - error!(pipeline, ?watermark, next_checkpoint, "Out-of-order batch"); - break; - } + debug!(pipeline, ?watermark, "Received batch"); + ensure!( + watermark.checkpoint_hi_inclusive == next_checkpoint, + "Out-of-order batch for {pipeline}: expected {next_checkpoint}, got {watermark:?}", + ); - if let Err(e) = db.write(pipeline, watermark, batch) { - error!(pipeline, ?e, "Failed to write batch"); - break; - } + db.write(pipeline, watermark, batch) + .with_context(|| format!("Failed to write batch for {pipeline} at {watermark:?}"))?; - current_watermark = Some(watermark); - } - } - } + current_watermark = Some(watermark); + } - info!( - pipeline, - next_snapshot_checkpoint, watermark = ?current_watermark, "Stopping sync" - ); - }) + info!( + pipeline, + next_snapshot_checkpoint, watermark = ?current_watermark, "Stopping sync" + ); + + Ok(()) } diff --git a/crates/sui-indexer-alt-e2e-tests/Cargo.toml b/crates/sui-indexer-alt-e2e-tests/Cargo.toml index 67e551d6edecb..49d8ab68587d7 100644 --- a/crates/sui-indexer-alt-e2e-tests/Cargo.toml +++ b/crates/sui-indexer-alt-e2e-tests/Cargo.toml @@ -27,10 +27,10 @@ serde_json.workspace = true shared-crypto.workspace = true tempfile.workspace = true tokio.workspace = true -tokio-util.workspace = true url.workspace = true simulacrum.workspace = true +sui-futures.workspace = true sui-indexer-alt.workspace = true sui-indexer-alt-consistent-api.workspace = true sui-indexer-alt-consistent-store.workspace = true diff --git a/crates/sui-indexer-alt-e2e-tests/src/lib.rs b/crates/sui-indexer-alt-e2e-tests/src/lib.rs index 5fde3ffb85b27..437aeafaf1ec2 100644 --- a/crates/sui-indexer-alt-e2e-tests/src/lib.rs +++ b/crates/sui-indexer-alt-e2e-tests/src/lib.rs @@ -15,6 +15,7 @@ use diesel_async::RunQueryDsl; use reqwest::Client; use serde_json::{Value, json}; use simulacrum::Simulacrum; +use sui_futures::service::Service; use sui_indexer_alt::{BootstrapGenesis, config::IndexerConfig, setup_indexer}; use sui_indexer_alt_consistent_api::proto::rpc::consistent::v1alpha::{ AvailableRangeRequest, consistent_service_client::ConsistentServiceClient, @@ -56,11 +57,9 @@ use sui_types::{ }; use tempfile::TempDir; use tokio::{ - task::JoinHandle, time::{error::Elapsed, interval}, try_join, }; -use tokio_util::sync::CancellationToken; use url::Url; pub mod coin_registry; @@ -104,21 +103,10 @@ pub struct OffchainCluster { /// The pipelines that the indexer is populating. pipelines: Vec<&'static str>, - /// A handle to the indexer task -- it will stop when the `cancel` token is triggered (or - /// earlier of its own accord). - indexer: JoinHandle<()>, - - /// A handle to the consistent store task -- it will stop when the `cancel` token is triggered - /// (or earlier of its own accord). - consistent_store: JoinHandle<()>, - - /// A handle to the JSON-RPC server task -- it will stop when the `cancel` token is triggered - /// (or earlier of its own accord). - jsonrpc: JoinHandle<()>, - - /// A handle to the GraphQL server task -- it will stop when the `cancel` token is triggered - /// (or earlier of its own accord). - graphql: JoinHandle<()>, + /// Handles to all running services. Held on to so the services are not dropped (and therefore + /// aborted) until the cluster is stopped. + #[allow(unused)] + services: Service, /// Hold on to the database so it doesn't get dropped until the cluster is stopped. #[allow(unused)] @@ -128,9 +116,6 @@ pub struct OffchainCluster { /// doesn't get cleaned up until the cluster is stopped. #[allow(unused)] dir: TempDir, - - /// This token controls the clean up of the cluster. - cancel: CancellationToken, } pub struct OffchainClusterConfig { @@ -152,7 +137,6 @@ impl FullCluster { Simulacrum::new(), OffchainClusterConfig::default(), &prometheus::Registry::new(), - CancellationToken::new(), ) .await } @@ -164,12 +148,11 @@ impl FullCluster { mut executor: Simulacrum, offchain_cluster_config: OffchainClusterConfig, registry: &prometheus::Registry, - cancel: CancellationToken, ) -> anyhow::Result { let (client_args, temp_dir) = local_ingestion_client_args(); executor.set_data_ingestion_path(temp_dir.path().to_owned()); - let offchain = OffchainCluster::new(client_args, offchain_cluster_config, registry, cancel) + let offchain = OffchainCluster::new(client_args, offchain_cluster_config, registry) .await .context("Failed to create off-chain cluster")?; @@ -296,12 +279,6 @@ impl FullCluster { ) -> Result<(), Elapsed> { self.offchain.wait_for_graphql(checkpoint, timeout).await } - - /// Triggers cancellation of all downstream services, waits for them to stop, cleans up the - /// temporary database, and the temporary directory used for ingestion. - pub async fn stopped(self) { - self.offchain.stopped().await; - } } impl OffchainCluster { @@ -325,7 +302,6 @@ impl OffchainCluster { bootstrap_genesis, }: OffchainClusterConfig, registry: &prometheus::Registry, - cancel: CancellationToken, ) -> anyhow::Result { let consistent_port = get_available_port(); let consistent_listen_address = @@ -370,7 +346,6 @@ impl OffchainCluster { indexer_config, bootstrap_genesis, registry, - cancel.child_token(), ) .await .context("Failed to setup indexer")?; @@ -386,7 +361,6 @@ impl OffchainCluster { "0.0.0", consistent_config, registry, - cancel.child_token(), ) .await .context("Failed to start Consistent Store")?; @@ -401,7 +375,6 @@ impl OffchainCluster { SystemPackageTaskArgs::default(), jsonrpc_config, registry, - cancel.child_token(), ) .await .context("Failed to start JSON-RPC server")?; @@ -425,24 +398,24 @@ impl OffchainCluster { graphql_config, pipelines.iter().map(|p| p.to_string()).collect(), registry, - cancel.child_token(), ) .await .context("Failed to start GraphQL server")?; + let services = indexer + .merge(consistent_store) + .merge(jsonrpc) + .merge(graphql); + Ok(Self { consistent_listen_address, jsonrpc_listen_address, graphql_listen_address, db, pipelines, - indexer, - consistent_store, - jsonrpc, - graphql, + services, database, dir, - cancel, }) } @@ -664,16 +637,6 @@ impl OffchainCluster { }) .await } - - /// Triggers cancellation of all downstream services, waits for them to stop, and cleans up the - /// temporary database. - pub async fn stopped(self) { - self.cancel.cancel(); - let _ = self.indexer.await; - let _ = self.consistent_store.await; - let _ = self.jsonrpc.await; - let _ = self.graphql.await; - } } impl Default for OffchainClusterConfig { diff --git a/crates/sui-indexer-alt-e2e-tests/tests/consistent_store_list_owned_objects_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/consistent_store_list_owned_objects_tests.rs index 6142a90e54e96..c09822e3f4892 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/consistent_store_list_owned_objects_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/consistent_store_list_owned_objects_tests.rs @@ -21,7 +21,6 @@ use sui_types::{ programmable_transaction_builder::ProgrammableTransactionBuilder, transaction::{Argument, Command, Transaction, TransactionData}, }; -use tokio_util::sync::CancellationToken; /// 5 SUI gas budget const DEFAULT_GAS_BUDGET: u64 = 5_000_000_000; @@ -553,7 +552,6 @@ async fn test_coin_balance_change_cleanup() { Simulacrum::new_with_protocol_version(OsRng, 27.into()), Default::default(), &Registry::new(), - CancellationToken::new(), ) .await .unwrap(); diff --git a/crates/sui-indexer-alt-e2e-tests/tests/graphql_available_range_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/graphql_available_range_tests.rs index c0c4bdc0c58ee..c377c1296848c 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/graphql_available_range_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/graphql_available_range_tests.rs @@ -9,7 +9,6 @@ use move_core_types::ident_str; use reqwest::Client; use serde_json::{Value, json}; use simulacrum::Simulacrum; -use tokio_util::sync::CancellationToken; use sui_indexer_alt::config::{ConcurrentLayer, IndexerConfig, PipelineLayer, PrunerLayer}; use sui_indexer_alt_e2e_tests::{FullCluster, OffchainClusterConfig, find::address_owned}; @@ -138,8 +137,6 @@ async fn test_available_range_with_pipelines() { let (first, last) = collect_sequence_numbers(&transasction_available_range); assert_eq!(first, 1); assert_eq!(last, 10); - - cluster.stopped().await; } /// Test that querying available range for a pipeline that is not enabled returns an error @@ -171,7 +168,6 @@ async fn test_available_range_pipeline_unavailable() { } } ]"###); - cluster.stopped().await; } /// Test available range queries with retention configurations @@ -236,8 +232,6 @@ async fn test_transaction_pagination_pruning() { let transactions_in_range = query_transactions(&cluster, b).await; let actual = collect_digests(&transactions_in_range); assert_eq!(&a_txs[6..], &actual); - - cluster.stopped().await; } #[tokio::test] @@ -276,8 +270,6 @@ async fn test_events_pagination_pruning() { let events_in_range = query_events(&cluster, json!({ "type": pkg.to_string() })).await; let actual = collect_digests(&events_in_range); assert_eq!(&tx_digests, &actual); - - cluster.stopped().await; } #[tokio::test] @@ -314,8 +306,6 @@ async fn test_checkpoint_pagination_pruning() { checkpoints[3]["sequenceNumber"].as_u64().unwrap(), cp_sequence_numbers[7] ); - - cluster.stopped().await; } /// Set-up a cluster with a custom configuration for pipelines. @@ -336,7 +326,6 @@ async fn cluster_with_pipelines(pipeline: PipelineLayer) -> FullCluster { ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await .expect("Failed to create cluster") diff --git a/crates/sui-indexer-alt-e2e-tests/tests/graphql_epoch_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/graphql_epoch_tests.rs index 6d969eb2f5ce8..99f8f1f1b4fe5 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/graphql_epoch_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/graphql_epoch_tests.rs @@ -27,7 +27,6 @@ use sui_types::{ }, test_checkpoint_data_builder::{AdvanceEpochConfig, TestCheckpointBuilder}, }; -use tokio_util::sync::CancellationToken; const SAFE_MODE_QUERY: &str = r#" query { @@ -226,7 +225,6 @@ async fn test_graphql( ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await?; @@ -243,8 +241,6 @@ async fn test_graphql( let request = client.post(offchain.graphql_url()).json(&query); let response = request.send().await?; - offchain.stopped().await; - let value: Value = response.json().await?; let Some(epoch) = value.pointer("/data/epoch") else { return Err(anyhow::Error::msg("epoch not found")); diff --git a/crates/sui-indexer-alt-e2e-tests/tests/graphql_execute_transaction_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/graphql_execute_transaction_tests.rs index d8c9c5fe90650..c13529a407835 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/graphql_execute_transaction_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/graphql_execute_transaction_tests.rs @@ -20,8 +20,7 @@ use sui_pg_db::{DbArgs, temp::get_available_port}; use sui_test_transaction_builder::make_transfer_sui_transaction; use sui_types::{gas_coin::GasCoin, transaction::SharedObjectMutability}; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; +use sui_futures::service::Service; use url::Url; use sui_types::{base_types::SuiAddress, transaction::ObjectArg}; @@ -101,8 +100,10 @@ struct ConsensusObject { struct GraphQlTestCluster { url: Url, - handle: JoinHandle<()>, - cancel: CancellationToken, + /// Hold on to the service so it doesn't get dropped (and therefore aborted) until the cluster + /// goes out of scope. + #[allow(unused)] + service: Service, } impl GraphQlTestCluster { @@ -128,11 +129,6 @@ impl GraphQlTestCluster { Ok(body) } - - async fn stopped(self) { - self.cancel.cancel(); - let _ = self.handle.await; - } } #[derive(Debug, Deserialize)] @@ -164,10 +160,8 @@ async fn create_graphql_test_cluster(validator_cluster: &TestCluster) -> GraphQl fullnode_rpc_url: Some(validator_cluster.rpc_url().to_string()), }; - let cancel = CancellationToken::new(); - // Start GraphQL server that connects directly to TestCluster's RPC - let graphql_handle = start_graphql( + let service = start_graphql( None, // No database - GraphQL will use fullnode RPC for executeTransaction fullnode_args, DbArgs::default(), @@ -179,7 +173,6 @@ async fn create_graphql_test_cluster(validator_cluster: &TestCluster) -> GraphQl GraphQlConfig::default(), vec![], // No pipelines since we're not using database &Registry::new(), - cancel.child_token(), ) .await .expect("Failed to start GraphQL server"); @@ -187,11 +180,7 @@ async fn create_graphql_test_cluster(validator_cluster: &TestCluster) -> GraphQl let url = Url::parse(&format!("http://{}/graphql", graphql_listen_address)) .expect("Failed to parse GraphQL URL"); - GraphQlTestCluster { - url, - handle: graphql_handle, - cancel, - } + GraphQlTestCluster { url, service } } #[sim_test] @@ -264,8 +253,6 @@ async fn test_execute_transaction_mutation_schema() { for (returned, original) in transaction.signatures.iter().zip(signatures.iter()) { assert_eq!(returned.signature_bytes, original.encoded()); } - - graphql_cluster.stopped().await; } #[sim_test] @@ -296,8 +283,6 @@ async fn test_execute_transaction_input_validation() { .unwrap(); assert!(result.get("errors").is_some()); - - graphql_cluster.stopped().await; } #[sim_test] @@ -375,8 +360,6 @@ async fn test_execute_transaction_with_events() { "Event sender should match transaction sender" ); } - - graphql_cluster.stopped().await; } #[sim_test] @@ -432,8 +415,6 @@ async fn test_execute_transaction_grpc_errors() { ); let error_array = errors.as_array().unwrap(); assert!(!error_array.is_empty()); - - graphql_cluster.stopped().await; } #[sim_test] @@ -530,8 +511,6 @@ async fn test_execute_transaction_unchanged_consensus_objects() { let object = first_edge.node.object.as_ref().unwrap(); assert_eq!(object.address, sui_types::SUI_CLOCK_OBJECT_ID.to_string()); assert!(object.version > 0, "Version should be greater than 0"); - - graphql_cluster.stopped().await; } #[sim_test] diff --git a/crates/sui-indexer-alt-e2e-tests/tests/graphql_name_service_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/graphql_name_service_tests.rs index e7a6a94f1affe..e03b97bca62b3 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/graphql_name_service_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/graphql_name_service_tests.rs @@ -18,7 +18,6 @@ use sui_types::{ programmable_transaction_builder::ProgrammableTransactionBuilder, transaction::{ObjectArg, SharedObjectMutability, Transaction, TransactionData}, }; -use tokio_util::sync::CancellationToken; /// 5 SUI gas budget const DEFAULT_GAS_BUDGET: u64 = 5_000_000_000; @@ -89,8 +88,6 @@ async fn test_resolve_domain() { assert_resolved!(target, c.resolve_address("foo.sui").await.unwrap()); assert_resolved!(target, c.resolve_address("@foo").await.unwrap()); assert_reverse!("foo.sui", c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// If a domain name exists but has no target, we can't resolve it, but it's not an error. @@ -108,8 +105,6 @@ async fn test_resolve_domain_no_target() { let resp = c.resolve_address("foo.sui").await.unwrap(); assert!(resp["data"]["suinsName"].is_null()); assert!(resp["errors"].is_null()); - - c.cluster.stopped().await; } /// Set-up a domain with an expiry, and confirm that it exists, then advance time on-chain until it @@ -136,8 +131,6 @@ async fn test_resolve_domain_expiry() { assert_not_resolved!(c.resolve_address("foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } #[tokio::test] @@ -146,8 +139,6 @@ async fn test_resolve_nonexistent_domain() { c.cluster.create_checkpoint().await; assert_not_resolved!(c.resolve_address("foo.sui").await.unwrap()); - - c.cluster.stopped().await; } /// Test resolving a valid sub-domain (which requires both the sub-domain and its parent to exist @@ -172,8 +163,6 @@ async fn test_resolve_subdomain() { assert_resolved!(target, c.resolve_address("bar.foo.sui").await.unwrap()); assert_resolved!(target, c.resolve_address("bar@foo").await.unwrap()); assert_reverse!("bar.foo.sui", c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// Like the parent domain case, but a sub-domain's expiry is controlled by its parent's expiry @@ -203,8 +192,6 @@ async fn test_resolve_subdomain_parent_expiry() { assert_not_resolved!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// A sub-domain that has its own expiry, in addition to (and before) the parent's expiry. @@ -236,8 +223,6 @@ async fn test_resolve_subdomain_expiry() { assert_not_resolved!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// A sub-domain where the parent domain's NFT is different from the sub-domain's NFT, is @@ -264,8 +249,6 @@ async fn test_resolve_subdomain_bad_parent() { assert_not_resolved!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// The parent domain record does not exist, so the sub-domain is considered expired. @@ -284,8 +267,6 @@ async fn test_resolve_subdomain_no_parent() { assert_not_resolved!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } struct SuiNSCluster { @@ -424,7 +405,6 @@ impl SuiNSCluster { ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await .expect("Failed to set-up cluster"); diff --git a/crates/sui-indexer-alt-e2e-tests/tests/graphql_request_id.rs b/crates/sui-indexer-alt-e2e-tests/tests/graphql_request_id.rs index e1e77b41ff080..ac6d8a5cba6f4 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/graphql_request_id.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/graphql_request_id.rs @@ -37,8 +37,6 @@ async fn request_id(query: &str) -> Result { let request = client.post(cluster.graphql_url()).json(&query); let response = request.send().await?; - cluster.stopped().await; - let request_id = response .headers() .get(REQUEST_ID_HEADER) diff --git a/crates/sui-indexer-alt-e2e-tests/tests/graphql_simulate_transaction_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/graphql_simulate_transaction_tests.rs index 8c40fbbd0ce72..fbbe623a99266 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/graphql_simulate_transaction_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/graphql_simulate_transaction_tests.rs @@ -29,8 +29,7 @@ use sui_pg_db::{ use sui_test_transaction_builder::make_transfer_sui_transaction; use sui_types::gas_coin::GasCoin; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; +use sui_futures::service::Service; use url::Url; use sui_types::base_types::SuiAddress; @@ -141,9 +140,10 @@ struct FieldLayout { struct GraphQlTestCluster { url: Url, - handle: JoinHandle<()>, - cancel: CancellationToken, - indexer_handle: JoinHandle<()>, + /// Hold on to the service so it doesn't get dropped (and therefore aborted) until the cluster + /// goes out of scope. + #[allow(unused)] + service: Service, /// Hold on to the database so it doesn't get dropped until the cluster is stopped. #[allow(unused)] database: TempDb, @@ -153,7 +153,6 @@ impl GraphQlTestCluster { async fn new(validator_cluster: &TestCluster) -> Self { let graphql_port = get_available_port(); let graphql_listen_address = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), graphql_port); - let cancel = CancellationToken::new(); let database = TempDb::new().expect("Failed to create temp database"); let database_url = database.database().url().clone(); @@ -179,15 +178,14 @@ impl GraphQlTestCluster { IndexerConfig::for_test(), None, &Registry::new(), - cancel.child_token(), ) .await .expect("Failed to setup indexer"); let pipelines: Vec = indexer.pipelines().map(|s| s.to_string()).collect(); - let indexer_handle = indexer.run().await.expect("Failed to start indexer"); + let s_indexer = indexer.run().await.expect("Failed to start indexer"); - let graphql_handle = start_graphql( + let s_graphql = start_graphql( Some(database_url), fullnode_args, DbArgs::default(), @@ -202,7 +200,6 @@ impl GraphQlTestCluster { GraphQlConfig::default(), pipelines, &Registry::new(), - cancel.child_token(), ) .await .expect("Failed to start GraphQL server"); @@ -212,9 +209,7 @@ impl GraphQlTestCluster { Self { url, - handle: graphql_handle, - cancel, - indexer_handle, + service: s_graphql.merge(s_indexer), database, } } @@ -241,12 +236,6 @@ impl GraphQlTestCluster { Ok(body) } - - async fn stopped(self) { - self.cancel.cancel(); - let _ = self.handle.await; - let _ = self.indexer_handle.await; - } } #[tokio::test] @@ -312,8 +301,6 @@ async fn test_simulate_transaction_basic() { // For simulation, signatures should be empty since we don't provide them assert_eq!(transaction.signatures.len(), 0); - - graphql_cluster.stopped().await; } #[tokio::test] @@ -350,7 +337,7 @@ async fn test_simulate_transaction_with_events() { modules { nodes { name - } + } } } name @@ -408,8 +395,6 @@ async fn test_simulate_transaction_with_events() { "error": null } "#); - - graphql_cluster.stopped().await; } #[tokio::test] @@ -441,8 +426,6 @@ async fn test_simulate_transaction_input_validation() { // Should return GraphQL errors for invalid input assert!(result.get("errors").is_some()); - - graphql_cluster.stopped().await; } #[tokio::test] @@ -570,8 +553,6 @@ async fn test_simulate_transaction_object_changes() { .as_str() .unwrap(); assert_eq!(created_type, sui_coin_type); - - graphql_cluster.stopped().await; } #[tokio::test] @@ -780,8 +761,6 @@ async fn test_simulate_transaction_command_results() { _ => panic!("Unexpected command index: {}", i), } } - - graphql_cluster.stopped().await; } #[tokio::test] @@ -889,8 +868,6 @@ async fn test_simulate_transaction_json_transfer() { // For simulation, signatures should be empty since we don't provide them assert_eq!(transaction.signatures.len(), 0); - - graphql_cluster.stopped().await; } #[tokio::test] @@ -1002,6 +979,4 @@ async fn test_package_resolver_finds_newly_published_package() { .unwrap() .contains("::resolver_test::NestedObject") ); - - graphql_cluster.stopped().await; } diff --git a/crates/sui-indexer-alt-e2e-tests/tests/graphql_zklogin_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/graphql_zklogin_tests.rs index 30f23f642e51d..98bc985bee880 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/graphql_zklogin_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/graphql_zklogin_tests.rs @@ -22,7 +22,6 @@ use sui_types::{ use tempfile::TempDir; use test_cluster::{TestCluster, TestClusterBuilder}; use tokio::time::interval; -use tokio_util::sync::CancellationToken; const QUERY: &str = r#" query ($bytes: Base64!, $signature: Base64!, $scope: ZkLoginIntentScope!, $author: SuiAddress!) { @@ -88,7 +87,6 @@ impl FullCluster { ..Default::default() }, &Registry::new(), - CancellationToken::new(), ) .await?; diff --git a/crates/sui-indexer-alt-e2e-tests/tests/grpc_streaming_integration_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/grpc_streaming_integration_tests.rs index aa1ad51ea0b86..4af799dab7ef2 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/grpc_streaming_integration_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/grpc_streaming_integration_tests.rs @@ -5,11 +5,13 @@ use std::time::Duration; use diesel::QueryDsl; use diesel_async::RunQueryDsl; +use prometheus::Registry; use sui_field_count::FieldCount; use sui_indexer_alt_framework::{ - cluster::IndexerCluster, + Indexer, IndexerArgs, ingestion::{ - ClientArgs, ingestion_client::IngestionClientArgs, streaming_client::StreamingClientArgs, + ClientArgs, IngestionConfig, ingestion_client::IngestionClientArgs, + streaming_client::StreamingClientArgs, }, pipeline::{ Processor, @@ -148,8 +150,10 @@ async fn test_indexer_cluster_with_grpc_streaming() { let reader = Db::for_read(url.clone(), DbArgs::default()).await.unwrap(); let writer = Db::for_write(url.clone(), DbArgs::default()).await.unwrap(); - // Create the tx_counts table + // Create the schema for the test. { + writer.run_migrations(None).await.unwrap(); + let mut conn = writer.connect().await.unwrap(); diesel::sql_query( r#" @@ -164,13 +168,17 @@ async fn test_indexer_cluster_with_grpc_streaming() { .unwrap(); } - // Build the indexer cluster with gRPC streaming configuration - let mut indexer = IndexerCluster::builder() - .with_database_url(url.clone()) - .with_client_args(client_args) - .build() - .await - .unwrap(); + // Build the indexer with gRPC streaming configuration + let mut indexer = Indexer::new( + writer, + IndexerArgs::default(), + client_args, + IngestionConfig::default(), + None, + &Registry::new(), + ) + .await + .unwrap(); // Add the tx_counts pipeline indexer @@ -178,11 +186,10 @@ async fn test_indexer_cluster_with_grpc_streaming() { .await .unwrap(); - let cancel = indexer.cancel().clone(); - // Run the indexer - it will use gRPC streaming from TestCluster - // and fall back to local ingestion if needed - let handle = indexer.run().await.unwrap(); + // and fall back to local ingestion if needed. Hold on to the service so it doesn't get + // dropped (and therefore aborted) until the test ends. + let _service = indexer.run().await.unwrap(); // Poll every 100ms with a 5s timeout for the sum of user transactions to reach 5 tokio::time::timeout(Duration::from_secs(5), async { @@ -203,7 +210,4 @@ async fn test_indexer_cluster_with_grpc_streaming() { }) .await .expect("Timeout: Expected sum of user transactions to reach 5 within 5 seconds"); - - cancel.cancel(); - handle.await.unwrap(); } diff --git a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_compound_object_filter_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_compound_object_filter_tests.rs index f8aff526e34d2..3aaf0f63ab5a5 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_compound_object_filter_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_compound_object_filter_tests.rs @@ -19,7 +19,6 @@ use sui_types::{ programmable_transaction_builder::ProgrammableTransactionBuilder, transaction::{Transaction, TransactionData}, }; -use tokio_util::sync::CancellationToken; /// 5 SUI gas budget const DEFAULT_GAS_BUDGET: u64 = 5_000_000_000; @@ -329,7 +328,6 @@ async fn setup_cluster(config: ObjectsConfig) -> FullCluster { ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await .expect("Failed to set-up cluster") diff --git a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_fn_delegation_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_fn_delegation_tests.rs index 0f4c061e11f7d..3589961ae5051 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_fn_delegation_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_fn_delegation_tests.rs @@ -8,6 +8,7 @@ use anyhow::Context; use prometheus::Registry; use reqwest::Client; use serde_json::{Value, json}; +use sui_futures::service::Service; use sui_indexer_alt_jsonrpc::{ NodeArgs, RpcArgs, args::SystemPackageTaskArgs, config::RpcConfig, start_rpc, }; @@ -18,16 +19,16 @@ use sui_swarm_config::genesis_config::AccountConfig; use sui_test_transaction_builder::{make_publish_transaction, make_staking_transaction}; use sui_types::{base_types::SuiAddress, transaction::TransactionDataAPI}; use test_cluster::{TestCluster, TestClusterBuilder}; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; use url::Url; struct FnDelegationTestCluster { onchain_cluster: TestCluster, rpc_url: Url, - rpc_handle: JoinHandle<()>, + /// Hold on to the service so it doesn't get dropped (and therefore aborted) until the cluster + /// goes out of scope. + #[allow(unused)] + service: Service, client: Client, - cancel: CancellationToken, } impl FnDelegationTestCluster { @@ -49,8 +50,6 @@ impl FnDelegationTestCluster { // Unwrap since we know the URL should be valid. let fullnode_rpc_url = Url::parse(onchain_cluster.rpc_url())?; - let cancel = CancellationToken::new(); - let rpc_listen_address = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), get_available_port()); let rpc_url = Url::parse(&format!("http://{}/", rpc_listen_address)) @@ -64,7 +63,7 @@ impl FnDelegationTestCluster { ..Default::default() }; - let rpc_handle = start_rpc( + let service = start_rpc( None, None, DbArgs::default(), @@ -76,7 +75,6 @@ impl FnDelegationTestCluster { SystemPackageTaskArgs::default(), RpcConfig::default(), ®istry, - cancel.child_token(), ) .await .expect("Failed to start JSON-RPC server"); @@ -84,9 +82,8 @@ impl FnDelegationTestCluster { Ok(Self { onchain_cluster, rpc_url, - rpc_handle, + service, client: Client::new(), - cancel, }) } @@ -163,11 +160,6 @@ impl FnDelegationTestCluster { Ok(body) } - - async fn stopped(self) { - self.cancel.cancel(); - let _ = self.rpc_handle.await; - } } #[sim_test] @@ -211,8 +203,6 @@ async fn test_execution() { assert!(response["result"]["events"].is_array()); assert!(response["result"]["objectChanges"].is_array()); assert!(response["result"]["balanceChanges"].is_array()); - - test_cluster.stopped().await; } #[sim_test] @@ -245,8 +235,6 @@ async fn test_execution_with_deprecated_mode() { response["error"]["message"], "Invalid Params: WaitForLocalExecution mode is deprecated" ); - - test_cluster.stopped().await; } #[sim_test] @@ -280,8 +268,6 @@ async fn test_execution_with_no_sigs() { .unwrap() .starts_with("missing field `signatures`") ); - - test_cluster.stopped().await; } #[sim_test] @@ -313,8 +299,6 @@ async fn test_execution_with_empty_sigs() { response["error"]["message"], "Invalid user signature: Expect 1 signer signatures but got 0" ); - - test_cluster.stopped().await; } #[sim_test] @@ -345,8 +329,6 @@ async fn test_execution_with_aborted_tx() { tracing::info!("execution rpc response is {:?}", response); assert_eq!(response["result"]["effects"]["status"]["status"], "failure"); - - test_cluster.stopped().await; } #[sim_test] @@ -368,8 +350,6 @@ async fn test_dry_run() { .unwrap(); assert_eq!(response["result"]["effects"]["status"]["status"], "success"); - - test_cluster.stopped().await; } #[sim_test] @@ -396,7 +376,6 @@ async fn test_dry_run_with_invalid_tx() { .unwrap() .starts_with("Invalid value was given to the function") ); - test_cluster.stopped().await; } #[sim_test] @@ -416,7 +395,6 @@ async fn test_get_all_balances() { // Only check that FN can return a valid response and not check the contents; // the contents is FN logic and thus should be tested on the FN side. assert_eq!(response["result"][0]["coinType"], "0x2::sui::SUI"); - test_cluster.stopped().await; } #[sim_test] @@ -442,8 +420,6 @@ async fn test_get_all_balances_with_invalid_address() { .unwrap() .contains("Deserialization failed") ); - - test_cluster.stopped().await; } #[sim_test] @@ -492,7 +468,6 @@ async fn test_get_stakes_and_by_ids() { // Two responses should match. assert_eq!(get_stakes_response, get_stakes_by_ids_response); - test_cluster.stopped().await; } #[sim_test] @@ -535,8 +510,6 @@ async fn test_get_stakes_invalid_params() { .unwrap() .contains("AccountAddressParseError") ); - - test_cluster.stopped().await; } #[sim_test] @@ -556,7 +529,6 @@ async fn test_get_validators_apy() { response["result"]["apys"][0]["address"], validator_address.to_string() ); - test_cluster.stopped().await; } #[sim_test] @@ -636,5 +608,4 @@ async fn test_get_balance() { .unwrap() .contains("Invalid params") ); - test_cluster.stopped().await; } diff --git a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_name_service_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_name_service_tests.rs index 7d97b776fdbb6..9b3dc38ab1c81 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_name_service_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_name_service_tests.rs @@ -18,7 +18,6 @@ use sui_types::{ programmable_transaction_builder::ProgrammableTransactionBuilder, transaction::{ObjectArg, SharedObjectMutability, Transaction, TransactionData}, }; -use tokio_util::sync::CancellationToken; /// 5 SUI gas budget const DEFAULT_GAS_BUDGET: u64 = 5_000_000_000; @@ -102,8 +101,6 @@ async fn test_resolve_domain() { assert_resolved!(target, c.resolve_address("foo.sui").await.unwrap()); assert_resolved!(target, c.resolve_address("@foo").await.unwrap()); assert_reverse!("foo.sui", c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// If a domain name exists but has no target, we can't resolve it, but it's not an error. @@ -121,8 +118,6 @@ async fn test_resolve_domain_no_target() { let resp = c.resolve_address("foo.sui").await.unwrap(); assert!(resp["result"].is_null()); assert!(resp["error"].is_null()); - - c.cluster.stopped().await; } /// Set-up a domain with an expiry, and confirm that it exists, then advance time on-chain until it @@ -149,8 +144,6 @@ async fn test_resolve_domain_expiry() { assert_invalid_params!(c.resolve_address("foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } #[tokio::test] @@ -159,8 +152,6 @@ async fn test_resolve_nonexistent_domain() { c.cluster.create_checkpoint().await; assert_invalid_params!(c.resolve_address("foo.sui").await.unwrap()); - - c.cluster.stopped().await; } /// Test resolving a valid sub-domain (which requires both the sub-domain and its parent to exist @@ -185,8 +176,6 @@ async fn test_resolve_subdomain() { assert_resolved!(target, c.resolve_address("bar.foo.sui").await.unwrap()); assert_resolved!(target, c.resolve_address("bar@foo").await.unwrap()); assert_reverse!("bar.foo.sui", c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// Like the parent domain case, but a sub-domain's expiry is controlled by its parent's expiry @@ -216,8 +205,6 @@ async fn test_resolve_subdomain_parent_expiry() { assert_invalid_params!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// A sub-domain that has its own expiry, in addition to (and before) the parent's expiry. @@ -249,8 +236,6 @@ async fn test_resolve_subdomain_expiry() { assert_invalid_params!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// A sub-domain where the parent domain's NFT is different from the sub-domain's NFT, is @@ -277,8 +262,6 @@ async fn test_resolve_subdomain_bad_parent() { assert_invalid_params!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } /// The parent domain record does not exist, so the sub-domain is considered expired. @@ -297,8 +280,6 @@ async fn test_resolve_subdomain_no_parent() { assert_invalid_params!(c.resolve_address("bar.foo.sui").await.unwrap()); assert_no_reverse!(c.resolve_name(target).await.unwrap()); - - c.cluster.stopped().await; } struct SuiNSCluster { @@ -437,7 +418,6 @@ impl SuiNSCluster { ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await .expect("Failed to set-up cluster"); diff --git a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_transaction_pruning_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_transaction_pruning_tests.rs index f6ba07e833b56..68dec2dba95b4 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_transaction_pruning_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_transaction_pruning_tests.rs @@ -20,7 +20,6 @@ use sui_types::{ programmable_transaction_builder::ProgrammableTransactionBuilder, transaction::{Transaction, TransactionData}, }; -use tokio_util::sync::CancellationToken; /// 5 SUI gas budget const DEFAULT_GAS_BUDGET: u64 = 5_000_000_000; @@ -217,7 +216,6 @@ async fn cluster_with_pipelines(pipeline: PipelineLayer) -> FullCluster { ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await .expect("Failed to create cluster") diff --git a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_type_limit_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_type_limit_tests.rs index f10b15890e4db..d9120b776c7e6 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_type_limit_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/jsonrpc_type_limit_tests.rs @@ -18,7 +18,6 @@ use sui_types::{ programmable_transaction_builder::ProgrammableTransactionBuilder, transaction::{Transaction, TransactionData}, }; -use tokio_util::sync::CancellationToken; /// 5 SUI gas budget const DEFAULT_GAS_BUDGET: u64 = 5_000_000_000; @@ -36,8 +35,6 @@ async fn test_within_limits() { result["result"]["data"]["content"].is_object(), "Result: {result:#?}" ); - - c.cluster.stopped().await; } /// If we set a limit on how deeply nested some type arguments can be, then trying to fetch an @@ -59,7 +56,6 @@ async fn test_type_argument_depth() { }; assert!(err.contains("Type parameter nesting exceeded"), "{err}"); - c.cluster.stopped().await; } /// There is also a limit on how many type parameters a single type can have. @@ -80,7 +76,6 @@ async fn test_type_argument_width() { }; assert!(err.contains("Expected at most 3 type parameters"), "{err}"); - c.cluster.stopped().await; } /// This limit controls the number of types that need to be loaded to resolve the layout of a type. @@ -104,8 +99,6 @@ async fn test_type_nodes() { err.contains("More than 3 struct definitions required to resolve type"), "{err}" ); - - c.cluster.stopped().await; } /// This limit controls the depth of the resulting value layout. @@ -129,8 +122,6 @@ async fn test_value_depth() { err.contains("Type layout nesting exceeded limit of 3"), "{err}" ); - - c.cluster.stopped().await; } struct TypeLimitCluster { @@ -165,7 +156,6 @@ impl TypeLimitCluster { ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await .expect("Failed to set-up cluster"); diff --git a/crates/sui-indexer-alt-e2e-tests/tests/transactional_tests.rs b/crates/sui-indexer-alt-e2e-tests/tests/transactional_tests.rs index 5118b0c1b5308..05d11bfccf0a0 100644 --- a/crates/sui-indexer-alt-e2e-tests/tests/transactional_tests.rs +++ b/crates/sui-indexer-alt-e2e-tests/tests/transactional_tests.rs @@ -24,7 +24,6 @@ use sui_transactional_test_runner::{ test_adapter::{OffChainConfig, PRE_COMPILED, SuiTestAdapter}, }; use tokio::join; -use tokio_util::sync::CancellationToken; struct OffchainReader { cluster: Arc, @@ -172,7 +171,6 @@ async fn cluster(config: &OffChainConfig) -> Arc { ..Default::default() }, &prometheus::Registry::new(), - CancellationToken::new(), ) .await .expect("Failed to create off-chain cluster"), @@ -198,12 +196,5 @@ async fn run_test(path: &Path) -> Result<(), Box> { // run the tasks in the test run_tasks_with_adapter(path, adapter, output, None).await?; - - // clean-up the off-chain cluster - Arc::try_unwrap(c) - .unwrap_or_else(|_| panic!("Failed to unwrap off-chain cluster")) - .stopped() - .await; - Ok(()) } diff --git a/crates/sui-indexer-alt-framework/Cargo.toml b/crates/sui-indexer-alt-framework/Cargo.toml index c1d4f93c36c3f..6caee7dc85698 100644 --- a/crates/sui-indexer-alt-framework/Cargo.toml +++ b/crates/sui-indexer-alt-framework/Cargo.toml @@ -12,6 +12,7 @@ async-trait.workspace = true axum.workspace = true backoff.workspace = true bb8 = "0.8.5" +bytes.workspace = true chrono.workspace = true clap = { workspace = true, features = ["env"] } diesel = { workspace = true, features = ["chrono"] } @@ -27,7 +28,6 @@ tempfile.workspace = true thiserror.workspace = true tokio.workspace = true tokio-stream.workspace = true -tokio-util.workspace = true tonic.workspace = true tracing.workspace = true tracing-subscriber = { workspace = true, optional = true } diff --git a/crates/sui-indexer-alt-framework/src/cluster.rs b/crates/sui-indexer-alt-framework/src/cluster.rs index 153b318d04fc6..b2a7f58ba8ba8 100644 --- a/crates/sui-indexer-alt-framework/src/cluster.rs +++ b/crates/sui-indexer-alt-framework/src/cluster.rs @@ -9,10 +9,8 @@ use std::{ use anyhow::Context; use diesel_migrations::EmbeddedMigrations; use prometheus::Registry; +use sui_futures::service::Service; use sui_indexer_alt_metrics::{MetricsArgs, MetricsService}; -use tokio::{signal, task::JoinHandle}; -use tokio_util::sync::CancellationToken; -use tracing::info; use url::Url; use crate::{ @@ -46,9 +44,6 @@ pub struct Args { pub struct IndexerCluster { indexer: Indexer, metrics: MetricsService, - - /// Cancelling this token signals cancellation to both the indexer and metrics service. - cancel: CancellationToken, } /// Builder for creating an IndexerCluster with a fluent API @@ -152,9 +147,8 @@ impl IndexerClusterBuilder { tracing_subscriber::fmt::init(); - let cancel = CancellationToken::new(); let registry = Registry::new(); - let metrics = MetricsService::new(self.args.metrics_args, registry, cancel.child_token()); + let metrics = MetricsService::new(self.args.metrics_args, registry); let client_args = self.args.client_args.context("client_args is required")?; let indexer = Indexer::new_from_pg( @@ -166,15 +160,10 @@ impl IndexerClusterBuilder { self.migrations, self.metrics_prefix.as_deref(), metrics.registry(), - cancel.child_token(), ) .await?; - Ok(IndexerCluster { - indexer, - metrics, - cancel, - }) + Ok(IndexerCluster { indexer, metrics }) } } @@ -196,38 +185,14 @@ impl IndexerCluster { self.indexer.ingestion_metrics() } - /// This token controls stopping the indexer and metrics service. Clone it before calling - /// [Self::run] to retain the ability to stop the service after it has started. - pub fn cancel(&self) -> &CancellationToken { - &self.cancel - } - - /// Starts the indexer and metrics service, returning a handle to `await` the service's exit. + /// Starts the indexer and metrics service, returning a handle over the service's tasks. /// The service will exit when the indexer has finished processing all the checkpoints it was - /// configured to process, or when it receives an interrupt signal. - pub async fn run(self) -> Result> { - let h_ctrl_c = tokio::spawn({ - let cancel = self.cancel.clone(); - async move { - tokio::select! { - _ = cancel.cancelled() => {} - _ = signal::ctrl_c() => { - info!("Received Ctrl-C, shutting down..."); - cancel.cancel(); - } - } - } - }); - - let h_metrics = self.metrics.run().await?; - let h_indexer = self.indexer.run().await?; + /// configured to process, or when it is instructed to shut down. + pub async fn run(self) -> Result { + let s_indexer = self.indexer.run().await?; + let s_metrics = self.metrics.run().await?; - Ok(tokio::spawn(async move { - let _ = h_indexer.await; - self.cancel.cancel(); - let _ = h_metrics.await; - let _ = h_ctrl_c.await; - })) + Ok(s_indexer.attach(s_metrics)) } } @@ -384,7 +349,7 @@ mod tests { // Run the indexer until it signals completion. We have configured it to stop after // ingesting 10 checkpoints, so it should shut itself down. - indexer.run().await.unwrap().await.unwrap(); + indexer.run().await.unwrap().join().await.unwrap(); // Check that the results were all written out. { diff --git a/crates/sui-indexer-alt-framework/src/ingestion/broadcaster.rs b/crates/sui-indexer-alt-framework/src/ingestion/broadcaster.rs index cd1afa2e5f037..912cf0d30d4ef 100644 --- a/crates/sui-indexer-alt-framework/src/ingestion/broadcaster.rs +++ b/crates/sui-indexer-alt-framework/src/ingestion/broadcaster.rs @@ -1,16 +1,17 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -use std::{collections::HashMap, pin::Pin, sync::Arc, time::Duration}; +use std::{collections::HashMap, marker::Unpin, pin::Pin, sync::Arc, time::Duration}; +use anyhow::{Context, anyhow}; use futures::{Stream, StreamExt, future::try_join_all, stream}; -use sui_futures::stream::TrySpawnStreamExt; -use tokio::{ - sync::{mpsc, watch}, - task::JoinHandle, +use sui_futures::{ + service::Service, + stream::{Break, TrySpawnStreamExt}, + task::TaskGuard, }; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; +use tokio::sync::{mpsc, watch}; +use tracing::{debug, info, warn}; use crate::{ ingestion::{error::Error, streaming_client::CheckpointStreamingClient}, @@ -34,7 +35,7 @@ use super::{IngestionConfig, ingestion_client::IngestionClient}; /// 4. Both the ingest_and_broadcast_range and stream_and_broadcast_range tasks wait on the watch /// channel when they hit the ingest_hi limit. /// -/// The task will shut down if the `cancel` token is signalled, or if the `checkpoints` range completes. +/// The task will shut down if the `checkpoints` range completes. pub(super) fn broadcaster( checkpoints: R, initial_commit_hi: Option, @@ -44,13 +45,12 @@ pub(super) fn broadcaster( mut commit_hi_rx: mpsc::UnboundedReceiver<(&'static str, u64)>, subscribers: Vec>>, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> +) -> Service where R: std::ops::RangeBounds + Send + 'static, S: CheckpointStreamingClient + Send + 'static, { - tokio::spawn(async move { + Service::new().spawn_aborting(async move { info!("Starting broadcaster"); // Extract start and end from the range bounds @@ -98,7 +98,7 @@ where // latest checkpoint we get from a success streaming connection. // The ingestion task fill up the gap from checkpoint_hi to ingestion_end (exclusive) while the streaming // task covers from ingestion_end to end_cp. - let (streaming_handle, ingestion_end) = setup_streaming_task( + let (stream_guard, ingestion_end) = setup_streaming_task( &mut streaming_client, checkpoint_hi, end_cp, @@ -107,13 +107,12 @@ where &subscribers, &ingest_hi_watch_rx, &metrics, - &cancel, ) .await; // Spawn a broadcaster task for this range. // It will exit when the range is complete or if it is cancelled. - let ingestion_handle = tokio::spawn(ingest_and_broadcast_range( + let ingest_guard = ingest_and_broadcast_range( checkpoint_hi, ingestion_end, config.retry_interval(), @@ -121,18 +120,12 @@ where ingest_hi_watch_rx.clone(), client.clone(), subscribers.clone(), - cancel.clone(), - )); + ); - let mut join_future = futures::future::join(streaming_handle, ingestion_handle); + let mut ingest_and_broadcast = futures::future::join(stream_guard, ingest_guard); loop { tokio::select! { - _ = cancel.cancelled() => { - info!("Shutdown received, stopping ingestion"); - break 'outer; - } - // Subscriber watermark update // docs::#regulator (see docs/content/guides/developer/advanced/custom-indexer.mdx) Some((name, hi)) = commit_hi_rx.recv() => { @@ -147,31 +140,30 @@ where // docs::/#regulator // Handle both streaming and ingestion completion - (streaming_result, ingestion_result) = &mut join_future => { - // Check ingestion result, cancel on any error - match ingestion_result { - Ok(Ok(())) => {} // Success, continue - Ok(Err(e)) => { - error!("Ingestion task failed: {}", e); - cancel.cancel(); + (streaming_result, ingestion_result) = &mut ingest_and_broadcast => { + // Check ingestion result, exit on any error. + match ingestion_result + .context("Ingestion task panicked, stopping broadcaster")? + { + Ok(()) => {}, + + + // Ingestion stopped because one of its channels was closed. The + // overall broadcaster should also shutdown. + Err(Break::Break) => { break 'outer; } - Err(e) => { - error!("Ingestion task panicked: {}", e); - cancel.cancel(); - break 'outer; + + // Ingestion failed with an error of some kind, surface this as an + // overall error from the broadcaster. + Err(Break::Err(e)) => { + return Err(anyhow!(e).context("Ingestion task failed, stopping broadcaster")); } } - // Update checkpoint_hi from streaming, or cancel on error - checkpoint_hi = match streaming_result { - Ok(w) => w, - Err(e) => { - error!("Streaming task panicked: {}", e); - cancel.cancel(); - break 'outer; - } - }; + // Update checkpoint_hi from streaming, or shutdown on error + checkpoint_hi = streaming_result + .context("Streaming task panicked, stopping broadcaster")?; info!(checkpoint_hi, "Both tasks completed, moving on to next range"); break; @@ -180,15 +172,15 @@ where } } - info!("Broadcaster finished"); + info!("Checkpoints done, stopping broadcaster"); + Ok(()) }) } -/// Fetch and broadcasts checkpoints from a range [start..end) to subscribers. -/// This task is ingest_hi-aware and will wait if it encounters a checkpoint -/// beyond the current ingest_hi, resuming when ingest_hi advances to currently -/// ingesting checkpoints. -async fn ingest_and_broadcast_range( +/// Fetch and broadcasts checkpoints from a range [start..end) to subscribers. This task is +/// ingest_hi-aware and will wait if it encounters a checkpoint beyond the current ingest_hi, +/// resuming when ingest_hi advances to currently ingesting checkpoints. +fn ingest_and_broadcast_range( start: u64, end: u64, retry_interval: Duration, @@ -196,60 +188,45 @@ async fn ingest_and_broadcast_range( ingest_hi_rx: watch::Receiver>, client: IngestionClient, subscribers: Arc>>>, - cancel: CancellationToken, -) -> Result<(), Error> { - stream::iter(start..end) - .try_for_each_spawned(ingest_concurrency, |cp| { - let mut ingest_hi_rx = ingest_hi_rx.clone(); - let client = client.clone(); - let subscribers = subscribers.clone(); - - // One clone is for the supervisor to signal a cancel if it detects a - // subscriber that wants to wind down ingestion, and the other is to pass to - // each worker to detect cancellation. - // let supervisor_cancel = cancel.clone(); - let cancel = cancel.clone(); - - async move { - // docs::#bound (see docs/content/guides/developer/advanced/custom-indexer.mdx) - // Wait until ingest_hi allows processing this checkpoint. - // None means no backpressure limit. If we get Some(hi) we wait until cp < hi. - // wait_for only errors if the sender is dropped (main broadcaster shut down) so - // we treat an error returned here as cancellation too. - if tokio::select! { - result = ingest_hi_rx.wait_for(|hi| hi.is_none_or(|h| cp < h)) => result.is_err(), - _ = cancel.cancelled() => true - } { - return Err(Error::Cancelled); - } - // docs::/#bound - - // Fetch the checkpoint or stop if cancelled. - let checkpoint = tokio::select! { - cp = client.wait_for(cp, retry_interval) => cp?, - _ = cancel.cancelled() => { - return Err(Error::Cancelled); +) -> TaskGuard>> { + TaskGuard::new(tokio::spawn(async move { + stream::iter(start..end) + .try_for_each_spawned(ingest_concurrency, |cp| { + let mut ingest_hi_rx = ingest_hi_rx.clone(); + let client = client.clone(); + let subscribers = subscribers.clone(); + + async move { + // docs::#bound (see docs/content/guides/developer/advanced/custom-indexer.mdx) + // Wait until ingest_hi allows processing this checkpoint. + // None means no backpressure limit. If we get Some(hi) we wait until cp < hi. + // wait_for only errors if the sender is dropped (main broadcaster shut down) so + // we treat an error returned here as a shutdown signal. + if ingest_hi_rx + .wait_for(|hi| hi.is_none_or(|hi| cp < hi)) + .await + .is_err() + { + return Err(Break::Break); + } + // docs::/#bound + + // Fetch the checkpoint or stop if cancelled. + let checkpoint = client.wait_for(cp, retry_interval).await?; + + // Send checkpoint to all subscribers. + if send_checkpoint(checkpoint, &subscribers).await.is_ok() { + debug!(checkpoint = cp, "Broadcasted checkpoint"); + Ok(()) + } else { + // An error is returned meaning some subscriber channel has closed, which + // we consider a shutdown signal for ingestion. + Err(Break::Break) } - }; - - // Send checkpoint to all subscribers. - tokio::select! { - result = send_checkpoint(checkpoint, &subscribers) => { - if result.is_ok() { - debug!(checkpoint = cp, "Broadcasted checkpoint"); - Ok(()) - } else { - // An error is returned meaning some subscriber channel has closed, - // which we consider a cancellation signal for the entire ingestion. - cancel.cancel(); - Err(Error::Cancelled) - } - }, - _ = cancel.cancelled() => Err(Error::Cancelled), } - } - }) - .await + }) + .await + })) } /// Sets up either a noop or real streaming task based on network state and proximity to @@ -264,8 +241,7 @@ async fn setup_streaming_task( subscribers: &Arc>>>, ingest_hi_watch_rx: &watch::Receiver>, metrics: &Arc, - cancel: &CancellationToken, -) -> (JoinHandle, u64) +) -> (TaskGuard, u64) where S: CheckpointStreamingClient, { @@ -277,7 +253,7 @@ where let backoff_batch_size = *streaming_backoff_batch_size; // Convenient closure to handle streaming fallback logic due to connection or peek failure. - let mut handle_streaming_fallback = |reason: &str| { + let mut fallback = |reason: &str| { let ingestion_end = (checkpoint_hi + backoff_batch_size).min(end_cp); warn!( checkpoint_hi, @@ -290,22 +266,19 @@ where }; let Ok(stream) = streaming_client.connect().await else { - return handle_streaming_fallback("Streaming connection failed"); + return fallback("Streaming connection failed"); }; let mut peekable_stream = Box::pin(stream).peekable(); - let Some(Ok(peeked_checkpoint)) = Pin::new(&mut peekable_stream).peek().await else { - return handle_streaming_fallback("Failed to peek latest checkpoint"); + return fallback("Failed to peek latest checkpoint"); }; // We have successfully connected and peeked, reset backoff batch size. *streaming_backoff_batch_size = config.streaming_backoff_initial_batch_size as u64; let network_latest_cp = *peeked_checkpoint.summary.sequence_number(); - let ingestion_end = network_latest_cp.min(end_cp); - if network_latest_cp > checkpoint_hi + config.checkpoint_buffer_size as u64 { info!( network_latest_cp, @@ -319,87 +292,83 @@ where checkpoint_hi, "Within buffer size, starting streaming" ); - let streaming_handle = tokio::spawn(stream_and_broadcast_range( + let stream_guard = TaskGuard::new(tokio::spawn(stream_and_broadcast_range( network_latest_cp.max(checkpoint_hi), end_cp, peekable_stream, subscribers.clone(), ingest_hi_watch_rx.clone(), metrics.clone(), - cancel.clone(), - )); + ))); - (streaming_handle, ingestion_end) + (stream_guard, ingestion_end) } -/// Streams and broadcasts checkpoints from a range [start, end) to subscribers. -/// This task is ingest_hi-aware, for each checkpoint this task will wait until -/// `checkpoint_hi < ingest_hi` before advancing to the next checkpoint. -/// If we encounter any streaming error or out-of-order checkpoint greater than -/// the current checkpoint_hi, we stop streaming and return checkpoint_hi so that -/// the main loop can reconnect and fill in the gap using ingestion. +/// Streams and broadcasts checkpoints from a range [start, end) to subscribers. This task is +/// ingest_hi-aware, for each checkpoint this task will wait until `checkpoint_hi < ingest_hi` +/// before advancing to the next checkpoint. If we encounter any streaming error or out-of-order +/// checkpoint greater than the current checkpoint_hi, we stop streaming and return checkpoint_hi +/// so that the main loop can reconnect and fill in the gap using ingestion. async fn stream_and_broadcast_range( mut lo: u64, hi: u64, - mut stream: impl Stream> + std::marker::Unpin, + mut stream: impl Stream> + Unpin, subscribers: Arc>>>, mut ingest_hi_rx: watch::Receiver>, metrics: Arc, - cancel: CancellationToken, ) -> u64 { while lo < hi { - tokio::select! { - _ = cancel.cancelled() => { - info!(lo, "Shutdown received, stopping streaming"); + let Some(item) = stream.next().await else { + warn!(lo, "Streaming ended unexpectedly"); + break; + }; + + let checkpoint = match item { + Ok(checkpoint) => checkpoint, + Err(e) => { + warn!(lo, "Streaming error: {e}"); break; } - item = stream.next() => { - match item { - Some(Ok(checkpoint)) => { - let sequence_number = *checkpoint.summary.sequence_number(); - - if sequence_number < lo { - debug!(checkpoint = sequence_number, lo, "Skipping already processed checkpoint"); - continue; - } + }; - if sequence_number > lo { - warn!(checkpoint = sequence_number, lo, "Out-of-order checkpoint"); - // Return to main loop to fill up the gap. - break; - } + let sequence_number = *checkpoint.summary.sequence_number(); - assert_eq!(sequence_number, lo); + if sequence_number < lo { + debug!( + checkpoint = sequence_number, + lo, "Skipping already processed checkpoint" + ); + continue; + } - // Wait until ingest_hi allows processing this checkpoint. - if tokio::select! { - result = ingest_hi_rx.wait_for(|hi| hi.is_none_or(|h| lo < h)) => result.is_err(), - _ = cancel.cancelled() => true, - } { - break; - } + if sequence_number > lo { + warn!(checkpoint = sequence_number, lo, "Out-of-order checkpoint"); + // Return to main loop to fill up the gap. + break; + } - // Send checkpoint to all subscribers and break on any error. - if send_checkpoint(Arc::new(checkpoint), &subscribers).await.is_err() { - break; - } + assert_eq!(sequence_number, lo); + if ingest_hi_rx + .wait_for(|hi| hi.is_none_or(|hi| lo < hi)) + .await + .is_err() + { + // Channel closed, treat as cancellation to avoid letting a checkpoint slip through as + // the indexer winds down. + break; + } - info!(checkpoint = lo, "Streamed checkpoint"); - metrics.total_streamed_checkpoints.inc(); - metrics.latest_streamed_checkpoint.set(lo as i64); - lo += 1; - } - Some(Err(e)) => { - warn!(lo, "Streaming error: {}", e); - break; - } - None => { - warn!(lo, "Streaming ended unexpectedly"); - break; - } - } - } + if send_checkpoint(Arc::new(checkpoint), &subscribers) + .await + .is_err() + { + break; } + + debug!(checkpoint = lo, "Streamed checkpoint"); + metrics.total_streamed_checkpoints.inc(); + metrics.latest_streamed_checkpoint.set(lo as i64); + lo += 1; } // We exit the loop either due to cancellation, error or completion of the range, @@ -420,8 +389,8 @@ async fn send_checkpoint( // A noop streaming task that just returns the provided checkpoint_hi, used to simplify // join logic when streaming is not used. -fn noop_streaming_task(checkpoint_hi: u64) -> JoinHandle { - tokio::spawn(async move { checkpoint_hi }) +fn noop_streaming_task(checkpoint_hi: u64) -> TaskGuard { + TaskGuard::new(tokio::spawn(async move { checkpoint_hi })) } #[cfg(test)] @@ -512,11 +481,10 @@ mod tests { async fn finite_list_of_checkpoints() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); let cps = 0..5; let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let mut svc = broadcaster::<_, MockStreamingClient>( cps, None, None, @@ -525,7 +493,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); assert_eq!( @@ -533,18 +500,16 @@ mod tests { BTreeSet::from_iter(0..5) ); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] async fn shutdown_on_sender_closed() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let mut svc = broadcaster::<_, MockStreamingClient>( 0.., None, None, @@ -553,7 +518,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); assert_eq!( @@ -562,17 +526,16 @@ mod tests { ); drop(subscriber_rx); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] - async fn shutdown_on_cancel() { + async fn shutdown() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let svc = broadcaster::<_, MockStreamingClient>( 0.., None, None, @@ -581,7 +544,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); assert_eq!( @@ -589,21 +551,19 @@ mod tests { BTreeSet::from_iter(0..5) ); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.shutdown().await.unwrap(); } #[tokio::test] async fn halted() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); let mut config = test_config(); config.checkpoint_buffer_size = 0; // No buffer let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let _svc = broadcaster::<_, MockStreamingClient>( 0.., Some(4), None, @@ -612,7 +572,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); assert_eq!( @@ -622,22 +581,18 @@ mod tests { // Regulator stopped because of watermark. expect_timeout(&mut subscriber_rx).await; - - cancel.cancel(); - h_broadcaster.await.unwrap(); } #[tokio::test] async fn halted_buffered() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); let mut config = test_config(); config.checkpoint_buffer_size = 2; // Buffer of 2 let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let _svc = broadcaster::<_, MockStreamingClient>( 0.., Some(2), None, @@ -646,7 +601,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); assert_eq!( @@ -656,22 +610,18 @@ mod tests { // Regulator stopped because of watermark (plus buffering). expect_timeout(&mut subscriber_rx).await; - - cancel.cancel(); - h_broadcaster.await.unwrap(); } #[tokio::test] async fn resumption() { let (hi_tx, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); let mut config = test_config(); config.checkpoint_buffer_size = 0; // No buffer let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let _svc = broadcaster::<_, MockStreamingClient>( 0.., Some(2), None, @@ -680,7 +630,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); assert_eq!( @@ -699,16 +648,12 @@ mod tests { // Halted again. expect_timeout(&mut subscriber_rx).await; - - cancel.cancel(); - h_broadcaster.await.unwrap(); } #[tokio::test] async fn multiple_subscribers() { let (hi_tx, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); hi_tx.send(("a", 2)).unwrap(); hi_tx.send(("b", 3)).unwrap(); @@ -718,7 +663,7 @@ mod tests { let cps = 0..10; let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let _svc = broadcaster::<_, MockStreamingClient>( cps, Some(2), None, @@ -727,7 +672,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); assert_eq!( @@ -758,9 +702,6 @@ mod tests { // But another update to "a" will now not make a difference, because "b" is still behind. hi_tx.send(("a", 5)).unwrap(); expect_timeout(&mut subscriber_rx).await; - - cancel.cancel(); - h_broadcaster.await.unwrap(); } #[tokio::test] @@ -768,10 +709,9 @@ mod tests { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx1, mut subscriber_rx1) = mpsc::channel(1); let (subscriber_tx2, mut subscriber_rx2) = mpsc::channel(1); - let cancel = CancellationToken::new(); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let mut svc = broadcaster::<_, MockStreamingClient>( 0.., None, None, @@ -780,7 +720,6 @@ mod tests { hi_rx, vec![subscriber_tx1, subscriber_tx2], metrics, - cancel.clone(), ); // Both subscribers should receive checkpoints @@ -797,14 +736,13 @@ mod tests { drop(subscriber_rx1); // The broadcaster should shut down gracefully - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] async fn start_from_non_zero() { let (hi_tx, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); // Set watermark before starting hi_tx.send(("test", 1005)).unwrap(); @@ -813,7 +751,7 @@ mod tests { config.checkpoint_buffer_size = 0; // No buffer let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster::<_, MockStreamingClient>( + let mut svc = broadcaster::<_, MockStreamingClient>( 1000..1010, Some(1005), None, @@ -822,7 +760,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics, - cancel.clone(), ); // Should receive checkpoints starting from 1000 @@ -842,8 +779,7 @@ mod tests { BTreeSet::from_iter(1005..1010) ); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } // =============== Streaming Tests ================== @@ -854,13 +790,12 @@ mod tests { async fn streaming_only() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(10); - let cancel = CancellationToken::new(); // Create a mock streaming service with checkpoints 0..5 let streaming_client = MockStreamingClient::new(0..5); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..5, // Bounded range None, Some(streaming_client), @@ -869,7 +804,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should receive all checkpoints from the stream in order @@ -880,22 +814,20 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 0); assert_eq!(metrics.latest_streamed_checkpoint.get(), 4); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] async fn streaming_with_transition() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(100); - let cancel = CancellationToken::new(); // Create a mock streaming service that starts at checkpoint 50 // This simulates streaming being ahead of ingestion let streaming_client = MockStreamingClient::new(49..60); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..60, None, Some(streaming_client), @@ -904,7 +836,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); assert_eq!( @@ -917,8 +848,7 @@ mod tests { assert_eq!(metrics.total_streamed_checkpoints.get(), 10); // [50..60) assert_eq!(metrics.latest_streamed_checkpoint.get(), 59); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } // =============== Part 2: Edge Cases ================== @@ -928,13 +858,12 @@ mod tests { // Test scenario where streaming service starts beyond the requested end checkpoint. let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(30); - let cancel = CancellationToken::new(); // Streaming starts at checkpoint 100, but we only want 0..30. let streaming_client = MockStreamingClient::new(100..110); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..30, None, Some(streaming_client), @@ -943,7 +872,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should use only ingestion since streaming is beyond end_cp @@ -956,8 +884,7 @@ mod tests { assert_eq!(metrics.total_streamed_checkpoints.get(), 0); assert_eq!(metrics.total_ingested_checkpoints.get(), 30); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] @@ -965,13 +892,12 @@ mod tests { // Test scenario where streaming starts before the requested start checkpoint. let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(30); - let cancel = CancellationToken::new(); // Streaming starts at checkpoint 0 but indexing starts at 30. let streaming_client = MockStreamingClient::new(0..100); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 30..100, None, Some(streaming_client), @@ -980,7 +906,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); assert_eq!( @@ -993,8 +918,7 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 0); assert_eq!(metrics.latest_streamed_checkpoint.get(), 99); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] @@ -1003,7 +927,6 @@ mod tests { // which should be skipped. let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(50); - let cancel = CancellationToken::new(); // Create streaming client that returns some checkpoints behind the watermark let mut streaming_client = MockStreamingClient::new(0..15); @@ -1013,7 +936,7 @@ mod tests { streaming_client.insert_checkpoint_range(15..20); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..20, None, Some(streaming_client), @@ -1022,7 +945,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should receive all checkpoints exactly once (no duplicates) from streaming. @@ -1035,8 +957,7 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 0); assert_eq!(metrics.latest_streamed_checkpoint.get(), 19); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] @@ -1045,14 +966,13 @@ mod tests { // requiring fallback to ingestion to fill the gap. let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(50); - let cancel = CancellationToken::new(); // Create streaming client that has a gap (checkpoint ahead of expected watermark) let mut streaming_client = MockStreamingClient::new(0..3); streaming_client.insert_checkpoint_range(6..10); // Gap: skips checkpoints 3 - 5 let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..10, None, Some(streaming_client), @@ -1061,7 +981,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should receive first three checkpoints from streaming in order @@ -1078,8 +997,7 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 4); assert_eq!(metrics.latest_streamed_checkpoint.get(), 9); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] @@ -1088,7 +1006,6 @@ mod tests { let (hi_tx, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(30); - let cancel = CancellationToken::new(); let streaming_client = MockStreamingClient::new(0..20); @@ -1098,7 +1015,7 @@ mod tests { }; let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..20, Some(5), // initial watermark to trigger backpressure Some(streaming_client), @@ -1107,7 +1024,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should receive first 10 checkpoints (0..10) from streaming @@ -1129,11 +1045,9 @@ mod tests { Vec::from_iter(10..20) ); assert_eq!(metrics.latest_streamed_checkpoint.get(), 19); - assert_eq!(metrics.total_streamed_checkpoints.get(), 20); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } // =============== Part 3: Streaming Errors ================== @@ -1142,7 +1056,6 @@ mod tests { async fn streaming_error_during_streaming() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(20); - let cancel = CancellationToken::new(); // Create streaming client with error injected mid-stream let mut streaming_client = MockStreamingClient::new(0..5); @@ -1150,7 +1063,7 @@ mod tests { streaming_client.insert_checkpoint_range(10..15); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..15, None, Some(streaming_client), @@ -1159,7 +1072,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should receive first 5 checkpoints from streaming in order @@ -1178,15 +1090,13 @@ mod tests { // The last checkpoint should come from streaming after recovery. assert_eq!(metrics.latest_streamed_checkpoint.get(), 14); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] async fn streaming_multiple_errors_with_recovery() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(50); - let cancel = CancellationToken::new(); // Create streaming client with multiple errors injected let mut streaming_client = MockStreamingClient::new(0..5); @@ -1196,7 +1106,7 @@ mod tests { streaming_client.insert_checkpoint_range(10..20); let metrics = test_ingestion_metrics(); - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..20, None, Some(streaming_client), @@ -1205,7 +1115,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should eventually receive all checkpoints despite errors from streaming. @@ -1219,34 +1128,30 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 0); assert_eq!(metrics.total_stream_disconnections.get(), 3); // 2 errors + 1 completion - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] async fn streaming_start_failure_fallback_to_ingestion() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(20); - let cancel = CancellationToken::new(); // Streaming service that fails to start let streaming_service = MockStreamingClient::new(0..20).fail_connection_times(1); let metrics = test_ingestion_metrics(); - let config = IngestionConfig { - streaming_backoff_initial_batch_size: 5, - ..test_config() - }; - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..20, None, Some(streaming_service), - config, + IngestionConfig { + streaming_backoff_initial_batch_size: 5, + ..test_config() + }, mock_client(metrics.clone()), hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should fallback to ingestion for initial batch size checkpoints @@ -1264,15 +1169,13 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 5); assert_eq!(metrics.total_streamed_checkpoints.get(), 15); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] async fn streaming_peek_failure_fallback_to_ingestion() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(20); - let cancel = CancellationToken::new(); // Streaming service where peek fails on first attempt let mut streaming_client = MockStreamingClient::new(vec![]); @@ -1280,20 +1183,18 @@ mod tests { streaming_client.insert_checkpoint_range(0..20); let metrics = test_ingestion_metrics(); - let config = IngestionConfig { - streaming_backoff_initial_batch_size: 5, - ..test_config() - }; - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..20, None, Some(streaming_client), - config, + IngestionConfig { + streaming_backoff_initial_batch_size: 5, + ..test_config() + }, mock_client(metrics.clone()), hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should fallback to ingestion for first 10 checkpoints @@ -1312,22 +1213,19 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 5); assert_eq!(metrics.total_streamed_checkpoints.get(), 15); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] async fn streaming_connection_retry_with_backoff() { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(50); - let cancel = CancellationToken::new(); // Streaming client where connection always fails (never recovers) let streaming_client = MockStreamingClient::new(0..50).fail_connection_times(usize::MAX); let metrics = test_ingestion_metrics(); - - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..50, None, Some(streaming_client), @@ -1336,7 +1234,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should fallback to ingestion for all checkpoints @@ -1352,8 +1249,7 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 50); assert_eq!(metrics.total_streamed_checkpoints.get(), 0); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } #[tokio::test] @@ -1362,7 +1258,6 @@ mod tests { let (_, hi_rx) = mpsc::unbounded_channel(); let (subscriber_tx, mut subscriber_rx) = mpsc::channel(50); - let cancel = CancellationToken::new(); let mut streaming_client = MockStreamingClient::new(0..40).fail_connection_times(4); streaming_client.insert_error(); // First error to get back to main loop @@ -1370,8 +1265,7 @@ mod tests { streaming_client.insert_checkpoint_range(40..50); // Complete the rest let metrics = test_ingestion_metrics(); - - let h_broadcaster = broadcaster( + let mut svc = broadcaster( 0..50, None, Some(streaming_client), @@ -1380,7 +1274,6 @@ mod tests { hi_rx, vec![subscriber_tx], metrics.clone(), - cancel.clone(), ); // Should fallback to ingestion for first 2 + 4 + 8 + 16 = 30 checkpoints @@ -1414,7 +1307,6 @@ mod tests { assert_eq!(metrics.total_ingested_checkpoints.get(), 32); assert_eq!(metrics.total_streamed_checkpoints.get(), 18); - cancel.cancel(); - h_broadcaster.await.unwrap(); + svc.join().await.unwrap(); } } diff --git a/crates/sui-indexer-alt-framework/src/ingestion/error.rs b/crates/sui-indexer-alt-framework/src/ingestion/error.rs index ade0d021874ee..f3c671d7aedd6 100644 --- a/crates/sui-indexer-alt-framework/src/ingestion/error.rs +++ b/crates/sui-indexer-alt-framework/src/ingestion/error.rs @@ -20,9 +20,6 @@ pub enum Error { #[error("No subscribers for ingestion service")] NoSubscribers, - #[error("Shutdown signal received, stopping ingestion service")] - Cancelled, - #[error(transparent)] RpcClientError(#[from] tonic::Status), diff --git a/crates/sui-indexer-alt-framework/src/ingestion/ingestion_client.rs b/crates/sui-indexer-alt-framework/src/ingestion/ingestion_client.rs index 86f2a51dea7f4..f770198c21a62 100644 --- a/crates/sui-indexer-alt-framework/src/ingestion/ingestion_client.rs +++ b/crates/sui-indexer-alt-framework/src/ingestion/ingestion_client.rs @@ -9,11 +9,11 @@ use async_trait::async_trait; use backoff::Error as BE; use backoff::ExponentialBackoff; use backoff::backoff::Constant; +use bytes::Bytes; use sui_futures::future::with_slow_future_monitor; use sui_rpc::Client; use sui_rpc::client::HeadersInterceptor; use sui_storage::blob::Blob; -use tokio_util::bytes::Bytes; use tracing::{debug, error, warn}; use url::Url; @@ -322,7 +322,6 @@ mod tests { use std::sync::Arc; use std::time::Duration; use tokio::time::timeout; - use tokio_util::bytes::Bytes; use crate::ingestion::test_utils::test_checkpoint_data; diff --git a/crates/sui-indexer-alt-framework/src/ingestion/mod.rs b/crates/sui-indexer-alt-framework/src/ingestion/mod.rs index c4eb77dd75769..a1479af8d1495 100644 --- a/crates/sui-indexer-alt-framework/src/ingestion/mod.rs +++ b/crates/sui-indexer-alt-framework/src/ingestion/mod.rs @@ -10,8 +10,8 @@ use std::{sync::Arc, time::Duration}; use prometheus::Registry; use serde::{Deserialize, Serialize}; -use tokio::{sync::mpsc, task::JoinHandle}; -use tokio_util::sync::CancellationToken; +use sui_futures::service::Service; +use tokio::sync::mpsc; use crate::ingestion::broadcaster::broadcaster; use crate::ingestion::error::{Error, Result}; @@ -69,7 +69,6 @@ pub struct IngestionService { commit_hi_rx: mpsc::UnboundedReceiver<(&'static str, u64)>, subscribers: Vec>>, metrics: Arc, - cancel: CancellationToken, } impl IngestionConfig { @@ -93,7 +92,6 @@ impl IngestionService { config: IngestionConfig, metrics_prefix: Option<&str>, registry: &Registry, - cancel: CancellationToken, ) -> Result { let metrics = IngestionMetrics::new(metrics_prefix, registry); let ingestion_client = IngestionClient::new(args.ingestion, metrics.clone())?; @@ -109,7 +107,6 @@ impl IngestionService { commit_hi_rx, subscribers, metrics, - cancel, }) } @@ -157,11 +154,7 @@ impl IngestionService { /// If ingestion reaches the leading edge of the network, it will encounter checkpoints that do /// not exist yet. These will be retried repeatedly on a fixed `retry_interval` until they /// become available. - pub async fn run( - self, - checkpoints: R, - initial_commit_hi: Option, - ) -> Result> + pub async fn run(self, checkpoints: R, initial_commit_hi: Option) -> Result where R: std::ops::RangeBounds + Send + 'static, { @@ -173,14 +166,13 @@ impl IngestionService { commit_hi_rx, subscribers, metrics, - cancel, } = self; if subscribers.is_empty() { return Err(Error::NoSubscribers); } - let broadcaster = broadcaster( + Ok(broadcaster( checkpoints, initial_commit_hi, streaming_client, @@ -189,10 +181,7 @@ impl IngestionService { commit_hi_rx, subscribers, metrics, - cancel.clone(), - ); - - Ok(broadcaster) + )) } } @@ -213,6 +202,7 @@ mod tests { use std::sync::Mutex; use reqwest::StatusCode; + use sui_futures::task::TaskGuard; use url::Url; use wiremock::{MockServer, Request}; @@ -225,7 +215,6 @@ mod tests { uri: String, checkpoint_buffer_size: usize, ingest_concurrency: usize, - cancel: CancellationToken, ) -> IngestionService { let registry = Registry::new(); IngestionService::new( @@ -243,7 +232,6 @@ mod tests { }, None, ®istry, - cancel, ) .unwrap() } @@ -251,48 +239,40 @@ mod tests { async fn test_subscriber( stop_after: usize, mut rx: mpsc::Receiver>, - cancel: CancellationToken, - ) -> JoinHandle> { - tokio::spawn(async move { + ) -> TaskGuard> { + TaskGuard::new(tokio::spawn(async move { let mut seqs = vec![]; for _ in 0..stop_after { - tokio::select! { - _ = cancel.cancelled() => break, - Some(checkpoint) = rx.recv() => { - seqs.push(checkpoint.summary.sequence_number); - } - } + let Some(checkpoint) = rx.recv().await else { + break; + }; + + seqs.push(checkpoint.summary.sequence_number); } - rx.close(); seqs - }) + })) } /// If the ingestion service has no subscribers, it will fail fast (before fetching any /// checkpoints). #[tokio::test] async fn fail_on_no_subscribers() { - telemetry_subscribers::init_for_testing(); - // The mock server will repeatedly return 404, so if the service does try to fetch a // checkpoint, it will be stuck repeatedly retrying. let server = MockServer::start().await; respond_with(&server, status(StatusCode::NOT_FOUND)).await; - let cancel = CancellationToken::new(); - let ingestion_service = test_ingestion(server.uri(), 1, 1, cancel.clone()).await; + let ingestion_service = test_ingestion(server.uri(), 1, 1).await; - let err = ingestion_service.run(0.., None).await.unwrap_err(); - assert!(matches!(err, Error::NoSubscribers)); + let res = ingestion_service.run(0.., None).await; + assert!(matches!(res, Err(Error::NoSubscribers))); } /// The subscriber has no effective limit, and the mock server will always return checkpoint - /// information, but the ingestion service can still be stopped using the cancellation token. + /// information, but the ingestion service can still be stopped by shutting it down. #[tokio::test] - async fn shutdown_on_cancel() { - telemetry_subscribers::init_for_testing(); - + async fn shutdown() { let server = MockServer::start().await; respond_with( &server, @@ -300,24 +280,20 @@ mod tests { ) .await; - let cancel = CancellationToken::new(); - let mut ingestion_service = test_ingestion(server.uri(), 1, 1, cancel.clone()).await; + let mut ingestion_service = test_ingestion(server.uri(), 1, 1).await; let (rx, _) = ingestion_service.subscribe(); - let subscriber = test_subscriber(usize::MAX, rx, cancel.clone()).await; - let broadcaster = ingestion_service.run(0.., None).await.unwrap(); + let subscriber = test_subscriber(usize::MAX, rx).await; + let svc = ingestion_service.run(0.., None).await.unwrap(); - cancel.cancel(); + svc.shutdown().await.unwrap(); subscriber.await.unwrap(); - broadcaster.await.unwrap(); } /// The subscriber will stop after receiving a single checkpoint, and this will trigger the /// ingestion service to stop as well, even if there are more checkpoints to fetch. #[tokio::test] async fn shutdown_on_subscriber_drop() { - telemetry_subscribers::init_for_testing(); - let server = MockServer::start().await; respond_with( &server, @@ -325,24 +301,20 @@ mod tests { ) .await; - let cancel = CancellationToken::new(); - let mut ingestion_service = test_ingestion(server.uri(), 1, 1, cancel.clone()).await; + let mut ingestion_service = test_ingestion(server.uri(), 1, 1).await; let (rx, _) = ingestion_service.subscribe(); - let subscriber = test_subscriber(1, rx, cancel.clone()).await; - let broadcaster = ingestion_service.run(0.., None).await.unwrap(); + let subscriber = test_subscriber(1, rx).await; + let mut svc = ingestion_service.run(0.., None).await.unwrap(); - cancel.cancelled().await; - subscriber.await.unwrap(); - broadcaster.await.unwrap(); + drop(subscriber); + svc.join().await.unwrap(); } /// The service will retry fetching a checkpoint that does not exist, in this test, the 4th /// checkpoint will return 404 a couple of times, before eventually succeeding. #[tokio::test] async fn retry_on_not_found() { - telemetry_subscribers::init_for_testing(); - let server = MockServer::start().await; let times: Mutex = Mutex::new(0); respond_with(&server, move |_: &Request| { @@ -356,25 +328,19 @@ mod tests { }) .await; - let cancel = CancellationToken::new(); - let mut ingestion_service = test_ingestion(server.uri(), 1, 1, cancel.clone()).await; + let mut ingestion_service = test_ingestion(server.uri(), 1, 1).await; let (rx, _) = ingestion_service.subscribe(); - let subscriber = test_subscriber(5, rx, cancel.clone()).await; - let broadcaster = ingestion_service.run(0.., None).await.unwrap(); + let subscriber = test_subscriber(5, rx).await; + let _svc = ingestion_service.run(0.., None).await.unwrap(); - cancel.cancelled().await; let seqs = subscriber.await.unwrap(); - broadcaster.await.unwrap(); - assert_eq!(seqs, vec![1, 2, 3, 6, 7]); } /// Similar to the previous test, but now it's a transient error that causes the retry. #[tokio::test] async fn retry_on_transient_error() { - telemetry_subscribers::init_for_testing(); - let server = MockServer::start().await; let times: Mutex = Mutex::new(0); respond_with(&server, move |_: &Request| { @@ -388,17 +354,13 @@ mod tests { }) .await; - let cancel = CancellationToken::new(); - let mut ingestion_service = test_ingestion(server.uri(), 1, 1, cancel.clone()).await; + let mut ingestion_service = test_ingestion(server.uri(), 1, 1).await; let (rx, _) = ingestion_service.subscribe(); - let subscriber = test_subscriber(5, rx, cancel.clone()).await; - let broadcaster = ingestion_service.run(0.., None).await.unwrap(); + let subscriber = test_subscriber(5, rx).await; + let _svc = ingestion_service.run(0.., None).await.unwrap(); - cancel.cancelled().await; let seqs = subscriber.await.unwrap(); - broadcaster.await.unwrap(); - assert_eq!(seqs, vec![1, 2, 3, 6, 7]); } @@ -407,8 +369,6 @@ mod tests { /// can keep processing checkpoints that were buffered for the slow one. #[tokio::test] async fn back_pressure_and_buffering() { - telemetry_subscribers::init_for_testing(); - let server = MockServer::start().await; let times: Mutex = Mutex::new(0); respond_with(&server, move |_: &Request| { @@ -418,9 +378,7 @@ mod tests { }) .await; - let cancel = CancellationToken::new(); - let mut ingestion_service = - test_ingestion(server.uri(), /* buffer */ 3, 1, cancel.clone()).await; + let mut ingestion_service = test_ingestion(server.uri(), 3, 1).await; // This subscriber will take its sweet time processing checkpoints. let (mut laggard, _) = ingestion_service.subscribe(); @@ -430,8 +388,8 @@ mod tests { } let (rx, _) = ingestion_service.subscribe(); - let subscriber = test_subscriber(5, rx, cancel.clone()).await; - let broadcaster = ingestion_service.run(0.., None).await.unwrap(); + let subscriber = test_subscriber(5, rx).await; + let _svc = ingestion_service.run(0.., None).await.unwrap(); // At this point, the service will have been able to pass 3 checkpoints to the non-lagging // subscriber, while the laggard's buffer fills up. Now the laggard will pull two @@ -440,10 +398,7 @@ mod tests { assert_eq!(unblock(&mut laggard).await, 1); assert_eq!(unblock(&mut laggard).await, 2); - cancel.cancelled().await; let seqs = subscriber.await.unwrap(); - broadcaster.await.unwrap(); - assert_eq!(seqs, vec![1, 2, 3, 4, 5]); } } diff --git a/crates/sui-indexer-alt-framework/src/ingestion/regulator.rs b/crates/sui-indexer-alt-framework/src/ingestion/regulator.rs deleted file mode 100644 index cd45ad3ecb5f9..0000000000000 --- a/crates/sui-indexer-alt-framework/src/ingestion/regulator.rs +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright (c) Mysten Labs, Inc. -// SPDX-License-Identifier: Apache-2.0 - -use std::collections::HashMap; - -use tokio::{sync::mpsc, task::JoinHandle}; -use tokio_util::sync::CancellationToken; -use tracing::info; - -/// The regulator task is responsible for writing out checkpoint sequence numbers from the -/// `checkpoints` iterator to `checkpoint_tx`, bounded by the high watermark dictated by -/// subscribers. -/// -/// Subscribers can share their commit high (exclusive) on `commit_hi_rx`. -/// The regulator remembers these, and stops serving checkpoints if they are over the minimum -/// subscriber commit_hi plus the ingestion `buffer_size`. -/// -/// This offers a form of back-pressure that is sensitive to ordering, which is useful for -/// subscribers that need to commit information in order: Without it, those subscribers may need to -/// buffer unboundedly many updates from checkpoints while they wait for the checkpoint that they -/// need to commit. -/// -/// Note that back-pressure is optional, and will only be applied if a subscriber provides a -/// commit_hi, at which point it must keep updating the commit_hi to allow the ingestion service to -/// continue making progress. -/// -/// The `initial_commit_hi` parameter allows the regulator to start with an initial bound -/// on how far it can ingest. This is useful for scenarios where the indexer is restarted and -/// already has some checkpoints ingested, but subscribers may not yet have reported their commit_hi. -/// -/// The task will shut down if the `cancel` token is signalled, or if the `checkpoints` iterator -/// runs out. -pub(super) fn regulator( - checkpoints: I, - buffer_size: usize, - initial_commit_hi: Option, - mut commit_hi_rx: mpsc::UnboundedReceiver<(&'static str, u64)>, - checkpoint_tx: mpsc::Sender, - cancel: CancellationToken, -) -> JoinHandle<()> -where - I: IntoIterator + Send + Sync + 'static, - I::IntoIter: Send + Sync + 'static, -{ - tokio::spawn(async move { - let mut commit_hi = initial_commit_hi; - let mut subscribers_hi = HashMap::new(); - let mut checkpoints = checkpoints.into_iter().peekable(); - - info!("Starting ingestion regulator"); - - loop { - let Some(cp) = checkpoints.peek() else { - info!("Checkpoints done, stopping regulator"); - break; - }; - - tokio::select! { - _ = cancel.cancelled() => { - info!("Shutdown received, stopping regulator"); - break; - } - - // docs::#regulator (see docs/content/guides/developer/advanced/custom-indexer.mdx) - Some((name, hi)) = commit_hi_rx.recv() => { - subscribers_hi.insert(name, hi); - commit_hi = subscribers_hi.values().copied().min(); - } - // docs::/#regulator - // docs::#bound (see docs/content/guides/developer/advanced/custom-indexer.mdx) - res = checkpoint_tx.send(*cp), if commit_hi.is_none_or(|hi| *cp < hi + buffer_size as u64) => if res.is_ok() { - checkpoints.next(); - } else { - info!("Checkpoint channel closed, stopping regulator"); - break; - } - // docs::/#bound - } - } - }) -} - -#[cfg(test)] -mod tests { - use std::time::Duration; - - use tokio::time::{error::Elapsed, timeout}; - - use super::*; - - /// Wait up to a second for a response on the channel, and return it, expecting this operation - /// to succeed. - async fn expect_recv(rx: &mut mpsc::Receiver) -> Option { - timeout(Duration::from_secs(1), rx.recv()).await.unwrap() - } - - /// Wait up to a second for a response on the channel, but expecting this operation to timeout. - async fn expect_timeout(rx: &mut mpsc::Receiver) -> Elapsed { - timeout(Duration::from_secs(1), rx.recv()) - .await - .unwrap_err() - } - - #[tokio::test] - async fn finite_list_of_checkpoints() { - let (_, commit_hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - let cps = 0..5; - let h_regulator = regulator(cps, 0, None, commit_hi_rx, cp_tx, cancel.clone()); - - for i in 0..5 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - h_regulator.await.unwrap(); - } - - #[tokio::test] - async fn shutdown_on_sender_closed() { - let (_, hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - let h_regulator = regulator(0.., 0, None, hi_rx, cp_tx, cancel.clone()); - - for i in 0..5 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - drop(cp_rx); - h_regulator.await.unwrap(); - } - - #[tokio::test] - async fn shutdown_on_cancel() { - let (_, hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - let h_regulator = regulator(0.., 0, None, hi_rx, cp_tx, cancel.clone()); - - for i in 0..5 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - cancel.cancel(); - h_regulator.await.unwrap(); - } - - #[tokio::test] - async fn halted() { - let (hi_tx, hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - hi_tx.send(("test", 4)).unwrap(); - - let h_regulator = regulator(0.., 0, None, hi_rx, cp_tx, cancel.clone()); - - for _ in 0..4 { - expect_recv(&mut cp_rx).await; - } - - // Regulator stopped because of commit_hi. - expect_timeout(&mut cp_rx).await; - - cancel.cancel(); - h_regulator.await.unwrap(); - } - - #[tokio::test] - async fn halted_buffered() { - let (hi_tx, hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - hi_tx.send(("test", 2)).unwrap(); - - let h_regulator = regulator(0.., 2, None, hi_rx, cp_tx, cancel.clone()); - - for i in 0..4 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - // Regulator stopped because of commit_hi (plus buffering). - expect_timeout(&mut cp_rx).await; - - cancel.cancel(); - h_regulator.await.unwrap(); - } - - #[tokio::test] - async fn resumption() { - let (hi_tx, hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - hi_tx.send(("test", 2)).unwrap(); - - let h_regulator = regulator(0.., 0, None, hi_rx, cp_tx, cancel.clone()); - - for i in 0..2 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - // Regulator stopped because of commit_hi, but resumes when that commit_hi is updated. - expect_timeout(&mut cp_rx).await; - hi_tx.send(("test", 4)).unwrap(); - - for i in 2..4 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - // Halted again. - expect_timeout(&mut cp_rx).await; - - cancel.cancel(); - h_regulator.await.unwrap(); - } - - #[tokio::test] - async fn multiple_subscribers() { - let (hi_tx, hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - hi_tx.send(("a", 2)).unwrap(); - hi_tx.send(("b", 3)).unwrap(); - - let cps = 0..10; - let h_regulator = regulator(cps, 0, None, hi_rx, cp_tx, cancel.clone()); - - for i in 0..2 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - // Regulator stopped because of a's commit_hi. - expect_timeout(&mut cp_rx).await; - - // Updating b's commit_hi doesn't make a difference. - hi_tx.send(("b", 4)).unwrap(); - expect_timeout(&mut cp_rx).await; - - // But updating a's commit_hi does. - hi_tx.send(("a", 3)).unwrap(); - assert_eq!(Some(2), expect_recv(&mut cp_rx).await); - - // ...by one checkpoint. - expect_timeout(&mut cp_rx).await; - - // And we can make more progress by updating it again. - hi_tx.send(("a", 4)).unwrap(); - assert_eq!(Some(3), expect_recv(&mut cp_rx).await); - - // But another update to "a" will now not make a difference, because "b" is still behind. - hi_tx.send(("a", 5)).unwrap(); - expect_timeout(&mut cp_rx).await; - - cancel.cancel(); - h_regulator.await.unwrap(); - } - - #[tokio::test] - async fn test_regulator_with_initial_commit_hi() { - let (hi_tx, hi_rx) = mpsc::unbounded_channel(); - let (cp_tx, mut cp_rx) = mpsc::channel(1); - let cancel = CancellationToken::new(); - - // Start regulator with initial_commit_hi = Some(5) and buffer_size = 0 - let h_regulator = regulator(0.., 0, Some(5), hi_rx, cp_tx, cancel.clone()); - - // Regulator should only serve checkpoints 0 through 5 exclusive - for i in 0..5 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - // Should be halted at the initial commit_hi - expect_timeout(&mut cp_rx).await; - - // Sending a subscriber commit_hi should allow progress - hi_tx.send(("test", 7)).unwrap(); - - for i in 5..7 { - assert_eq!(Some(i), expect_recv(&mut cp_rx).await); - } - - // Halted again - expect_timeout(&mut cp_rx).await; - - cancel.cancel(); - h_regulator.await.unwrap(); - } -} diff --git a/crates/sui-indexer-alt-framework/src/lib.rs b/crates/sui-indexer-alt-framework/src/lib.rs index 859fe5c0005c8..4c4d8851ec4e7 100644 --- a/crates/sui-indexer-alt-framework/src/lib.rs +++ b/crates/sui-indexer-alt-framework/src/lib.rs @@ -4,29 +4,28 @@ use std::{collections::BTreeSet, sync::Arc, time::Duration}; use anyhow::{Context, bail, ensure}; -use futures::future; use ingestion::{ClientArgs, IngestionConfig, IngestionService, ingestion_client::IngestionClient}; use metrics::IndexerMetrics; -use pipeline::{ - Processor, - concurrent::{self, ConcurrentConfig}, - sequential::{self, Handler, SequentialConfig}, -}; use prometheus::Registry; use sui_indexer_alt_framework_store_traits::{ Connection, Store, TransactionalStore, pipeline_task, }; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; use tracing::info; pub use anyhow::Result; pub use sui_field_count::FieldCount; +pub use sui_futures::service; /// External users access the store trait through framework::store pub use sui_indexer_alt_framework_store_traits as store; pub use sui_types as types; use crate::metrics::IngestionMetrics; +use crate::pipeline::{ + Processor, + concurrent::{self, ConcurrentConfig}, + sequential::{self, Handler, SequentialConfig}, +}; +use crate::service::Service; #[cfg(feature = "cluster")] pub mod cluster; @@ -135,9 +134,6 @@ pub struct Indexer { /// with the same name isn't added twice. added_pipelines: BTreeSet<&'static str>, - /// Cancellation token shared among all continuous tasks in the service. - cancel: CancellationToken, - /// The checkpoint for the indexer to start ingesting from. This is derived from the committer /// watermarks of pipelines added to the indexer. Pipelines without watermarks default to 0, /// unless overridden by [Self::default_next_checkpoint]. @@ -147,8 +143,8 @@ pub struct Indexer { /// the regulator to prevent ingestion from running too far ahead of sequential pipelines. next_sequential_checkpoint: Option, - /// The handles for every task spawned by this indexer, used to manage graceful shutdown. - handles: Vec>, + /// The service handles for every pipeline, used to manage lifetimes and graceful shutdown. + pipelines: Vec, } /// Configuration for a tasked indexer. @@ -197,7 +193,6 @@ impl Indexer { ingestion_config: IngestionConfig, metrics_prefix: Option<&str>, registry: &Registry, - cancel: CancellationToken, ) -> Result { let IndexerArgs { first_checkpoint, @@ -208,13 +203,8 @@ impl Indexer { let metrics = IndexerMetrics::new(metrics_prefix, registry); - let ingestion_service = IngestionService::new( - client_args, - ingestion_config, - metrics_prefix, - registry, - cancel.clone(), - )?; + let ingestion_service = + IngestionService::new(client_args, ingestion_config, metrics_prefix, registry)?; Ok(Self { store, @@ -222,17 +212,16 @@ impl Indexer { ingestion_service, default_next_checkpoint: first_checkpoint.unwrap_or_default(), last_checkpoint, + task: task.into_task(), enabled_pipelines: if pipeline.is_empty() { None } else { Some(pipeline.into_iter().collect()) }, added_pipelines: BTreeSet::new(), - cancel, first_ingestion_checkpoint: u64::MAX, next_sequential_checkpoint: None, - handles: vec![], - task: task.into_task(), + pipelines: vec![], }) } @@ -290,7 +279,7 @@ impl Indexer { return Ok(()); }; - self.handles.push(concurrent::pipeline::( + self.pipelines.push(concurrent::pipeline::( handler, next_checkpoint, config, @@ -298,7 +287,6 @@ impl Indexer { self.task.clone(), self.ingestion_service.subscribe().0, self.metrics.clone(), - self.cancel.clone(), )); Ok(()) @@ -309,7 +297,7 @@ impl Indexer { /// respective watermarks. /// /// Ingestion will stop after consuming the configured `last_checkpoint` if one is provided. - pub async fn run(mut self) -> Result> { + pub async fn run(self) -> Result { if let Some(enabled_pipelines) = self.enabled_pipelines { ensure!( enabled_pipelines.is_empty(), @@ -322,7 +310,7 @@ impl Indexer { info!(self.first_ingestion_checkpoint, last_checkpoint = ?self.last_checkpoint, "Ingestion range"); - let broadcaster_handle = self + let mut service = self .ingestion_service .run( self.first_ingestion_checkpoint..=last_checkpoint, @@ -331,16 +319,11 @@ impl Indexer { .await .context("Failed to start ingestion service")?; - self.handles.push(broadcaster_handle); + for pipeline in self.pipelines { + service = service.merge(pipeline); + } - Ok(tokio::spawn(async move { - // Wait for the ingestion service and all its related tasks to wind down gracefully: - // If ingestion has been configured to only handle a specific range of checkpoints, we - // want to make sure that tasks are allowed to run to completion before shutting them - // down. - future::join_all(self.handles).await; - info!("Indexing pipeline gracefully shut down"); - })) + Ok(service) } /// Determine the checkpoint for the pipeline to resume processing from. This is either the @@ -429,7 +412,7 @@ impl Indexer { let (checkpoint_rx, watermark_tx) = self.ingestion_service.subscribe(); - self.handles.push(sequential::pipeline::( + self.pipelines.push(sequential::pipeline::( handler, next_checkpoint, config, @@ -437,7 +420,6 @@ impl Indexer { checkpoint_rx, watermark_tx, self.metrics.clone(), - self.cancel.clone(), )); Ok(()) @@ -451,7 +433,6 @@ mod tests { use async_trait::async_trait; use sui_synthetic_ingestion::synthetic_ingestion; use tokio::sync::watch; - use tokio_util::sync::CancellationToken; use crate::FieldCount; use crate::ingestion::ingestion_client::IngestionClientArgs; @@ -600,7 +581,6 @@ mod tests { /// first_ingestion_checkpoint is smallest among existing watermarks + 1. #[tokio::test] async fn test_first_ingestion_checkpoint_all_pipelines_have_watermarks() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -665,7 +645,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -693,7 +672,6 @@ mod tests { /// first_ingestion_checkpoint is 0 when at least one pipeline has no watermark. #[tokio::test] async fn test_first_ingestion_checkpoint_not_all_pipelines_have_watermarks() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -740,7 +718,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -768,7 +745,6 @@ mod tests { /// first_ingestion_checkpoint is 1 when smallest committer watermark is 0. #[tokio::test] async fn test_first_ingestion_checkpoint_smallest_is_0() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -827,7 +803,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -856,7 +831,6 @@ mod tests { /// watermark, and first_checkpoint is smallest. #[tokio::test] async fn test_first_ingestion_checkpoint_first_checkpoint_and_no_watermark() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -906,7 +880,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -935,7 +908,6 @@ mod tests { /// first_checkpoint but all pipelines have watermarks (ignores first_checkpoint). #[tokio::test] async fn test_first_ingestion_checkpoint_ignore_first_checkpoint() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -983,7 +955,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -1005,7 +976,6 @@ mod tests { /// to resume ingesting from. #[tokio::test] async fn test_first_ingestion_checkpoint_large_first_checkpoint() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1054,7 +1024,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -1080,7 +1049,6 @@ mod tests { // test ingestion, all pipelines have watermarks, no first_checkpoint provided #[tokio::test] async fn test_indexer_ingestion_existing_watermarks_no_first_checkpoint() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1159,7 +1127,6 @@ mod tests { ingestion_config, None, ®istry, - cancel.clone(), ) .await .unwrap(); @@ -1184,7 +1151,7 @@ mod tests { let ingestion_metrics = indexer.ingestion_metrics().clone(); let indexer_metrics = indexer.indexer_metrics().clone(); - indexer.run().await.unwrap().await.unwrap(); + indexer.run().await.unwrap().join().await.unwrap(); assert_eq!(ingestion_metrics.total_ingested_checkpoints.get(), 24); assert_eq!( @@ -1224,7 +1191,6 @@ mod tests { // test ingestion, no pipelines missing watermarks, first_checkpoint provided #[tokio::test] async fn test_indexer_ingestion_existing_watermarks_ignore_first_checkpoint() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1304,7 +1270,6 @@ mod tests { ingestion_config, None, ®istry, - cancel.clone(), ) .await .unwrap(); @@ -1328,7 +1293,7 @@ mod tests { let ingestion_metrics = indexer.ingestion_metrics().clone(); let metrics = indexer.indexer_metrics().clone(); - indexer.run().await.unwrap().await.unwrap(); + indexer.run().await.unwrap().join().await.unwrap(); assert_eq!(ingestion_metrics.total_ingested_checkpoints.get(), 24); assert_eq!( @@ -1368,7 +1333,6 @@ mod tests { // test ingestion, some pipelines missing watermarks, no first_checkpoint provided #[tokio::test] async fn test_indexer_ingestion_missing_watermarks_no_first_checkpoint() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1438,7 +1402,6 @@ mod tests { ingestion_config, None, ®istry, - cancel.clone(), ) .await .unwrap(); @@ -1462,7 +1425,7 @@ mod tests { let ingestion_metrics = indexer.ingestion_metrics().clone(); let metrics = indexer.indexer_metrics().clone(); - indexer.run().await.unwrap().await.unwrap(); + indexer.run().await.unwrap().join().await.unwrap(); assert_eq!(ingestion_metrics.total_ingested_checkpoints.get(), 30); assert_eq!( @@ -1502,7 +1465,6 @@ mod tests { // test ingestion, some pipelines missing watermarks, use first_checkpoint #[tokio::test] async fn test_indexer_ingestion_use_first_checkpoint() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1573,7 +1535,6 @@ mod tests { ingestion_config, None, ®istry, - cancel.clone(), ) .await .unwrap(); @@ -1597,7 +1558,7 @@ mod tests { let ingestion_metrics = indexer.ingestion_metrics().clone(); let metrics = indexer.indexer_metrics().clone(); - indexer.run().await.unwrap().await.unwrap(); + indexer.run().await.unwrap().join().await.unwrap(); assert_eq!(ingestion_metrics.total_ingested_checkpoints.get(), 20); assert_eq!( @@ -1636,7 +1597,6 @@ mod tests { #[tokio::test] async fn test_multiple_sequential_pipelines_next_checkpoint() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1703,7 +1663,6 @@ mod tests { ingestion_config, None, ®istry, - cancel.clone(), ) .await .unwrap(); @@ -1741,7 +1700,7 @@ mod tests { ); // Run indexer to verify it can make progress past the initial hi and finish ingesting. - indexer.run().await.unwrap().await.unwrap(); + indexer.run().await.unwrap().join().await.unwrap(); // Verify each pipeline made some progress independently let watermark1 = conn.committer_watermark(MockHandler::NAME).await.unwrap(); @@ -1759,7 +1718,6 @@ mod tests { /// committing checkpoints less than the main pipeline's reader watermark. #[tokio::test] async fn test_tasked_pipelines_ignore_below_main_reader_lo() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1813,7 +1771,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -1828,7 +1785,7 @@ mod tests { let ingestion_metrics = tasked_indexer.ingestion_metrics().clone(); let metrics = tasked_indexer.indexer_metrics().clone(); - tasked_indexer.run().await.unwrap().await.unwrap(); + tasked_indexer.run().await.unwrap().join().await.unwrap(); assert_eq!(ingestion_metrics.total_ingested_checkpoints.get(), 16); assert_eq!( @@ -1855,7 +1812,6 @@ mod tests { /// Tasked pipelines can run ahead of the main pipeline's committer watermark. #[tokio::test] async fn test_tasked_pipelines_surpass_main_pipeline_committer_hi() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -1905,7 +1861,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -1920,7 +1875,7 @@ mod tests { let ingestion_metrics = tasked_indexer.ingestion_metrics().clone(); let metrics = tasked_indexer.indexer_metrics().clone(); - tasked_indexer.run().await.unwrap().await.unwrap(); + tasked_indexer.run().await.unwrap().join().await.unwrap(); assert_eq!(ingestion_metrics.total_ingested_checkpoints.get(), 17); assert_eq!( @@ -1961,7 +1916,6 @@ mod tests { /// reader watermark to the committer. Committer watermark should still advance. #[tokio::test] async fn test_tasked_pipelines_skip_checkpoints_trailing_main_reader_lo() { - let cancel = CancellationToken::new(); let registry = Registry::new(); let store = MockStore::default(); @@ -2000,7 +1954,6 @@ mod tests { ingestion_config, None, ®istry, - cancel, ) .await .unwrap(); @@ -2025,7 +1978,7 @@ mod tests { let metrics = tasked_indexer.indexer_metrics().clone(); let indexer_handle = tokio::spawn(async move { - tasked_indexer.run().await.unwrap().await.unwrap(); + tasked_indexer.run().await.unwrap().join().await.unwrap(); }); store diff --git a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/collector.rs b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/collector.rs index 301c6050ea2ac..24cff5eab1b54 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/collector.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/collector.rs @@ -9,12 +9,11 @@ use std::{ }, }; +use sui_futures::service::Service; use tokio::{ sync::{SetOnce, mpsc}, - task::JoinHandle, time::{MissedTickBehavior, interval}, }; -use tokio_util::sync::CancellationToken; use tracing::{debug, info}; use crate::{ @@ -70,8 +69,7 @@ impl From> for PendingCheckpoint { /// /// The `main_reader_lo` tracks the lowest checkpoint that can be committed by this pipeline. /// -/// This task will shutdown if canceled via the `cancel` token, or if any of its channels are -/// closed. +/// This task will shutdown if any of its channels are closed. pub(super) fn collector( handler: Arc, config: CommitterConfig, @@ -79,38 +77,8 @@ pub(super) fn collector( tx: mpsc::Sender>, main_reader_lo: Arc>, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { - tokio::spawn(async move { - // Wait for the main reader lo to be initialized before proceeding to the main loop. - // - // TODO: without this init, if the processor shuts down while the collector is still waiting - // for `main_reader_lo` to be initialized, the indexer will stall forever. - let atomic_reader_lo = loop { - tokio::select! { - _ = cancel.cancelled() => { - info!(pipeline = H::NAME, "Shutdown received before main reader lo initialization"); - return; - } - // Until `main_reader_lo` is initialized, periodically check that the checkpoint rx is - // still open. - _ = tokio::time::sleep(std::time::Duration::from_millis(100)) => { - // Only until we enter the main loop does it also make sense to check - // `rx.is_empty()`. Progress can't be made by the collector right now. - if rx.is_closed() { - info!( - pipeline = H::NAME, - "Processor closed channel before main reader lo initialization, stopping collector", - ); - return; - } - } - atomic = main_reader_lo.wait() => { - break atomic; - } - } - }; - +) -> Service { + Service::new().spawn_aborting(async move { // The `poll` interval controls the maximum time to wait between collecting batches, // regardless of number of rows pending. let mut poll = interval(config.collect_interval()); @@ -130,11 +98,6 @@ pub(super) fn collector( loop { tokio::select! { - _ = cancel.cancelled() => { - info!(pipeline = H::NAME, "Shutdown received, stopping collector"); - break; - } - // Time to create another batch and push it to the committer. _ = poll.tick() => { let guard = metrics @@ -221,7 +184,7 @@ pub(super) fn collector( Some(mut indexed) = rx.recv(), if pending_rows < H::MAX_PENDING_ROWS => { // Clear the values of outdated checkpoints, so that we don't commit data to the // store, but can still advance watermarks. - if indexed.checkpoint() < atomic_reader_lo.load(Ordering::Relaxed) { + if indexed.checkpoint() < main_reader_lo.wait().await.load(Ordering::Relaxed) { indexed.values.clear(); metrics.total_collector_skipped_checkpoints .with_label_values(&[H::NAME]) @@ -247,6 +210,8 @@ pub(super) fn collector( // docs::/#collector } } + + Ok(()) }) } @@ -348,7 +313,6 @@ mod tests { async fn test_collector_batches_data() { let (processor_tx, processor_rx) = mpsc::channel(10); let (collector_tx, mut collector_rx) = mpsc::channel(10); - let cancel = CancellationToken::new(); let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(0)))); let handler = Arc::new(TestHandler); @@ -359,7 +323,6 @@ mod tests { collector_tx, main_reader_lo.clone(), test_metrics(), - cancel.clone(), ); let part1_length = TEST_MAX_CHUNK_ROWS / 2; @@ -384,26 +347,22 @@ mod tests { let batch3 = recv_with_timeout(&mut collector_rx, Duration::from_secs(1)).await; assert_eq!(batch3.batch_len, 0); - - cancel.cancel(); } #[tokio::test] async fn test_collector_shutdown() { let (processor_tx, processor_rx) = mpsc::channel(10); let (collector_tx, mut collector_rx) = mpsc::channel(10); - let cancel = CancellationToken::new(); let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(0)))); let handler = Arc::new(TestHandler); - let collector = collector::( + let mut collector = collector::( handler, CommitterConfig::default(), processor_rx, collector_tx, main_reader_lo, test_metrics(), - cancel.clone(), ); processor_tx @@ -420,52 +379,10 @@ mod tests { drop(processor_tx); // After a short delay, collector should shut down - let _ = tokio::time::timeout(Duration::from_millis(500), collector) + tokio::time::timeout(Duration::from_millis(500), collector.join()) .await - .expect("collector did not shutdown"); - - cancel.cancel(); - } - - /// While waiting for the `main_reader_lo` to be initialized, if the processor shuts down, the - /// collector should also shut down. - #[tokio::test(start_paused = true)] - async fn test_collector_processor_shutdown_before_init() { - let (processor_tx, processor_rx) = mpsc::channel(10); - let (collector_tx, mut collector_rx) = mpsc::channel(10); - let cancel = CancellationToken::new(); - let main_reader_lo = Arc::new(SetOnce::new()); - - let handler = Arc::new(TestHandler); - let collector = collector::( - handler, - CommitterConfig::default(), - processor_rx, - collector_tx, - main_reader_lo.clone(), - test_metrics(), - cancel.clone(), - ); - - processor_tx - .send(IndexedCheckpoint::new(0, 1, 10, 1000, vec![Entry, Entry])) - .await - .unwrap(); - - // Advance time significantly - collector should still be blocked waiting for - // main_reader_lo. - tokio::time::advance(Duration::from_secs(100)).await; - assert!(collector_rx.try_recv().is_err()); - - // Drop processor sender to simulate shutdown - drop(processor_tx); - - // After a short delay, collector should shut down - let _ = tokio::time::timeout(Duration::from_millis(500), collector) - .await - .expect("collector did not shutdown"); - - cancel.cancel(); + .expect("collector shutdown timeout") + .expect("collector shutdown failed"); } #[tokio::test] @@ -477,7 +394,6 @@ mod tests { let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(0)))); let metrics = test_metrics(); - let cancel = CancellationToken::new(); let handler = Arc::new(TestHandler); let _collector = collector::( @@ -487,7 +403,6 @@ mod tests { collector_tx, main_reader_lo.clone(), metrics.clone(), - cancel.clone(), ); // Send more data than MAX_PENDING_ROWS plus collector channel buffer @@ -521,8 +436,6 @@ mod tests { send_result, Err(mpsc::error::TrySendError::Full(_)) )); - - cancel.cancel(); } #[tokio::test] @@ -531,8 +444,6 @@ mod tests { let (collector_tx, mut collector_rx) = mpsc::channel(10); let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(0)))); - let cancel = CancellationToken::new(); - // Set a very long collect interval (60 seconds) to ensure timing doesn't trigger batching let config = CommitterConfig { collect_interval_ms: 60_000, @@ -546,7 +457,6 @@ mod tests { collector_tx, main_reader_lo.clone(), test_metrics(), - cancel.clone(), ); let start_time = std::time::Instant::now(); @@ -580,8 +490,6 @@ mod tests { // Verify batch was created quickly (much less than 60 seconds) let elapsed = start_time.elapsed(); assert!(elapsed < Duration::from_secs(10)); - - cancel.cancel(); } #[tokio::test] @@ -590,8 +498,6 @@ mod tests { let (collector_tx, mut collector_rx) = mpsc::channel(10); let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(0)))); - let cancel = CancellationToken::new(); - // Set a very long collect interval (60 seconds) to ensure timing doesn't trigger batching let config = CommitterConfig { collect_interval_ms: 60_000, @@ -605,7 +511,6 @@ mod tests { collector_tx, main_reader_lo.clone(), test_metrics(), - cancel.clone(), ); // The collector starts with an immediate poll tick, creating an empty batch @@ -628,8 +533,6 @@ mod tests { // Verify batch was created quickly (much less than 60 seconds) let elapsed = start_time.elapsed(); assert!(elapsed < Duration::from_secs(10)); - - cancel.cancel(); } #[tokio::test] @@ -638,8 +541,6 @@ mod tests { let (collector_tx, mut collector_rx) = mpsc::channel(10); let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(0)))); - let cancel = CancellationToken::new(); - // Set a reasonable collect interval for this test (3 seconds). let config = CommitterConfig { collect_interval_ms: 3000, @@ -653,7 +554,6 @@ mod tests { collector_tx, main_reader_lo.clone(), test_metrics(), - cancel.clone(), ); // Consume initial empty batch @@ -671,8 +571,6 @@ mod tests { // Should eventually get batch when timer triggers let timer_batch = recv_with_timeout(&mut collector_rx, Duration::from_secs(4)).await; assert_eq!(timer_batch.batch_len, TestHandler::MIN_EAGER_ROWS - 1); - - cancel.cancel(); } /// The collector must wait for `main_reader_lo` to be initialized before attempting to prepare @@ -681,7 +579,6 @@ mod tests { async fn test_collector_waits_for_main_reader_lo_init() { let (processor_tx, processor_rx) = mpsc::channel(10); let (collector_tx, mut collector_rx) = mpsc::channel(10); - let cancel = CancellationToken::new(); let main_reader_lo = Arc::new(SetOnce::new()); let handler = Arc::new(TestHandler); @@ -697,7 +594,6 @@ mod tests { collector_tx, main_reader_lo.clone(), test_metrics(), - cancel.clone(), ); // Send enough data to trigger batching. @@ -720,8 +616,7 @@ mod tests { assert_eq!(batch.batch_len, TestHandler::MIN_EAGER_ROWS + 1); - cancel.cancel(); - collector.await.unwrap(); + collector.shutdown().await.unwrap(); } /// When receiving checkpoints, if they are below the main reader lo, they should be dropped @@ -730,7 +625,6 @@ mod tests { async fn test_collector_drops_checkpoints_immediately_if_le_main_reader_lo() { let (processor_tx, processor_rx) = mpsc::channel(10); let (collector_tx, mut collector_rx) = mpsc::channel(10); - let cancel = CancellationToken::new(); let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(5)))); let metrics = test_metrics(); @@ -746,7 +640,6 @@ mod tests { collector_tx, main_reader_lo.clone(), metrics.clone(), - cancel.clone(), ); let eager_rows_plus_one = TestHandler::MIN_EAGER_ROWS + 1; @@ -781,8 +674,7 @@ mod tests { // And that we only have values from two checkpoints (5, 6) assert_eq!(batch.batch_len, eager_rows_plus_one * 2); - cancel.cancel(); - collector.await.unwrap(); + collector.shutdown().await.unwrap(); } /// Because a checkpoint may be partially batched before the main reader lo advances past it, @@ -793,7 +685,6 @@ mod tests { async fn test_collector_only_filters_whole_checkpoints() { let (processor_tx, processor_rx) = mpsc::channel(10); let (collector_tx, mut collector_rx) = mpsc::channel(10); - let cancel = CancellationToken::new(); let main_reader_lo = Arc::new(SetOnce::new_with(Some(AtomicU64::new(0)))); let metrics = test_metrics(); @@ -805,7 +696,6 @@ mod tests { collector_tx, main_reader_lo.clone(), metrics.clone(), - cancel.clone(), ); let more_than_max_chunk_rows = TEST_MAX_CHUNK_ROWS + 10; @@ -861,7 +751,6 @@ mod tests { 5 ); - cancel.cancel(); - collector.await.unwrap(); + collector.shutdown().await.unwrap(); } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/commit_watermark.rs b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/commit_watermark.rs index 1df4609319d1a..1cae9fbe53c7b 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/commit_watermark.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/commit_watermark.rs @@ -7,12 +7,11 @@ use std::{ sync::Arc, }; +use sui_futures::service::Service; use tokio::{ sync::mpsc, - task::JoinHandle, time::{MissedTickBehavior, interval}, }; -use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use crate::{ @@ -40,8 +39,7 @@ use super::Handler; /// The task regularly traces its progress, outputting at a higher log level every /// [LOUD_WATERMARK_UPDATE_INTERVAL]-many checkpoints. /// -/// The task will shutdown if the `cancel` token is signalled, or if the `rx` channel closes and the -/// watermark cannot be progressed. +/// The task will shutdown if the `rx` channel closes and the watermark cannot be progressed. pub(super) fn commit_watermark( mut next_checkpoint: u64, config: CommitterConfig, @@ -49,11 +47,10 @@ pub(super) fn commit_watermark( store: H::Store, task: Option, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { +) -> Service { // SAFETY: on indexer instantiation, we've checked that the pipeline name is valid. let pipeline_task = pipeline_task::(H::NAME, task.as_deref()).unwrap(); - tokio::spawn(async move { + Service::new().spawn_aborting(async move { let mut poll = interval(config.watermark_interval()); poll.set_missed_tick_behavior(MissedTickBehavior::Delay); @@ -81,7 +78,6 @@ pub(super) fn commit_watermark( loop { tokio::select! { - _ = cancel.cancelled() => {} _ = poll.tick() => {} Some(parts) = rx.recv() => { for part in parts { @@ -244,11 +240,6 @@ pub(super) fn commit_watermark( } } - if cancel.is_cancelled() { - info!(pipeline = H::NAME, "Shutdown received"); - break; - } - if rx.is_closed() && rx.is_empty() { info!(pipeline = H::NAME, "Committer closed channel"); break; @@ -256,6 +247,7 @@ pub(super) fn commit_watermark( } info!(pipeline = H::NAME, "Stopping committer watermark task"); + Ok(()) }) } @@ -266,7 +258,6 @@ mod tests { use async_trait::async_trait; use sui_types::full_checkpoint_content::Checkpoint; use tokio::sync::mpsc; - use tokio_util::sync::CancellationToken; use crate::{ FieldCount, @@ -319,8 +310,8 @@ mod tests { struct TestSetup { store: MockStore, watermark_tx: mpsc::Sender>, - commit_watermark_handle: JoinHandle<()>, - cancel: CancellationToken, + #[allow(unused)] + commit_watermark: Service, } fn setup_test + 'static>( @@ -330,26 +321,22 @@ mod tests { ) -> TestSetup { let (watermark_tx, watermark_rx) = mpsc::channel(100); let metrics = IndexerMetrics::new(None, &Default::default()); - let cancel = CancellationToken::new(); let store_clone = store.clone(); - let cancel_clone = cancel.clone(); - let commit_watermark_handle = commit_watermark::( + let commit_watermark = commit_watermark::( next_checkpoint, config, watermark_rx, store_clone, None, metrics, - cancel_clone, ); TestSetup { store, watermark_tx, - commit_watermark_handle, - cancel, + commit_watermark, } } @@ -381,10 +368,6 @@ mod tests { // Verify watermark progression let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermark.checkpoint_hi_inclusive, 3); - - // Clean up - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; } #[tokio::test] @@ -420,10 +403,6 @@ mod tests { // Verify watermark has progressed to 4 let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermark.checkpoint_hi_inclusive, 4); - - // Clean up - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; } #[tokio::test] @@ -452,10 +431,6 @@ mod tests { // Verify watermark has progressed let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermark.checkpoint_hi_inclusive, 1); - - // Clean up - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; } #[tokio::test] @@ -489,10 +464,6 @@ mod tests { // Verify watermark is still none let watermark = setup.store.watermark(DataPipeline::NAME); assert!(watermark.is_none()); - - // Clean up - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; } #[tokio::test] @@ -534,10 +505,6 @@ mod tests { let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermark.checkpoint_hi_inclusive, 11); - - // Clean up - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; } #[tokio::test] @@ -579,10 +546,6 @@ mod tests { // Verify watermark has progressed let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermark.checkpoint_hi_inclusive, 1); - - // Clean up - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; } #[tokio::test] @@ -617,46 +580,5 @@ mod tests { // Verify watermark has progressed let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermark.checkpoint_hi_inclusive, 1); - - // Clean up - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; - } - - #[tokio::test] - async fn test_final_watermark_sync_on_shutdown() { - let config = CommitterConfig { - // Set to u64::MAX to ensure watermark isn't updated until shutdown. - watermark_interval_ms: u64::MAX, - ..Default::default() - }; - let setup = setup_test::(config, 10, MockStore::default()); - - setup - .watermark_tx - .send(vec![create_watermark_part_for_checkpoint(10)]) - .await - .unwrap(); - setup - .watermark_tx - .send(vec![create_watermark_part_for_checkpoint(11)]) - .await - .unwrap(); - - // Wait until all watermark parts have been received on the channel. - tokio::time::timeout(Duration::from_secs(1), async { - let mut interval = tokio::time::interval(Duration::from_millis(100)); - while setup.watermark_tx.capacity() != setup.watermark_tx.max_capacity() { - interval.tick().await; - } - }) - .await - .unwrap(); - - setup.cancel.cancel(); - let _ = setup.commit_watermark_handle.await; - - let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); - assert_eq!(watermark.checkpoint_hi_inclusive, 11); } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/committer.rs b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/committer.rs index a191526ed9dbe..3cd964ebfa2bc 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/committer.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/committer.rs @@ -4,15 +4,17 @@ use std::{sync::Arc, time::Duration}; use backoff::ExponentialBackoff; -use sui_futures::stream::TrySpawnStreamExt; -use tokio::{sync::mpsc, task::JoinHandle}; +use sui_futures::{ + service::Service, + stream::{Break, TrySpawnStreamExt}, +}; +use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; -use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use crate::{ metrics::{CheckpointLagMetricReporter, IndexerMetrics}, - pipeline::{Break, CommitterConfig, WatermarkPart}, + pipeline::{CommitterConfig, WatermarkPart}, store::Store, }; @@ -32,8 +34,7 @@ const MAX_RETRY_INTERVAL: Duration = Duration::from_secs(1); /// succeeds. Once the write succeeds, the [WatermarkPart]s for that batch are sent on `tx` to the /// watermark task. /// -/// This task will shutdown via its `cancel`lation token, or if its receiver or sender channels are -/// closed. +/// This task will shutdown if its receiver or sender channels are closed. pub(super) fn committer( handler: Arc, config: CommitterConfig, @@ -41,9 +42,8 @@ pub(super) fn committer( tx: mpsc::Sender>, db: H::Store, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { - tokio::spawn(async move { +) -> Service { + Service::new().spawn_aborting(async move { info!(pipeline = H::NAME, "Starting committer"); let checkpoint_lag_reporter = CheckpointLagMetricReporter::new_for_pipeline::( &metrics.partially_committed_checkpoint_timestamp_lag, @@ -64,7 +64,6 @@ pub(super) fn committer( let tx = tx.clone(); let db = db.clone(); let metrics = metrics.clone(); - let cancel = cancel.clone(); let checkpoint_lag_reporter = checkpoint_lag_reporter.clone(); // Repeatedly try to get a connection to the DB and write the batch. Use an @@ -180,22 +179,13 @@ pub(super) fn committer( }; async move { - tokio::select! { - _ = cancel.cancelled() => { - return Err(Break::Cancel); - } - - // Double check that the commit actually went through, (this backoff should - // not produce any permanent errors, but if it does, we need to shutdown - // the pipeline). - commit = backoff::future::retry(backoff, commit) => { - let () = commit?; - } - }; - + // Double check that the commit actually went through, (this backoff should + // not produce any permanent errors, but if it does, we need to shutdown + // the pipeline). + backoff::future::retry(backoff, commit).await?; if tx.send(watermark).await.is_err() { info!(pipeline = H::NAME, "Watermark closed channel"); - return Err(Break::Cancel); + return Err(Break::::Break); } Ok(()) @@ -206,15 +196,17 @@ pub(super) fn committer( { Ok(()) => { info!(pipeline = H::NAME, "Batches done, stopping committer"); + Ok(()) } - Err(Break::Cancel) => { - info!(pipeline = H::NAME, "Shutdown received, stopping committer"); + Err(Break::Break) => { + info!(pipeline = H::NAME, "Channels closed, stopping committer"); + Ok(()) } Err(Break::Err(e)) => { error!(pipeline = H::NAME, "Error from committer: {e}"); - cancel.cancel(); + Err(e.context(format!("Error from committer {}", H::NAME))) } } }) @@ -231,7 +223,6 @@ mod tests { use async_trait::async_trait; use sui_types::full_checkpoint_content::Checkpoint; use tokio::sync::mpsc; - use tokio_util::sync::CancellationToken; use crate::{ FieldCount, @@ -323,7 +314,7 @@ mod tests { store: MockStore, batch_tx: mpsc::Sender>, watermark_rx: mpsc::Receiver>, - committer_handle: JoinHandle<()>, + committer: Service, } /// Creates and spawns a committer task with the provided mock store, along with @@ -335,31 +326,26 @@ mod tests { async fn setup_test(store: MockStore) -> TestSetup { let config = CommitterConfig::default(); let metrics = IndexerMetrics::new(None, &Default::default()); - let cancel = CancellationToken::new(); let (batch_tx, batch_rx) = mpsc::channel::>(10); let (watermark_tx, watermark_rx) = mpsc::channel(10); let store_clone = store.clone(); let handler = Arc::new(DataPipeline); - let committer_handle = tokio::spawn(async move { - let _ = committer( - handler, - config, - batch_rx, - watermark_tx, - store_clone, - metrics, - cancel, - ) - .await; - }); + let committer = committer( + handler, + config, + batch_rx, + watermark_tx, + store_clone, + metrics, + ); TestSetup { store, batch_tx, watermark_rx, - committer_handle, + committer, } } @@ -440,10 +426,6 @@ mod tests { assert_eq!(data.get(&2).unwrap().value(), &vec![4, 5, 6]); assert_eq!(data.get(&3).unwrap().value(), &vec![7, 8, 9]); } - - // Clean up - drop(setup.batch_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -500,10 +482,6 @@ mod tests { } let watermark = setup.watermark_rx.recv().await.unwrap(); assert_eq!(watermark.len(), 1); - - // Clean up - drop(setup.batch_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -566,10 +544,6 @@ mod tests { } let watermark = setup.watermark_rx.recv().await.unwrap(); assert_eq!(watermark.len(), 1); - - // Clean up - drop(setup.batch_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -607,10 +581,6 @@ mod tests { "No data should be committed for empty batch" ); } - - // Clean up - drop(setup.batch_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -658,10 +628,6 @@ mod tests { // Verify the committer task has terminated due to watermark channel closure // The task should exit gracefully when it can't send watermarks (returns Break::Cancel) - let result = setup.committer_handle.await; - assert!( - result.is_ok(), - "Committer should terminate gracefully when watermark channel is closed" - ); + setup.committer.shutdown().await.unwrap(); } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/main_reader_lo.rs b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/main_reader_lo.rs index 4039fb7d61bc7..9d449437d4d37 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/main_reader_lo.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/main_reader_lo.rs @@ -9,12 +9,11 @@ use std::{ time::Duration, }; +use sui_futures::service::Service; use tokio::{ sync::SetOnce, - task::JoinHandle, time::{MissedTickBehavior, interval}, }; -use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use crate::store::{Connection, Store}; @@ -26,60 +25,48 @@ use super::Handler; pub(super) fn track_main_reader_lo( reader_lo: Arc>, reader_interval: Option, - cancel: CancellationToken, store: H::Store, -) -> JoinHandle<()> { - tokio::spawn(async move { +) -> Service { + Service::new().spawn_aborting(async move { let Some(reader_interval) = reader_interval else { info!( pipeline = H::NAME, "Not a tasked indexer, skipping main reader lo task" ); reader_lo.set(AtomicU64::new(0)).ok(); - return; + return Ok(()); }; - let mut reader_interval = interval(reader_interval); - // If we miss ticks, skip them to ensure we have the latest watermark. + let mut reader_interval = interval(reader_interval); reader_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); loop { - tokio::select! { - _ = cancel.cancelled() => { - info!(pipeline = H::NAME, "Shutdown received"); - break; - } + reader_interval.tick().await; - _ = reader_interval.tick() => { - match store.connect().await { - Ok(mut conn) => { - match conn.reader_watermark(H::NAME).await { - Ok(watermark_opt) => { - // If the reader watermark is not present (either because the - // watermark entry does not exist, or the reader watermark is - // not set), we assume that pruning is not enabled, and - // checkpoints >= 0 are valid. - let update = watermark_opt.map_or(0, |wm| wm.reader_lo); - - let current = reader_lo.get(); + let mut conn = match store.connect().await { + Ok(conn) => conn, + Err(e) => { + warn!(pipeline = H::NAME, "Failed to connect to store: {e}"); + continue; + } + }; - if let Some(can_update) = current { - can_update.store(update, Ordering::Relaxed); - } else { - reader_lo.set(AtomicU64::new(update)).ok(); - } - } - Err(e) => { - warn!(pipeline = H::NAME, "Failed to get reader watermark: {e}"); - } - } - } - Err(e) => { - warn!(pipeline = H::NAME, "Failed to connect to store: {e}"); - } - } + let watermark = match conn.reader_watermark(H::NAME).await { + // If the reader watermark is not present (either because the watermark entry does + // not exist, or the reaer watermark is not set), we assume that pruning is not + // enabled and checkpoints >= 0 are valid. + Ok(watermark) => watermark.map_or(0, |wm| wm.reader_lo), + Err(e) => { + warn!(pipeline = H::NAME, "Failed to get reader watermark: {e}"); + continue; } + }; + + if let Some(lo) = reader_lo.get() { + lo.store(watermark, Ordering::Relaxed); + } else { + reader_lo.set(AtomicU64::new(watermark)).ok(); } } }) diff --git a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/mod.rs b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/mod.rs index 16e62a7d6a378..5f0a6a34bf944 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/mod.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/mod.rs @@ -1,18 +1,12 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -use std::{ - sync::{Arc, atomic::AtomicU64}, - time::Duration, -}; +use std::{sync::Arc, time::Duration}; use async_trait::async_trait; use serde::{Deserialize, Serialize}; -use tokio::{ - sync::{SetOnce, mpsc}, - task::JoinHandle, -}; -use tokio_util::sync::CancellationToken; +use sui_futures::service::Service; +use tokio::sync::{SetOnce, mpsc}; use tracing::info; use crate::{ @@ -206,9 +200,9 @@ impl Default for PrunerConfig { /// watermark below which all data has been committed (modulo pruning). /// /// Checkpoint data is fed into the pipeline through the `checkpoint_rx` channel, and internal -/// channels are created to communicate between its various components. The pipeline can be -/// shutdown using its `cancel` token, and will also shutdown if any of its independent tasks -/// reports an issue. +/// channels are created to communicate between its various components. The pipeline will shutdown +/// if any of its input or output channels close, any of its independent tasks fail, or if it is +/// signalled to shutdown through the returned service handle. pub(crate) fn pipeline( handler: H, next_checkpoint: u64, @@ -217,12 +211,12 @@ pub(crate) fn pipeline( task: Option, checkpoint_rx: mpsc::Receiver>, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { +) -> Service { info!( pipeline = H::NAME, - "Starting pipeline with config: {:?}", config + "Starting pipeline with config: {config:#?}", ); + let ConcurrentConfig { committer: committer_config, pruner: pruner_config, @@ -235,82 +229,62 @@ pub(crate) fn pipeline( //docs::/#buff let (committer_tx, watermark_rx) = mpsc::channel(committer_config.write_concurrency + PIPELINE_BUFFER); + let main_reader_lo = Arc::new(SetOnce::new()); - // The pruner is not connected to the rest of the tasks by channels, so it needs to be - // explicitly signalled to shutdown when the other tasks shutdown, in addition to listening to - // the global cancel signal. We achieve this by creating a child cancel token that we call - // cancel on once the committer tasks have shutdown. - let pruner_cancel = cancel.child_token(); let handler = Arc::new(handler); - let main_reader_lo = Arc::new(SetOnce::::new()); - - let main_reader_lo_task = track_main_reader_lo::( - main_reader_lo.clone(), - task.as_ref().map(|t| t.reader_interval), - pruner_cancel.clone(), - store.clone(), - ); - - let processor = processor( + let s_processor = processor( handler.clone(), checkpoint_rx, processor_tx, metrics.clone(), - cancel.clone(), ); - let collector = collector::( + let s_collector = collector::( handler.clone(), committer_config.clone(), collector_rx, collector_tx, main_reader_lo.clone(), metrics.clone(), - cancel.clone(), ); - let committer = committer::( + let s_committer = committer::( handler.clone(), committer_config.clone(), committer_rx, committer_tx, store.clone(), metrics.clone(), - cancel.clone(), ); - let commit_watermark = commit_watermark::( + let s_commit_watermark = commit_watermark::( next_checkpoint, committer_config, watermark_rx, store.clone(), task.as_ref().map(|t| t.task.clone()), metrics.clone(), - cancel, ); - let reader_watermark = reader_watermark::( - pruner_config.clone(), + let s_track_reader_lo = track_main_reader_lo::( + main_reader_lo.clone(), + task.as_ref().map(|t| t.reader_interval), store.clone(), - metrics.clone(), - pruner_cancel.clone(), ); - let pruner = pruner( - handler, - pruner_config, - store, - metrics, - pruner_cancel.clone(), - ); + let s_reader_watermark = + reader_watermark::(pruner_config.clone(), store.clone(), metrics.clone()); - tokio::spawn(async move { - let (_, _, _, _) = futures::join!(processor, collector, committer, commit_watermark); + let s_pruner = pruner(handler, pruner_config, store, metrics); - pruner_cancel.cancel(); - let _ = futures::join!(main_reader_lo_task, reader_watermark, pruner); - }) + s_processor + .merge(s_collector) + .merge(s_committer) + .merge(s_commit_watermark) + .attach(s_track_reader_lo) + .attach(s_reader_watermark) + .attach(s_pruner) } #[cfg(test)] @@ -319,7 +293,6 @@ mod tests { use prometheus::Registry; use tokio::{sync::mpsc, time::timeout}; - use tokio_util::sync::CancellationToken; use crate::{ FieldCount, @@ -419,17 +392,16 @@ mod tests { struct TestSetup { store: MockStore, checkpoint_tx: mpsc::Sender>, - pipeline_handle: JoinHandle<()>, - cancel: CancellationToken, + #[allow(unused)] + pipeline: Service, } impl TestSetup { async fn new(config: ConcurrentConfig, store: MockStore, next_checkpoint: u64) -> Self { let (checkpoint_tx, checkpoint_rx) = mpsc::channel(TEST_CHECKPOINT_BUFFER_SIZE); let metrics = IndexerMetrics::new(None, &Registry::default()); - let cancel = CancellationToken::new(); - let pipeline_handle = pipeline( + let pipeline = pipeline( DataPipeline, next_checkpoint, config, @@ -437,14 +409,12 @@ mod tests { None, checkpoint_rx, metrics, - cancel.clone(), ); Self { store, checkpoint_tx, - pipeline_handle, - cancel, + pipeline, } } @@ -460,12 +430,6 @@ mod tests { Ok(()) } - async fn shutdown(self) { - drop(self.checkpoint_tx); - self.cancel.cancel(); - let _ = self.pipeline_handle.await; - } - async fn send_checkpoint_with_timeout( &self, checkpoint: u64, @@ -551,8 +515,6 @@ mod tests { assert!(!data.contains_key(&1)); assert!(!data.contains_key(&2)); }; - - setup.shutdown().await; } #[tokio::test] @@ -598,8 +560,6 @@ mod tests { data.len() }; assert_eq!(total_checkpoints, 10); - - setup.shutdown().await; } #[tokio::test] @@ -631,8 +591,6 @@ mod tests { .await; assert_eq!(data, vec![i * 10 + 1, i * 10 + 2]); } - - setup.shutdown().await; } #[tokio::test] @@ -668,8 +626,6 @@ mod tests { .wait_for_watermark(DataPipeline::NAME, 4, TEST_TIMEOUT) .await; assert_eq!(watermark.checkpoint_hi_inclusive, 4); - - setup.shutdown().await; } // ==================== BACK-PRESSURE TESTING ==================== @@ -738,8 +694,6 @@ mod tests { .wait_for_data(DataPipeline::NAME, 0, TEST_TIMEOUT) .await; assert_eq!(data, vec![1, 2]); - - setup.shutdown().await; } #[tokio::test] @@ -817,8 +771,6 @@ mod tests { .send_checkpoint_with_timeout(back_pressure_checkpoint.unwrap(), TEST_TIMEOUT) .await .unwrap(); - - setup.shutdown().await; } // ==================== FAILURE TESTING ==================== @@ -847,8 +799,6 @@ mod tests { .wait_for_data(DataPipeline::NAME, 0, Duration::from_secs(1)) .await; assert_eq!(data, vec![1, 2]); - - setup.shutdown().await; } #[tokio::test] @@ -912,8 +862,6 @@ mod tests { assert!(!data.contains_key(&0)); assert!(!data.contains_key(&1)); }; - - setup.shutdown().await; } #[tokio::test] @@ -952,8 +900,6 @@ mod tests { // Verify that reader watermark was eventually updated despite failures let watermark = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermark.reader_lo, 3); - - setup.shutdown().await; } #[tokio::test] @@ -980,7 +926,5 @@ mod tests { .wait_for_data(DataPipeline::NAME, 0, TEST_TIMEOUT) .await; assert_eq!(data, vec![1, 2]); - - setup.shutdown().await; } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/pruner.rs b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/pruner.rs index e0d7997ea44ec..e443f208e9bb7 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/pruner.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/pruner.rs @@ -5,12 +5,11 @@ use std::{collections::BTreeMap, sync::Arc}; use futures::StreamExt; use futures::stream::FuturesUnordered; +use sui_futures::service::Service; use tokio::{ sync::Semaphore, - task::JoinHandle, time::{MissedTickBehavior, interval}, }; -use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use crate::{ @@ -92,19 +91,17 @@ impl PendingRanges { /// The task regularly traces its progress, outputting at a higher log level every /// [LOUD_WATERMARK_UPDATE_INTERVAL]-many checkpoints. /// -/// The task will shutdown if the `cancel` token is signalled. If the `config` is `None`, the task -/// will shutdown immediately. +/// If the `config` is `None`, the task will shutdown immediately. pub(super) fn pruner( handler: Arc, config: Option, store: H::Store, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { - tokio::spawn(async move { +) -> Service { + Service::new().spawn_aborting(async move { let Some(config) = config else { info!(pipeline = H::NAME, "Skipping pruner task"); - return; + return Ok(()); }; info!( @@ -128,41 +125,39 @@ pub(super) fn pruner( let mut pending_prune_ranges = PendingRanges::default(); loop { - // (1) Get the latest pruning bounds from the database. - let mut watermark = tokio::select! { - _ = cancel.cancelled() => { - info!(pipeline = H::NAME, "Shutdown received"); - break; - } + poll.tick().await; - _ = poll.tick() => { - let guard = metrics - .watermark_pruner_read_latency - .with_label_values(&[H::NAME]) - .start_timer(); + // (1) Get the latest pruning bounds from the database. + let mut watermark = { + let guard = metrics + .watermark_pruner_read_latency + .with_label_values(&[H::NAME]) + .start_timer(); + + let Ok(mut conn) = store.connect().await else { + warn!( + pipeline = H::NAME, + "Pruner failed to connect, while fetching watermark" + ); + continue; + }; + + match conn.pruner_watermark(H::NAME, config.delay()).await { + Ok(Some(current)) => { + guard.stop_and_record(); + current + } - let Ok(mut conn) = store.connect().await else { - warn!(pipeline = H::NAME, "Pruner failed to connect, while fetching watermark"); + Ok(None) => { + guard.stop_and_record(); + warn!(pipeline = H::NAME, "No watermark for pipeline, skipping"); continue; - }; - - match conn.pruner_watermark(H::NAME, config.delay()).await { - Ok(Some(current)) => { - guard.stop_and_record(); - current - } - - Ok(None) => { - guard.stop_and_record(); - warn!(pipeline = H::NAME, "No watermark for pipeline, skipping"); - continue; - } + } - Err(e) => { - guard.stop_and_record(); - warn!(pipeline = H::NAME, "Failed to get watermark: {e}"); - continue; - } + Err(e) => { + guard.stop_and_record(); + warn!(pipeline = H::NAME, "Failed to get watermark: {e}"); + continue; } } }; @@ -170,13 +165,7 @@ pub(super) fn pruner( // (2) Wait until this information can be acted upon. if let Some(wait_for) = watermark.wait_for() { debug!(pipeline = H::NAME, ?wait_for, "Waiting to prune"); - tokio::select! { - _ = tokio::time::sleep(wait_for) => {} - _ = cancel.cancelled() => { - info!(pipeline = H::NAME, "Shutdown received"); - break; - } - } + tokio::time::sleep(wait_for).await; } // Tracks the current highest `pruner_hi` not yet written to db. This is updated as @@ -204,21 +193,13 @@ pub(super) fn pruner( let mut tasks = FuturesUnordered::new(); for (from, to_exclusive) in pending_prune_ranges.iter() { let semaphore = semaphore.clone(); - let cancel = cancel.child_token(); let metrics = metrics.clone(); let handler = handler.clone(); let db = store.clone(); tasks.push(tokio::spawn(async move { - let _permit = tokio::select! { - permit = semaphore.acquire() => { - permit.unwrap() - } - _ = cancel.cancelled() => { - return ((from, to_exclusive), Err(anyhow::anyhow!("Cancelled"))); - } - }; + let _permit = semaphore.acquire().await.unwrap(); let result = prune_task_impl(metrics, db, handler, from, to_exclusive).await; ((from, to_exclusive), result) })); @@ -291,8 +272,6 @@ pub(super) fn pruner( } } } - - info!(pipeline = H::NAME, "Stopping pruner"); }) } @@ -354,7 +333,6 @@ mod tests { use prometheus::Registry; use sui_types::full_checkpoint_content::Checkpoint; use tokio::time::Duration; - use tokio_util::sync::CancellationToken; use crate::{ FieldCount, @@ -545,7 +523,6 @@ mod tests { }; let registry = Registry::new_custom(Some("test".to_string()), None).unwrap(); let metrics = IndexerMetrics::new(None, ®istry); - let cancel = CancellationToken::new(); // Update data let test_data = HashMap::from([(1, vec![1, 2, 3]), (2, vec![4, 5, 6]), (3, vec![7, 8, 9])]); @@ -570,17 +547,7 @@ mod tests { // Start the pruner let store_clone = store.clone(); - let cancel_clone = cancel.clone(); - let pruner_handle = tokio::spawn(async move { - pruner( - handler, - Some(pruner_config), - store_clone, - metrics, - cancel_clone, - ) - .await - }); + let _pruner = pruner(handler, Some(pruner_config), store_clone, metrics); // Wait a short time within delay_ms tokio::time::sleep(Duration::from_millis(200)).await; @@ -621,10 +588,6 @@ mod tests { "Pruner watermark should be updated" ); } - - // Clean up - cancel.cancel(); - let _ = pruner_handle.await; } #[tokio::test] @@ -639,7 +602,6 @@ mod tests { }; let registry = Registry::new_custom(Some("test".to_string()), None).unwrap(); let metrics = IndexerMetrics::new(None, ®istry); - let cancel = CancellationToken::new(); // Update data let test_data = HashMap::from([(1, vec![1, 2, 3]), (2, vec![4, 5, 6]), (3, vec![7, 8, 9])]); @@ -664,17 +626,7 @@ mod tests { // Start the pruner let store_clone = store.clone(); - let cancel_clone = cancel.clone(); - let pruner_handle = tokio::spawn(async move { - pruner( - handler, - Some(pruner_config), - store_clone, - metrics, - cancel_clone, - ) - .await - }); + let _pruner = pruner(handler, Some(pruner_config), store_clone, metrics); // Because the `pruner_timestamp` is in the past, even with the delay_ms it should be pruned // close to immediately. To be safe, sleep for 1000ms before checking, which is well under @@ -697,10 +649,6 @@ mod tests { "Pruner watermark should be updated" ); } - - // Clean up - cancel.cancel(); - let _ = pruner_handle.await; } #[tokio::test] @@ -715,7 +663,6 @@ mod tests { }; let registry = Registry::new_custom(Some("test".to_string()), None).unwrap(); let metrics = IndexerMetrics::new(None, ®istry); - let cancel = CancellationToken::new(); // Set up test data for checkpoints 1-4 let test_data = HashMap::from([ @@ -748,17 +695,7 @@ mod tests { // Start the pruner let store_clone = store.clone(); - let cancel_clone = cancel.clone(); - let pruner_handle = tokio::spawn(async move { - pruner( - handler, - Some(pruner_config), - store_clone, - metrics, - cancel_clone, - ) - .await - }); + let _pruner = pruner(handler, Some(pruner_config), store_clone, metrics); // Wait for first pruning cycle - ranges [2,3) and [3,4) should succeed, [1,2) should fail tokio::time::sleep(Duration::from_millis(500)).await; @@ -793,9 +730,5 @@ mod tests { assert!(!data.contains_key(&3), "Checkpoint 3 should be pruned"); assert!(data.contains_key(&4), "Checkpoint 4 should be preserved"); } - - // Clean up - cancel.cancel(); - let _ = pruner_handle.await; } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/reader_watermark.rs b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/reader_watermark.rs index a724815d057ac..456b8992a7fa4 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/concurrent/reader_watermark.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/concurrent/reader_watermark.rs @@ -3,8 +3,8 @@ use std::sync::Arc; -use tokio::{task::JoinHandle, time::interval}; -use tokio_util::sync::CancellationToken; +use sui_futures::service::Service; +use tokio::time::interval; use tracing::{debug, info, warn}; use crate::{ @@ -23,86 +23,78 @@ use super::{Handler, PrunerConfig}; /// last updated that watermark. The timestamp is always fetched from the database (not from the /// indexer or the reader), to avoid issues with drift between clocks. /// -/// If there is no pruner configuration, this task will immediately exit. Otherwise, the task exits -/// when the provided cancellation token is triggered. +/// If there is no pruner configuration, this task will immediately exit. pub(super) fn reader_watermark( config: Option, store: H::Store, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { - tokio::spawn(async move { +) -> Service { + Service::new().spawn_aborting(async move { let Some(config) = config else { info!(pipeline = H::NAME, "Skipping reader watermark task"); - return; + return Ok(()); }; let mut poll = interval(config.interval()); loop { - tokio::select! { - _ = cancel.cancelled() => { - info!(pipeline = H::NAME, "Shutdown received"); - break; + poll.tick().await; + + let Ok(mut conn) = store.connect().await else { + warn!( + pipeline = H::NAME, + "Reader watermark task failed to get connection for DB" + ); + continue; + }; + + let current = match conn.reader_watermark(H::NAME).await { + Ok(Some(current)) => current, + + Ok(None) => { + warn!(pipeline = H::NAME, "No watermark for pipeline, skipping"); + continue; } - _ = poll.tick() => { - let Ok(mut conn) = store.connect().await else { - warn!(pipeline = H::NAME, "Reader watermark task failed to get connection for DB"); - continue; - }; - - let current = match conn.reader_watermark(H::NAME).await { - Ok(Some(current)) => current, - - Ok(None) => { - warn!(pipeline = H::NAME, "No watermark for pipeline, skipping"); - continue; - } - - Err(e) => { - warn!(pipeline = H::NAME, "Failed to get current watermark: {e}"); - continue; - } - }; - - // Calculate the new reader watermark based on the current high watermark. - let new_reader_lo = (current.checkpoint_hi_inclusive as u64 + 1) - .saturating_sub(config.retention); - - if new_reader_lo <= current.reader_lo as u64 { - debug!( - pipeline = H::NAME, - current = current.reader_lo, - new = new_reader_lo, - "No change to reader watermark", - ); - continue; - } - - metrics - .watermark_reader_lo - .with_label_values(&[H::NAME]) - .set(new_reader_lo as i64); - - let Ok(updated) = conn.set_reader_watermark(H::NAME, new_reader_lo).await else { - warn!(pipeline = H::NAME, "Failed to update reader watermark"); - continue; - }; - - if updated { - info!(pipeline = H::NAME, new_reader_lo, "Watermark"); - - metrics - .watermark_reader_lo_in_db - .with_label_values(&[H::NAME]) - .set(new_reader_lo as i64); - } + Err(e) => { + warn!(pipeline = H::NAME, "Failed to get current watermark: {e}"); + continue; } + }; + + // Calculate the new reader watermark based on the current high watermark. + let new_reader_lo = + (current.checkpoint_hi_inclusive as u64 + 1).saturating_sub(config.retention); + + if new_reader_lo <= current.reader_lo as u64 { + debug!( + pipeline = H::NAME, + current = current.reader_lo, + new = new_reader_lo, + "No change to reader watermark", + ); + continue; } - } - info!(pipeline = H::NAME, "Stopping reader watermark task"); + metrics + .watermark_reader_lo + .with_label_values(&[H::NAME]) + .set(new_reader_lo as i64); + + let Ok(updated) = conn.set_reader_watermark(H::NAME, new_reader_lo).await else { + warn!(pipeline = H::NAME, "Failed to update reader watermark"); + continue; + }; + + if updated { + info!(pipeline = H::NAME, new_reader_lo, "Watermark"); + + metrics + .watermark_reader_lo_in_db + .with_label_values(&[H::NAME]) + .set(new_reader_lo as i64); + } + } }) } @@ -113,7 +105,6 @@ mod tests { use sui_pg_db::FieldCount; use sui_types::full_checkpoint_content::Checkpoint; use tokio::time::Duration; - use tokio_util::sync::CancellationToken; use crate::{ metrics::IndexerMetrics, @@ -168,8 +159,8 @@ mod tests { struct TestSetup { store: MockStore, - handle: JoinHandle<()>, - cancel: CancellationToken, + #[allow(unused)] + handle: Service, } async fn setup_test( @@ -192,18 +183,11 @@ mod tests { }; let metrics = IndexerMetrics::new(None, &Default::default()); - let cancel = CancellationToken::new(); let store_clone = store.clone(); - let cancel_clone = cancel.clone(); - let handle = - reader_watermark::(Some(config), store_clone, metrics, cancel_clone); - - TestSetup { - store, - handle, - cancel, - } + let handle = reader_watermark::(Some(config), store_clone, metrics); + + TestSetup { store, handle } } #[tokio::test] @@ -236,10 +220,6 @@ mod tests { let watermarks = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermarks.reader_lo, 6); } - - // Clean up - setup.cancel.cancel(); - let _ = setup.handle.await; } #[tokio::test] @@ -276,10 +256,6 @@ mod tests { "Reader watermark should not be updated when new value is smaller" ); } - - // Clean up - setup.cancel.cancel(); - let _ = setup.handle.await; } #[tokio::test] @@ -332,10 +308,6 @@ mod tests { watermark.reader_lo, 6, "Reader watermark should be updated after retry succeeds" ); - - // Clean up - setup.cancel.cancel(); - let _ = setup.handle.await; } #[tokio::test] @@ -380,9 +352,5 @@ mod tests { let watermarks = setup.store.watermark(DataPipeline::NAME).unwrap(); assert_eq!(watermarks.reader_lo, 6); } - - // Clean up - setup.cancel.cancel(); - let _ = setup.handle.await; } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/mod.rs b/crates/sui-indexer-alt-framework/src/pipeline/mod.rs index 7c661e2e48f92..6ae3710b03c54 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/mod.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/mod.rs @@ -55,17 +55,6 @@ struct WatermarkPart { total_rows: usize, } -/// Internal type used by workers to propagate errors or shutdown signals up to their -/// supervisor. -#[derive(thiserror::Error, Debug)] -enum Break { - #[error("Shutdown received")] - Cancel, - - #[error(transparent)] - Err(#[from] anyhow::Error), -} - impl CommitterConfig { pub fn collect_interval(&self) -> Duration { Duration::from_millis(self.collect_interval_ms) diff --git a/crates/sui-indexer-alt-framework/src/pipeline/processor.rs b/crates/sui-indexer-alt-framework/src/pipeline/processor.rs index edc420028e283..c52e4b4e1527a 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/processor.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/processor.rs @@ -5,17 +5,16 @@ use std::sync::Arc; use std::time::Duration; use backoff::ExponentialBackoff; -use sui_futures::stream::TrySpawnStreamExt; +use sui_futures::{ + service::Service, + stream::{Break, TrySpawnStreamExt}, +}; use sui_types::full_checkpoint_content::Checkpoint; -use tokio::{sync::mpsc, task::JoinHandle}; +use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; -use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; -use crate::{ - metrics::{CheckpointLagMetricReporter, IndexerMetrics}, - pipeline::Break, -}; +use crate::metrics::{CheckpointLagMetricReporter, IndexerMetrics}; use super::IndexedCheckpoint; use async_trait::async_trait; @@ -61,16 +60,13 @@ pub trait Processor: Send + Sync + 'static { /// /// Each worker processes a checkpoint into rows and sends them on to the committer using the `tx` /// channel. -/// -/// The task will shutdown if the `cancel` token is cancelled. pub(super) fn processor( processor: Arc

, rx: mpsc::Receiver>, tx: mpsc::Sender>, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { - tokio::spawn(async move { +) -> Service { + Service::new().spawn_aborting(async move { info!(pipeline = P::NAME, "Starting processor"); let checkpoint_lag_reporter = CheckpointLagMetricReporter::new_for_pipeline::

( &metrics.processed_checkpoint_timestamp_lag, @@ -82,15 +78,10 @@ pub(super) fn processor( .try_for_each_spawned(P::FANOUT, |checkpoint| { let tx = tx.clone(); let metrics = metrics.clone(); - let cancel = cancel.clone(); let checkpoint_lag_reporter = checkpoint_lag_reporter.clone(); let processor = processor.clone(); async move { - if cancel.is_cancelled() { - return Err(Break::Cancel); - } - metrics .total_handler_checkpoints_received .with_label_values(&[P::NAME]) @@ -152,7 +143,7 @@ pub(super) fn processor( values, )) .await - .map_err(|_| Break::Cancel)?; + .map_err(|_| Break::Break)?; Ok(()) } @@ -163,15 +154,17 @@ pub(super) fn processor( info!(pipeline = P::NAME, "Checkpoints done, stopping processor"); } - Err(Break::Cancel) => { - info!(pipeline = P::NAME, "Shutdown received, stopping processor"); + Err(Break::Break) => { + info!(pipeline = P::NAME, "Channel closed, stopping processor"); } Err(Break::Err(e)) => { error!(pipeline = P::NAME, "Error from handler: {e}"); - cancel.cancel(); + return Err(e.context(format!("Error from processor {}", P::NAME))); } }; + + Ok(()) }) } @@ -188,7 +181,6 @@ mod tests { }; use sui_types::test_checkpoint_data_builder::TestCheckpointBuilder; use tokio::{sync::mpsc, time::timeout}; - use tokio_util::sync::CancellationToken; use super::*; @@ -239,10 +231,9 @@ mod tests { let (data_tx, data_rx) = mpsc::channel(2); let (indexed_tx, mut indexed_rx) = mpsc::channel(2); let metrics = IndexerMetrics::new(None, &Default::default()); - let cancel = CancellationToken::new(); // Spawn the processor task - let handle = super::processor(processor, data_rx, indexed_tx, metrics, cancel.clone()); + let _svc = super::processor(processor, data_rx, indexed_tx, metrics); // Send both checkpoints data_tx.send(checkpoint1.clone()).await.unwrap(); @@ -279,10 +270,6 @@ mod tests { timeout_result.is_err(), "Should timeout waiting for more checkpoints" ); - - // Clean up - drop(data_tx); - let _ = handle.await; } #[tokio::test] @@ -298,10 +285,9 @@ mod tests { let (data_tx, data_rx) = mpsc::channel(2); let (indexed_tx, mut indexed_rx) = mpsc::channel(2); let metrics = IndexerMetrics::new(None, &Default::default()); - let cancel = CancellationToken::new(); // Spawn the processor task - let handle = super::processor(processor, data_rx, indexed_tx, metrics, cancel.clone()); + let svc = super::processor(processor, data_rx, indexed_tx, metrics); // Send first checkpoint. data_tx.send(checkpoint1.clone()).await.unwrap(); @@ -313,21 +299,19 @@ mod tests { .expect("Should receive first IndexedCheckpoint"); assert_eq!(indexed1.watermark.checkpoint_hi_inclusive, 1); - // Cancel the processor - cancel.cancel(); + // Shutdown the processor + svc.shutdown().await.unwrap(); - // Send second checkpoint after cancellation - data_tx.send(checkpoint2.clone()).await.unwrap(); + // Sending second checkpoint after shutdown should fail, because the data_rx channel is + // closed. + data_tx.send(checkpoint2.clone()).await.unwrap_err(); // Indexed channel is closed, and indexed_rx receives the last None result. let next_result = indexed_rx.recv().await; assert!( next_result.is_none(), - "Channel should be closed after cancellation" + "Channel should be closed after shutdown" ); - - // Clean up - let _ = handle.await; } #[tokio::test] @@ -369,10 +353,9 @@ mod tests { let (indexed_tx, mut indexed_rx) = mpsc::channel(2); let metrics = IndexerMetrics::new(None, &Default::default()); - let cancel = CancellationToken::new(); // Spawn the processor task - let handle = super::processor(processor, data_rx, indexed_tx, metrics, cancel.clone()); + let _svc = super::processor(processor, data_rx, indexed_tx, metrics); // Send and verify first checkpoint (should succeed immediately) data_tx.send(checkpoint1.clone()).await.unwrap(); @@ -393,10 +376,6 @@ mod tests { // Verify that exactly 3 attempts were made (2 failures + 1 success) assert_eq!(attempt_count.load(Ordering::Relaxed), 3); - - // Clean up - drop(data_tx); - let _ = handle.await; } // By default, Rust's async tests run on the single-threaded runtime. @@ -434,10 +413,9 @@ mod tests { let (data_tx, data_rx) = mpsc::channel(10); let (indexed_tx, mut indexed_rx) = mpsc::channel(10); let metrics = IndexerMetrics::new(None, &Default::default()); - let cancel = CancellationToken::new(); // Spawn processor task - let handle = super::processor(processor, data_rx, indexed_tx, metrics, cancel.clone()); + let _svc = super::processor(processor, data_rx, indexed_tx, metrics); // Send all checkpoints and measure time let start = std::time::Instant::now(); @@ -460,8 +438,5 @@ mod tests { // Verify results assert_eq!(received.len(), 5); - - // Clean up - let _ = handle.await; } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/sequential/committer.rs b/crates/sui-indexer-alt-framework/src/pipeline/sequential/committer.rs index 1deb7f33f6edd..859abc19cbd78 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/sequential/committer.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/sequential/committer.rs @@ -4,12 +4,11 @@ use std::{cmp::Ordering, collections::BTreeMap, sync::Arc}; use scoped_futures::ScopedFutureExt; +use sui_futures::service::Service; use tokio::{ sync::mpsc, - task::JoinHandle, time::{MissedTickBehavior, interval}, }; -use tokio_util::sync::CancellationToken; use tracing::{debug, info, warn}; use crate::{ @@ -36,8 +35,6 @@ use super::{Handler, SequentialConfig}; /// /// Upon successful write, the task sends its new watermark back to the ingestion service, to /// unblock its regulator. -/// -/// The task can be shutdown using its `cancel` token or if either of its channels are closed. pub(super) fn committer( handler: Arc, config: SequentialConfig, @@ -46,13 +43,12 @@ pub(super) fn committer( tx: mpsc::UnboundedSender<(&'static str, u64)>, store: H::Store, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> +) -> Service where H: Handler + Send + Sync + 'static, H::Store: TransactionalStore + 'static, { - tokio::spawn(async move { + Service::new().spawn_aborting(async move { // The `poll` interval controls the maximum time to wait between commits, regardless of the // amount of data available. let mut poll = interval(config.committer.collect_interval()); @@ -71,6 +67,7 @@ where let mut batch = H::Batch::default(); let mut batch_rows = 0; let mut batch_checkpoints = 0; + // The task keeps track of the highest (inclusive) checkpoint it has added to the batch // through `next_checkpoint`, and whether that batch needs to be written out. By extension // it also knows the next checkpoint to expect and add to the batch. In case of db txn @@ -97,11 +94,6 @@ where loop { tokio::select! { - _ = cancel.cancelled() => { - info!(pipeline = H::NAME, "Shutdown received"); - break; - } - _ = poll.tick() => { if batch_checkpoints == 0 && rx.is_closed() @@ -369,6 +361,7 @@ where } info!(pipeline = H::NAME, "Stopping committer"); + Ok(()) }) } @@ -406,7 +399,6 @@ mod tests { use std::{sync::Arc, time::Duration}; use sui_types::full_checkpoint_content::Checkpoint; use tokio::sync::mpsc; - use tokio_util::sync::CancellationToken; // Test implementation of Handler #[derive(Default)] @@ -450,14 +442,14 @@ mod tests { store: MockStore, checkpoint_tx: mpsc::Sender>, commit_hi_rx: mpsc::UnboundedReceiver<(&'static str, u64)>, - committer_handle: JoinHandle<()>, + #[allow(unused)] + committer: Service, } /// Emulates adding a sequential pipeline to the indexer. The next_checkpoint is the checkpoint /// for the indexer to ingest from. fn setup_test(next_checkpoint: u64, config: SequentialConfig, store: MockStore) -> TestSetup { let metrics = IndexerMetrics::new(None, &Registry::default()); - let cancel = CancellationToken::new(); let (checkpoint_tx, checkpoint_rx) = mpsc::channel(10); #[allow(clippy::disallowed_methods)] @@ -465,7 +457,7 @@ mod tests { let store_clone = store.clone(); let handler = Arc::new(TestHandler); - let committer_handle = committer( + let committer = committer( handler, config, next_checkpoint, @@ -473,14 +465,13 @@ mod tests { commit_hi_tx, store_clone, metrics, - cancel, ); TestSetup { store, checkpoint_tx, commit_hi_rx, - committer_handle, + committer, } } @@ -532,10 +523,6 @@ mod tests { let commit_hi = setup.commit_hi_rx.recv().await.unwrap(); assert_eq!(commit_hi.0, "test", "Pipeline name should be 'test'"); assert_eq!(commit_hi.1, 3, "commit_hi should be 3 (checkpoint 2 + 1)"); - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } /// Configure the MockStore with no watermark, and emulate `first_checkpoint` by passing the @@ -577,10 +564,6 @@ mod tests { assert_eq!(watermark.checkpoint_hi_inclusive, 7); assert_eq!(watermark.tx_hi, 7); } - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -613,10 +596,6 @@ mod tests { let commit_hi = setup.commit_hi_rx.recv().await.unwrap(); assert_eq!(commit_hi.0, "test", "Pipeline name should be 'test'"); assert_eq!(commit_hi.1, 3, "commit_hi should be 3 (checkpoint 2 + 1)"); - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -650,10 +629,6 @@ mod tests { // Verify data is written in order across batches assert_eq!(setup.store.get_sequential_data(), vec![0, 1, 2, 3]); - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -687,10 +662,6 @@ mod tests { assert_eq!(setup.store.get_sequential_data(), vec![0, 1, 2]); let commit_hi = setup.commit_hi_rx.recv().await.unwrap(); assert_eq!(commit_hi.1, 3, "commit_hi should be 3 (checkpoint 2 + 1)"); - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -723,10 +694,6 @@ mod tests { // Verify all checkpoints are written assert_eq!(setup.store.get_sequential_data(), vec![0, 1, 2, 3]); - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -762,10 +729,6 @@ mod tests { // Verify only checkpoint 0 is written (since it's the only one that satisfies checkpoint_lag) assert_eq!(setup.store.get_sequential_data(), vec![0]); - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } #[tokio::test] @@ -805,9 +768,5 @@ mod tests { commit_hi.1, 11, "commit_hi should be 11 (checkpoint 10 + 1)" ); - - // Clean up - drop(setup.checkpoint_tx); - let _ = setup.committer_handle.await; } } diff --git a/crates/sui-indexer-alt-framework/src/pipeline/sequential/mod.rs b/crates/sui-indexer-alt-framework/src/pipeline/sequential/mod.rs index c0c0a319bba88..6bf848dbfc812 100644 --- a/crates/sui-indexer-alt-framework/src/pipeline/sequential/mod.rs +++ b/crates/sui-indexer-alt-framework/src/pipeline/sequential/mod.rs @@ -4,8 +4,9 @@ use std::sync::Arc; use serde::{Deserialize, Serialize}; -use tokio::{sync::mpsc, task::JoinHandle}; -use tokio_util::sync::CancellationToken; +use sui_futures::service::Service; +use tokio::sync::mpsc; +use tracing::info; use super::{CommitterConfig, PIPELINE_BUFFER, Processor, processor::processor}; @@ -75,7 +76,7 @@ pub trait Handler: Processor { } /// Configuration for a sequential pipeline -#[derive(Serialize, Deserialize, Clone, Default)] +#[derive(Serialize, Deserialize, Debug, Clone, Default)] pub struct SequentialConfig { /// Configuration for the writer, that makes forward progress. pub committer: CommitterConfig, @@ -106,9 +107,9 @@ pub struct SequentialConfig { /// /// Checkpoint data is fed into the pipeline through the `checkpoint_rx` channel, watermark updates /// are communicated to the ingestion service through the `watermark_tx` channel and internal -/// channels are created to communicate between its various components. The pipeline can be -/// shutdown using its `cancel` token, and will also shutdown if any of its input or output -/// channels close, or any of its independent tasks fail. +/// channels are created to communicate between its various components. The pipeline will shutdown +/// if any of its input or output channels close, any of its independent tasks fail, or if it is +/// signalled to shutdown through the returned service handle. pub(crate) fn pipeline( handler: H, next_checkpoint: u64, @@ -117,21 +118,24 @@ pub(crate) fn pipeline( checkpoint_rx: mpsc::Receiver>, watermark_tx: mpsc::UnboundedSender<(&'static str, u64)>, metrics: Arc, - cancel: CancellationToken, -) -> JoinHandle<()> { +) -> Service { + info!( + pipeline = H::NAME, + "Starting pipeline with config: {config:#?}", + ); + let (processor_tx, committer_rx) = mpsc::channel(H::FANOUT + PIPELINE_BUFFER); let handler = Arc::new(handler); - let processor = processor( + let s_processor = processor( handler.clone(), checkpoint_rx, processor_tx, metrics.clone(), - cancel.clone(), ); - let committer = committer::( + let s_committer = committer::( handler, config, next_checkpoint, @@ -139,10 +143,7 @@ pub(crate) fn pipeline( watermark_tx, db, metrics.clone(), - cancel.clone(), ); - tokio::spawn(async move { - let (_, _) = futures::join!(processor, committer); - }) + s_processor.merge(s_committer) } diff --git a/crates/sui-indexer-alt-framework/src/postgres/mod.rs b/crates/sui-indexer-alt-framework/src/postgres/mod.rs index b12ff451bde3a..f0c2a42f5a60e 100644 --- a/crates/sui-indexer-alt-framework/src/postgres/mod.rs +++ b/crates/sui-indexer-alt-framework/src/postgres/mod.rs @@ -7,7 +7,6 @@ use prometheus::Registry; use sui_indexer_alt_metrics::db::DbConnectionStatsCollector; use sui_pg_db::temp::TempDb; use tempfile::tempdir; -use tokio_util::sync::CancellationToken; use url::Url; use crate::{ @@ -46,7 +45,6 @@ impl Indexer { migrations: Option<&'static EmbeddedMigrations>, metrics_prefix: Option<&str>, registry: &Registry, - cancel: CancellationToken, ) -> Result { let store = Db::for_write(database_url, db_args) // I guess our store needs a constructor fn .await @@ -70,7 +68,6 @@ impl Indexer { ingestion_config, metrics_prefix, registry, - cancel, ) .await } @@ -98,7 +95,6 @@ impl Indexer { IngestionConfig::default(), None, &Registry::new(), - CancellationToken::new(), ) .await .unwrap(); diff --git a/crates/sui-indexer-alt-graphql/Cargo.toml b/crates/sui-indexer-alt-graphql/Cargo.toml index e16b7fa410f98..f6a6bfbd125c1 100644 --- a/crates/sui-indexer-alt-graphql/Cargo.toml +++ b/crates/sui-indexer-alt-graphql/Cargo.toml @@ -44,7 +44,6 @@ shared-crypto.workspace = true telemetry-subscribers.workspace = true thiserror.workspace = true tokio.workspace = true -tokio-util.workspace = true toml.workspace = true tonic.workspace = true tower-http.workspace = true @@ -60,6 +59,7 @@ move-ir-types.workspace = true bin-version.workspace = true sui-default-config.workspace = true sui-display.workspace = true +sui-futures.workspace = true sui-indexer-alt-metrics.workspace = true sui-indexer-alt-reader.workspace = true sui-indexer-alt-schema.workspace = true diff --git a/crates/sui-indexer-alt-graphql/src/api/query.rs b/crates/sui-indexer-alt-graphql/src/api/query.rs index 7b25ed3c3f8fb..401a594b5837f 100644 --- a/crates/sui-indexer-alt-graphql/src/api/query.rs +++ b/crates/sui-indexer-alt-graphql/src/api/query.rs @@ -6,7 +6,6 @@ use async_graphql::{Context, Object, Result, connection::Connection}; use futures::future::try_join_all; use sui_indexer_alt_reader::fullnode_client::{Error::GrpcExecutionError, FullnodeClient}; use sui_rpc::proto::sui::rpc::v2 as proto; -use sui_types::digests::ChainIdentifier; use crate::{ api::{ @@ -20,6 +19,7 @@ use crate::{ error::{RpcError, bad_user_input, upcast}, pagination::{Page, PaginationConfig}, scope::Scope, + task::chain_identifier::ChainIdentifier, }; use super::{ @@ -82,8 +82,8 @@ impl Query { /// First four bytes of the network's genesis checkpoint digest (uniquely identifies the network), hex-encoded. async fn chain_identifier(&self, ctx: &Context<'_>) -> Result { - let chain_id: ChainIdentifier = *ctx.data()?; - Ok(chain_id.to_string()) + let chain_id: &ChainIdentifier = ctx.data()?; + Ok(chain_id.wait().await.to_string()) } /// Fetch a checkpoint by its sequence number, or the latest checkpoint if no sequence number is provided. diff --git a/crates/sui-indexer-alt-graphql/src/lib.rs b/crates/sui-indexer-alt-graphql/src/lib.rs index 2377b88031319..0666388f47725 100644 --- a/crates/sui-indexer-alt-graphql/src/lib.rs +++ b/crates/sui-indexer-alt-graphql/src/lib.rs @@ -28,6 +28,7 @@ use extensions::{ use headers::ContentLength; use health::DbProbe; use prometheus::Registry; +use sui_futures::service::Service; use sui_indexer_alt_reader::pg_reader::db::DbArgs; use sui_indexer_alt_reader::system_package_task::{SystemPackageTask, SystemPackageTaskArgs}; use sui_indexer_alt_reader::{ @@ -43,10 +44,9 @@ use task::{ chain_identifier, watermark::{WatermarkTask, WatermarksLock}, }; -use tokio::{net::TcpListener, task::JoinHandle}; -use tokio_util::sync::CancellationToken; +use tokio::{net::TcpListener, sync::oneshot}; use tower_http::cors; -use tracing::{error, info}; +use tracing::info; use url::Url; use crate::api::{mutation::Mutation, query::Query}; @@ -99,9 +99,6 @@ pub struct RpcService { /// Metrics for the RPC service. metrics: Arc, - - /// Cancellation token controls lifecycle of all RPC-related services. - cancel: CancellationToken, } impl RpcService @@ -115,7 +112,6 @@ where version: &'static str, schema: SchemaBuilder, registry: &Registry, - cancel: CancellationToken, ) -> Self { let RpcArgs { rpc_listen_address, @@ -135,7 +131,6 @@ where router, schema, metrics, - cancel, } } @@ -173,7 +168,7 @@ where /// Run the RPC service. This binds the listener and exposes handlers for the RPC service and IDE /// (if enabled). - pub async fn run(self) -> anyhow::Result> + pub async fn run(self) -> anyhow::Result where Q: ObjectType + 'static, M: ObjectType + 'static, @@ -186,7 +181,6 @@ where mut router, schema, metrics: _, - cancel, } = self; if with_ide { @@ -214,24 +208,23 @@ where .await .context("Failed to bind GraphQL to listen address")?; - let service = axum::serve( - listener, - router.into_make_service_with_connect_info::(), - ) - .with_graceful_shutdown({ - let cancel = cancel.clone(); - async move { - cancel.cancelled().await; - info!("Shutdown received, shutting down GraphQL service"); - } - }); - - Ok(tokio::spawn(async move { - if let Err(e) = service.await.context("Failed to start GraphQL service") { - error!("Failed to start GraphQL service: {e:?}"); - cancel.cancel(); - } - })) + let (stx, srx) = oneshot::channel::<()>(); + Ok(Service::new() + .with_shutdown_signal(async move { + let _ = stx.send(()); + }) + .spawn(async move { + axum::serve( + listener, + router.into_make_service_with_connect_info::(), + ) + .with_graceful_shutdown(async move { + let _ = srx.await; + info!("Shutdown received, shutting down GraphQL service"); + }) + .await + .context("Failed to start GraphQL service") + })) } } @@ -254,8 +247,7 @@ pub fn schema() -> SchemaBuilder { } /// Set-up and run the RPC service, using the provided arguments (expected to be extracted from the -/// command-line). The service will continue to run until the cancellation token is triggered, and -/// will signal cancellation on the token when it is shutting down. +/// command-line). /// /// Access to most reads is controlled by the `database_url` -- if it is `None`, those reads will /// not work. KV queries can optionally be served by a Bigtable instance or Ledger gRPC service @@ -279,28 +271,16 @@ pub async fn start_rpc( config: RpcConfig, pg_pipelines: Vec, registry: &Registry, - cancel: CancellationToken, -) -> anyhow::Result> { - let rpc = RpcService::new(args, version, schema(), registry, cancel.child_token()); +) -> anyhow::Result { + let rpc = RpcService::new(args, version, schema(), registry); let metrics = rpc.metrics(); // Create gRPC full node client wrapper - let fullnode_client = FullnodeClient::new( - Some("graphql_fullnode"), - fullnode_args, - registry, - cancel.child_token(), - ) - .await?; - - let pg_reader = PgReader::new( - Some("graphql_db"), - database_url.clone(), - db_args, - registry, - cancel.child_token(), - ) - .await?; + let fullnode_client = + FullnodeClient::new(Some("graphql_fullnode"), fullnode_args, registry).await?; + + let pg_reader = + PgReader::new(Some("graphql_db"), database_url.clone(), db_args, registry).await?; let bigtable_reader = if let Some(instance_id) = kv_args.bigtable_instance.as_ref() { let reader = BigtableReader::new( @@ -325,13 +305,8 @@ pub async fn start_rpc( None }; - let consistent_reader = ConsistentReader::new( - Some("graphql_consistent"), - consistent_reader_args, - registry, - cancel.child_token(), - ) - .await?; + let consistent_reader = + ConsistentReader::new(Some("graphql_consistent"), consistent_reader_args, registry).await?; let pg_loader = Arc::new(pg_reader.as_data_loader()); let kv_loader = if let Some(reader) = bigtable_reader.as_ref() { @@ -348,16 +323,13 @@ pub async fn start_rpc( system_package_task_args, pg_reader.clone(), package_store.clone(), - cancel.child_token(), ); // Fetch and cache the chain identifier from the database. - let chain_identifier = chain_identifier::task( - &pg_reader, + let (chain_identifier, s_chain_id) = chain_identifier::task( + pg_reader.clone(), config.watermark.watermark_polling_interval, - cancel.child_token(), - ) - .await?; + ); let watermark_task = WatermarkTask::new( config.watermark, @@ -367,7 +339,6 @@ pub async fn start_rpc( ledger_grpc_reader, consistent_reader.clone(), metrics.clone(), - cancel.child_token(), ); let rpc = rpc @@ -393,16 +364,14 @@ pub async fn start_rpc( .data(package_store) .data(fullnode_client); - let h_rpc = rpc.run().await?; - let h_system_package_task = system_package_task.run(); - let h_watermark = watermark_task.run(); + let s_rpc = rpc.run().await?; + let s_system_package_task = system_package_task.run(); + let s_watermark = watermark_task.run(); - Ok(tokio::spawn(async move { - let _ = h_rpc.await; - cancel.cancel(); - let _ = h_system_package_task.await; - let _ = h_watermark.await; - })) + Ok(s_rpc + .attach(s_chain_id) + .attach(s_system_package_task) + .attach(s_watermark)) } /// Handler for RPC requests (POST requests making GraphQL queries). diff --git a/crates/sui-indexer-alt-graphql/src/main.rs b/crates/sui-indexer-alt-graphql/src/main.rs index 303f3ebc1dc3e..1d8e9eb1272a0 100644 --- a/crates/sui-indexer-alt-graphql/src/main.rs +++ b/crates/sui-indexer-alt-graphql/src/main.rs @@ -4,15 +4,14 @@ use anyhow::Context; use clap::Parser; use prometheus::Registry; +use sui_futures::service::Error; use sui_indexer_alt_graphql::{ args::{Args, Command}, config::{IndexerConfig, RpcLayer}, start_rpc, }; use sui_indexer_alt_metrics::{MetricsService, uptime}; -use tokio::{fs, signal}; -use tokio_util::sync::CancellationToken; -use tracing::info; +use tokio::fs; // Define the `GIT_REVISION` const bin_version::git_revision!(); @@ -75,32 +74,17 @@ async fn main() -> anyhow::Result<()> { pg_pipelines.extend(config.pipelines().map(|p| p.to_owned())); } - let cancel = CancellationToken::new(); - let registry = Registry::new_custom(Some("graphql_alt".into()), None) .context("Failed to create Prometheus registry.")?; - let metrics = MetricsService::new(metrics_args, registry, cancel.child_token()); - - let h_ctrl_c = tokio::spawn({ - let cancel = cancel.clone(); - async move { - tokio::select! { - _ = cancel.cancelled() => {} - _ = signal::ctrl_c() => { - info!("Received Ctrl-C, shutting down..."); - cancel.cancel(); - } - } - } - }); + let metrics = MetricsService::new(metrics_args, registry); metrics .registry() .register(uptime(VERSION)?) .context("Failed to register uptime metric.")?; - let h_rpc = start_rpc( + let s_rpc = start_rpc( Some(database_url), fullnode_args, db_args, @@ -112,16 +96,22 @@ async fn main() -> anyhow::Result<()> { rpc_config, pg_pipelines, metrics.registry(), - cancel.child_token(), ) .await?; - let h_metrics = metrics.run().await?; + let s_metrics = metrics.run().await?; - let _ = h_rpc.await; - cancel.cancel(); - let _ = h_metrics.await; - let _ = h_ctrl_c.await; + match s_rpc.attach(s_metrics).main().await { + Ok(()) | Err(Error::Terminated) => {} + + Err(Error::Aborted) => { + std::process::exit(1); + } + + Err(Error::Task(_)) => { + std::process::exit(2); + } + } } Command::GenerateConfig => { diff --git a/crates/sui-indexer-alt-graphql/src/task/chain_identifier.rs b/crates/sui-indexer-alt-graphql/src/task/chain_identifier.rs index 10497be5643b6..7da1f493ef73b 100644 --- a/crates/sui-indexer-alt-graphql/src/task/chain_identifier.rs +++ b/crates/sui-indexer-alt-graphql/src/task/chain_identifier.rs @@ -1,62 +1,84 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -use std::time::Duration; +use std::{sync::Arc, time::Duration}; -use anyhow::{Context, bail}; +use anyhow::Context; use diesel::QueryDsl; +use sui_futures::service::Service; use sui_indexer_alt_reader::pg_reader::{Connection, PgReader}; use sui_indexer_alt_schema::schema::kv_genesis; -use sui_types::digests::{ChainIdentifier, CheckpointDigest}; -use tokio::time; -use tokio_util::sync::CancellationToken; +use sui_types::digests::{ChainIdentifier as NativeChainIdentifier, CheckpointDigest}; +use tokio::{sync::SetOnce, time}; use tracing::warn; -/// Repeatedly try to fetch the chain identifier from the database at a given interval, eventually -/// returning it, or an error if the task was cancelled before it had a chance to complete. -/// If no database is available, returns a default ChainIdentifier immediately. -pub(crate) async fn task( - pg_reader: &PgReader, - interval: Duration, - cancel: CancellationToken, -) -> anyhow::Result { - // If no database is available, return default chain identifier - if !pg_reader.has_database() { - return Ok(ChainIdentifier::default()); +#[derive(Clone, Default)] +pub(crate) struct ChainIdentifier(Arc>); + +impl ChainIdentifier { + /// Wait for the chain identifier to be set, and return it. + pub(crate) async fn wait(&self) -> NativeChainIdentifier { + *self.0.wait().await + } + + // Set the chain identifier. + fn set(&self, chain: NativeChainIdentifier) -> anyhow::Result<()> { + self.0.set(chain).context("Chain Identifier already set") } +} - let mut interval = time::interval(interval); +/// Repeatedly try to fetch the chain identifier from the database at a given interval, eventually +/// setting it in a `SetOnce` construct. If no database is available, returns a default +/// ChainIdentifier immediately. +/// +/// Returns the container that is populated with the chain identifier once it has been fetched, and +/// a handle to the service performing the fetching. +pub(crate) fn task(pg_reader: PgReader, interval: Duration) -> (ChainIdentifier, Service) { + let crx = ChainIdentifier::default(); + let ctx = crx.clone(); - loop { - tokio::select! { - _ = cancel.cancelled() => { - bail!("Shutdown signal received, terminating chain identifier task"); - } + let svc = Service::new().spawn_aborting(async move { + if !pg_reader.has_database() { + ctx.set(NativeChainIdentifier::default()) + .context("Failed to set ") + .expect("SAFETY: Only this task sets the chain identifier, immediately before returning."); + return Ok(()); + } - _ = interval.tick() => { - let mut conn = match pg_reader.connect().await { - Ok(conn) => conn, - Err(e) => { - warn!("Failed to connect to database: {e}"); - continue; - } - }; - - match fetch(&mut conn).await { - Ok(chain_identifier) => return Ok(chain_identifier), - Err(e) => { - warn!("Failed to fetch chain identifier: {e}"); - continue; - } + let mut interval = time::interval(interval); + + loop { + interval.tick().await; + + let mut conn = match pg_reader.connect().await { + Ok(conn) => conn, + Err(e) => { + warn!("Failed to connect to database: {e}"); + continue; + } + }; + + match fetch(&mut conn).await { + Ok(chain) => { + ctx.set(chain) + .expect("SAFETY: Only this task sets the chain identifier, immediately before returning."); + return Ok(()); + } + + Err(e) => { + warn!("Failed to fetch chain identifier: {e}"); + continue; } } } - } + }); + + (crx, svc) } /// Try to fetch the chain identifier from the database. Fails if the genesis checkpoint digest has /// not been written to the database yet, or if it cannot be deserialized as a checkpoint digest. -async fn fetch(conn: &mut Connection<'_>) -> anyhow::Result { +async fn fetch(conn: &mut Connection<'_>) -> anyhow::Result { use kv_genesis::dsl as g; let digest_bytes: Vec<_> = conn @@ -67,5 +89,5 @@ async fn fetch(conn: &mut Connection<'_>) -> anyhow::Result { let digest = CheckpointDigest::try_from(digest_bytes).context("Failed to deserialize genesis digest")?; - Ok(ChainIdentifier::from(digest)) + Ok(NativeChainIdentifier::from(digest)) } diff --git a/crates/sui-indexer-alt-graphql/src/task/watermark.rs b/crates/sui-indexer-alt-graphql/src/task/watermark.rs index 4ba68c6249522..487cee9e62a49 100644 --- a/crates/sui-indexer-alt-graphql/src/task/watermark.rs +++ b/crates/sui-indexer-alt-graphql/src/task/watermark.rs @@ -14,6 +14,7 @@ use diesel::{ sql_types::{BigInt, Text}, }; use futures::future::OptionFuture; +use sui_futures::service::Service; use sui_indexer_alt_reader::{ bigtable_reader::BigtableReader, consistent_reader::{self, ConsistentReader, proto::AvailableRangeResponse}, @@ -21,9 +22,8 @@ use sui_indexer_alt_reader::{ pg_reader::PgReader, }; use sui_sql_macro::query; -use tokio::{sync::RwLock, task::JoinHandle, time}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, info, warn}; +use tokio::{sync::RwLock, time}; +use tracing::{debug, warn}; use crate::{ api::types::available_range::pipeline_unavailable, config::WatermarkConfig, error::RpcError, @@ -60,9 +60,6 @@ pub(crate) struct WatermarkTask { /// Access to metrics to report watermark updates. metrics: Arc, - - /// Signal to cancel the task. - cancel: CancellationToken, } /// Snapshot of current watermarks. The upperbound is global across all pipelines, and the @@ -125,7 +122,6 @@ impl WatermarkTask { ledger_grpc_reader: Option, consistent_reader: ConsistentReader, metrics: Arc, - cancel: CancellationToken, ) -> Self { let WatermarkConfig { watermark_polling_interval, @@ -140,7 +136,6 @@ impl WatermarkTask { interval: watermark_polling_interval, pg_pipelines, metrics, - cancel, } } @@ -150,11 +145,8 @@ impl WatermarkTask { } /// Start a new task that regularly polls the database for watermarks. - /// - /// This operation consume the `self` and returns a handle to the spawned tokio task. The task - /// will continue to run until its cancellation token is triggered. - pub(crate) fn run(self) -> JoinHandle<()> { - tokio::spawn(async move { + pub(crate) fn run(self) -> Service { + Service::new().spawn_aborting(async move { let Self { watermarks, pg_reader, @@ -164,60 +156,50 @@ impl WatermarkTask { interval, pg_pipelines, metrics, - cancel, } = self; let mut interval = time::interval(interval); loop { - tokio::select! { - biased; + interval.tick().await; - _ = cancel.cancelled() => { - info!("Shutdown signal received, terminating watermark task"); - break; + let rows = match WatermarkRow::read(&pg_reader, bigtable_reader.as_ref(), ledger_grpc_reader.as_ref(), &pg_pipelines).await { + Ok(rows) => rows, + Err(e) => { + warn!("Failed to read watermarks: {e:#}"); + continue; } + }; - _ = interval.tick() => { - let rows = match WatermarkRow::read(&pg_reader, bigtable_reader.as_ref(), ledger_grpc_reader.as_ref(), &pg_pipelines).await { - Ok(rows) => rows, - Err(e) => { - warn!("Failed to read watermarks: {e:#}"); - continue; - } - }; - - let mut w = Watermarks::default(); - for row in rows { - row.record_metrics(&metrics); - w.merge(row); - } - - match watermark_from_consistent(&consistent_reader, w.global_hi.checkpoint as u64).await { - Ok(None) => {} - Ok(Some(consistent_row)) => { - // Merge the consistent store watermark - consistent_row.record_metrics(&metrics); - w.merge(consistent_row); - } - - Err(e) => { - warn!("Failed to get consistent store watermark: {e:#}"); - continue; - } - }; - - debug!( - epoch = w.global_hi.epoch, - checkpoint = w.global_hi.checkpoint, - transaction = w.global_hi.transaction, - timestamp = ?DateTime::from_timestamp_millis(w.timestamp_ms_hi_inclusive).unwrap_or_default(), - "Watermark updated" - ); - - *watermarks.write().await = Arc::new(w); - } + let mut w = Watermarks::default(); + for row in rows { + row.record_metrics(&metrics); + w.merge(row); } + + match watermark_from_consistent(&consistent_reader, w.global_hi.checkpoint as u64).await { + Ok(None) => {} + Ok(Some(consistent_row)) => { + // Merge the consistent store watermark + consistent_row.record_metrics(&metrics); + w.merge(consistent_row); + } + + Err(e) => { + warn!("Failed to get consistent store watermark: {e:#}"); + continue; + } + }; + + debug!( + epoch = w.global_hi.epoch, + checkpoint = w.global_hi.checkpoint, + transaction = w.global_hi.transaction, + timestamp = ?DateTime::from_timestamp_millis(w.timestamp_ms_hi_inclusive).unwrap_or_default(), + "Watermark updated" + ); + + *watermarks.write().await = Arc::new(w); } }) } diff --git a/crates/sui-indexer-alt-jsonrpc/Cargo.toml b/crates/sui-indexer-alt-jsonrpc/Cargo.toml index cab4344cc0d43..02368e66bc510 100644 --- a/crates/sui-indexer-alt-jsonrpc/Cargo.toml +++ b/crates/sui-indexer-alt-jsonrpc/Cargo.toml @@ -38,7 +38,6 @@ serde_with.workspace = true telemetry-subscribers.workspace = true thiserror.workspace = true tokio.workspace = true -tokio-util.workspace = true toml.workspace = true tower-layer.workspace = true # tower.workspace = true @@ -52,6 +51,7 @@ move-core-types.workspace = true sui-default-config.workspace = true sui-display.workspace = true +sui-futures.workspace = true sui-indexer-alt-metrics.workspace = true sui-indexer-alt-reader.workspace = true sui-indexer-alt-schema.workspace = true diff --git a/crates/sui-indexer-alt-jsonrpc/src/context.rs b/crates/sui-indexer-alt-jsonrpc/src/context.rs index 80fb446339de2..7f58d8672cd18 100644 --- a/crates/sui-indexer-alt-jsonrpc/src/context.rs +++ b/crates/sui-indexer-alt-jsonrpc/src/context.rs @@ -13,7 +13,6 @@ use sui_indexer_alt_reader::{ pg_reader::db::DbArgs, }; use sui_package_resolver::Resolver; -use tokio_util::sync::CancellationToken; use url::Url; use crate::{config::RpcConfig, metrics::RpcMetrics}; @@ -57,9 +56,8 @@ impl Context { config: RpcConfig, metrics: Arc, registry: &Registry, - cancel: CancellationToken, ) -> Result { - let pg_reader = PgReader::new(None, database_url, db_args, registry, cancel).await?; + let pg_reader = PgReader::new(None, database_url, db_args, registry).await?; let pg_loader = Arc::new(pg_reader.as_data_loader()); let kv_loader = if let Some(instance_id) = bigtable_instance { diff --git a/crates/sui-indexer-alt-jsonrpc/src/lib.rs b/crates/sui-indexer-alt-jsonrpc/src/lib.rs index 3f8cd7e71e70b..1fc40c917681a 100644 --- a/crates/sui-indexer-alt-jsonrpc/src/lib.rs +++ b/crates/sui-indexer-alt-jsonrpc/src/lib.rs @@ -21,13 +21,12 @@ use metrics::RpcMetrics; use metrics::middleware::MetricsLayer; use prometheus::Registry; use serde_json::json; +use sui_futures::service::Service; use sui_indexer_alt_reader::bigtable_reader::BigtableArgs; use sui_indexer_alt_reader::pg_reader::db::DbArgs; use sui_indexer_alt_reader::system_package_task::{SystemPackageTask, SystemPackageTaskArgs}; use sui_open_rpc::Project; use timeout::TimeoutLayer; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; use tower_layer::Identity; use tracing::{info, warn}; use url::Url; @@ -88,9 +87,6 @@ pub struct RpcService { /// Description of the schema served by this service. schema: Project, - - /// Cancellation token controlling all services. - cancel: CancellationToken, } impl RpcArgs { @@ -109,11 +105,7 @@ impl RpcArgs { impl RpcService { /// Create a new instance of the JSON-RPC service, configured by `rpc_args`. The service will /// not accept connections until [Self::run] is called. - pub fn new( - rpc_args: RpcArgs, - registry: &Registry, - cancel: CancellationToken, - ) -> anyhow::Result { + pub fn new(rpc_args: RpcArgs, registry: &Registry) -> anyhow::Result { let metrics = RpcMetrics::new(registry); let server = ServerBuilder::new() @@ -143,7 +135,6 @@ impl RpcService { slow_request_threshold: rpc_args.slow_request_threshold(), modules: jsonrpsee::RpcModule::new(()), schema, - cancel, }) } @@ -161,9 +152,9 @@ impl RpcService { .context("Failed to add module because of a name conflict") } - /// Start the service (it will accept connections) and return a handle that will resolve when - /// the service stops. - pub async fn run(self) -> anyhow::Result> { + /// Start the service (it will accept connections) and return a handle that tracks the + /// lifecycle of the service. + pub async fn run(self) -> anyhow::Result { let Self { rpc_listen_address, server, @@ -172,7 +163,6 @@ impl RpcService { slow_request_threshold, mut modules, schema, - cancel, } = self; info!("Starting JSON-RPC service on {rpc_listen_address}",); @@ -206,20 +196,15 @@ impl RpcService { .context("Failed to bind JSON-RPC service")? .start(modules); - // Set-up a helper task that will tear down the RPC service when the cancellation token is - // triggered. - let cancel_handle = handle.clone(); - let cancel_cancel = cancel.clone(); - let h_cancel = tokio::spawn(async move { - cancel_cancel.cancelled().await; - cancel_handle.stop() - }); - - Ok(tokio::spawn(async move { - handle.stopped().await; - cancel.cancel(); - let _ = h_cancel.await; - })) + let signal = handle.clone(); + Ok(Service::new() + .with_shutdown_signal(async move { + let _ = signal.stop(); + }) + .spawn(async move { + handle.stopped().await; + Ok(()) + })) } } @@ -243,8 +228,7 @@ pub struct NodeArgs { } /// Set-up and run the RPC service, using the provided arguments (expected to be extracted from the -/// command-line). The service will continue to run until the cancellation token is triggered, and -/// will signal cancellation on the token when it is shutting down. +/// command-line). /// /// Access to most reads is controlled by the `database_url` -- if it is `None`, reads will not work. /// The only exceptions are the `DelegationCoins` and `DelegationGovernance` modules, which are controlled @@ -269,10 +253,8 @@ pub async fn start_rpc( system_package_task_args: SystemPackageTaskArgs, rpc_config: RpcConfig, registry: &Registry, - cancel: CancellationToken, -) -> anyhow::Result> { - let mut rpc = RpcService::new(rpc_args, registry, cancel.child_token()) - .context("Failed to create RPC service")?; +) -> anyhow::Result { + let mut rpc = RpcService::new(rpc_args, registry).context("Failed to create RPC service")?; let context = Context::new( database_url, @@ -282,7 +264,6 @@ pub async fn start_rpc( rpc_config, rpc.metrics(), registry, - cancel.child_token(), ) .await?; @@ -290,7 +271,6 @@ pub async fn start_rpc( system_package_task_args, context.pg_reader().clone(), context.package_resolver().package_store().clone(), - cancel.child_token(), ); rpc.add_module(Checkpoints(context.clone()))?; @@ -320,14 +300,10 @@ pub async fn start_rpc( ); } - let h_rpc = rpc.run().await.context("Failed to start RPC service")?; - let h_system_package_task = system_package_task.run(); + let s_rpc = rpc.run().await.context("Failed to start RPC service")?; + let s_system_package_task = system_package_task.run(); - Ok(tokio::spawn(async move { - let _ = h_rpc.await; - cancel.cancel(); - let _ = h_system_package_task.await; - })) + Ok(s_rpc.attach(s_system_package_task)) } #[cfg(test)] @@ -394,21 +370,10 @@ mod tests { #[tokio::test] async fn test_graceful_shutdown() { - let cancel = CancellationToken::new(); - let rpc = RpcService::new( - RpcArgs { - rpc_listen_address: test_listen_address(), - ..Default::default() - }, - &Registry::new(), - cancel.clone(), - ) - .unwrap(); + let rpc = test_service().await; + let svc = rpc.run().await.unwrap(); - let handle = rpc.run().await.unwrap(); - - cancel.cancel(); - tokio::time::timeout(Duration::from_millis(500), handle) + tokio::time::timeout(Duration::from_millis(500), svc.shutdown()) .await .expect("Shutdown should not timeout") .expect("Shutdown should succeed"); @@ -416,25 +381,22 @@ mod tests { #[tokio::test] async fn test_rpc_discovery() { - let cancel = CancellationToken::new(); let rpc_listen_address = test_listen_address(); - let mut rpc = RpcService::new( RpcArgs { rpc_listen_address, ..Default::default() }, &Registry::new(), - cancel.clone(), ) .unwrap(); rpc.add_module(Foo).unwrap(); rpc.add_module(Baz).unwrap(); - let handle = rpc.run().await.unwrap(); + let svc = rpc.run().await.unwrap(); - let url = format!("http://{}/", rpc_listen_address); + let url = format!("http://{rpc_listen_address}/"); let client = Client::new(); let resp: Value = client @@ -490,8 +452,7 @@ mod tests { ]) ); - cancel.cancel(); - tokio::time::timeout(Duration::from_millis(500), handle) + tokio::time::timeout(Duration::from_millis(500), svc.shutdown()) .await .expect("Shutdown should not timeout") .expect("Shutdown should succeed"); @@ -499,25 +460,22 @@ mod tests { #[tokio::test] async fn test_request_metrics() { - let cancel = CancellationToken::new(); let rpc_listen_address = test_listen_address(); - let mut rpc = RpcService::new( RpcArgs { rpc_listen_address, ..Default::default() }, &Registry::new(), - cancel.clone(), ) .unwrap(); rpc.add_module(Foo).unwrap(); let metrics = rpc.metrics(); - let handle = rpc.run().await.unwrap(); + let svc = rpc.run().await.unwrap(); - let url = format!("http://{}/", rpc_listen_address); + let url = format!("http://{rpc_listen_address}/"); let client = Client::new(); client @@ -582,8 +540,7 @@ mod tests { 1 ); - cancel.cancel(); - tokio::time::timeout(Duration::from_millis(500), handle) + tokio::time::timeout(Duration::from_millis(500), svc.shutdown()) .await .expect("Shutdown should not timeout") .expect("Shutdown should succeed"); @@ -677,14 +634,12 @@ mod tests { } async fn test_service() -> RpcService { - let cancel = CancellationToken::new(); RpcService::new( RpcArgs { rpc_listen_address: test_listen_address(), ..Default::default() }, &Registry::new(), - cancel, ) .expect("Failed to create test JSON-RPC service") } diff --git a/crates/sui-indexer-alt-jsonrpc/src/main.rs b/crates/sui-indexer-alt-jsonrpc/src/main.rs index b92b31ab48945..357d5f86dd8a1 100644 --- a/crates/sui-indexer-alt-jsonrpc/src/main.rs +++ b/crates/sui-indexer-alt-jsonrpc/src/main.rs @@ -4,15 +4,14 @@ use anyhow::Context; use clap::Parser; use prometheus::Registry; +use sui_futures::service::Error; use sui_indexer_alt_jsonrpc::{ args::{Args, Command}, config::RpcLayer, start_rpc, }; use sui_indexer_alt_metrics::{MetricsService, uptime}; -use tokio::{fs, signal}; -use tokio_util::sync::CancellationToken; -use tracing::info; +use tokio::fs; // Define the `GIT_REVISION` const bin_version::git_revision!(); @@ -59,32 +58,17 @@ async fn main() -> anyhow::Result<()> { } .finish(); - let cancel = CancellationToken::new(); - let registry = Registry::new_custom(Some("jsonrpc_alt".into()), None) .context("Failed to create Prometheus registry.")?; - let metrics = MetricsService::new(metrics_args, registry, cancel.child_token()); - - let h_ctrl_c = tokio::spawn({ - let cancel = cancel.clone(); - async move { - tokio::select! { - _ = cancel.cancelled() => {} - _ = signal::ctrl_c() => { - info!("Received Ctrl-C, shutting down..."); - cancel.cancel(); - } - } - } - }); + let metrics = MetricsService::new(metrics_args, registry); metrics .registry() .register(uptime(VERSION)?) .context("Failed to register uptime metric.")?; - let h_rpc = start_rpc( + let s_rpc = start_rpc( Some(database_url), bigtable_instance, db_args, @@ -94,16 +78,22 @@ async fn main() -> anyhow::Result<()> { system_package_task_args, rpc_config, metrics.registry(), - cancel.child_token(), ) .await?; - let h_metrics = metrics.run().await?; + let s_metrics = metrics.run().await?; - let _ = h_rpc.await; - cancel.cancel(); - let _ = h_metrics.await; - let _ = h_ctrl_c.await; + match s_rpc.attach(s_metrics).main().await { + Ok(()) | Err(Error::Terminated) => {} + + Err(Error::Aborted) => { + std::process::exit(1); + } + + Err(Error::Task(_)) => { + std::process::exit(2); + } + } } Command::GenerateConfig => { diff --git a/crates/sui-indexer-alt-metrics/Cargo.toml b/crates/sui-indexer-alt-metrics/Cargo.toml index 50d500cd6522f..0824f2d845a5b 100644 --- a/crates/sui-indexer-alt-metrics/Cargo.toml +++ b/crates/sui-indexer-alt-metrics/Cargo.toml @@ -13,7 +13,7 @@ clap.workspace = true prometheus.workspace = true prometheus-closure-metric.workspace = true tokio.workspace = true -tokio-util.workspace = true tracing.workspace = true +sui-futures.workspace = true sui-pg-db.workspace = true diff --git a/crates/sui-indexer-alt-metrics/src/lib.rs b/crates/sui-indexer-alt-metrics/src/lib.rs index 94fb20c20648d..dd768c5740166 100644 --- a/crates/sui-indexer-alt-metrics/src/lib.rs +++ b/crates/sui-indexer-alt-metrics/src/lib.rs @@ -7,8 +7,8 @@ use anyhow::Context; use axum::{Extension, Router, http::StatusCode, routing::get}; use prometheus::{Registry, TextEncoder, core::Collector}; use prometheus_closure_metric::{ClosureMetric, ValueType}; -use tokio::{net::TcpListener, task::JoinHandle}; -use tokio_util::sync::CancellationToken; +use sui_futures::service::Service; +use tokio::{net::TcpListener, sync::oneshot}; use tracing::info; pub mod db; @@ -25,20 +25,17 @@ pub struct MetricsArgs { pub struct MetricsService { addr: SocketAddr, registry: Registry, - cancel: CancellationToken, } impl MetricsService { /// Create a new instance of the service, listening on the address provided in `args`, serving - /// metrics from the `registry`. The service will shut down if the provided `cancel` token is - /// cancelled. + /// metrics from the `registry`. /// /// The service will not be run until [Self::run] is called. - pub fn new(args: MetricsArgs, registry: Registry, cancel: CancellationToken) -> Self { + pub fn new(args: MetricsArgs, registry: Registry) -> Self { Self { addr: args.metrics_address, registry, - cancel, } } @@ -48,12 +45,8 @@ impl MetricsService { } /// Start the service. The service will run until the cancellation token is triggered. - pub async fn run(self) -> anyhow::Result> { - let Self { - addr, - registry, - cancel, - } = self; + pub async fn run(self) -> anyhow::Result { + let Self { addr, registry } = self; let listener = TcpListener::bind(&self.addr) .await @@ -63,16 +56,20 @@ impl MetricsService { .route("/metrics", get(metrics)) .layer(Extension(registry)); - Ok(tokio::spawn(async move { - info!("Starting metrics service on {}", addr); - axum::serve(listener, app) - .with_graceful_shutdown(async move { - cancel.cancelled().await; - info!("Shutdown received, shutting down metrics service"); - }) - .await - .unwrap() - })) + let (stx, srx) = oneshot::channel::<()>(); + Ok(Service::new() + .with_shutdown_signal(async move { + let _ = stx.send(()); + }) + .spawn(async move { + info!("Starting metrics service on {addr}"); + Ok(axum::serve(listener, app) + .with_graceful_shutdown(async move { + let _ = srx.await; + info!("Shutdown received, shutting down metrics service"); + }) + .await?) + })) } } diff --git a/crates/sui-indexer-alt-reader/Cargo.toml b/crates/sui-indexer-alt-reader/Cargo.toml index b37a7caa41e70..37c2e3f481bda 100644 --- a/crates/sui-indexer-alt-reader/Cargo.toml +++ b/crates/sui-indexer-alt-reader/Cargo.toml @@ -23,13 +23,13 @@ prometheus.workspace = true serde_json.workspace = true thiserror.workspace = true tokio.workspace = true -tokio-util.workspace = true tonic = { workspace = true, features = ["tls-native-roots"] } tracing.workspace = true url.workspace = true move-core-types.workspace = true +sui-futures.workspace = true sui-indexer-alt-consistent-api.workspace = true sui-indexer-alt-metrics.workspace = true sui-indexer-alt-schema.workspace = true diff --git a/crates/sui-indexer-alt-reader/src/consistent_reader.rs b/crates/sui-indexer-alt-reader/src/consistent_reader.rs index f17a763d0778b..c9aae359f343d 100644 --- a/crates/sui-indexer-alt-reader/src/consistent_reader.rs +++ b/crates/sui-indexer-alt-reader/src/consistent_reader.rs @@ -16,7 +16,6 @@ use sui_types::{ TypeTag, base_types::{ObjectDigest, ObjectID, ObjectRef, SequenceNumber}, }; -use tokio_util::sync::CancellationToken; use tonic::transport::Channel; use tracing::instrument; use url::Url; @@ -25,13 +24,6 @@ pub use sui_indexer_alt_consistent_api::proto::rpc::consistent::v1alpha as proto use crate::metrics::ConsistentReaderMetrics; -/// Like `anyhow::bail!`, but returns this module's `Error` type, not `anyhow::Error`. -macro_rules! bail { - ($e:expr) => { - return Err(Error::Internal(anyhow!($e))); - }; -} - #[derive(clap::Args, Debug, Clone, Default)] pub struct ConsistentReaderArgs { /// URL of the consistent store gRPC service @@ -49,7 +41,6 @@ pub struct ConsistentReader { client: Option, timeout: Option, metrics: Arc, - cancel: CancellationToken, } /// Response from a paginated query. @@ -92,7 +83,6 @@ impl ConsistentReader { prefix: Option<&str>, args: ConsistentReaderArgs, registry: &Registry, - cancel: CancellationToken, ) -> Result { let client = if let Some(url) = &args.consistent_store_url { let mut endpoint = Channel::from_shared(url.to_string()) @@ -116,7 +106,6 @@ impl ConsistentReader { client, timeout, metrics, - cancel, }) } @@ -377,15 +366,10 @@ impl ConsistentReader { ); } - let response = tokio::select! { - _ = self.cancel.cancelled() => { - bail!("Request cancelled"); - } - - r = response(client, request) => { - r.map(|r| r.into_inner()).map_err(Into::into) - } - }; + let response = response(client, request) + .await + .map(|r| r.into_inner()) + .map_err(Into::into); if response.is_ok() { self.metrics diff --git a/crates/sui-indexer-alt-reader/src/fullnode_client.rs b/crates/sui-indexer-alt-reader/src/fullnode_client.rs index 7a5c5e91df635..6e0d8675203ea 100644 --- a/crates/sui-indexer-alt-reader/src/fullnode_client.rs +++ b/crates/sui-indexer-alt-reader/src/fullnode_client.rs @@ -3,7 +3,7 @@ use std::sync::Arc; -use anyhow::{Context, anyhow}; +use anyhow::Context; use prometheus::Registry; use prost_types::FieldMask; use sui_rpc::field::FieldMaskUtil; @@ -11,19 +11,11 @@ use sui_rpc::proto::sui::rpc::v2 as proto; use sui_rpc::proto::sui::rpc::v2::transaction_execution_service_client::TransactionExecutionServiceClient; use sui_types::signature::GenericSignature; use sui_types::transaction::{Transaction, TransactionData}; -use tokio_util::sync::CancellationToken; use tonic::transport::Channel; use tracing::instrument; use crate::metrics::FullnodeClientMetrics; -/// Like `anyhow::bail!`, but returns this module's `Error` type, not `anyhow::Error`. -macro_rules! bail { - ($e:expr) => { - return Err(Error::Internal(anyhow!($e))); - }; -} - #[derive(clap::Args, Debug, Clone, Default)] pub struct FullnodeArgs { /// gRPC URL for full node operations such as executeTransaction and simulateTransaction. @@ -36,7 +28,6 @@ pub struct FullnodeArgs { pub struct FullnodeClient { execution_client: Option>, metrics: Arc, - cancel: CancellationToken, } #[derive(thiserror::Error, Debug)] @@ -56,7 +47,6 @@ impl FullnodeClient { prefix: Option<&str>, args: FullnodeArgs, registry: &Registry, - cancel: CancellationToken, ) -> Result { let execution_client = if let Some(url) = &args.fullnode_rpc_url { let channel = Channel::from_shared(url.clone()) @@ -73,7 +63,6 @@ impl FullnodeClient { Ok(Self { execution_client, metrics, - cancel, }) } @@ -175,15 +164,10 @@ impl FullnodeClient { .with_label_values(&[method]) .start_timer(); - let response = tokio::select! { - _ = self.cancel.cancelled() => { - bail!("Request cancelled"); - } - - r = response(client) => { - r.map(|r| r.into_inner()).map_err(Error::from) - } - }; + let response = response(client) + .await + .map(|r| r.into_inner()) + .map_err(Into::into); if response.is_ok() { self.metrics diff --git a/crates/sui-indexer-alt-reader/src/object_versions.rs b/crates/sui-indexer-alt-reader/src/object_versions.rs index 28ac39ca4a692..283b7577ce0e9 100644 --- a/crates/sui-indexer-alt-reader/src/object_versions.rs +++ b/crates/sui-indexer-alt-reader/src/object_versions.rs @@ -315,7 +315,6 @@ mod tests { use sui_indexer_alt_schema::{MIGRATIONS, schema::obj_versions}; use sui_pg_db::{Db, DbArgs, temp::TempDb}; use sui_types::digests::ObjectDigest; - use tokio_util::sync::CancellationToken; use super::*; @@ -327,15 +326,9 @@ mod tests { let url = temp_db.database().url(); let writer = Db::for_write(url.clone(), DbArgs::default()).await.unwrap(); - let reader = PgReader::new( - None, - Some(url.clone()), - DbArgs::default(), - ®istry, - CancellationToken::new(), - ) - .await - .unwrap(); + let reader = PgReader::new(None, Some(url.clone()), DbArgs::default(), ®istry) + .await + .unwrap(); writer.run_migrations(Some(&MIGRATIONS)).await.unwrap(); (temp_db, writer, reader) diff --git a/crates/sui-indexer-alt-reader/src/pg_reader.rs b/crates/sui-indexer-alt-reader/src/pg_reader.rs index 9048521880dcd..523118bf06ffd 100644 --- a/crates/sui-indexer-alt-reader/src/pg_reader.rs +++ b/crates/sui-indexer-alt-reader/src/pg_reader.rs @@ -14,7 +14,6 @@ use diesel::query_dsl::methods::LimitDsl; use diesel_async::RunQueryDsl; use prometheus::Registry; use sui_indexer_alt_metrics::db::DbConnectionStatsCollector; -use tokio_util::sync::CancellationToken; use tracing::{debug, warn}; use url::Url; @@ -28,7 +27,6 @@ pub use sui_pg_db as db; pub struct PgReader { db: Option, metrics: Arc, - cancel: CancellationToken, } pub struct Connection<'p> { @@ -46,7 +44,6 @@ impl PgReader { database_url: Option, db_args: db::DbArgs, registry: &Registry, - cancel: CancellationToken, ) -> anyhow::Result { let db = if let Some(database_url) = database_url { let db = db::Db::for_read(database_url, db_args) @@ -67,11 +64,7 @@ impl PgReader { let metrics = DbReaderMetrics::new(prefix, registry); - Ok(Self { - db, - metrics, - cancel, - }) + Ok(Self { db, metrics }) } /// Create a data loader backed by this reader. @@ -84,25 +77,22 @@ impl PgReader { self.db.is_some() } - /// Acquire a connection to the database. This can potentially fail if the service is cancelled - /// while the connection is being acquired. + /// Acquire a connection to the database. This can fail if a database has not been configured + /// to connect to. pub async fn connect(&self) -> anyhow::Result> { let Some(db) = &self.db else { bail!("No database to connect to"); }; - tokio::select! { - _ = self.cancel.cancelled() => { - bail!("Cancelled while connecting to the database"); - } - - conn = db.connect() => { - Ok(Connection { - conn: conn.context("Failed to connect to database")?, - metrics: self.metrics.clone(), - }) - } - } + let conn = db + .connect() + .await + .context("Failed to connect to database")?; + + Ok(Connection { + conn, + metrics: self.metrics.clone(), + }) } } diff --git a/crates/sui-indexer-alt-reader/src/system_package_task.rs b/crates/sui-indexer-alt-reader/src/system_package_task.rs index faf02163582bb..cafc1c988858c 100644 --- a/crates/sui-indexer-alt-reader/src/system_package_task.rs +++ b/crates/sui-indexer-alt-reader/src/system_package_task.rs @@ -8,9 +8,9 @@ use diesel::{ sql_types::{BigInt, Bytea}, }; use move_core_types::account_address::AccountAddress; +use sui_futures::service::Service; use sui_sql_macro::query; -use tokio::{task::JoinHandle, time}; -use tokio_util::sync::CancellationToken; +use tokio::time; use tracing::{error, info}; use crate::{package_resolver::PackageCache, pg_reader::PgReader}; @@ -33,9 +33,6 @@ pub struct SystemPackageTask { /// How long to wait between checks. interval: Duration, - - /// Signal to cancel the task. - cancel: CancellationToken, } impl SystemPackageTaskArgs { @@ -49,13 +46,11 @@ impl SystemPackageTask { args: SystemPackageTaskArgs, pg_reader: PgReader, package_cache: Arc, - cancel: CancellationToken, ) -> Self { Self { pg_reader, package_cache, interval: args.epoch_polling_interval(), - cancel, } } @@ -63,131 +58,118 @@ impl SystemPackageTask { /// packages if it detects that the epoch has changed (which means that a framework upgrade /// could have happened). /// - /// This operation consumes the `self` and returns a handle to the spawned tokio task. The task - /// will continue to run until its cancellation token is triggered. - pub fn run(self) -> JoinHandle<()> { - tokio::spawn(async move { + /// This operation consumes the `self` and returns a Service handle. + pub fn run(self) -> Service { + Service::new().spawn_aborting(async move { let Self { pg_reader, package_cache, interval, - cancel, } = self; let mut last_epoch: i64 = 0; let mut interval = time::interval(interval); loop { - tokio::select! { - _ = cancel.cancelled() => { - info!("Shutdown signal received, terminating system package eviction task"); - break; + interval.tick().await; + + let mut conn = match pg_reader.connect().await { + Ok(conn) => conn, + Err(e) => { + error!("Failed to connect to database: {:?}", e); + continue; + } + }; + + #[derive(QueryableByName, Copy, Clone)] + struct Watermark { + #[diesel(sql_type = BigInt)] + epoch_hi_inclusive: i64, + + #[diesel(sql_type = BigInt)] + checkpoint_hi_inclusive: i64, + } + + let query = query!( + r#" + SELECT + epoch_hi_inclusive, + checkpoint_hi_inclusive + FROM + watermarks + WHERE + pipeline = 'kv_packages' + "# + ); + + let Watermark { + epoch_hi_inclusive: next_epoch, + checkpoint_hi_inclusive, + } = match conn.results(query).await.as_deref() { + Ok([watermark]) => *watermark, + + Ok([]) => { + info!("Package index isn't populated yet, no epoch information"); + continue; } - _ = interval.tick() => { - let mut conn = match pg_reader.connect().await { - Ok(conn) => conn, - Err(e) => { - error!("Failed to connect to database: {:?}", e); - continue; - } - }; - - #[derive(QueryableByName, Copy, Clone)] - struct Watermark { - #[diesel(sql_type = BigInt)] - epoch_hi_inclusive: i64, - - #[diesel(sql_type = BigInt)] - checkpoint_hi_inclusive: i64, - } - - let query = query!( - r#" - SELECT - epoch_hi_inclusive, - checkpoint_hi_inclusive - FROM - watermarks - WHERE - pipeline = 'kv_packages' - "# - ); - - let Watermark { - epoch_hi_inclusive: next_epoch, - checkpoint_hi_inclusive, - } = match conn - .results(query) - .await - .as_deref() - { - Ok([watermark]) => *watermark, - - Ok([]) => { - info!("Package index isn't populated yet, no epoch information"); - continue; - } - - Ok(_) => { - error!("Expected exactly one row from the watermarks table"); - continue; - }, - - Err(e) => { - error!("Failed to fetch latest epoch: {e}"); - continue; - } - }; - - if next_epoch <= last_epoch { - continue; - } - - info!(last_epoch, next_epoch, "Detected epoch boundary"); - last_epoch = next_epoch; - - #[derive(QueryableByName, Clone)] - struct SystemPackage { - #[diesel(sql_type = Bytea)] - original_id: Vec, - } - - let query = query!( - r#" - SELECT DISTINCT - original_id - FROM - kv_packages - WHERE - is_system_package - AND cp_sequence_number <= {BigInt} - "#, - checkpoint_hi_inclusive - ); - - let system_packages: Vec = match conn.results(query).await { - Ok(system_packages) => system_packages, - - Err(e) => { - error!("Failed to fetch system packages: {e}"); - continue; - } - }; - - let Ok(system_packages) = system_packages - .into_iter() - .map(|pkg| AccountAddress::from_bytes(pkg.original_id)) - .collect::, _>>() - else { - error!("Failed to deserialize system package addresses"); - continue; - }; - - info!(system_packages = ?system_packages, "Evicting..."); - package_cache.evict(system_packages) + Ok(_) => { + error!("Expected exactly one row from the watermarks table"); + continue; } + + Err(e) => { + error!("Failed to fetch latest epoch: {e}"); + continue; + } + }; + + if next_epoch <= last_epoch { + continue; } + + info!(last_epoch, next_epoch, "Detected epoch boundary"); + last_epoch = next_epoch; + + #[derive(QueryableByName, Clone)] + struct SystemPackage { + #[diesel(sql_type = Bytea)] + original_id: Vec, + } + + let query = query!( + r#" + SELECT DISTINCT + original_id + FROM + kv_packages + WHERE + is_system_package + AND cp_sequence_number <= {BigInt} + "#, + checkpoint_hi_inclusive + ); + + let system_packages: Vec = match conn.results(query).await { + Ok(system_packages) => system_packages, + + Err(e) => { + error!("Failed to fetch system packages: {e}"); + continue; + } + }; + + let Ok(system_packages) = system_packages + .into_iter() + .map(|pkg| AccountAddress::from_bytes(pkg.original_id)) + .collect::, _>>() + else { + error!("Failed to deserialize system package addresses"); + continue; + }; + + info!(system_packages = ?system_packages, "Evicting..."); + package_cache.evict(system_packages) } }) } diff --git a/crates/sui-indexer-alt/Cargo.toml b/crates/sui-indexer-alt/Cargo.toml index c0b050909ff90..4a08bc5ca0a23 100644 --- a/crates/sui-indexer-alt/Cargo.toml +++ b/crates/sui-indexer-alt/Cargo.toml @@ -33,7 +33,6 @@ prometheus.workspace = true serde.workspace = true telemetry-subscribers.workspace = true tokio.workspace = true -tokio-util.workspace = true toml.workspace = true tracing.workspace = true url.workspace = true diff --git a/crates/sui-indexer-alt/src/benchmark.rs b/crates/sui-indexer-alt/src/benchmark.rs index a747a0a003eea..12970cd4305fd 100644 --- a/crates/sui-indexer-alt/src/benchmark.rs +++ b/crates/sui-indexer-alt/src/benchmark.rs @@ -9,12 +9,13 @@ use sui_indexer_alt_framework::{ IndexerArgs, ingestion::{ClientArgs, ingestion_client::IngestionClientArgs}, postgres::{DbArgs, reset_database}, + service::terminate, }; use sui_indexer_alt_schema::MIGRATIONS; use sui_indexer_alt_schema::checkpoints::StoredGenesis; use sui_indexer_alt_schema::epochs::StoredEpochStart; use sui_synthetic_ingestion::synthetic_ingestion::read_ingestion_data; -use tokio_util::sync::CancellationToken; +use tracing::info; use url::Url; #[derive(clap::Args, Debug, Clone)] @@ -63,35 +64,40 @@ pub async fn run_benchmark( }; let cur_time = Instant::now(); + let registry = Registry::new(); + let indexer = tokio::select! { + _ = terminate() => { + info!("Indexer terminated during setup"); + return Ok(()); + } - setup_indexer( - database_url, - db_args, - indexer_args, - client_args, - indexer_config, - Some(BootstrapGenesis { - stored_genesis: StoredGenesis { - genesis_digest: [0u8; 32].to_vec(), - initial_protocol_version: 0, - }, - stored_epoch_start: StoredEpochStart { - epoch: 0, - protocol_version: 0, - cp_lo: 0, - start_timestamp_ms: 0, - reference_gas_price: 0, - system_state: vec![], - }, - }), - &Registry::new(), - CancellationToken::new(), - ) - .await? - .run() - .await? - .await?; + indexer = setup_indexer( + database_url, + db_args, + indexer_args, + client_args, + indexer_config, + Some(BootstrapGenesis { + stored_genesis: StoredGenesis { + genesis_digest: [0u8; 32].to_vec(), + initial_protocol_version: 0, + }, + stored_epoch_start: StoredEpochStart { + epoch: 0, + protocol_version: 0, + cp_lo: 0, + start_timestamp_ms: 0, + reference_gas_price: 0, + system_state: vec![], + }, + }), + ®istry, + ) => { + indexer? + } + }; + indexer.run().await?.join().await?; let elapsed = Instant::now().duration_since(cur_time); println!("Indexed {} transactions in {:?}", num_transactions, elapsed); println!("TPS: {}", num_transactions as f64 / elapsed.as_secs_f64()); diff --git a/crates/sui-indexer-alt/src/bootstrap.rs b/crates/sui-indexer-alt/src/bootstrap.rs index a76c2a826280a..2d00f611a8af6 100644 --- a/crates/sui-indexer-alt/src/bootstrap.rs +++ b/crates/sui-indexer-alt/src/bootstrap.rs @@ -19,7 +19,6 @@ use sui_indexer_alt_schema::{ schema::{kv_epoch_starts, kv_genesis}, }; use sui_types::transaction::TransactionDataAPI; -use tokio_util::sync::CancellationToken; use tracing::info; pub struct BootstrapGenesis { @@ -31,13 +30,9 @@ pub struct BootstrapGenesis { /// the information stored there. If the database has been bootstrapped before, this function will /// simply read the previously bootstrapped information. Otherwise, it will wait until the first /// checkpoint is available and extract the necessary information from there. -/// -/// Can be cancelled via the `cancel` token, or through an interrupt signal (which will also cancel -/// the token). pub async fn bootstrap( indexer: &Indexer, retry_interval: Duration, - cancel: CancellationToken, bootstrap_genesis: Option, ) -> Result { info!("Bootstrapping indexer with genesis information"); @@ -73,13 +68,11 @@ pub async fn bootstrap( // - Get the Genesis system transaction from the genesis checkpoint. // - Get the system state object that was written out by the system transaction. None => { - let genesis_checkpoint = tokio::select! { - cp = indexer.ingestion_client().wait_for(0, retry_interval) => - cp.context("Failed to fetch genesis checkpoint")?, - _ = cancel.cancelled() => { - bail!("Cancelled before genesis checkpoint was available"); - } - }; + let genesis_checkpoint = indexer + .ingestion_client() + .wait_for(0, retry_interval) + .await + .context("Failed to fetch genesis checkpoint")?; let Checkpoint { summary: checkpoint_summary, diff --git a/crates/sui-indexer-alt/src/lib.rs b/crates/sui-indexer-alt/src/lib.rs index 34c5790330b49..cadc5714013aa 100644 --- a/crates/sui-indexer-alt/src/lib.rs +++ b/crates/sui-indexer-alt/src/lib.rs @@ -28,7 +28,6 @@ use sui_indexer_alt_framework::{ }; use sui_indexer_alt_metrics::db::DbConnectionStatsCollector; use sui_indexer_alt_schema::MIGRATIONS; -use tokio_util::sync::CancellationToken; use url::Url; pub mod args; @@ -46,7 +45,6 @@ pub async fn setup_indexer( indexer_config: IndexerConfig, bootstrap_genesis: Option, registry: &Registry, - cancel: CancellationToken, ) -> anyhow::Result> { let IndexerConfig { ingestion, @@ -109,7 +107,6 @@ pub async fn setup_indexer( ingestion, metrics_prefix, registry, - cancel.clone(), ) .await?; @@ -157,7 +154,7 @@ pub async fn setup_indexer( }; } - let genesis = bootstrap(&indexer, retry_interval, cancel.clone(), bootstrap_genesis).await?; + let genesis = bootstrap(&indexer, retry_interval, bootstrap_genesis).await?; // Pipelines that rely on genesis information add_concurrent!(KvFeatureFlags(genesis.clone()), kv_feature_flags); diff --git a/crates/sui-indexer-alt/src/main.rs b/crates/sui-indexer-alt/src/main.rs index 0e9ddb588528d..025d9a2b9eb3e 100644 --- a/crates/sui-indexer-alt/src/main.rs +++ b/crates/sui-indexer-alt/src/main.rs @@ -14,12 +14,12 @@ use sui_indexer_alt::config::IndexerConfig; use sui_indexer_alt::config::Merge; use sui_indexer_alt::setup_indexer; use sui_indexer_alt_framework::postgres::reset_database; +use sui_indexer_alt_framework::service::Error; +use sui_indexer_alt_framework::service::terminate; use sui_indexer_alt_metrics::MetricsService; use sui_indexer_alt_metrics::uptime; use sui_indexer_alt_schema::MIGRATIONS; use tokio::fs; -use tokio::signal; -use tokio_util::sync::CancellationToken; use tracing::info; // Define the `GIT_REVISION` const @@ -53,57 +53,57 @@ async fn main() -> Result<()> { metrics_args, config, } => { + let is_bounded = indexer_args.last_checkpoint.is_some(); + let indexer_config = read_config(&config).await?; info!("Starting indexer with config: {:#?}", indexer_config); - let cancel = CancellationToken::new(); - let registry = Registry::new_custom(Some("indexer_alt".into()), None) .context("Failed to create Prometheus registry.")?; - let metrics = MetricsService::new(metrics_args, registry, cancel.child_token()); - - let h_ctrl_c = tokio::spawn({ - let cancel = cancel.clone(); - async move { - tokio::select! { - _ = cancel.cancelled() => {} - _ = signal::ctrl_c() => { - info!("Received Ctrl-C, shutting down..."); - cancel.cancel(); - } - } - } - }); + let metrics = MetricsService::new(metrics_args, registry); metrics .registry() .register(uptime(VERSION)?) .context("Failed to register uptime metric.")?; - let h_indexer = setup_indexer( - database_url, - db_args, - indexer_args, - client_args, - indexer_config, - None, - metrics.registry(), - cancel.child_token(), - ) - .await? - .run() - .await - .context("Failed to start indexer")?; - - let h_metrics = metrics.run().await?; - - // Wait for the indexer to finish, then force the supporting services to shut down - // using the cancellation token. - let _ = h_indexer.await; - cancel.cancel(); - let _ = h_metrics.await; - let _ = h_ctrl_c.await; + let indexer = tokio::select! { + _ = terminate() => { + info!("Indexer terminated during setup"); + return Ok(()); + } + + indexer = setup_indexer( + database_url, + db_args, + indexer_args, + client_args, + indexer_config, + None, + metrics.registry(), + ) => { + indexer? + } + }; + + let s_indexer = indexer.run().await?; + let s_metrics = metrics.run().await?; + + match s_indexer.attach(s_metrics).main().await { + Ok(()) => {} + Err(Error::Terminated) => { + if is_bounded { + std::process::exit(1); + } + } + Err(Error::Aborted) => { + std::process::exit(1); + } + Err(Error::Task(_)) => { + std::process::exit(2); + } + } } Command::GenerateConfig => { diff --git a/crates/sui/Cargo.toml b/crates/sui/Cargo.toml index 03774a1deee5a..a60602cc97e3e 100644 --- a/crates/sui/Cargo.toml +++ b/crates/sui/Cargo.toml @@ -55,6 +55,7 @@ sui-bridge.workspace = true sui-data-store.workspace = true sui-execution.workspace = true sui-faucet.workspace = true +sui-futures.workspace = true sui-swarm-config.workspace = true sui-indexer-alt.workspace = true sui-indexer-alt-framework.workspace = true @@ -76,7 +77,6 @@ sui-package-management.workspace = true sui-protocol-config.workspace = true shared-crypto.workspace = true sui-transaction-builder.workspace = true -tokio-util.workspace = true move-binary-format.workspace = true move-trace-format.workspace = true move-bytecode-source-map.workspace = true diff --git a/crates/sui/src/sui_commands.rs b/crates/sui/src/sui_commands.rs index 705f618211f5b..edf56b42241e3 100644 --- a/crates/sui/src/sui_commands.rs +++ b/crates/sui/src/sui_commands.rs @@ -17,10 +17,12 @@ use futures::future; use move_analyzer::analyzer; use move_command_line_common::files::MOVE_COMPILED_EXTENSION; use move_compiler::editions::Flavor; +use move_core_types::account_address::AccountAddress; use move_package::BuildConfig; use mysten_common::tempdir; use prometheus::Registry; use rand::rngs::OsRng; +use serde_json::json; use std::collections::BTreeMap; use std::io::{Write, stdout}; use std::net::{AddrParseError, IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; @@ -44,16 +46,7 @@ use sui_config::{ SUI_BENCHMARK_GENESIS_GAS_KEYSTORE_FILENAME, SUI_GENESIS_FILENAME, SUI_KEYSTORE_FILENAME, }; use sui_faucet::{AppState, FaucetConfig, LocalFaucet, create_wallet_context, start_faucet}; -use sui_json_rpc_types::{SuiObjectDataOptions, SuiRawData}; -use sui_move::summary::PackageSummaryMetadata; -use sui_sdk::SuiClient; -use sui_sdk::apis::ReadApi; -use sui_types::move_package::MovePackage; -use tokio::time::interval; -use tokio_util::sync::CancellationToken; - -use move_core_types::account_address::AccountAddress; -use serde_json::json; +use sui_futures::service::Service; use sui_indexer_alt::{config::IndexerConfig, setup_indexer}; use sui_indexer_alt_consistent_store::{ args::RpcArgs as ConsistentArgs, config::ServiceConfig as ConsistentConfig, @@ -71,10 +64,12 @@ use sui_indexer_alt_reader::{ consistent_reader::ConsistentReaderArgs, fullnode_client::FullnodeArgs, system_package_task::SystemPackageTaskArgs, }; +use sui_json_rpc_types::{SuiObjectDataOptions, SuiRawData}; use sui_keys::key_derive::generate_new_key; use sui_keys::keypair_file::read_key; use sui_keys::keystore::{AccountKeystore, FileBasedKeystore, Keystore}; use sui_move::manage_package::resolve_lock_file_path; +use sui_move::summary::PackageSummaryMetadata; use sui_move::{self, execute_move_command}; use sui_move_build::{ BuildConfig as SuiBuildConfig, SuiPackageHooks, check_conflicting_addresses, @@ -85,6 +80,8 @@ use sui_pg_db::DbArgs; use sui_pg_db::temp::{LocalDatabase, get_available_port}; use sui_protocol_config::Chain; use sui_replay_2 as SR2; +use sui_sdk::SuiClient; +use sui_sdk::apis::ReadApi; use sui_sdk::sui_client_config::{SuiClientConfig, SuiEnv}; use sui_sdk::wallet_context::WalletContext; use sui_swarm::memory::Swarm; @@ -95,6 +92,8 @@ use sui_swarm_config::node_config_builder::FullnodeConfigBuilder; use sui_types::base_types::{ObjectID, SuiAddress}; use sui_types::crypto::{SignatureScheme, SuiKeyPair, ToFromBytes}; use sui_types::digests::ChainIdentifier; +use sui_types::move_package::MovePackage; +use tokio::time::interval; use tracing::info; use url::Url; @@ -1055,8 +1054,7 @@ async fn start( info!("Fullnode RPC URL: {fullnode_rpc_url}"); let prometheus_registry = Registry::new(); - let cancel = CancellationToken::new(); - let mut rpc_services = vec![]; + let mut rpc_services = Service::new(); // Set-up the database for the indexer, if needed let (_database, database_url) = match with_indexer { @@ -1106,14 +1104,12 @@ async fn start( IndexerConfig::for_test(), None, &prometheus_registry, - cancel.child_token(), ) .await .context("Failed to setup indexer")?; let pipelines = indexer.pipelines().map(|s| s.to_string()).collect(); - let handle = indexer.run().await.context("Failed to start indexer")?; - rpc_services.push(handle); + rpc_services = rpc_services.merge(indexer.run().await.context("Failed to start indexer")?); info!("Indexer started with pipelines: {pipelines:?}"); pipelines @@ -1138,19 +1134,19 @@ async fn start( ..Default::default() }; - let handle = start_consistent_store( - config_dir.join("consistent_store"), - IndexerArgs::default(), - client_args, - consistent_args, - "0.0.0", - ConsistentConfig::for_test(), - &prometheus_registry, - cancel.child_token(), - ) - .await - .context("Failed to start Consistent Store")?; - rpc_services.push(handle); + rpc_services = rpc_services.merge( + start_consistent_store( + config_dir.join("consistent_store"), + IndexerArgs::default(), + client_args, + consistent_args, + "0.0.0", + ConsistentConfig::for_test(), + &prometheus_registry, + ) + .await + .context("Failed to start Consistent Store")?, + ); info!("Consistent Store started at {address}"); Some(address) @@ -1182,23 +1178,23 @@ async fn start( let mut graphql_config = GraphQlConfig::default(); graphql_config.zklogin.env = sui_indexer_alt_graphql::config::ZkLoginEnv::Test; - let handle = start_graphql( - database_url.clone(), - fullnode_args, - DbArgs::default(), - GraphQlKvArgs::default(), - consistent_reader_args, - graphql_args, - SystemPackageTaskArgs::default(), - "0.0.0", - graphql_config, - pipelines, - &prometheus_registry, - cancel.child_token(), - ) - .await - .context("Failed to start GraphQL server")?; - rpc_services.push(handle); + rpc_services = rpc_services.merge( + start_graphql( + database_url.clone(), + fullnode_args, + DbArgs::default(), + GraphQlKvArgs::default(), + consistent_reader_args, + graphql_args, + SystemPackageTaskArgs::default(), + "0.0.0", + graphql_config, + pipelines, + &prometheus_registry, + ) + .await + .context("Failed to start GraphQL server")?, + ); info!("GraphQL started at {address}"); } @@ -1301,13 +1297,8 @@ async fn start( } } - // Trigger cancellation to shut down all RPC services, and wait for all services to exit - // cleanly. - cancel.cancel(); - // TODO (amnn): The indexer can take some time to shut down if the database it is talking to - // stops responding. Re-enable graceful shutdown once cancel handling is revamped across the - // framework. - // future::join_all(rpc_services).await; + info!("Shutting down RPC services..."); + rpc_services.shutdown().await?; Ok(()) }