supabase
diff --git a/‎etl-api/src/db/pipelines.rs‎
Lines changed: 1 addition & 1 deletion b/‎etl-api/src/db/pipelines.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎etl-api/src/routes/pipelines.rs‎
Lines changed: 51 additions & 13 deletions b/‎etl-api/src/routes/pipelines.rs‎
Lines changed: 51 additions & 13 deletions
diff --git a/‎etl-api/tests/pipelines.rs‎
Lines changed: 4 additions & 2 deletions b/‎etl-api/tests/pipelines.rs‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎etl-postgres/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎etl-postgres/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎etl-postgres/src/replication/lag.rs‎
Lines changed: 106 additions & 0 deletions b/‎etl-postgres/src/replication/lag.rs‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎etl-postgres/src/replication/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎etl-postgres/src/replication/mod.rs‎
Lines changed: 1 addition & 0 deletions
@@ -47,7 +47,7 @@ pub enum PipelinesDbError {
     DestinationsDb(#[from] DestinationsDbError),
 
     #[error("Slot operation failed: {0}")]
-    SlotError(#[from] slots::SlotError),
+    SlotError(#[from] slots::EtlReplicationSlotError),
 }
 
 pub async fn count_pipelines_for_tenant<'c, E>(
 
@@ -9,7 +9,7 @@ use etl_config::{
     Environment,
     shared::{ReplicatorConfig, SupabaseConfig, TlsConfig},
 };
-use etl_postgres::replication::{TableLookupError, get_table_name_from_oid, health, state};
+use etl_postgres::replication::{TableLookupError, get_table_name_from_oid, health, lag, state};
 use etl_postgres::types::TableId;
 use secrecy::ExposeSecret;
 use serde::{Deserialize, Serialize};
@@ -271,9 +271,7 @@ pub enum SimpleTableReplicationState {
     Queued,
     CopyingTable,
     CopiedTable,
-    FollowingWal {
-        lag: u64,
-    },
+    FollowingWal,
     Error {
         reason: String,
         #[serde(skip_serializing_if = "Option::is_none")]
@@ -301,13 +299,10 @@ impl From<state::TableReplicationState> for SimpleTableReplicationState {
             state::TableReplicationState::Init => SimpleTableReplicationState::Queued,
             state::TableReplicationState::DataSync => SimpleTableReplicationState::CopyingTable,
             state::TableReplicationState::FinishedCopy => SimpleTableReplicationState::CopiedTable,
-            // TODO: add lag metric when available.
             state::TableReplicationState::SyncDone { .. } => {
-                SimpleTableReplicationState::FollowingWal { lag: 0 }
-            }
-            state::TableReplicationState::Ready => {
-                SimpleTableReplicationState::FollowingWal { lag: 0 }
+                SimpleTableReplicationState::FollowingWal
             }
+            state::TableReplicationState::Ready => SimpleTableReplicationState::FollowingWal,
             state::TableReplicationState::Errored {
                 reason,
                 solution,
@@ -340,12 +335,52 @@ pub struct TableReplicationStatus {
     #[schema(example = "public.users")]
     pub table_name: String,
     pub state: SimpleTableReplicationState,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[schema(nullable = true)]
+    pub table_sync_lag: Option<SlotLagMetricsResponse>,
+}
+
+/// Lag metrics reported for replication slots.
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub struct SlotLagMetricsResponse {
+    /// Bytes between the current WAL location and the slot restart LSN.
+    #[schema(example = 1024)]
+    pub restart_lsn_bytes: i64,
+    /// Bytes between the current WAL location and the confirmed flush LSN.
+    #[schema(example = 2048)]
+    pub confirmed_flush_lsn_bytes: i64,
+    /// How many bytes of WAL are still safe to build up before the limit of the slot is reached.
+    #[schema(example = 8192)]
+    pub safe_wal_size_bytes: i64,
+    /// Write lag expressed in milliseconds.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[schema(example = 1500, nullable = true)]
+    pub write_lag: Option<i64>,
+    /// Flush lag expressed in milliseconds.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[schema(example = 1200, nullable = true)]
+    pub flush_lag: Option<i64>,
+}
+
+impl From<lag::SlotLagMetrics> for SlotLagMetricsResponse {
+    fn from(metrics: lag::SlotLagMetrics) -> Self {
+        Self {
+            restart_lsn_bytes: metrics.restart_lsn_bytes,
+            confirmed_flush_lsn_bytes: metrics.confirmed_flush_lsn_bytes,
+            safe_wal_size_bytes: metrics.safe_wal_size_bytes,
+            write_lag: metrics.write_lag_ms,
+            flush_lag: metrics.flush_lag_ms,
+        }
+    }
 }
 
 #[derive(Debug, Serialize, Deserialize, ToSchema)]
 pub struct GetPipelineReplicationStatusResponse {
     #[schema(example = 1)]
     pub pipeline_id: i64,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[schema(nullable = true)]
+    pub apply_lag: Option<SlotLagMetricsResponse>,
     pub table_statuses: Vec<TableReplicationStatus>,
 }
 
@@ -924,13 +959,14 @@ pub async fn get_pipeline_replication_status(
 
     // Fetch replication state for all tables in this pipeline
     let state_rows = state::get_table_replication_state_rows(&source_pool, pipeline_id).await?;
+    let mut lag_metrics = lag::get_pipeline_lag_metrics(&source_pool, pipeline_id as u64).await?;
+    let apply_lag = lag_metrics.apply.map(Into::into);
 
     // Convert database states to UI-friendly format and fetch table names
     let mut tables: Vec<TableReplicationStatus> = Vec::new();
     for row in state_rows {
-        let table_id = row.table_id.0;
-        let table_name =
-            get_table_name_from_oid(&source_pool, TableId::new(row.table_id.0)).await?;
+        let table_id = TableId::new(row.table_id.0);
+        let table_name = get_table_name_from_oid(&source_pool, table_id).await?;
 
         // Extract the metadata row from the database
         let table_replication_state = row
@@ -939,14 +975,16 @@ pub async fn get_pipeline_replication_status(
             .ok_or(PipelineError::MissingTableReplicationState)?;
 
         tables.push(TableReplicationStatus {
-            table_id,
+            table_id: table_id.into_inner(),
             table_name: table_name.to_string(),
             state: table_replication_state.into(),
+            table_sync_lag: lag_metrics.table_sync.remove(&table_id).map(Into::into),
         });
     }
 
     let response = GetPipelineReplicationStatusResponse {
         pipeline_id,
+        apply_lag,
         table_statuses: tables,
     };
 
 
@@ -1007,6 +1007,7 @@ async fn pipeline_replication_status_returns_table_states_and_names() {
 
     assert_eq!(response.pipeline_id, pipeline_id);
     assert_eq!(response.table_statuses.len(), 2);
+    assert!(response.apply_lag.is_none());
 
     // Verify table states
     for (table_oid, table_name) in &tables {
@@ -1017,6 +1018,7 @@ async fn pipeline_replication_status_returns_table_states_and_names() {
             .expect("Table not found in response");
 
         assert_eq!(table_status.table_id, table_oid.0);
+        assert!(table_status.table_sync_lag.is_none());
 
         match table_name.as_str() {
             "test.test_table_users" => assert!(matches!(
@@ -1025,7 +1027,7 @@ async fn pipeline_replication_status_returns_table_states_and_names() {
             )),
             "test.test_table_orders" => assert!(matches!(
                 table_status.state,
-                SimpleTableReplicationState::FollowingWal { .. }
+                SimpleTableReplicationState::FollowingWal
             )),
             _ => panic!("Unexpected table name: {table_name}"),
         }
@@ -1069,7 +1071,7 @@ async fn rollback_table_state_succeeds_for_manual_retry_errors() {
     assert_eq!(response.table_id, table_oid.0);
     assert!(matches!(
         response.new_state,
-        SimpleTableReplicationState::FollowingWal { .. }
+        SimpleTableReplicationState::FollowingWal
     ));
 
     drop_pg_database(&source_db_config).await;
 
@@ -28,6 +28,7 @@ sqlx = { workspace = true, features = [
     "postgres",
     "json",
     "migrate",
+    "time",
 ] }
 thiserror = { workspace = true }
 tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
 
@@ -0,0 +1,106 @@
+use sqlx::{FromRow, PgPool};
+use std::collections::BTreeMap;
+
+use crate::replication::slots::EtlReplicationSlot;
+use crate::types::TableId;
+
+/// Lag metrics associated with a logical replication slot.
+#[derive(Debug)]
+pub struct SlotLagMetrics {
+    /// The number of bytes between the current WAL LSN and the slot restart LSN.
+    pub restart_lsn_bytes: i64,
+    /// The number of bytes between the current WAL LSN and the confirmed flush LSN.
+    pub confirmed_flush_lsn_bytes: i64,
+    /// How many bytes of WAL are still safe to build up before the limit of the slot is reached.
+    pub safe_wal_size_bytes: i64,
+    /// Write lag in milliseconds relative to the primary.
+    pub write_lag_ms: Option<i64>,
+    /// Flush lag in milliseconds relative to the primary.
+    pub flush_lag_ms: Option<i64>,
+}
+
+/// Lag metrics for pipeline apply and table sync workers.
+#[derive(Debug, Default)]
+pub struct PipelineLagMetrics {
+    /// Lag metrics for the apply worker slot.
+    pub apply: Option<SlotLagMetrics>,
+    /// Lag metrics keyed by table OID for table sync worker slots.
+    pub table_sync: BTreeMap<TableId, SlotLagMetrics>,
+}
+
+/// Database row returned by the replication slot lag query.
+#[derive(Debug, FromRow)]
+struct SlotLagRow {
+    slot_name: String,
+    restart_lsn_bytes: i64,
+    confirmed_flush_lsn_bytes: i64,
+    safe_wal_size_bytes: i64,
+    write_lag_ms: Option<i64>,
+    flush_lag_ms: Option<i64>,
+}
+
+/// Fetches replication lag metrics for the given pipeline by inspecting logical replication slots.
+///
+/// Returns aggregated lag metrics for the apply worker slot and each table sync slot associated
+/// with the pipeline. Slots that are not currently active in `pg_stat_replication` still report
+/// their WAL metrics, while the write and flush lag values remain `None`.
+pub async fn get_pipeline_lag_metrics(
+    pool: &PgPool,
+    pipeline_id: u64,
+) -> sqlx::Result<PipelineLagMetrics> {
+    let Ok(apply_prefix) = EtlReplicationSlot::apply_prefix(pipeline_id) else {
+        return Ok(PipelineLagMetrics::default());
+    };
+    let Ok(table_sync_prefix) = EtlReplicationSlot::table_sync_prefix(pipeline_id) else {
+        return Ok(PipelineLagMetrics::default());
+    };
+
+    let rows: Vec<SlotLagRow> = sqlx::query_as(
+        r#"
+        select
+            s.slot_name,
+            coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), s.restart_lsn), 0)::bigint as restart_lsn_bytes,
+            coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), s.confirmed_flush_lsn), 0)::bigint as confirmed_flush_lsn_bytes,
+            coalesce(s.safe_wal_size, 0) as safe_wal_size_bytes,
+            round(extract(epoch from r.write_lag) * 1000)::bigint as write_lag_ms,
+            round(extract(epoch from r.flush_lag) * 1000)::bigint as flush_lag_ms
+        from pg_replication_slots as s
+        left outer join pg_stat_replication as r on s.active_pid = r.pid
+        where s.slot_type = 'logical'
+          and (s.slot_name = $1 or s.slot_name like $2)
+        "#,
+    )
+    .bind(apply_prefix)
+    .bind(format!("{table_sync_prefix}%"))
+    .fetch_all(pool)
+    .await?;
+
+    let mut metrics = PipelineLagMetrics::default();
+
+    for row in rows {
+        let slot_lag_metrics = SlotLagMetrics {
+            restart_lsn_bytes: row.restart_lsn_bytes,
+            confirmed_flush_lsn_bytes: row.confirmed_flush_lsn_bytes,
+            safe_wal_size_bytes: row.safe_wal_size_bytes,
+            write_lag_ms: row.write_lag_ms,
+            flush_lag_ms: row.flush_lag_ms,
+        };
+
+        match EtlReplicationSlot::try_from(row.slot_name.as_str()) {
+            Ok(EtlReplicationSlot::Apply {
+                pipeline_id: slot_pipeline_id,
+            }) if slot_pipeline_id == pipeline_id => {
+                metrics.apply = Some(slot_lag_metrics);
+            }
+            Ok(EtlReplicationSlot::TableSync {
+                pipeline_id: slot_pipeline_id,
+                table_id,
+            }) if slot_pipeline_id == pipeline_id => {
+                metrics.table_sync.insert(table_id, slot_lag_metrics);
+            }
+            _ => {}
+        }
+    }
+
+    Ok(metrics)
+}
@@ -1,5 +1,6 @@
 mod db;
 pub mod health;
+pub mod lag;
 pub mod schema;
 pub mod slots;
 pub mod state;
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ pub enum PipelinesDbError {`
`47`	`47`	`DestinationsDb(#[from] DestinationsDbError),`
`48`	`48`
`49`	`49`	`#[error("Slot operation failed: {0}")]`
`50`		`- SlotError(#[from] slots::SlotError),`
	`50`	`+ SlotError(#[from] slots::EtlReplicationSlotError),`
`51`	`51`	`}`
`52`	`52`
`53`	`53`	`pub async fn count_pipelines_for_tenant<'c, E>(`