fix(bigquery): Allow missing table schema when truncate is issued

iambriccardo · web-flow · commit b78ca30504c9 · 2025-08-27T08:58:40.000+02:00
diff --git a/etl-destinations/src/bigquery/core.rs b/etl-destinations/src/bigquery/core.rs
@@ -654,7 +654,7 @@ where
             }
 
             if !truncate_table_ids.is_empty() {
-                self.process_truncate_for_table_ids(truncate_table_ids.into_iter())
+                self.process_truncate_for_table_ids(truncate_table_ids.into_iter(), true)
                     .await?;
             }
         }
@@ -670,13 +670,28 @@ where
     async fn process_truncate_for_table_ids(
         &self,
         table_ids: impl IntoIterator<Item = TableId>,
+        is_cdc_truncate: bool,
     ) -> EtlResult<()> {
         // We want to lock for the entire processing to ensure that we don't have any race conditions
         // and possible errors are easier to reason about.
         let mut inner = self.inner.lock().await;
 
         for table_id in table_ids {
-            let table_schema = self.store.get_table_schema(&table_id).await?.ok_or_else(|| etl_error!(
+            let table_schema = self.store.get_table_schema(&table_id).await?;
+            // If we are not doing CDC, it means that this truncation has been issued while recovering
+            // from a failed data sync operation. In that case, we could have failed before table schemas
+            // were stored in the schema store, so we just continue and emit a warning. If we are doing
+            // CDC, it's a problem if the schema disappears while streaming, so we error out.
+            if !is_cdc_truncate {
+                warn!(
+                    "the table schema for table {table_id} was not found in the schema store while processing truncate events for BigQuery",
+                    table_id = table_id.to_string()
+                );
+
+                continue;
+            }
+
+            let table_schema = table_schema.ok_or_else(|| etl_error!(
                 ErrorKind::MissingTableSchema,
                     "Table not found in the schema store",
                     format!(
@@ -785,7 +800,7 @@ where
     S: StateStore + SchemaStore + Send + Sync,
 {
     async fn truncate_table(&self, table_id: TableId) -> EtlResult<()> {
-        self.process_truncate_for_table_ids(iter::once(table_id))
+        self.process_truncate_for_table_ids(iter::once(table_id), false)
             .await
     }
 
diff --git a/etl/src/failpoints.rs b/etl/src/failpoints.rs
@@ -8,7 +8,8 @@ use fail::fail_point;
 use crate::bail;
 use crate::error::{ErrorKind, EtlError, EtlResult};
 
-pub const START_TABLE_SYNC__AFTER_DATA_SYNC: &str = "start_table_sync.after_data_sync";
+pub const START_TABLE_SYNC__BEFORE_DATA_SYNC_SLOT_CREATION: &str =
+    "start_table_sync.befor_data_sync_slot_creation";
 pub const START_TABLE_SYNC__DURING_DATA_SYNC: &str = "start_table_sync.during_data_sync";
 
 /// Executes a configurable failpoint for testing error scenarios.
diff --git a/etl/src/replication/table_sync.rs b/etl/src/replication/table_sync.rs
@@ -18,7 +18,8 @@ use crate::destination::Destination;
 use crate::error::{ErrorKind, EtlError, EtlResult};
 #[cfg(feature = "failpoints")]
 use crate::failpoints::{
-    START_TABLE_SYNC__AFTER_DATA_SYNC, START_TABLE_SYNC__DURING_DATA_SYNC, etl_fail_point,
+    START_TABLE_SYNC__BEFORE_DATA_SYNC_SLOT_CREATION, START_TABLE_SYNC__DURING_DATA_SYNC,
+    etl_fail_point,
 };
 use crate::metrics::{
     ETL_BATCH_SEND_MILLISECONDS_TOTAL, ETL_BATCH_SIZE, ETL_TABLE_SYNC_ROWS_COPIED_TOTAL,
@@ -165,7 +166,7 @@ where
 
             // Fail point to test when the table sync fails before copying data.
             #[cfg(feature = "failpoints")]
-            etl_fail_point(START_TABLE_SYNC__AFTER_DATA_SYNC)?;
+            etl_fail_point(START_TABLE_SYNC__BEFORE_DATA_SYNC_SLOT_CREATION)?;
 
             // We create the slot with a transaction, since we need to have a consistent snapshot of the database
             // before copying the schema and tables.
diff --git a/etl/tests/failpoints/pipeline_test.rs b/etl/tests/failpoints/pipeline_test.rs
@@ -1,6 +1,8 @@
 use etl::destination::memory::MemoryDestination;
 use etl::error::ErrorKind;
-use etl::failpoints::{START_TABLE_SYNC__AFTER_DATA_SYNC, START_TABLE_SYNC__DURING_DATA_SYNC};
+use etl::failpoints::{
+    START_TABLE_SYNC__BEFORE_DATA_SYNC_SLOT_CREATION, START_TABLE_SYNC__DURING_DATA_SYNC,
+};
 use etl::state::table::TableReplicationPhaseType;
 use etl::test_utils::database::spawn_source_database;
 use etl::test_utils::notify::NotifyingStore;
@@ -15,7 +17,11 @@ use rand::random;
 #[tokio::test(flavor = "multi_thread")]
 async fn table_copy_fails_after_data_sync_threw_an_error_with_no_retry() {
     let _scenario = FailScenario::setup();
-    fail::cfg(START_TABLE_SYNC__AFTER_DATA_SYNC, "1*return(no_retry)").unwrap();
+    fail::cfg(
+        START_TABLE_SYNC__BEFORE_DATA_SYNC_SLOT_CREATION,
+        "1*return(no_retry)",
+    )
+    .unwrap();
 
     init_test_tracing();
 
@@ -73,7 +79,11 @@ async fn table_copy_fails_after_data_sync_threw_an_error_with_no_retry() {
 #[tokio::test(flavor = "multi_thread")]
 async fn table_copy_is_consistent_after_data_sync_threw_an_error_with_timed_retry() {
     let _scenario = FailScenario::setup();
-    fail::cfg(START_TABLE_SYNC__AFTER_DATA_SYNC, "1*return(timed_retry)").unwrap();
+    fail::cfg(
+        START_TABLE_SYNC__BEFORE_DATA_SYNC_SLOT_CREATION,
+        "1*return(timed_retry)",
+    )
+    .unwrap();
 
     init_test_tracing();