chore: don't cache entire table in memory

benjamin-awd · benjamin-awd · commit 54a1e6fc03f4 · 2026-01-13T12:06:24.000+08:00
diff --git a/src/sinks/delta_lake/service.rs b/src/sinks/delta_lake/service.rs
@@ -47,8 +47,13 @@ impl DriverResponse for DeltaLakeResponse {
 /// This service writes Arrow RecordBatches directly to Delta Lake tables,
 /// avoiding the overhead of Parquet serialization/deserialization round-trips.
 ///
-/// The table is wrapped in RwLock to allow updating the cached snapshot after
-/// successful commits, reducing unnecessary conflict retries.
+/// ## Memory Management
+///
+/// The table is reloaded from storage before each write operation. This bounds
+/// memory usage by starting each write with a fresh snapshot from the latest
+/// checkpoint, rather than accumulating transaction log state across writes.
+/// This adds ~100-200ms latency per write but prevents OOM in long-running
+/// processes with high-version-count tables.
 ///
 /// ## Schema Evolution Support
 ///
@@ -115,12 +120,20 @@ impl Service<DeltaLakeRequest> for DeltaLakeService {
             // Use batches directly from request - no Parquet deserialization needed
             let batches = request.batches;
 
-            // Get a clone of the current table snapshot
+            // Get a clone of the table and reload fresh state from storage.
+            // This bounds memory usage by not accumulating transaction log state
+            // across writes - each write starts with a fresh snapshot loaded from
+            // the latest checkpoint.
             let mut table = {
                 let table_guard = table_lock.read().await;
                 table_guard.clone()
             };
 
+            // Reload to get fresh state from latest checkpoint
+            table.load().await.map_err(|e| {
+                DeltaTableError::Generic(format!("Failed to load table before write: {}", e))
+            })?;
+
             // Retry loop for handling concurrent transaction conflicts and schema evolution
             // When optimization operations (like z-order) rewrite files, or when schema changes,
             // we need to reload the table snapshot and retry the write
@@ -155,52 +168,28 @@ impl Service<DeltaLakeRequest> for DeltaLakeService {
                 // Execute write and commit
                 match write_builder.await {
                     Ok(new_table) => {
-                        // Success! Update the cached table to the latest version
-                        // Only update if our committed version is newer than the cached version
-                        // This prevents "race to the bottom" where slower requests could
-                        // overwrite newer state with older state
-                        {
-                            let mut table_guard = table_lock.write().await;
-
-                            let new_version = new_table.version();
-                            let cached_version = table_guard.version();
-
-                            if new_version > cached_version {
-                                // Get schema before moving new_table to avoid unnecessary clone
-                                let new_schema = new_table.schema();
-
-                                // Log schema evolution if new fields were added
-                                let old_schema = shared_schema.load();
-                                let new_fields: Vec<_> = new_schema
-                                    .fields()
-                                    .iter()
-                                    .filter(|f| old_schema.field_with_name(f.name()).is_err())
-                                    .map(|f| f.name().as_str())
-                                    .collect();
-
-                                if !new_fields.is_empty() {
-                                    info!(
-                                        message = "Schema evolution: new fields added to table",
-                                        new_fields = ?new_fields,
-                                        total_fields = new_schema.fields().len(),
-                                        version = new_version,
-                                    );
-                                }
-
-                                // Update schema cache while holding table lock to keep them in sync
-                                // TableProvider::schema() returns the Arrow schema directly
-                                shared_schema.store(new_schema);
-
-                                // Update table cache - move instead of clone
-                                *table_guard = new_table;
-                            } else {
-                                debug!(
-                                    message =
-                                        "Skipping cache update - cached version is newer or equal",
-                                    new_version = new_version,
-                                    cached_version = cached_version,
-                                );
-                            }
+                        // Update schema cache for request builder (used for schema evolution)
+                        // We don't cache the full table state since we reload before each write,
+                        // which bounds memory usage by not accumulating transaction log state.
+                        let new_schema = new_table.schema();
+                        let old_schema = shared_schema.load();
+
+                        // Log and update schema if new fields were added
+                        let new_fields: Vec<_> = new_schema
+                            .fields()
+                            .iter()
+                            .filter(|f| old_schema.field_with_name(f.name()).is_err())
+                            .map(|f| f.name().as_str())
+                            .collect();
+
+                        if !new_fields.is_empty() {
+                            info!(
+                                message = "Schema evolution: new fields added to table",
+                                new_fields = ?new_fields,
+                                total_fields = new_schema.fields().len(),
+                                version = new_table.version(),
+                            );
+                            shared_schema.store(new_schema);
                         }
 
                         // Get the byte size from the request (Arrow in-memory size)
diff --git a/src/sources/delta_lake_cdf/source.rs b/src/sources/delta_lake_cdf/source.rs
@@ -304,7 +304,10 @@ async fn create_cdf_stream(
     table: &DeltaTable,
     start_version: i64,
     end_version: i64,
-) -> Result<impl futures::Stream<Item = Result<deltalake::arrow::record_batch::RecordBatch, DeltaTableError>>, DeltaTableError> {
+) -> Result<
+    impl futures::Stream<Item = Result<deltalake::arrow::record_batch::RecordBatch, DeltaTableError>>,
+    DeltaTableError,
+> {
     // Clone table and create CDF builder
     let cdf_builder = table
         .clone()