[V2 Streaming] Integrate DataFrame-based initial snapshot behind feature flag

huan233usc · huan233usc · commit baf7c35cf897 · 2026-02-26T20:00:53.000Z
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -3130,6 +3130,18 @@ trait DeltaSQLConfBase extends DeltaSQLConfUtils {
       .doc("Maximum number of files allowed in initial snapshot for V2 streaming.")
       .intConf
       .createWithDefault(50000)
+
+  val DELTA_STREAMING_USE_DATAFRAME_INITIAL_SNAPSHOT =
+    buildConf("streaming.distributedInitialSnapshot")
+      .internal()
+      .doc(
+        "When enabled, the V2 streaming connector uses a DataFrame-based approach for " +
+        "initial snapshot loading. This avoids driver OOM for large tables by running " +
+        "Kernel log replay on an executor and sorting files via Spark's distributed sort " +
+        "with MEMORY_AND_DISK persistence."
+      )
+      .booleanConf
+      .createWithDefault(false)
 }
 
 object DeltaSQLConf extends DeltaSQLConfBase
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/read/SparkMicroBatchStream.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/read/SparkMicroBatchStream.java
@@ -154,7 +154,11 @@ private static class InitialSnapshotCache {
   private final AtomicReference<InitialSnapshotCache> cachedInitialSnapshot =
       new AtomicReference<>(null);
 
+  private final AtomicReference<DataFrameSnapshotCache> cachedDataFrameSnapshot =
+      new AtomicReference<>(null);
+
   private final int maxInitialSnapshotFiles;
+  private final boolean useDataFrameBasedInitialSnapshot;
 
   public SparkMicroBatchStream(
       DeltaSnapshotManager snapshotManager,
@@ -204,6 +208,12 @@ public SparkMicroBatchStream(
                 .sessionState()
                 .conf()
                 .getConf(DeltaSQLConf.DELTA_STREAMING_INITIAL_SNAPSHOT_MAX_FILES());
+    this.useDataFrameBasedInitialSnapshot =
+        (Boolean)
+            spark
+                .sessionState()
+                .conf()
+                .getConf(DeltaSQLConf.DELTA_STREAMING_USE_DATAFRAME_INITIAL_SNAPSHOT());
 
     boolean isStreamingFromColumnMappingTable =
         ColumnMapping.getColumnMappingMode(
@@ -450,6 +460,14 @@ public void commit(Offset end) {
   @Override
   public void stop() {
     cachedInitialSnapshot.set(null);
+    invalidateDataFrameCache();
+  }
+
+  private void invalidateDataFrameCache() {
+    DataFrameSnapshotCache prev = cachedDataFrameSnapshot.getAndSet(null);
+    if (prev != null) {
+      prev.close();
+    }
   }
 
   ///////////////////////
@@ -1042,6 +1060,10 @@ private long addIndexedFilesAndReturnNextIndex(
    * @return An iterator of IndexedFile representing the snapshot files
    */
   private CloseableIterator<IndexedFile> getSnapshotFiles(long version) {
+    if (useDataFrameBasedInitialSnapshot) {
+      return getSnapshotFilesViaDataFrame(version);
+    }
+
     InitialSnapshotCache cache = cachedInitialSnapshot.get();
 
     if (cache != null && cache.version != null && cache.version == version) {
@@ -1056,6 +1078,78 @@ private CloseableIterator<IndexedFile> getSnapshotFiles(long version) {
     return Utils.toCloseableIterator(indexedFiles.iterator());
   }
 
+  private CloseableIterator<IndexedFile> getSnapshotFilesViaDataFrame(long version) {
+    DataFrameSnapshotCache dfCache = cachedDataFrameSnapshot.get();
+    if (dfCache != null && dfCache.getVersion() == version) {
+      return dataFrameToIndexedFiles(dfCache.getSortedAddFiles(), version);
+    }
+
+    invalidateDataFrameCache();
+
+    SnapshotImpl snapshot = (SnapshotImpl) snapshotManager.loadSnapshotAt(version);
+    io.delta.spark.internal.v2.utils.SerializableReadOnlySnapshot serSnapshot =
+        io.delta.spark.internal.v2.utils.SerializableReadOnlySnapshot.fromSnapshot(
+            snapshot, hadoopConf);
+
+    ScanFileRDD rdd = new ScanFileRDD(spark.sparkContext(), serSnapshot);
+    org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df =
+        spark
+            .createDataFrame(rdd, ScanFileRDD.SPARK_SCHEMA)
+            .orderBy("modificationTime", "path")
+            .persist(org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK());
+
+    dfCache = new DataFrameSnapshotCache(version, df);
+    cachedDataFrameSnapshot.set(dfCache);
+
+    return dataFrameToIndexedFiles(df, version);
+  }
+
+  /**
+   * Converts a sorted DataFrame of AddFile rows into a lazy CloseableIterator of IndexedFiles,
+   * wrapped with BEGIN/END sentinels. Uses toLocalIterator() to stream rows from executors to the
+   * driver one at a time, avoiding pulling all data into driver memory.
+   */
+  private static CloseableIterator<IndexedFile> dataFrameToIndexedFiles(
+      org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, long version) {
+
+    java.util.Iterator<org.apache.spark.sql.Row> localIter = df.toLocalIterator();
+
+    return new CloseableIterator<IndexedFile>() {
+      private boolean sentBegin = false;
+      private boolean sentEnd = false;
+      private long index = 0;
+
+      @Override
+      public boolean hasNext() {
+        return !sentEnd;
+      }
+
+      @Override
+      public IndexedFile next() {
+        if (!sentBegin) {
+          sentBegin = true;
+          return new IndexedFile(version, DeltaSourceOffset.BASE_INDEX(), null);
+        }
+
+        if (localIter.hasNext()) {
+          org.apache.spark.sql.Row sparkRow = localIter.next();
+          io.delta.kernel.data.Row kernelRow =
+              new io.delta.spark.internal.v2.utils.SparkRowToKernelRow(
+                  sparkRow, AddFile.SCHEMA_WITHOUT_STATS);
+          return new IndexedFile(version, index++, new AddFile(kernelRow));
+        }
+
+        sentEnd = true;
+        return new IndexedFile(version, DeltaSourceOffset.END_INDEX(), null);
+      }
+
+      @Override
+      public void close() throws IOException {
+        // toLocalIterator() resources are managed by Spark
+      }
+    };
+  }
+
   /** Loads snapshot files at the specified version. */
   private List<IndexedFile> loadAndValidateSnapshot(long version) {
     Snapshot snapshot = snapshotManager.loadSnapshotAt(version);
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/read/SparkMicroBatchStreamTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/read/SparkMicroBatchStreamTest.java
@@ -2866,4 +2866,66 @@ public void testMemoryProtection_initialSnapshotTooLarge(@TempDir File tempDir)
       spark.conf().unset(configKey);
     }
   }
+
+  @Test
+  public void testDataFrameBasedInitialSnapshot_handlesLargeSnapshot(@TempDir File tempDir)
+      throws Exception {
+    String testTablePath = tempDir.getAbsolutePath();
+    String testTableName = "test_df_snapshot_" + System.nanoTime();
+    createEmptyTestTable(testTablePath, testTableName);
+
+    insertVersions(
+        testTableName,
+        /* numVersions= */ 10,
+        /* rowsPerVersion= */ 5,
+        /* includeEmptyVersion= */ false);
+
+    String maxFilesKey = DeltaSQLConf.DELTA_STREAMING_INITIAL_SNAPSHOT_MAX_FILES().key();
+    String dfFlagKey = DeltaSQLConf.DELTA_STREAMING_USE_DATAFRAME_INITIAL_SNAPSHOT().key();
+    spark.conf().set(maxFilesKey, "5");
+    spark.conf().set(dfFlagKey, "true");
+
+    try {
+      Configuration hadoopConf = spark.sessionState().newHadoopConf();
+      PathBasedSnapshotManager snapshotManager =
+          new PathBasedSnapshotManager(testTablePath, hadoopConf);
+      SparkMicroBatchStream stream =
+          createTestStreamWithDefaults(snapshotManager, hadoopConf, emptyDeltaOptions());
+
+      long version = 5L;
+      long fromIndex = DeltaSourceOffset.BASE_INDEX();
+      boolean isInitialSnapshot = true;
+
+      List<IndexedFile> files = new ArrayList<>();
+      try (CloseableIterator<IndexedFile> iter =
+          stream.getFileChanges(version, fromIndex, isInitialSnapshot, Optional.empty())) {
+        while (iter.hasNext()) {
+          files.add(iter.next());
+        }
+      }
+
+      // Should succeed (no exception) and include BEGIN/END sentinels + data files
+      assertTrue(files.size() >= 3, "Should have at least BEGIN, one file, and END sentinels");
+      assertEquals(DeltaSourceOffset.BASE_INDEX(), files.get(0).index);
+      assertEquals(DeltaSourceOffset.END_INDEX(), files.get(files.size() - 1).index);
+
+      // Verify data files are sorted by (modificationTime, path)
+      for (int i = 2; i < files.size() - 1; i++) {
+        IndexedFile prev = files.get(i - 1);
+        IndexedFile curr = files.get(i);
+        if (prev.getAdd() != null && curr.getAdd() != null) {
+          long prevTime = prev.getAdd().getModificationTime();
+          long currTime = curr.getAdd().getModificationTime();
+          assertTrue(
+              prevTime < currTime
+                  || (prevTime == currTime
+                      && prev.getAdd().getPath().compareTo(curr.getAdd().getPath()) <= 0),
+              "Files should be sorted by (modificationTime, path)");
+        }
+      }
+    } finally {
+      spark.conf().unset(maxFilesKey);
+      spark.conf().unset(dfFlagKey);
+    }
+  }
 }