Refactor selection vector logic into StreamingHelper.getAddFile

huan233usc · huan233usc · commit ecf0b8fabfa8 · 2026-02-05T18:57:06.000Z
- Add overloaded getAddFile(FilteredColumnarBatch, rowId) that respects selection vector
- Selection vectors filter out duplicate files from stats re-collection
- Simplify SparkMicroBatchStream.loadAndValidateSnapshot by delegating to helper
- Add V2 DeletionVectors streaming tests
diff --git a/spark-unified/src/test/scala/org/apache/spark/sql/delta/test/DeltaSourceV2DeletionVectorsSuite.scala b/spark-unified/src/test/scala/org/apache/spark/sql/delta/test/DeltaSourceV2DeletionVectorsSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.test
+
+import org.apache.spark.sql.delta.{DeltaSourceDeletionVectorTests, DeltaSourceSuiteBase, PersistentDVEnabled}
+import org.apache.spark.sql.delta.sources.DeltaSQLConf
+
+/**
+ * Test suite that runs DeltaSourceDeletionVectorTests using the V2 connector.
+ */
+class DeltaSourceV2DeletionVectorsSuite extends DeltaSourceSuiteBase
+  with DeltaSQLCommandTest
+  with DeltaSourceDeletionVectorTests
+  with PersistentDVEnabled
+  with V2ForceTest {
+
+  override protected def useDsv2: Boolean = true
+
+  // Override executeSql to use V1 connector for write operations (DELETE/INSERT)
+  // V2 connector doesn't support write operations yet
+  override protected def executeSql(sqlText: String): Unit = {
+    withSQLConf(DeltaSQLConf.V2_ENABLE_MODE.key -> "NONE") {
+      sql(sqlText)
+    }
+  }
+
+  private lazy val shouldPassTests = Set(
+    "allow to delete files before starting a streaming query",
+    "allow to delete files before staring a streaming query without checkpoint",
+    "multiple deletion vectors per file with initial snapshot"
+  )
+
+  private lazy val shouldFailTests = Set(
+    // These tests use ignoreDeletes/ignoreChanges options not yet supported in V2
+    "deleting files fails query if ignoreDeletes = false",
+    "allow to delete files after staring a streaming query when ignoreFileDeletion is true",
+    "allow to delete files after staring a streaming query when ignoreDeletes is true",
+    "updating the source table causes failure when ignoreChanges = false - using DELETE",
+    "allow to update the source table when ignoreChanges = true - using DELETE",
+    "deleting files when ignoreChanges = true doesn't fail the query",
+    "updating source table when ignoreDeletes = true fails the query - using DELETE",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE - List()",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE" +
+      " - List((ignoreDeletes,true))",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE" +
+      " - List((ignoreChanges,true))",
+    "subsequent DML commands are processed correctly in a batch - DELETE->DELETE" +
+      " - List((skipChangeCommits,true))",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE - List()",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE" +
+      " - List((ignoreDeletes,true))",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE" +
+      " - List((ignoreChanges,true))",
+    "subsequent DML commands are processed correctly in a batch - INSERT->DELETE" +
+      " - List((skipChangeCommits,true))",
+    "multiple deletion vectors per file - List((ignoreFileDeletion,true))",
+    "multiple deletion vectors per file - List((ignoreChanges,true))"
+  )
+
+  override protected def shouldFail(testName: String): Boolean = {
+    val inPassList = shouldPassTests.contains(testName)
+    val inFailList = shouldFailTests.contains(testName)
+
+    assert(inPassList || inFailList, s"Test '$testName' not in shouldPassTests or shouldFailTests")
+    assert(!(inPassList && inFailList),
+      s"Test '$testName' in both shouldPassTests and shouldFailTests")
+
+    inFailList
+  }
+}
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceDeletionVectorsSuite.scala
@@ -21,7 +21,7 @@ import java.io.File
 import scala.util.control.NonFatal
 
 import org.apache.spark.sql.delta.Relocated.StreamExecution
-import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
+import org.apache.spark.sql.delta.test.{DeltaSQLCommandTest, DeltaSQLTestUtils}
 import org.apache.hadoop.fs.Path
 import org.scalatest.concurrent.Eventually
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
@@ -30,28 +30,31 @@ import org.apache.spark.sql.streaming.{StreamTest, Trigger}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 
 trait DeltaSourceDeletionVectorTests extends StreamTest
-  with DeletionVectorsTestUtils {
+  with DeletionVectorsTestUtils
+  with DeltaSourceConnectorTrait {
+  self: DeltaSQLTestUtils =>
 
   import testImplicits._
 
+  /** Execute SQL statement. Override in V2 tests to use V1 connector for write operations. */
+  protected def executeSql(sqlText: String): Unit = sql(sqlText)
+
   test("allow to delete files before starting a streaming query") {
     withTempDir { inputDir =>
       val deltaLog = DeltaLog.forTable(spark, new Path(inputDir.toURI))
       (0 until 5).foreach { i =>
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
-      sql(s"DELETE FROM delta.`$inputDir`")
+      executeSql(s"DELETE FROM delta.`$inputDir`")
       (5 until 10).foreach { i =>
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
       deltaLog.checkpoint()
       assert(deltaLog.readLastCheckpointFile().nonEmpty, "this test requires a checkpoint")
 
-      val df = spark.readStream
-        .format("delta")
-        .load(inputDir.getCanonicalPath)
+      val df = loadStreamWithOptions(inputDir.getCanonicalPath, Map.empty)
 
       testStream(df)(
         AssertOnQuery { q =>
@@ -69,16 +72,14 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
-      sql(s"DELETE FROM delta.`$inputDir`")
+      executeSql(s"DELETE FROM delta.`$inputDir`")
       (5 until 7).foreach { i =>
         val v = Seq(i.toString).toDF
         v.write.mode("append").format("delta").save(deltaLog.dataPath.toString)
       }
       assert(deltaLog.readLastCheckpointFile().isEmpty, "this test requires no checkpoint")
 
-      val df = spark.readStream
-        .format("delta")
-        .load(inputDir.getCanonicalPath)
+      val df = loadStreamWithOptions(inputDir.getCanonicalPath, Map.empty)
 
       testStream(df)(
         AssertOnQuery { q =>
@@ -115,7 +116,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       Seq(i, i + 1).toDF().coalesce(1).write.format("delta").mode("append").save(inputDir)
     }
 
-    val df = spark.readStream.format("delta").options(sourceOptions.toMap).load(inputDir)
+    val df = loadStreamWithOptions(inputDir, sourceOptions.toMap)
     val expectDVs = commandShouldProduceDVs.getOrElse(
       sqlCommand.toUpperCase().startsWith("DELETE"))
 
@@ -126,7 +127,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       },
       CheckAnswer((0 until 10): _*),
       AssertOnQuery { q =>
-        sql(sqlCommand)
+        executeSql(sqlCommand)
         deletionVectorsPresentIfExpected(inputDir, expectDVs)
       })
 
@@ -148,7 +149,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
     }
     val log = DeltaLog.forTable(spark, inputDir)
     val commitVersionBeforeDML = log.update().version
-    val df = spark.readStream.format("delta").options(sourceOptions.toMap).load(inputDir)
+    val df = loadStreamWithOptions(inputDir, sourceOptions.toMap)
     def expectDVsInCommand(shouldProduceDVs: Option[Boolean], command: String): Boolean = {
       shouldProduceDVs.getOrElse(command.toUpperCase().startsWith("DELETE"))
     }
@@ -177,11 +178,11 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
         true
       },
       AssertOnQuery { q =>
-        sql(sqlCommand1)
+        executeSql(sqlCommand1)
         deletionVectorsPresentIfExpected(inputDir, expectDVsInCommand1)
       },
       AssertOnQuery { q =>
-        sql(sqlCommand2)
+        executeSql(sqlCommand2)
         deletionVectorsPresentIfExpected(inputDir, expectDVsInCommand2)
       },
       AssertOnQuery { q =>
@@ -416,21 +417,19 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       (0 until 10).toDF("value").coalesce(1).write.format("delta").save(path)
 
       // V1: Delete row 0
-      sql(s"DELETE FROM delta.`$path` WHERE value = 0")
+      executeSql(s"DELETE FROM delta.`$path` WHERE value = 0")
 
       // V2: Delete row 1
-      sql(s"DELETE FROM delta.`$path` WHERE value = 1")
+      executeSql(s"DELETE FROM delta.`$path` WHERE value = 1")
 
       // V3: Delete row 2
-      sql(s"DELETE FROM delta.`$path` WHERE value = 2")
+      executeSql(s"DELETE FROM delta.`$path` WHERE value = 2")
 
       // Verify DVs are present
       assert(getFilesWithDeletionVectors(deltaLog).nonEmpty,
         "This test requires deletion vectors to be present")
 
-      val df = spark.readStream
-        .format("delta")
-        .load(path)
+      val df = loadStreamWithOptions(path, Map.empty)
 
       testStream(df)(
         // Process the initial snapshot
@@ -457,10 +456,7 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
       // V0: 10 rows in a single file
       (0 until 10).toDF("value").coalesce(1).write.format("delta").save(path)
 
-      val df = spark.readStream
-        .format("delta")
-        .options(sourceOptions.toMap)
-        .load(path)
+      val df = loadStreamWithOptions(path, sourceOptions.toMap)
 
       testStream(df)(
         AssertOnQuery { q =>
@@ -470,12 +466,12 @@ trait DeltaSourceDeletionVectorTests extends StreamTest
         CheckAnswer((0 until 10): _*),
         AssertOnQuery { q =>
           // V1: Delete row 0 - creates first DV (version 1)
-          sql(s"DELETE FROM delta.`$path` WHERE value = 0")
+          executeSql(s"DELETE FROM delta.`$path` WHERE value = 0")
           true
         },
         AssertOnQuery { q =>
           // V2: Delete row 1 - updates DV (version 2). DV is cumulative: {0, 1}
-          sql(s"DELETE FROM delta.`$path` WHERE value = 1")
+          executeSql(s"DELETE FROM delta.`$path` WHERE value = 1")
           true
         },
         AssertOnQuery { q =>
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/read/SparkMicroBatchStream.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/read/SparkMicroBatchStream.java
@@ -22,7 +22,6 @@
 import io.delta.kernel.CommitRange;
 import io.delta.kernel.Scan;
 import io.delta.kernel.Snapshot;
-import io.delta.kernel.data.ColumnVector;
 import io.delta.kernel.data.ColumnarBatch;
 import io.delta.kernel.data.FilteredColumnarBatch;
 import io.delta.kernel.defaults.engine.DefaultEngine;
@@ -1067,24 +1066,13 @@ private List<IndexedFile> loadAndValidateSnapshot(long version) {
     try (CloseableIterator<FilteredColumnarBatch> filesIter = scan.getScanFiles(engine)) {
       while (filesIter.hasNext()) {
         FilteredColumnarBatch filteredBatch = filesIter.next();
-        ColumnarBatch batch = filteredBatch.getData();
-        Optional<ColumnVector> selectionVector = filteredBatch.getSelectionVector();
 
         // Get all AddFiles from the batch. Include both dataChange=true and dataChange=false
-        // (checkpoint files) files. Respect the selection vector to filter out duplicate files
-        // (e.g., stats re-collection that re-adds files with updated stats).
-        for (int rowId = 0; rowId < batch.getSize(); rowId++) {
-          // Skip rows that are filtered out by the selection vector
-          final int currentRowId = rowId;
-          boolean shouldSkip =
-              selectionVector
-                  .map(sv -> sv.isNullAt(currentRowId) || !sv.getBoolean(currentRowId))
-                  .orElse(false);
-          if (shouldSkip) {
-            continue;
-          }
-
-          Optional<AddFile> addOpt = StreamingHelper.getAddFile(batch, currentRowId);
+        // (checkpoint files) files. StreamingHelper.getAddFile respects the selection vector
+        // to filter out duplicate files (e.g., stats re-collection re-adds files with updated
+        // stats).
+        for (int rowId = 0; rowId < filteredBatch.getData().getSize(); rowId++) {
+          Optional<AddFile> addOpt = StreamingHelper.getAddFile(filteredBatch, rowId);
           if (addOpt.isPresent()) {
             addFiles.add(addOpt.get());
 
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/utils/StreamingHelper.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/utils/StreamingHelper.java
@@ -21,6 +21,7 @@
 import io.delta.kernel.CommitActions;
 import io.delta.kernel.data.ColumnVector;
 import io.delta.kernel.data.ColumnarBatch;
+import io.delta.kernel.data.FilteredColumnarBatch;
 import io.delta.kernel.data.Row;
 import io.delta.kernel.engine.Engine;
 import io.delta.kernel.internal.DeltaLogActionUtils;
@@ -64,7 +65,31 @@ public static long getVersion(ColumnarBatch batch) {
     return batch.getColumnVector(versionColIdx).getLong(0);
   }
 
-  /** Get AddFile action from a batch at the specified row, if present. */
+  /**
+   * Get AddFile action from a FilteredColumnarBatch at the specified row, if present.
+   *
+   * <p>This method respects the selection vector to filter out duplicate files that may appear when
+   * stats re-collection (e.g., ANALYZE TABLE COMPUTE STATISTICS) re-adds files with updated stats.
+   * The Kernel uses selection vectors to mark which rows (AddFiles) are logically valid.
+   *
+   * @param batch the FilteredColumnarBatch containing AddFile actions
+   * @param rowId the row index to check
+   * @return Optional containing the AddFile if present and selected, empty otherwise
+   */
+  public static Optional<AddFile> getAddFile(FilteredColumnarBatch batch, int rowId) {
+    // Check selection vector first - rows may be filtered out when stats re-collection
+    // re-adds files with updated stats
+    Optional<ColumnVector> selectionVector = batch.getSelectionVector();
+    boolean isFiltered =
+        selectionVector.map(sv -> sv.isNullAt(rowId) || !sv.getBoolean(rowId)).orElse(false);
+    if (isFiltered) {
+      return Optional.empty();
+    }
+
+    return getAddFile(batch.getData(), rowId);
+  }
+
+  /** Get AddFile action from a ColumnarBatch at the specified row, if present. */
   public static Optional<AddFile> getAddFile(ColumnarBatch batch, int rowId) {
     int addIdx = getFieldIndex(batch, DeltaLogActionUtils.DeltaAction.ADD.colName);
     ColumnVector addVector = batch.getColumnVector(addIdx);