[Kernel-Spark] Phase 2: Vectorized reader support for Deletion Vectors

huan233usc · huan233usc · commit c37d12837659 · 2026-02-05T18:30:36.000Z
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/read/deletionvector/ColumnVectorWithFilter.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/read/deletionvector/ColumnVectorWithFilter.java
@@ -0,0 +1,144 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.spark.internal.v2.read.deletionvector;
+
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarMap;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A column vector that applies row-level filtering using a row ID mapping.
+ *
+ * <p>Wraps an existing column vector and remaps row indices during data access, effectively
+ * filtering the original data to only expose the live subset of rows without copying data.
+ *
+ * <p>Follows Apache Iceberg's ColumnVectorWithFilter pattern.
+ */
+public class ColumnVectorWithFilter extends ColumnVector {
+  private final ColumnVector delegate;
+  private final int[] rowIdMapping;
+  private volatile ColumnVectorWithFilter[] children = null;
+
+  public ColumnVectorWithFilter(ColumnVector delegate, int[] rowIdMapping) {
+    super(delegate.dataType());
+    this.delegate = delegate;
+    this.rowIdMapping = rowIdMapping;
+  }
+
+  @Override
+  public void close() {
+    delegate.close();
+  }
+
+  @Override
+  public boolean hasNull() {
+    return delegate.hasNull();
+  }
+
+  @Override
+  public int numNulls() {
+    // Computing the actual number of nulls with rowIdMapping is expensive.
+    // It is OK to overestimate and return the number of nulls in the original vector.
+    return delegate.numNulls();
+  }
+
+  @Override
+  public boolean isNullAt(int rowId) {
+    return delegate.isNullAt(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public boolean getBoolean(int rowId) {
+    return delegate.getBoolean(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public byte getByte(int rowId) {
+    return delegate.getByte(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public short getShort(int rowId) {
+    return delegate.getShort(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public int getInt(int rowId) {
+    return delegate.getInt(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public long getLong(int rowId) {
+    return delegate.getLong(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public float getFloat(int rowId) {
+    return delegate.getFloat(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public double getDouble(int rowId) {
+    return delegate.getDouble(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public ColumnarArray getArray(int rowId) {
+    return delegate.getArray(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public ColumnarMap getMap(int rowId) {
+    return delegate.getMap(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    return delegate.getDecimal(rowIdMapping[rowId], precision, scale);
+  }
+
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    return delegate.getUTF8String(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public byte[] getBinary(int rowId) {
+    return delegate.getBinary(rowIdMapping[rowId]);
+  }
+
+  @Override
+  public ColumnVector getChild(int ordinal) {
+    if (children == null) {
+      synchronized (this) {
+        if (children == null) {
+          // Eagerly create all children to avoid race condition on children[ordinal] access
+          StructType structType = (StructType) dataType();
+          ColumnVectorWithFilter[] newChildren =
+              new ColumnVectorWithFilter[structType.fields().length];
+          for (int i = 0; i < newChildren.length; i++) {
+            newChildren[i] = new ColumnVectorWithFilter(delegate.getChild(i), rowIdMapping);
+          }
+          children = newChildren;
+        }
+      }
+    }
+    return children[ordinal];
+  }
+}
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/read/deletionvector/DeletionVectorReadFunction.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/read/deletionvector/DeletionVectorReadFunction.java
@@ -17,12 +17,14 @@
 
 import io.delta.spark.internal.v2.utils.CloseableIterator;
 import java.io.Serializable;
+import java.util.Arrays;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.ProjectingInternalRow;
 import org.apache.spark.sql.execution.datasources.PartitionedFile;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import scala.Function1;
 import scala.collection.Iterator;
-import scala.runtime.AbstractFunction1;
 
 /**
  * Wraps a Parquet reader function to apply deletion vector filtering.
@@ -39,7 +41,8 @@
  * of the underlying Parquet reader, even when the iterator is not fully consumed.
  */
 public class DeletionVectorReadFunction
-    extends AbstractFunction1<PartitionedFile, Iterator<InternalRow>> implements Serializable {
+    extends scala.runtime.AbstractFunction1<PartitionedFile, Iterator<InternalRow>>
+    implements Serializable {
 
   private static final long serialVersionUID = 1L;
 
@@ -59,22 +62,74 @@ private DeletionVectorReadFunction(
   @Override
   public Iterator<InternalRow> apply(PartitionedFile file) {
     int dvColumnIndex = dvSchemaContext.getDvColumnIndex();
+    int outputColumnCount = dvSchemaContext.getOutputSchema().fields().length;
     // Use pre-computed ordinals from DeletionVectorSchemaContext.
     ProjectingInternalRow projection =
         ProjectingInternalRow.apply(
             dvSchemaContext.getOutputSchema(), dvSchemaContext.getOutputColumnOrdinals());
 
     // Wrap the base iterator as CloseableIterator to preserve close() through filter/map.
     // This ensures proper resource cleanup even when the iterator is not fully consumed.
-    Iterator<InternalRow> baseIterator = baseReadFunc.apply(file);
+    // Use Object as type: Spark passes ColumnarBatch cast to InternalRow in vectorized mode.
+    @SuppressWarnings("unchecked")
+    Iterator<Object> baseIterator = (Iterator<Object>) (Iterator<?>) baseReadFunc.apply(file);
 
-    return CloseableIterator.wrap(baseIterator)
-        .filterCloseable(row -> row.getByte(dvColumnIndex) == ROW_NOT_DELETED)
-        .mapCloseable(
-            row -> {
-              projection.project(row);
-              return (InternalRow) projection;
-            });
+    // Filter: skip deleted rows (noop for vectorized - batch filtering done in map)
+    // Map: project row / filter batch
+    @SuppressWarnings("unchecked")
+    Iterator<InternalRow> result =
+        (Iterator<InternalRow>)
+            (Iterator<?>)
+                CloseableIterator.wrap(baseIterator)
+                    .filterCloseable(
+                        item -> {
+                          if (item instanceof InternalRow) {
+                            // Row-based: filter deleted rows
+                            return ((InternalRow) item).getByte(dvColumnIndex) == ROW_NOT_DELETED;
+                          }
+                          // Vectorized: noop (batch filtering done in map)
+                          return true;
+                        })
+                    .mapCloseable(
+                        item -> {
+                          if (item instanceof ColumnarBatch) {
+                            return filterBatch(
+                                (ColumnarBatch) item, dvColumnIndex, outputColumnCount);
+                          } else {
+                            // Row-based: project out DV column
+                            projection.project((InternalRow) item);
+                            return projection;
+                          }
+                        });
+    return result;
+  }
+
+  /** Filter a ColumnarBatch by building row ID mapping for live rows. */
+  private static ColumnarBatch filterBatch(
+      ColumnarBatch batch, int dvColumnIndex, int outputColumnCount) {
+    int[] liveRows = findLiveRows(batch, dvColumnIndex);
+    // Build filtered column vectors (excluding DV column)
+    ColumnVector[] filteredVectors = new ColumnVector[outputColumnCount];
+    int outIdx = 0;
+    for (int i = 0; i < batch.numCols(); i++) {
+      if (i != dvColumnIndex) {
+        filteredVectors[outIdx++] = new ColumnVectorWithFilter(batch.column(i), liveRows);
+      }
+    }
+    return new ColumnarBatch(filteredVectors, liveRows.length);
+  }
+
+  /** Find indices of rows where DV column is 0 (not deleted). */
+  private static int[] findLiveRows(ColumnarBatch batch, int dvColumnIndex) {
+    ColumnVector dvColumn = batch.column(dvColumnIndex);
+    int[] temp = new int[batch.numRows()];
+    int count = 0;
+    for (int i = 0; i < batch.numRows(); i++) {
+      if (dvColumn.getByte(i) == ROW_NOT_DELETED) {
+        temp[count++] = i;
+      }
+    }
+    return Arrays.copyOf(temp, count);
   }
 
   /** Factory method to wrap a reader function with DV filtering. */
diff --git a/spark/v2/src/main/java/io/delta/spark/internal/v2/utils/PartitionUtils.java b/spark/v2/src/main/java/io/delta/spark/internal/v2/utils/PartitionUtils.java
@@ -211,9 +211,9 @@ public static PartitionReaderFactory createDeltaParquetReaderFactory(
             .map(DeletionVectorSchemaContext::getSchemaWithDvColumn)
             .orElse(readDataSchema);
 
-    // TODO(https://github.com/delta-io/delta/issues/5859): Enable vectorized reader for DV tables
+    // Vectorized reader is supported for DV tables using ColumnVectorWithFilter
     boolean enableVectorizedReader =
-        !isTableSupportDv && ParquetUtils.isBatchReadSupportedForSchema(sqlConf, readDataSchema);
+        ParquetUtils.isBatchReadSupportedForSchema(sqlConf, readDataSchema);
     scala.collection.immutable.Map<String, String> optionsWithVectorizedReading =
         scalaOptions.$plus(
             new Tuple2<>(
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/read/deletionvector/ColumnVectorWithFilterTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/read/deletionvector/ColumnVectorWithFilterTest.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.spark.internal.v2.read.deletionvector;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
+import org.apache.spark.sql.types.DataTypes;
+import org.junit.jupiter.api.Test;
+
+public class ColumnVectorWithFilterTest {
+
+  @Test
+  void testIntegerFiltering() {
+    try (WritableColumnVector delegate = new OnHeapColumnVector(5, DataTypes.IntegerType)) {
+      for (int i = 0; i < 5; i++) {
+        delegate.putInt(i, (i + 1) * 10); // [10, 20, 30, 40, 50]
+      }
+      ColumnVectorWithFilter filtered = new ColumnVectorWithFilter(delegate, new int[] {1, 3});
+
+      assertEquals(20, filtered.getInt(0));
+      assertEquals(40, filtered.getInt(1));
+    }
+  }
+
+  @Test
+  void testLongFiltering() {
+    try (WritableColumnVector delegate = new OnHeapColumnVector(4, DataTypes.LongType)) {
+      for (int i = 0; i < 4; i++) {
+        delegate.putLong(i, (i + 1) * 100L);
+      }
+      ColumnVectorWithFilter filtered = new ColumnVectorWithFilter(delegate, new int[] {0, 2});
+
+      assertEquals(100L, filtered.getLong(0));
+      assertEquals(300L, filtered.getLong(1));
+    }
+  }
+
+  @Test
+  void testDoubleFiltering() {
+    try (WritableColumnVector delegate = new OnHeapColumnVector(3, DataTypes.DoubleType)) {
+      delegate.putDouble(0, 1.1);
+      delegate.putDouble(1, 2.2);
+      delegate.putDouble(2, 3.3);
+      ColumnVectorWithFilter filtered = new ColumnVectorWithFilter(delegate, new int[] {0, 2});
+
+      assertEquals(1.1, filtered.getDouble(0), 0.001);
+      assertEquals(3.3, filtered.getDouble(1), 0.001);
+    }
+  }
+
+  @Test
+  void testBooleanFiltering() {
+    try (WritableColumnVector delegate = new OnHeapColumnVector(4, DataTypes.BooleanType)) {
+      delegate.putBoolean(0, true);
+      delegate.putBoolean(1, false);
+      delegate.putBoolean(2, true);
+      delegate.putBoolean(3, false);
+      ColumnVectorWithFilter filtered = new ColumnVectorWithFilter(delegate, new int[] {1, 2});
+
+      assertFalse(filtered.getBoolean(0));
+      assertTrue(filtered.getBoolean(1));
+    }
+  }
+
+  @Test
+  void testStringFiltering() {
+    try (WritableColumnVector delegate = new OnHeapColumnVector(3, DataTypes.StringType)) {
+      delegate.putByteArray(0, "alice".getBytes());
+      delegate.putByteArray(1, "bob".getBytes());
+      delegate.putByteArray(2, "charlie".getBytes());
+      ColumnVectorWithFilter filtered = new ColumnVectorWithFilter(delegate, new int[] {2});
+
+      assertEquals("charlie", filtered.getUTF8String(0).toString());
+    }
+  }
+
+  @Test
+  void testNullHandling() {
+    try (WritableColumnVector delegate = new OnHeapColumnVector(4, DataTypes.IntegerType)) {
+      delegate.putInt(0, 10);
+      delegate.putNull(1);
+      delegate.putInt(2, 30);
+      delegate.putNull(3);
+      ColumnVectorWithFilter filtered =
+          new ColumnVectorWithFilter(delegate, new int[] {0, 1, 2, 3});
+
+      assertFalse(filtered.isNullAt(0));
+      assertTrue(filtered.isNullAt(1));
+      assertFalse(filtered.isNullAt(2));
+      assertTrue(filtered.isNullAt(3));
+    }
+  }
+
+  @Test
+  void testEmptyAndIdentityMapping() {
+    try (WritableColumnVector delegate = new OnHeapColumnVector(3, DataTypes.IntegerType)) {
+      delegate.putInt(0, 10);
+      delegate.putInt(1, 20);
+      delegate.putInt(2, 30);
+
+      // Empty mapping
+      ColumnVectorWithFilter empty = new ColumnVectorWithFilter(delegate, new int[] {});
+      assertEquals(DataTypes.IntegerType, empty.dataType());
+
+      // Identity mapping
+      ColumnVectorWithFilter identity = new ColumnVectorWithFilter(delegate, new int[] {0, 1, 2});
+      assertEquals(10, identity.getInt(0));
+      assertEquals(20, identity.getInt(1));
+      assertEquals(30, identity.getInt(2));
+    }
+  }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/read/deletionvector/DeletionVectorVectorizedReaderTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/read/deletionvector/DeletionVectorVectorizedReaderTest.java