huan233usc
diff --git a/‎spark/v2/src/main/java/io/delta/spark/internal/v2/read/DataFrameSnapshotCache.java‎
Lines changed: 48 additions & 0 deletions b/‎spark/v2/src/main/java/io/delta/spark/internal/v2/read/DataFrameSnapshotCache.java‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎spark/v2/src/main/java/io/delta/spark/internal/v2/read/ScanFileRDD.java‎
Lines changed: 161 additions & 0 deletions b/‎spark/v2/src/main/java/io/delta/spark/internal/v2/read/ScanFileRDD.java‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎spark/v2/src/main/java/io/delta/spark/internal/v2/utils/SerializableReadOnlySnapshot.java‎
Lines changed: 174 additions & 0 deletions b/‎spark/v2/src/main/java/io/delta/spark/internal/v2/utils/SerializableReadOnlySnapshot.java‎
Lines changed: 174 additions & 0 deletions
@@ -0,0 +1,48 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.spark.internal.v2.read;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+
+/** Cached sorted AddFile DataFrame, keyed by version. Callers synchronize externally. */
+public class DataFrameSnapshotCache implements AutoCloseable {
+
+  private final long version;
+  private Dataset<Row> sortedAddFiles;
+
+  public DataFrameSnapshotCache(long version, Dataset<Row> sortedAddFiles) {
+    this.version = version;
+    this.sortedAddFiles = sortedAddFiles;
+  }
+
+  public long getVersion() {
+    return version;
+  }
+
+  public Dataset<Row> getSortedAddFiles() {
+    return sortedAddFiles;
+  }
+
+  @Override
+  public void close() {
+    Dataset<Row> df = sortedAddFiles;
+    if (df != null) {
+      df.unpersist();
+      sortedAddFiles = null;
+    }
+  }
+}
@@ -0,0 +1,161 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.spark.internal.v2.read;
+
+import io.delta.kernel.Scan;
+import io.delta.kernel.data.FilteredColumnarBatch;
+import io.delta.kernel.defaults.engine.DefaultEngine;
+import io.delta.kernel.engine.Engine;
+import io.delta.kernel.internal.actions.AddFile;
+import io.delta.kernel.utils.CloseableIterator;
+import io.delta.spark.internal.v2.utils.KernelRowToSparkRow;
+import io.delta.spark.internal.v2.utils.SchemaUtils;
+import io.delta.spark.internal.v2.utils.SerializableReadOnlySnapshot;
+import io.delta.spark.internal.v2.utils.StreamingHelper;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.NoSuchElementException;
+import java.util.Optional;
+import org.apache.spark.Partition;
+import org.apache.spark.SparkContext;
+import org.apache.spark.TaskContext;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructType;
+import scala.collection.mutable.ArrayBuffer;
+import scala.reflect.ClassTag$;
+
+/**
+ * A single-partition Spark RDD that reconstructs a Kernel {@link Scan} on the executor from a
+ * {@link SerializableReadOnlySnapshot} and lazily streams {@link AddFile} records as Spark {@link
+ * Row}s. The RDD output schema matches {@link AddFile#SCHEMA_WITHOUT_STATS}.
+ *
+ * <p>Single partition is an intentional limitation; a future version will use Kernel's plan API for
+ * multi-partition replay. The downstream sort is still distributed.
+ */
+public class ScanFileRDD extends RDD<Row> {
+
+  public static final StructType SPARK_SCHEMA =
+      SchemaUtils.convertKernelSchemaToSparkSchema(AddFile.SCHEMA_WITHOUT_STATS);
+
+  private final SerializableReadOnlySnapshot serializableSnapshot;
+
+  public ScanFileRDD(SparkContext sc, SerializableReadOnlySnapshot serializableSnapshot) {
+    super(sc, new ArrayBuffer<>(), ClassTag$.MODULE$.apply(Row.class));
+    this.serializableSnapshot = serializableSnapshot;
+  }
+
+  private static final class SinglePartition implements Partition, Serializable {
+    private static final long serialVersionUID = 1L;
+
+    @Override
+    public int index() {
+      return 0;
+    }
+  }
+
+  @Override
+  public Partition[] getPartitions() {
+    return new Partition[] {new SinglePartition()};
+  }
+
+  @Override
+  public scala.collection.Iterator<Row> compute(Partition split, TaskContext context) {
+    Engine engine = DefaultEngine.create(serializableSnapshot.getHadoopConf());
+    Scan scan = serializableSnapshot.toScan();
+
+    CloseableIterator<FilteredColumnarBatch> batchIter;
+    try {
+      batchIter = scan.getScanFiles(engine);
+    } catch (Exception e) {
+      throw new RuntimeException("Failed to open scan files on executor", e);
+    }
+
+    AddFileLazyIterator lazyIter = new AddFileLazyIterator(batchIter);
+
+    if (context != null) {
+      context.addTaskCompletionListener(
+          ctx -> {
+            try {
+              batchIter.close();
+            } catch (IOException e) {
+              // best effort cleanup
+            }
+          });
+    }
+
+    return lazyIter;
+  }
+
+  /**
+   * Lazy Scala iterator that streams AddFile rows one at a time from the underlying Kernel batch
+   * iterator. No eager materialization into a list.
+   */
+  private static final class AddFileLazyIterator implements scala.collection.Iterator<Row> {
+
+    private final CloseableIterator<FilteredColumnarBatch> batchIter;
+
+    private FilteredColumnarBatch currentBatch;
+    private int currentRowId;
+    private int currentBatchSize;
+    private Row nextRow;
+
+    AddFileLazyIterator(CloseableIterator<FilteredColumnarBatch> batchIter) {
+      this.batchIter = batchIter;
+      this.currentBatch = null;
+      this.currentRowId = 0;
+      this.currentBatchSize = 0;
+      this.nextRow = null;
+    }
+
+    @Override
+    public boolean hasNext() {
+      if (nextRow != null) {
+        return true;
+      }
+      nextRow = advance();
+      return nextRow != null;
+    }
+
+    @Override
+    public Row next() {
+      if (!hasNext()) {
+        throw new NoSuchElementException();
+      }
+      Row result = nextRow;
+      nextRow = null;
+      return result;
+    }
+
+    private Row advance() {
+      while (true) {
+        while (currentRowId < currentBatchSize) {
+          int rowId = currentRowId++;
+          Optional<AddFile> addOpt = StreamingHelper.getAddFile(currentBatch, rowId);
+          if (addOpt.isPresent()) {
+            return new KernelRowToSparkRow(addOpt.get().toRow(), SPARK_SCHEMA);
+          }
+        }
+        if (!batchIter.hasNext()) {
+          return null;
+        }
+        currentBatch = batchIter.next();
+        currentRowId = 0;
+        currentBatchSize = currentBatch.getData().getSize();
+      }
+    }
+  }
+}
@@ -0,0 +1,174 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.spark.internal.v2.utils;
+
+import io.delta.kernel.Scan;
+import io.delta.kernel.defaults.engine.DefaultEngine;
+import io.delta.kernel.engine.Engine;
+import io.delta.kernel.internal.ScanImpl;
+import io.delta.kernel.internal.SnapshotImpl;
+import io.delta.kernel.internal.actions.Metadata;
+import io.delta.kernel.internal.actions.Protocol;
+import io.delta.kernel.internal.checksum.CRCInfo;
+import io.delta.kernel.internal.lang.Lazy;
+import io.delta.kernel.internal.metrics.SnapshotQueryContext;
+import io.delta.kernel.internal.metrics.SnapshotReportImpl;
+import io.delta.kernel.internal.replay.LogReplay;
+import io.delta.kernel.internal.snapshot.LogSegment;
+import io.delta.kernel.metrics.SnapshotReport;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Optional;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.spark.util.SerializableConfiguration;
+
+/**
+ * Serializable carrier for a Delta snapshot's state. Created on the driver from an existing {@link
+ * SnapshotImpl} (zero I/O), and reconstructed on the executor as a read-only {@link Scan} via
+ * {@link #toScan(Configuration)}. The returned {@code Scan} interface exposes only read operations,
+ * preventing accidental misuse of write-path APIs (e.g. {@code Committer}).
+ */
+public class SerializableReadOnlySnapshot implements Serializable {
+
+  private static final long serialVersionUID = 1L;
+
+  private final String dataPath;
+  private final String logPath;
+  private final long version;
+
+  private final ArrayList<SerializableFileStatus> deltas;
+  private final ArrayList<SerializableFileStatus> compactions;
+  private final ArrayList<SerializableFileStatus> checkpoints;
+  private final SerializableFileStatus deltaAtEndVersion;
+  private final SerializableFileStatus lastSeenChecksum; // nullable
+  private final Long maxPublishedDeltaVersion; // nullable
+
+  private final Protocol protocol;
+  private final Metadata metadata;
+  private final SerializableConfiguration hadoopConf;
+
+  private SerializableReadOnlySnapshot(
+      String dataPath,
+      String logPath,
+      long version,
+      ArrayList<SerializableFileStatus> deltas,
+      ArrayList<SerializableFileStatus> compactions,
+      ArrayList<SerializableFileStatus> checkpoints,
+      SerializableFileStatus deltaAtEndVersion,
+      SerializableFileStatus lastSeenChecksum,
+      Long maxPublishedDeltaVersion,
+      Protocol protocol,
+      Metadata metadata,
+      SerializableConfiguration hadoopConf) {
+    this.dataPath = dataPath;
+    this.logPath = logPath;
+    this.version = version;
+    this.deltas = deltas;
+    this.compactions = compactions;
+    this.checkpoints = checkpoints;
+    this.deltaAtEndVersion = deltaAtEndVersion;
+    this.lastSeenChecksum = lastSeenChecksum;
+    this.maxPublishedDeltaVersion = maxPublishedDeltaVersion;
+    this.protocol = protocol;
+    this.metadata = metadata;
+    this.hadoopConf = hadoopConf;
+  }
+
+  /**
+   * Driver-side: extract the snapshot state from an existing Kernel {@link SnapshotImpl}. This
+   * performs zero I/O — all data is already in memory on the driver.
+   */
+  public static SerializableReadOnlySnapshot fromSnapshot(
+      SnapshotImpl snapshot, Configuration hadoopConf) {
+    LogSegment logSegment = snapshot.getLogSegment();
+    return new SerializableReadOnlySnapshot(
+        snapshot.getDataPath().toString(),
+        logSegment.getLogPath().toString(),
+        snapshot.getVersion(null /* engine, unused for SnapshotImpl */),
+        new ArrayList<>(SerializableFileStatus.fromList(logSegment.getDeltas())),
+        new ArrayList<>(SerializableFileStatus.fromList(logSegment.getCompactions())),
+        new ArrayList<>(SerializableFileStatus.fromList(logSegment.getCheckpoints())),
+        SerializableFileStatus.from(logSegment.getDeltaFileAtEndVersion()),
+        logSegment.getLastSeenChecksum().map(SerializableFileStatus::from).orElse(null),
+        logSegment.getMaxPublishedDeltaVersion().orElse(null),
+        snapshot.getProtocol(),
+        snapshot.getMetadata(),
+        new SerializableConfiguration(hadoopConf));
+  }
+
+  /**
+   * Executor-side: reconstruct a read-only {@link Scan} from the serialized snapshot state. The
+   * returned {@code Scan} only exposes {@code getScanFiles()} — no write-path or commit APIs.
+   */
+  public Scan toScan(Configuration hadoopConfOverride) {
+    return buildScan(hadoopConfOverride);
+  }
+
+  /**
+   * Executor-side: reconstruct using the serialized Hadoop configuration. Convenience overload when
+   * no conf override is needed.
+   */
+  public Scan toScan() {
+    return buildScan(hadoopConf.value());
+  }
+
+  public long getVersion() {
+    return version;
+  }
+
+  /** Returns the serialized Hadoop configuration for creating an Engine on the executor. */
+  public Configuration getHadoopConf() {
+    return hadoopConf.value();
+  }
+
+  // ---- internal reconstruction ----
+
+  private Scan buildScan(Configuration conf) {
+    Engine engine = DefaultEngine.create(conf);
+    io.delta.kernel.utils.Path kernelDataPath = new io.delta.kernel.utils.Path(dataPath);
+    io.delta.kernel.utils.Path kernelLogPath = new io.delta.kernel.utils.Path(logPath);
+
+    LogSegment logSegment =
+        new LogSegment(
+            kernelLogPath,
+            version,
+            SerializableFileStatus.toFileStatusList(deltas),
+            SerializableFileStatus.toFileStatusList(compactions),
+            SerializableFileStatus.toFileStatusList(checkpoints),
+            deltaAtEndVersion.toFileStatus(),
+            Optional.ofNullable(lastSeenChecksum).map(SerializableFileStatus::toFileStatus),
+            Optional.ofNullable(maxPublishedDeltaVersion));
+
+    Lazy<LogSegment> lazyLogSegment = new Lazy<>(() -> logSegment);
+    Lazy<Optional<CRCInfo>> lazyCrcInfo = new Lazy<>(Optional::empty);
+
+    LogReplay logReplay = new LogReplay(engine, kernelDataPath, lazyLogSegment, lazyCrcInfo);
+
+    SnapshotQueryContext queryContext = SnapshotQueryContext.forVersionSnapshot(dataPath, version);
+    queryContext.setResolvedVersion(version);
+    SnapshotReport snapshotReport = SnapshotReportImpl.forSuccess(queryContext);
+
+    return new ScanImpl(
+        metadata.getSchema(),
+        metadata.getSchema(),
+        protocol,
+        metadata,
+        logReplay,
+        Optional.empty(),
+        kernelDataPath,
+        snapshotReport);
+  }
+}