apache · ahmedabu98 · Dec 26, 2025 · Dec 26, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergScanConfig.java
@@ -127,7 +127,7 @@ public org.apache.iceberg.Schema getRequiredSchema() {
     if (cachedRequiredSchema == null) {
       cachedRequiredSchema =
           resolveSchema(
-              getTable().schema(),
+              IcebergUtils.beamSchemaToIcebergSchema(getSchema()),
               getKeepFields(),
               getDropFields(),
               FilterUtils.getReferencedFieldNames(getFilterString()));

diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IncrementalScanSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IncrementalScanSource.java
@@ -42,11 +42,11 @@
  * source creates a single range, while the unbounded implementation continuously polls for new
  * snapshots at the specified interval.
  */
-class IncrementalScanSource extends PTransform<PBegin, PCollection<Row>> {
+public class IncrementalScanSource extends PTransform<PBegin, PCollection<Row>> {
   private static final Duration DEFAULT_POLL_INTERVAL = Duration.standardSeconds(60);
-  private final IcebergScanConfig scanConfig;
+  protected final IcebergScanConfig scanConfig;
 
-  IncrementalScanSource(IcebergScanConfig scanConfig) {
+  public IncrementalScanSource(IcebergScanConfig scanConfig) {
     this.scanConfig = scanConfig;
   }
 
@@ -74,14 +74,15 @@ public PCollection<Row> expand(PBegin input) {
   }
 
   /** Continuously watches for new snapshots. */
-  private PCollection<KV<String, List<SnapshotInfo>>> unboundedSnapshots(PBegin input) {
+  protected PCollection<KV<String, List<SnapshotInfo>>> unboundedSnapshots(PBegin input) {
     Duration pollInterval =
         MoreObjects.firstNonNull(scanConfig.getPollInterval(), DEFAULT_POLL_INTERVAL);
     return input.apply("Watch for Snapshots", new WatchForSnapshots(scanConfig, pollInterval));
   }
 
   /** Creates a fixed snapshot range. */
-  private PCollection<KV<String, List<SnapshotInfo>>> boundedSnapshots(PBegin input, Table table) {
+  protected PCollection<KV<String, List<SnapshotInfo>>> boundedSnapshots(
+      PBegin input, Table table) {
     checkStateNotNull(
         table.currentSnapshot().snapshotId(),
         "Table %s does not have any snapshots to read from.",

diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/PartitionUtils.java
@@ -25,8 +25,15 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Maps;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.MetadataColumns;
+import org.apache.iceberg.PartitionField;
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
 import org.checkerframework.checker.nullness.qual.Nullable;
 
 class PartitionUtils {
@@ -90,4 +97,51 @@ static PartitionSpec toPartitionSpec(
 
     return builder.build();
   }
+
+  /**
+   * Copied over from Apache Iceberg's <a
+   * href="https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/util/PartitionUtil.java">PartitionUtil</a>
+   */
+  public static Map<Integer, ?> constantsMap(
+      PartitionSpec spec, ContentFile<?> file, BiFunction<Type, Object, Object> convertConstant) {
+    StructLike partitionData = file.partition();
+
+    // use java.util.HashMap because partition data may contain null values
+    Map<Integer, Object> idToConstant = Maps.newHashMap();
+
+    // add first_row_id as _row_id
+    if (file.firstRowId() != null) {
+      idToConstant.put(
+          MetadataColumns.ROW_ID.fieldId(),
+          convertConstant.apply(Types.LongType.get(), file.firstRowId()));
+    }
+
+    idToConstant.put(
+        MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(),
+        convertConstant.apply(Types.LongType.get(), file.fileSequenceNumber()));
+
+    // add _file
+    idToConstant.put(
+        MetadataColumns.FILE_PATH.fieldId(),
+        convertConstant.apply(Types.StringType.get(), file.location()));
+
+    // add _spec_id
+    idToConstant.put(
+        MetadataColumns.SPEC_ID.fieldId(),
+        convertConstant.apply(Types.IntegerType.get(), file.specId()));
+
+    List<Types.NestedField> partitionFields = spec.partitionType().fields();
+    List<PartitionField> fields = spec.fields();
+    for (int pos = 0; pos < fields.size(); pos += 1) {
+      PartitionField field = fields.get(pos);
+      if (field.transform().isIdentity()) {
+        Object converted =
+            convertConstant.apply(
+                partitionFields.get(pos).type(), partitionData.get(pos, Object.class));
+        idToConstant.put(field.sourceId(), converted);
+      }
+    }
+
+    return idToConstant;
+  }
 }
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadFromTasks.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/ReadFromTasks.java
@@ -75,9 +75,7 @@ public void process(
       }
       FileScanTask task = fileScanTasks.get((int) l);
       Schema beamSchema = IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema());
-      try (CloseableIterable<Record> fullIterable =
-          ReadUtils.createReader(task, table, scanConfig.getRequiredSchema())) {
-        CloseableIterable<Record> reader = ReadUtils.maybeApplyFilter(fullIterable, scanConfig);
+      try (CloseableIterable<Record> reader = ReadUtils.createReader(task, table, scanConfig)) {
 
         for (Record record : reader) {
           Row row = IcebergUtils.icebergRecordToBeamRow(beamSchema, record);