naming changes; key on snapshot ID instead of ordinal ID

ahmedabu98 · ahmedabu98 · commit 4e81f78e1ead · 2025-12-28T03:41:20.000-05:00
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java
@@ -61,9 +61,9 @@ public class ChangelogScanner
   private static final Counter numDeletedDataFileScanTasks =
       Metrics.counter(ChangelogScanner.class, "numDeletedDataFileScanTasks");
   public static final TupleTag<KV<ChangelogDescriptor, List<SerializableChangelogTask>>>
-      UNIFORM_CHANGES = new TupleTag<>();
+      UNIDIRECTIONAL_CHANGES = new TupleTag<>();
   public static final TupleTag<KV<ChangelogDescriptor, List<SerializableChangelogTask>>>
-      MIXED_CHANGES = new TupleTag<>();
+      BIDIRECTIONAL_CHANGES = new TupleTag<>();
   public static final KvCoder<ChangelogDescriptor, List<SerializableChangelogTask>> OUTPUT_CODER =
       KvCoder.of(ChangelogDescriptor.coder(), ListCoder.of(SerializableChangelogTask.coder()));
   private final IcebergScanConfig scanConfig;
@@ -117,7 +117,7 @@ private void createAndOutputReadTasks(
     Map<Long, Long> cachedSnapshotTimestamps = new HashMap<>();
     // Maintain the same scan task groupings produced by Iceberg's binpacking, for
     // better work load distribution among readers.
-    // Also allows the user to control by setting a `read.split.target-size`:
+    // This allows the user to control load per worker by tuning `read.split.target-size`:
     // https://iceberg.apache.org/docs/latest/configuration/#read-properties
     Map<Integer, List<List<SerializableChangelogTask>>> changelogScanTaskGroups = new HashMap<>();
 
@@ -126,7 +126,7 @@ private void createAndOutputReadTasks(
 
     try (CloseableIterable<ScanTaskGroup<ChangelogScanTask>> scanTaskGroups = scan.planTasks()) {
       for (ScanTaskGroup<ChangelogScanTask> scanTaskGroup : scanTaskGroups) {
-        Map<Integer, List<SerializableChangelogTask>> ordinalGroups = new HashMap<>();
+        Map<Integer, List<SerializableChangelogTask>> ordinalTaskGroup = new HashMap<>();
 
         for (ChangelogScanTask changelogScanTask : scanTaskGroup.tasks()) {
           long snapshotId = changelogScanTask.commitSnapshotId();
@@ -137,7 +137,7 @@ private void createAndOutputReadTasks(
 
           SerializableChangelogTask task =
               SerializableChangelogTask.from(changelogScanTask, timestampMillis);
-          ordinalGroups.computeIfAbsent(ordinal, (unused) -> new ArrayList<>()).add(task);
+          ordinalTaskGroup.computeIfAbsent(ordinal, (o) -> new ArrayList<>()).add(task);
 
           changeTypesPerOrdinal
               .computeIfAbsent(ordinal, (o) -> new HashSet<>())
@@ -158,7 +158,7 @@ private void createAndOutputReadTasks(
         }
 
         for (Map.Entry<Integer, List<SerializableChangelogTask>> ordinalGroup :
-            ordinalGroups.entrySet()) {
+            ordinalTaskGroup.entrySet()) {
           changelogScanTaskGroups
               .computeIfAbsent(ordinalGroup.getKey(), (unused) -> new ArrayList<>())
               .add(ordinalGroup.getValue());
@@ -198,17 +198,20 @@ private void createAndOutputReadTasks(
             KV.of(descriptor, subgroup);
 
         // Determine where each ordinal's tasks will go, based on the type of changes:
-        // 1. If an ordinal's changes are uniform (i.e. all inserts or all deletes), they should be
+        // 1. If an ordinal's changes are unidirectional (i.e. only inserts or only deletes), they
+        // should be
         // processed directly in the fast path.
-        // 2. If an ordinal's changes are mixed (i.e. some inserts and some deletes), they will need
+        // 2. If an ordinal's changes are bidirectional (i.e. both inserts and deletes), they will
+        // need
         // more careful processing to determine if any updates have occurred.
         Set<SerializableChangelogTask.Type> changeTypes =
             checkStateNotNull(changeTypesPerOrdinal.get(ordinal));
         TupleTag<KV<ChangelogDescriptor, List<SerializableChangelogTask>>> outputTag;
-        if (changeTypes.contains(ADDED_ROWS) && changeTypes.size() > 1) { // added and deleted rows
-          outputTag = MIXED_CHANGES;
-        } else { // all added or all deleted rows
-          outputTag = UNIFORM_CHANGES;
+        if (changeTypes.contains(ADDED_ROWS)
+            && changeTypes.size() > 1) { // both added and deleted rows
+          outputTag = BIDIRECTIONAL_CHANGES;
+        } else { // only added or only deleted rows
+          outputTag = UNIDIRECTIONAL_CHANGES;
         }
 
         multiOutputReceiver.get(outputTag).outputWithTimestamp(output, timestamp);
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/IncrementalChangelogSource.java
@@ -17,8 +17,8 @@
  */
 package org.apache.beam.sdk.io.iceberg.cdc;
 
-import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.MIXED_CHANGES;
-import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIFORM_CHANGES;
+import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.BIDIRECTIONAL_CHANGES;
+import static org.apache.beam.sdk.io.iceberg.cdc.ChangelogScanner.UNIDIRECTIONAL_CHANGES;
 import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_DELETES;
 import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.KEYED_INSERTS;
 import static org.apache.beam.sdk.io.iceberg.cdc.ReadFromChangelogs.UNIFORM_ROWS;
@@ -77,25 +77,28 @@ public PCollection<Row> expand(PBegin input) {
             .apply(
                 "Create Changelog Tasks",
                 ParDo.of(new ChangelogScanner(scanConfig))
-                    .withOutputTags(UNIFORM_CHANGES, TupleTagList.of(MIXED_CHANGES)));
+                    .withOutputTags(
+                        UNIDIRECTIONAL_CHANGES, TupleTagList.of(BIDIRECTIONAL_CHANGES)));
 
     // for changelog ordinal groups that have UNIFORM changes (i.e. all deletes, or all inserts),
     // take the fast approach of just reading and emitting CDC records.
-    PCollection<Row> fastPathCdcRows =
-        processUniformChanges(
-            changelogTasks.get(UNIFORM_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER));
+    PCollection<Row> uniDirectionalCdcRows =
+        processUniDirectionalChanges(
+            changelogTasks.get(UNIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER));
 
-    // changelog ordinal groups that have MIXED changes (i.e. some deletes and some inserts)
-    // will need extra processing to identify any updates
-    PCollection<Row> slowPathCdcRows =
-        processMixedChanges(
-            changelogTasks.get(MIXED_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER));
+    // changelog ordinal groups that have BIDIRECTIONAL changes (i.e. both deletes and inserts)
+    // will need extra processing (including a shuffle) to identify any updates
+    PCollection<Row> largeBiDirectionalCdcRows =
+        processLargeBiDirectionalChanges(
+            changelogTasks.get(BIDIRECTIONAL_CHANGES).setCoder(ChangelogScanner.OUTPUT_CODER));
 
-    // Merge UNIFORM and MIXED outputs
-    return PCollectionList.of(fastPathCdcRows).and(slowPathCdcRows).apply(Flatten.pCollections());
+    // Merge UNIDIRECTIONAL and BIDIRECTIONAL outputs
+    return PCollectionList.of(uniDirectionalCdcRows)
+        .and(largeBiDirectionalCdcRows)
+        .apply(Flatten.pCollections());
   }
 
-  private PCollection<Row> processUniformChanges(
+  private PCollection<Row> processUniDirectionalChanges(
       PCollection<KV<ChangelogDescriptor, List<SerializableChangelogTask>>> uniformChangelogs) {
     return uniformChangelogs
         .apply(Redistribute.arbitrarily())
@@ -107,41 +110,47 @@ private PCollection<Row> processUniformChanges(
         .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema()));
   }
 
-  private PCollection<Row> processMixedChanges(
-      PCollection<KV<ChangelogDescriptor, List<SerializableChangelogTask>>> mixedChangelogs) {
-    PCollectionTuple mixedCdcKeyedRows =
-        mixedChangelogs
+  private PCollection<Row> processLargeBiDirectionalChanges(
+      PCollection<KV<ChangelogDescriptor, List<SerializableChangelogTask>>>
+          biDirectionalChangelogs) {
+    PCollectionTuple biDirectionalKeyedRows =
+        biDirectionalChangelogs
             .apply(Redistribute.arbitrarily())
             .apply(
-                "Read Mixed Changes",
+                "Read Large BiDirectional Changes",
                 ParDo.of(ReadFromChangelogs.withKeyedOutput(scanConfig))
                     .withOutputTags(KEYED_INSERTS, TupleTagList.of(KEYED_DELETES)));
 
     // prior to CoGBK, set a windowing strategy to maintain the earliest timestamp in the window
+    // this allows us to emit records downstream that may have larger reified timestamps
     Window<KV<Row, TimestampedValue<Row>>> windowingStrategy =
         Window.<KV<Row, TimestampedValue<Row>>>into(new GlobalWindows())
             .withTimestampCombiner(TimestampCombiner.EARLIEST);
 
     // preserve the element's timestamp by moving it into the value
+    // in the normal case, this will be a no-op because all CDC rows in an ordinal have the same
+    // commit timestamp.
+    // but this will matter if we add custom watermarking, where record timestamps are
+    // derived from a specified column
     KvCoder<Row, Row> keyedOutputCoder = ReadFromChangelogs.keyedOutputCoder(scanConfig);
     PCollection<KV<Row, TimestampedValue<Row>>> keyedInsertsWithTimestamps =
-        mixedCdcKeyedRows
+        biDirectionalKeyedRows
             .get(KEYED_INSERTS)
             .setCoder(keyedOutputCoder)
-            .apply(Reify.timestampsInValue())
+            .apply("Reify INSERT Timestamps", Reify.timestampsInValue())
             .apply(windowingStrategy);
     PCollection<KV<Row, TimestampedValue<Row>>> keyedDeletesWithTimestamps =
-        mixedCdcKeyedRows
+        biDirectionalKeyedRows
             .get(KEYED_DELETES)
             .setCoder(keyedOutputCoder)
-            .apply(Reify.timestampsInValue())
+            .apply("Reify DELETE Timestamps", Reify.timestampsInValue())
             .apply(windowingStrategy);
 
     // CoGroup by record ID and emit any (DELETE + INSERT) pairs as updates: (UPDATE_BEFORE,
     // UPDATE_AFTER)
     return KeyedPCollectionTuple.of(INSERTS, keyedInsertsWithTimestamps)
         .and(DELETES, keyedDeletesWithTimestamps)
-        .apply(CoGroupByKey.create())
+        .apply("CoGroupBy Row ID", CoGroupByKey.create())
         .apply("Reconcile Inserts and Deletes", ParDo.of(new ReconcileChanges()))
         .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(scanConfig.getProjectedSchema()));
   }
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReadFromChangelogs.java
@@ -66,8 +66,8 @@ public class ReadFromChangelogs<OutT>
   private transient StructProjection recordIdProjection;
   private transient org.apache.iceberg.Schema recordIdSchema;
   private final Schema beamRowSchema;
-  private final Schema rowIdWithOrdinalBeamSchema;
-  private static final String ORDINAL_FIELD = "__beam__changelog__ordinal__";
+  private final Schema rowAndSnapshotIDBeamSchema;
+  private static final String SNAPSHOT_FIELD = "__beam__changelog__snapshot__id__";
 
   private ReadFromChangelogs(IcebergScanConfig scanConfig, boolean keyedOutput) {
     this.scanConfig = scanConfig;
@@ -78,13 +78,7 @@ private ReadFromChangelogs(IcebergScanConfig scanConfig, boolean keyedOutput) {
     this.recordIdSchema = recordSchema.select(recordSchema.identifierFieldNames());
     this.recordIdProjection = StructProjection.create(recordSchema, recordIdSchema);
 
-    Schema rowIdBeamSchema = icebergSchemaToBeamSchema(recordIdSchema);
-    List<Schema.Field> fields =
-        ImmutableList.<Schema.Field>builder()
-            .add(Schema.Field.of(ORDINAL_FIELD, Schema.FieldType.INT32))
-            .addAll(rowIdBeamSchema.getFields())
-            .build();
-    this.rowIdWithOrdinalBeamSchema = new Schema(fields);
+    this.rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig);
   }
 
   static ReadFromChangelogs<Row> of(IcebergScanConfig scanConfig) {
@@ -100,19 +94,24 @@ static ReadFromChangelogs<KV<Row, Row>> withKeyedOutput(IcebergScanConfig scanCo
    * schema's identifier fields.
    */
   static KvCoder<Row, Row> keyedOutputCoder(IcebergScanConfig scanConfig) {
+    org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema();
+    Schema rowAndSnapshotIDBeamSchema = rowAndSnapshotIDBeamSchema(scanConfig);
+    return KvCoder.of(
+        SchemaCoder.of(rowAndSnapshotIDBeamSchema),
+        SchemaCoder.of(icebergSchemaToBeamSchema(recordSchema)));
+  }
+
+  private static Schema rowAndSnapshotIDBeamSchema(IcebergScanConfig scanConfig) {
     org.apache.iceberg.Schema recordSchema = scanConfig.getProjectedSchema();
     org.apache.iceberg.Schema recordIdSchema =
-        recordSchema.select(recordSchema.identifierFieldNames());
+      recordSchema.select(recordSchema.identifierFieldNames());
     Schema rowIdBeamSchema = icebergSchemaToBeamSchema(recordIdSchema);
     List<Schema.Field> fields =
-        ImmutableList.<Schema.Field>builder()
-            .add(Schema.Field.of(ORDINAL_FIELD, Schema.FieldType.INT32))
-            .addAll(rowIdBeamSchema.getFields())
-            .build();
-    Schema rowIdWithOrdinalBeamSchema = new Schema(fields);
-    return KvCoder.of(
-        SchemaCoder.of(rowIdWithOrdinalBeamSchema),
-        SchemaCoder.of(icebergSchemaToBeamSchema(recordSchema)));
+      ImmutableList.<Schema.Field>builder()
+        .add(Schema.Field.of(SNAPSHOT_FIELD, Schema.FieldType.INT64))
+        .addAll(rowIdBeamSchema.getFields())
+        .build();
+    return new Schema(fields);
   }
 
   @Setup
@@ -164,12 +163,16 @@ private void processAddedRowsTask(
     try (CloseableIterable<Record> fullIterable = ReadUtils.createReader(task, table, scanConfig)) {
       DeleteFilter<Record> deleteFilter =
           ReadUtils.genericDeleteFilter(
-              table, scanConfig, task.getDataFile().getPath(), task.getExistingDeletes());
+              table, scanConfig, task.getDataFile().getPath(), task.getAddedDeletes());
       CloseableIterable<Record> filtered = deleteFilter.filter(fullIterable);
 
       for (Record rec : filtered) {
         outputRecord(
-            rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_INSERTS);
+            rec,
+            outputReceiver,
+            task.getCommitSnapshotId(),
+            task.getTimestampMillis(),
+            KEYED_INSERTS);
       }
     }
     numAddedRowsScanTasksCompleted.inc();
@@ -192,7 +195,11 @@ private void processDeletedRowsTask(
       for (Record rec : newlyDeletedRecords) {
         // TODO: output with DELETE kind
         outputRecord(
-            rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_DELETES);
+            rec,
+            outputReceiver,
+            task.getCommitSnapshotId(),
+            task.getTimestampMillis(),
+            KEYED_DELETES);
       }
     }
     numDeletedRowsScanTasksCompleted.inc();
@@ -209,7 +216,11 @@ private void processDeletedFileTask(
       for (Record rec : filtered) {
         // TODO: output with DELETE kind
         outputRecord(
-            rec, outputReceiver, task.getOrdinal(), task.getTimestampMillis(), KEYED_DELETES);
+            rec,
+            outputReceiver,
+            task.getCommitSnapshotId(),
+            task.getTimestampMillis(),
+            KEYED_DELETES);
       }
     }
     numDeletedDataFileScanTasksCompleted.inc();
@@ -218,35 +229,38 @@ private void processDeletedFileTask(
   private void outputRecord(
       Record rec,
       MultiOutputReceiver outputReceiver,
-      int ordinal,
+      long snapshotId,
       long timestampMillis,
       TupleTag<KV<Row, Row>> keyedTag) {
     Row row = IcebergUtils.icebergRecordToBeamRow(beamRowSchema, rec);
     Instant timestamp = Instant.ofEpochMilli(timestampMillis);
     if (keyedOutput) { // slow path
       StructProjection recId = recordIdProjection.wrap(rec);
-      // Create a Row ID consisting of record ID columns and the changelog task's ordinal #
-      Row id = structToBeamRow(ordinal, recId, recordIdSchema, rowIdWithOrdinalBeamSchema);
+      // Create a Row ID consisting of:
+      // 1. the task's commit snapshot ID
+      // 2. the record ID column values
+      // This is needed to sufficiently distinguish a record change
+      Row id = structToBeamRow(snapshotId, recId, recordIdSchema, rowAndSnapshotIDBeamSchema);
       outputReceiver.get(keyedTag).outputWithTimestamp(KV.of(id, row), timestamp);
     } else { // fast path
-      System.out.printf("[UNIFORM] -- Output(%s, %s)\n%s%n", ordinal, timestamp, row);
+      System.out.printf("[UNIFORM] -- Output(%s, %s)\n%s%n", snapshotId, timestamp, row);
       outputReceiver.get(UNIFORM_ROWS).outputWithTimestamp(row, timestamp);
     }
   }
 
   public static Row structToBeamRow(
-      int ordinal, StructLike struct, org.apache.iceberg.Schema schema, Schema beamSchema) {
+      long snapshotId, StructLike struct, org.apache.iceberg.Schema schema, Schema beamSchema) {
     ImmutableMap.Builder<String, Object> values = ImmutableMap.builder();
     List<Types.NestedField> columns = schema.columns();
     for (Types.NestedField column : columns) {
       String name = column.name();
       Object value = schema.accessorForField(column.fieldId()).get(struct);
       values.put(name, value);
     }
-    // Include ordinal as part of the row ID.
+    // Include snapshot ID as part of the row ID.
     // This is essential to ensure that the downstream ReconcileChanges compares rows
     // within the same operation.
-    values.put(ORDINAL_FIELD, ordinal);
+    values.put(SNAPSHOT_FIELD, snapshotId);
     return Row.withSchema(beamSchema).withFieldValues(values.build()).build();
   }
 
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ReconcileChanges.java
@@ -36,6 +36,7 @@ public void processElement(
       @Timestamp Instant timestamp,
       OutputReceiver<Row> out) {
     CoGbkResult result = element.getValue();
+    System.out.println("xxx [MIXED] Process timestamp: " + timestamp);
 
     // iterables are lazy-loaded from the shuffle service
     Iterable<TimestampedValue<Row>> deletes = result.getAll(DELETES);