no partition optimization for snapshots with multiple specs; skip duplicate records that may be produced by copy-on-write

ahmedabu98 · ahmedabu98 · commit 4c57d8bb6800 · 2026-01-22T21:22:25.000-05:00
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ChangelogScanner.java
@@ -237,6 +237,7 @@ private void gatherPartitionData(
       Set<Integer> pinnedSpecs,
       Set<Long> snapshotsWithUnpinnedSpecs)
       throws IOException {
+    Map<Long, Set<Integer>> specsInSnapshot = new HashMap<>();
     try (CloseableIterable<ScanTaskGroup<ChangelogScanTask>> groups = scan.planTasks()) {
       for (ScanTaskGroup<ChangelogScanTask> group : groups) {
         for (ChangelogScanTask task : group.tasks()) {
@@ -255,9 +256,16 @@ private void gatherPartitionData(
               .computeIfAbsent(snapshotId, (id) -> new HashMap<>())
               .computeIfAbsent(partition, (p) -> new HashSet<>())
               .add(type);
+          specsInSnapshot.computeIfAbsent(snapshotId, id -> new HashSet<>()).add(specId);
         }
       }
     }
+
+    // snapshots where multiple specs are used are also not safe
+    specsInSnapshot.entrySet().stream()
+        .filter(e -> e.getValue().size() > 1)
+        .map(Map.Entry::getKey)
+        .forEach(snapshotsWithUnpinnedSpecs::add);
   }
 
   private void createAndOutputReadTasks(
@@ -352,7 +360,7 @@ private void createAndOutputReadTasks(
                           checkStateNotNull(changeTypesPerPartitionPerSnapshot.get(snapshotId))
                               .get(partition))))) {
             // TODO: remove debug printing
-            System.out.printf("\tUnidirectional task with partition '%s':\n", partition);
+            System.out.printf("\tBidirectional task with partition '%s':\n", partition);
             System.out.printf(
                 "\t\t(%s) DF: %s\n",
                 task.getClass().getSimpleName(), name(getDataFile(task).location()));
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ResolveChanges.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/cdc/ResolveChanges.java
@@ -17,13 +17,18 @@
  */
 package org.apache.beam.sdk.io.iceberg.cdc;
 
-import java.util.Iterator;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.beam.sdk.schemas.Schema;
 import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.join.CoGbkResult;
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.Row;
 import org.apache.beam.sdk.values.TimestampedValue;
 import org.apache.beam.sdk.values.TupleTag;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists;
 
 /**
  * Receives inserts and deletes, keyed by snapshot ID and Primary Key, and determines if any updates
@@ -34,62 +39,143 @@
  *
  * <p>Otherwise, records are output as-is: INSERT as INSERT, and DELETE as DELETE.
  *
- * <p>Input elements have their timestamp reified. This is because CoGroupByKey assigns all elements
- * in a window with the same timestamp, erasing individual record timestamps. This DoFn preserves it
- * by outputting records with their reified timestamps.
+ * <p>Note: snapshots written using the Copy-on-Write method will produce tasks where records appear
+ * to be deleted then re-inserted. We perform an initial de-duplication and drop these records to
+ * avoid mistaking them as no-op updates.
+ *
+ * <p>Input elements are prepared by reifying their timestamps. This is because CoGroupByKey assigns
+ * all elements in a window with the same timestamp, erasing individual record timestamps. This DoFn
+ * preserves it by outputting records with their reified timestamps.
  */
 public class ResolveChanges extends DoFn<KV<Row, CoGbkResult>, Row> {
   public static final TupleTag<TimestampedValue<Row>> DELETES = new TupleTag<>() {};
   public static final TupleTag<TimestampedValue<Row>> INSERTS = new TupleTag<>() {};
 
   @DoFn.ProcessElement
   public void processElement(@Element KV<Row, CoGbkResult> element, OutputReceiver<Row> out) {
+    Set<String> pkFields = new HashSet<>(element.getKey().getSchema().getFieldNames());
     CoGbkResult result = element.getValue();
 
-    // iterables are lazy-loaded from the shuffle service
-    Iterable<TimestampedValue<Row>> deletes = result.getAll(DELETES);
-    Iterable<TimestampedValue<Row>> inserts = result.getAll(INSERTS);
+    List<TimestampedValue<Row>> deletes = Lists.newArrayList(result.getAll(DELETES));
+    List<TimestampedValue<Row>> inserts = Lists.newArrayList(result.getAll(INSERTS));
+    deletes.sort(Comparator.comparing(TimestampedValue::getTimestamp));
+    inserts.sort(Comparator.comparing(TimestampedValue::getTimestamp));
+
+    boolean[] duplicateDeletes = new boolean[deletes.size()];
+    boolean[] duplicateInserts = new boolean[inserts.size()];
 
-    boolean hasDeletes = deletes.iterator().hasNext();
-    boolean hasInserts = inserts.iterator().hasNext();
+    boolean hasDeletes = !deletes.isEmpty();
+    boolean hasInserts = !inserts.isEmpty();
 
     if (hasInserts && hasDeletes) {
       // UPDATE: row ID exists in both streams
       // - emit all deletes as 'UPDATE_BEFORE', and all inserts as 'UPDATE_AFTER'
-      // - emit extra inserts as 'UPDATE_AFTER'
-      // - ignore extra deletes (TODO: double check if this is a good decision)
-      Iterator<TimestampedValue<Row>> deletesIterator = deletes.iterator();
-      Iterator<TimestampedValue<Row>> insertsIterator = inserts.iterator();
-      while (deletesIterator.hasNext() && insertsIterator.hasNext()) {
-        // TODO: output as UPDATE_BEFORE kind
-        TimestampedValue<Row> updateBefore = deletesIterator.next();
-        out.outputWithTimestamp(updateBefore.getValue(), updateBefore.getTimestamp());
-        System.out.printf("[BIDIRECTIONAL] -- UpdateBefore%n%s%n", updateBefore);
-
-        // TODO: output as UPDATE_AFTER kind
-        TimestampedValue<Row> updateAfter = insertsIterator.next();
-        out.outputWithTimestamp(updateAfter.getValue(), updateAfter.getTimestamp());
-        System.out.printf("[BIDIRECTIONAL] -- UpdateAfter%n%s%n", updateAfter);
+      // - emit extra deletes as 'DELETE'
+      // - emit extra inserts as 'INSERT'
+
+      // First, loop through both deletes and inserts and deduplicate records.
+      // Duplicates can occur when an Iceberg writer uses CoW method:
+      //  Deletes records by rewriting an entire DataFile except for the few records intended for
+      // deletion.
+      //  From our perspective, all the records were deleted, and some were inserted back in.
+      //  We must ignore these records and not mistake them for "updates".
+      for (int d = 0; d < deletes.size(); d++) {
+        TimestampedValue<Row> delete = deletes.get(d);
+
+        for (int i = 0; i < inserts.size(); i++) {
+          if (duplicateInserts[i]) {
+            continue;
+          }
+
+          TimestampedValue<Row> insert = inserts.get(i);
+          if (isDuplicate(pkFields, delete.getValue(), insert.getValue())) {
+            duplicateDeletes[d] = true;
+            duplicateInserts[i] = true;
+            System.out.printf("[DEDUPE] -- Ignored CoW record: %s%n", delete);
+            break;
+          }
+        }
+      }
+
+      // Second, loop through and output UPDATE pairs
+      int d = 0;
+      int i = 0;
+      while (d < deletes.size() && i < inserts.size()) {
+        // find next unique delete
+        while (d < deletes.size() && duplicateDeletes[d]) {
+          d++;
+        }
+        // find next unique insert
+        while (i < inserts.size() && duplicateInserts[i]) {
+          i++;
+        }
+
+        if (d < deletes.size() && i < inserts.size()) {
+          // UPDATE pair found. output as UpdateBefore/After
+          TimestampedValue<Row> updateBefore = deletes.get(d);
+          TimestampedValue<Row> updateAfter = inserts.get(i);
+
+          // TODO: output as UPDATE_BEFORE and UPDATE_AFTER kind
+          out.outputWithTimestamp(updateBefore.getValue(), updateBefore.getTimestamp());
+          out.outputWithTimestamp(updateAfter.getValue(), updateAfter.getTimestamp());
+          System.out.printf(
+              "[BIDIRECTIONAL] -- UpdateBefore:%n\t%s%n\tUpdateAfter%n\t%s%n",
+              updateBefore, updateAfter);
+
+          d++;
+          i++;
+        }
       }
-      while (insertsIterator.hasNext()) {
-        // TODO: output as UPDATE_AFTER kind
-        TimestampedValue<Row> insert = insertsIterator.next();
-        out.outputWithTimestamp(insert.getValue(), insert.getTimestamp());
-        System.out.printf("[BIDIRECTIONAL] -- Added(extra)%n%s%n", insert);
+
+      // Finally, output extra deletes or inserts
+      while (d < deletes.size()) {
+        // TODO: output as DELETE kind
+        if (!duplicateDeletes[d]) {
+          TimestampedValue<Row> delete = deletes.get(d);
+          out.outputWithTimestamp(delete.getValue(), delete.getTimestamp());
+          System.out.printf("[BIDIRECTIONAL] -- Deleted(extra)%n%s%n", delete);
+        }
+        d++;
+      }
+      while (i < inserts.size()) {
+        // TODO: output as INSERT kind
+        if (!duplicateInserts[i]) {
+          TimestampedValue<Row> insert = inserts.get(i);
+          out.outputWithTimestamp(insert.getValue(), insert.getTimestamp());
+          System.out.printf("[BIDIRECTIONAL] -- Inserted(extra)%n%s%n", insert);
+        }
+        i++;
       }
     } else if (hasInserts) {
       // INSERT only
       for (TimestampedValue<Row> rec : inserts) {
-        System.out.printf("[UNIDIRECTIONAL] -- Added%n%s%n", rec);
+        System.out.printf("[BIDIRECTIONAL (only inserts)] -- Added%n%s%n", rec);
         out.outputWithTimestamp(rec.getValue(), rec.getTimestamp());
       }
     } else if (hasDeletes) {
       // DELETE only
       for (TimestampedValue<Row> rec : deletes) {
         // TODO: output as DELETE kind
-        System.out.printf("[UNIDIRECTIONAL] -- Deleted%n%s%n", rec);
+        System.out.printf("[BIDIRECTIONAL (only deletes)] -- Deleted%n%s%n", rec);
         out.outputWithTimestamp(rec.getValue(), rec.getTimestamp());
       }
     }
   }
+
+  /** Compares both records and checks whether they are duplicates or not. */
+  private static boolean isDuplicate(Set<String> pkFields, Row delete, Row insert) {
+    Schema schema = insert.getSchema();
+    for (String field : insert.getSchema().getFieldNames()) {
+      if (pkFields.contains(field)) {
+        // these records are grouped by Primary Key, so we already know PK values are equal
+        continue;
+      }
+      // return early if two values are not equal
+      if (!Row.Equals.deepEquals(
+          insert.getValue(field), delete.getValue(field), schema.getField(field).getType())) {
+        return false;
+      }
+    }
+    return true;
+  }
 }