simplify validation

ahmedabu98 · ahmedabu98 · commit 1f4771218f9e · 2025-03-20T08:17:07.000-04:00
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/AppendFilesToTables.java
@@ -22,7 +22,9 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.UUID;
+import java.util.stream.Collectors;
 import org.apache.beam.sdk.coders.KvCoder;
 import org.apache.beam.sdk.coders.StringUtf8Coder;
 import org.apache.beam.sdk.metrics.Counter;
@@ -38,6 +40,8 @@
 import org.apache.beam.sdk.util.Preconditions;
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Iterables;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Streams;
 import org.apache.iceberg.AppendFiles;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.FileFormat;
@@ -128,13 +132,12 @@ public void processElement(
         BoundedWindow window)
         throws IOException {
       String tableStringIdentifier = element.getKey();
+      Table table = getCatalog().loadTable(TableIdentifier.parse(element.getKey()));
       Iterable<FileWriteResult> fileWriteResults = element.getValue();
-      if (!fileWriteResults.iterator().hasNext()) {
+      if (shouldSkip(table, fileWriteResults)) {
         return;
       }
 
-      Table table = getCatalog().loadTable(TableIdentifier.parse(element.getKey()));
-
       // vast majority of the time, we will simply append data files.
       // in the rare case we get a batch that contains multiple partition specs, we will group
       // data into manifest files and append.
@@ -211,5 +214,36 @@ private ManifestWriter<DataFile> createManifestWriter(
                   tableLocation, manifestFilePrefix, uuid, spec.specId()));
       return ManifestFiles.write(spec, io.newOutputFile(location));
     }
+
+    // If the process call fails immediately after a successful commit, it gets retried with
+    // the same data, possibly leading to data duplication.
+    // To mitigate, we skip the current batch of files if it matches the most recently committed
+    // batch.
+    //
+    // TODO(ahmedabu98): This does not cover concurrent writes from other pipelines, where the
+    //  "last successful snapshot" might reflect commits from other sources. Ideally, we would make
+    //  this stateful, but that is update incompatible.
+    // TODO(ahmedabu98): add load test pipelines with intentional periodic crashing
+    private boolean shouldSkip(Table table, Iterable<FileWriteResult> fileWriteResults) {
+      if (table.currentSnapshot() == null) {
+        return false;
+      }
+      if (!fileWriteResults.iterator().hasNext()) {
+        return true;
+      }
+
+      Set<String> filesCommittedLastSnapshot =
+          Streams.stream(table.currentSnapshot().addedDataFiles(table.io()))
+              .map(DataFile::path)
+              .map(CharSequence::toString)
+              .collect(Collectors.toSet());
+
+      // Check if the current batch is identical to the most recently committed batch.
+      // Upstream GBK means we always get the same batch of files on retry,
+      // so a single overlapping file means the whole batch is identical.
+      return Iterables.size(fileWriteResults) == filesCommittedLastSnapshot.size()
+          && filesCommittedLastSnapshot.contains(
+              fileWriteResults.iterator().next().getSerializableDataFile().getPath());
+    }
   }
 }
diff --git a/settings.gradle.kts b/settings.gradle.kts
@@ -363,3 +363,4 @@ include("sdks:java:io:iceberg:bqms")
 findProject(":sdks:java:io:iceberg:bqms")?.name = "bqms"
 include("it:clickhouse")
 findProject(":it:clickhouse")?.name = "clickhouse"
+include("testing")