fix(table): correct deduplication logic for data files in MaintenanceTable

ForeverAngry · ForeverAngry · commit 635a1d956c49 · 2025-07-05T19:45:36.000-04:00
The deduplicate_data_files() method was not properly removing duplicate
data file references from Iceberg tables. After deduplication, multiple
references to the same data file remained instead of the expected single
reference.

Root causes:
1. _get_all_datafiles() was scanning ALL snapshots instead of current only
2. Incorrect transaction API usage that didn't leverage snapshot updates
3. Missing proper overwrite logic to create clean deduplicated snapshots

Key fixes:
- Modified _get_all_datafiles() to scan only current snapshot manifests
- Implemented proper transaction pattern using update_snapshot().overwrite()
- Added explicit delete_data_file() calls for duplicates + append_data_file() for unique files
- Removed unused helper methods _get_all_datafiles_with_context() and _detect_duplicates()

Technical details:
- Deduplication now operates on ManifestEntry objects from current snapshot only
- Files are grouped by basename and first occurrence is kept as canonical reference
- New snapshot created atomically replaces current snapshot with deduplicated file list
- Proper Iceberg transaction semantics ensure data consistency

Tests: All deduplication tests now pass including the previously failing
test_deduplicate_data_files_removes_duplicates_in_current_snapshot

Fixes: Table maintenance deduplication functionality
diff --git a/pyiceberg/table/maintenance.py b/pyiceberg/table/maintenance.py
@@ -293,107 +293,88 @@ def _get_protected_snapshot_ids(self, table_metadata: TableMetadata) -> Set[int]
         return protected_ids
 
     def _get_all_datafiles(self) -> List[DataFile]:
-        """Collect all DataFiles in the table, scanning all partitions."""
+        """Collect all DataFiles in the current snapshot only."""
         datafiles: List[DataFile] = []
 
+        current_snapshot = self.tbl.current_snapshot()
+        if not current_snapshot:
+            return datafiles
+
         def process_manifest(manifest: ManifestFile) -> list[DataFile]:
             found: list[DataFile] = []
-            for entry in manifest.fetch_manifest_entry(io=self.tbl.io):
+            for entry in manifest.fetch_manifest_entry(io=self.tbl.io, discard_deleted=True):
                 if hasattr(entry, "data_file"):
                     found.append(entry.data_file)
             return found
 
-        # Scan all snapshots
-        manifests = []
-        for snapshot in self.tbl.snapshots():
-            manifests.extend(snapshot.manifests(io=self.tbl.io))
+        # Scan only the current snapshot's manifests
+        manifests = current_snapshot.manifests(io=self.tbl.io)
         with ThreadPoolExecutor() as executor:
             results = executor.map(process_manifest, manifests)
             for res in results:
                 datafiles.extend(res)
 
         return datafiles
 
-    def _get_all_datafiles_with_context(self) -> List[tuple[DataFile, str, int]]:
-        """Collect all DataFiles in the table, scanning all partitions, with manifest context."""
-        datafiles: List[tuple[DataFile, str, int]] = []
-
-        def process_manifest(manifest: ManifestFile) -> list[tuple[DataFile, str, int]]:
-            found: list[tuple[DataFile, str, int]] = []
-            for idx, entry in enumerate(manifest.fetch_manifest_entry(io=self.tbl.io)):
-                if hasattr(entry, "data_file"):
-                    found.append((entry.data_file, getattr(manifest, 'manifest_path', str(manifest)), idx))
-            return found
-
-        # Scan all snapshots
-        manifests = []
-        for snapshot in self.tbl.snapshots():
-            manifests.extend(snapshot.manifests(io=self.tbl.io))
-        with ThreadPoolExecutor() as executor:
-            results = executor.map(process_manifest, manifests)
-            for res in results:
-                datafiles.extend(res)
-
-        return datafiles
-
-    def _detect_duplicates(self, all_datafiles_with_context: List[tuple[DataFile, str, int]]) -> List[DataFile]:
-        """Detect duplicate data files based on file name and extension."""
-        seen = {}
-        processed_entries = set()
-        duplicates = []
-
-        for df, manifest_path, entry_idx in all_datafiles_with_context:
-            # Extract file name and extension
-            file_name_with_extension = df.file_path.split("/")[-1]
-            entry_key = (manifest_path, entry_idx)
-
-            if file_name_with_extension in seen:
-                if entry_key not in processed_entries:
-                    duplicates.append(df)
-                    processed_entries.add(entry_key)
-            else:
-                seen[file_name_with_extension] = (df, manifest_path, entry_idx)
-
-        return duplicates
-
     def deduplicate_data_files(self) -> List[DataFile]:
         """
         Remove duplicate data files from an Iceberg table.
 
         Returns:
             List of removed DataFile objects.
         """
+        import os
+        from collections import defaultdict
+        
         removed: List[DataFile] = []
 
-        # Collect all data files
-        all_datafiles_with_context = self._get_all_datafiles_with_context()
-
-        # Detect duplicates
-        duplicates = self._detect_duplicates(all_datafiles_with_context)
+        # Get the current snapshot
+        current_snapshot = self.tbl.current_snapshot()
+        if not current_snapshot:
+            return removed
+
+        # Collect all manifest entries from the current snapshot
+        all_entries = []
+        for manifest in current_snapshot.manifests(io=self.tbl.io):
+            entries = list(manifest.fetch_manifest_entry(io=self.tbl.io, discard_deleted=True))
+            all_entries.extend(entries)
+
+        # Group entries by file name
+        file_groups = defaultdict(list)
+        for entry in all_entries:
+            file_name = os.path.basename(entry.data_file.file_path)
+            file_groups[file_name].append(entry)
+
+        # Find duplicate entries to remove
+        has_duplicates = False
+        files_to_remove = []
+        files_to_keep = []
+        
+        for file_name, entries in file_groups.items():
+            if len(entries) > 1:
+                # Keep the first entry, remove the rest
+                files_to_keep.append(entries[0].data_file)
+                for duplicate_entry in entries[1:]:
+                    files_to_remove.append(duplicate_entry.data_file)
+                    removed.append(duplicate_entry.data_file)
+                    has_duplicates = True
+            else:
+                # No duplicates, keep the entry
+                files_to_keep.append(entries[0].data_file)
 
-        # Remove the DataFiles
-        for df in duplicates:
-            self.tbl.transaction().update_snapshot().overwrite().delete_data_file(df)
-            removed.append(df)
+        # Only create a new snapshot if we actually have duplicates to remove
+        if has_duplicates:
+            with self.tbl.transaction() as txn:
+                with txn.update_snapshot().overwrite() as overwrite_snapshot:
+                    # First, explicitly delete all the duplicate files
+                    for file_to_remove in files_to_remove:
+                        overwrite_snapshot.delete_data_file(file_to_remove)
+                    
+                    # Then add back only the files that should be kept
+                    for file_to_keep in files_to_keep:
+                        overwrite_snapshot.append_data_file(file_to_keep)
+            
+            # Refresh the table to reflect the changes
+            self.tbl = self.tbl.refresh()
 
         return removed
-
-    def _detect_duplicates(self, all_datafiles_with_context: List[tuple[DataFile, str, int]]) -> List[DataFile]:
-        """Detect duplicate data files based on file path and partition."""
-        seen = {}
-        processed_entries = set()
-        duplicates = []
-
-        for df, manifest_path, entry_idx in all_datafiles_with_context:
-            partition: dict[str, Any] = df.partition.to_dict() if hasattr(df.partition, "to_dict") else {}
-            key = (df.file_path, tuple(sorted(partition.items())) if partition else ())
-            entry_key = (manifest_path, entry_idx)
-
-            if key in seen:
-                if entry_key not in processed_entries:
-                    duplicates.append(df)
-                    processed_entries.add(entry_key)
-            else:
-                seen[key] = (df, manifest_path, entry_idx)
-
-        return duplicates
diff --git a/tests/table/test_dedup_data_file_filepaths.py b/tests/table/test_dedup_data_file_filepaths.py
@@ -16,6 +16,7 @@
 # under the License.
 from pathlib import Path
 from typing import List, Set
+import os
 
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -79,8 +80,10 @@ def prepopulated_table(iceberg_catalog: InMemoryCatalog, dupe_data_file_path: Pa
 
     tx = table.transaction()
     tx.add_files([str(dupe_data_file_path)], check_duplicate_files=False)
-    tx.add_files([str(dupe_data_file_path)], check_duplicate_files=False)
     tx.commit_transaction()
+    tx2 = table.transaction()
+    tx2.add_files([str(dupe_data_file_path)], check_duplicate_files=False)
+    tx2.commit_transaction()
 
     return table
 
@@ -114,16 +117,17 @@ def test_get_all_datafiles_all_snapshots(prepopulated_table: Table, dupe_data_fi
     assert dupe_data_file_path.name in file_paths
 
 
-def test_dedup_data_files_removes_duplicates_in_current_snapshot(prepopulated_table: Table, dupe_data_file_path: Path) -> None:
+def test_deduplicate_data_files_removes_duplicates_in_current_snapshot(prepopulated_table: Table, dupe_data_file_path: Path) -> None:
     mt = MaintenanceTable(tbl=prepopulated_table)
 
     all_datafiles: List[DataFile] = mt._get_all_datafiles()
-    file_paths: List[str] = [df.file_path.split("/")[-1] for df in all_datafiles]
-    # Only one reference should remain after deduplication
-    assert file_paths.count(dupe_data_file_path.name) == 1
+    file_names: List[str] = [os.path.basename(df.file_path) for df in all_datafiles]
+    # There should be more than one reference before deduplication
+    assert file_names.count(dupe_data_file_path.name) > 1, f"Expected multiple references to {dupe_data_file_path.name} before deduplication"
     removed: List[DataFile] = mt.deduplicate_data_files()
 
     all_datafiles_after: List[DataFile] = mt._get_all_datafiles()
-    file_paths_after: List[str] = [df.file_path.split("/")[-1] for df in all_datafiles_after]
-    assert file_paths_after.count(dupe_data_file_path.name) == 1
+    file_names_after: List[str] = [os.path.basename(df.file_path) for df in all_datafiles_after]
+    # Only one reference should remain after deduplication
+    assert file_names_after.count(dupe_data_file_path.name) == 1
     assert all(isinstance(df, DataFile) for df in removed)