Cleanup

Fokko · Fokko · commit 172f9c0a622b · 2024-04-16T09:34:29.000+02:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -2967,6 +2967,13 @@ def __init__(
         super().__init__(operation, transaction, io, commit_uuid, snapshot_properties)
         self._predicate = AlwaysFalse()
 
+    def _commit(self) -> UpdatesAndRequirements:
+        # Only produce a commit when there is something to delete
+        if self.files_affected:
+            return super()._commit()
+        else:
+            return (), ()
+
     def _build_partition_projection(self, spec_id: int) -> BooleanExpression:
         schema = self._transaction.table_metadata.schema()
         spec = self._transaction.table_metadata.specs()[spec_id]
@@ -2996,6 +3003,13 @@ def delete_by_predicate(self, predicate: BooleanExpression) -> None:
 
     @cached_property
     def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry], bool]:
+        """Computes all the delete operation and cache it when nothing changes.
+
+        Returns:
+            - List of existing manifests that are not affected by the delete operation.
+            - The manifest-entries that are deleted based on the metadata.
+            - Flag indicating that rewrites of data-files are needed.
+        """
         schema = self._transaction.table_metadata.schema()
 
         def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> ManifestEntry:
@@ -3016,44 +3030,47 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
         partial_rewrites_needed = False
         if snapshot := self._transaction.table_metadata.current_snapshot():
             for manifest_file in snapshot.manifests(io=self._io):
-                if not manifest_evaluators[manifest_file.partition_spec_id](manifest_file):
-                    # If the manifest isn't relevant, we can just keep it in the manifest-list
-                    existing_manifests.append(manifest_file)
-                else:
-                    # It is relevant, let's check out the content
-                    deleted_entries = []
-                    existing_entries = []
-                    for entry in manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True):
-                        if strict_metrics_evaluator(entry.data_file) == ROWS_MUST_MATCH:
-                            deleted_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.DELETED))
-                        elif inclusive_metrics_evaluator(entry.data_file) == ROWS_CANNOT_MATCH:
-                            existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
-                        else:
-                            # Based on the metadata, it is unsure to say if the file can be deleted
-                            partial_rewrites_needed = True
-
-                    if len(deleted_entries) > 0:
-                        total_deleted_entries += deleted_entries
-
-                        # Rewrite the manifest
-                        if len(existing_entries) > 0:
-                            output_file_location = _new_manifest_path(
-                                location=self._transaction.table_metadata.location,
-                                num=next(self._manifest_counter),
-                                commit_uuid=self.commit_uuid,
-                            )
-                            with write_manifest(
-                                format_version=self._transaction.table_metadata.format_version,
-                                spec=self._transaction.table_metadata.specs()[manifest_file.partition_spec_id],
-                                schema=self._transaction.table_metadata.schema(),
-                                output_file=self._io.new_output(output_file_location),
-                                snapshot_id=self._snapshot_id,
-                            ) as writer:
-                                for existing_entry in existing_entries:
-                                    writer.add_entry(existing_entry)
-                            existing_manifests.append(writer.to_manifest_file())
-                    else:
+                if manifest_file.content == ManifestContent.DATA:
+                    if not manifest_evaluators[manifest_file.partition_spec_id](manifest_file):
+                        # If the manifest isn't relevant, we can just keep it in the manifest-list
                         existing_manifests.append(manifest_file)
+                    else:
+                        # It is relevant, let's check out the content
+                        deleted_entries = []
+                        existing_entries = []
+                        for entry in manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True):
+                            if strict_metrics_evaluator(entry.data_file) == ROWS_MUST_MATCH:
+                                deleted_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.DELETED))
+                            elif inclusive_metrics_evaluator(entry.data_file) == ROWS_CANNOT_MATCH:
+                                existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
+                            else:
+                                # Based on the metadata, it is unsure to say if the file can be deleted
+                                partial_rewrites_needed = True
+
+                        if len(deleted_entries) > 0:
+                            total_deleted_entries += deleted_entries
+
+                            # Rewrite the manifest
+                            if len(existing_entries) > 0:
+                                output_file_location = _new_manifest_path(
+                                    location=self._transaction.table_metadata.location,
+                                    num=next(self._manifest_counter),
+                                    commit_uuid=self.commit_uuid,
+                                )
+                                with write_manifest(
+                                    format_version=self._transaction.table_metadata.format_version,
+                                    spec=self._transaction.table_metadata.specs()[manifest_file.partition_spec_id],
+                                    schema=self._transaction.table_metadata.schema(),
+                                    output_file=self._io.new_output(output_file_location),
+                                    snapshot_id=self._snapshot_id,
+                                ) as writer:
+                                    for existing_entry in existing_entries:
+                                        writer.add_entry(existing_entry)
+                                existing_manifests.append(writer.to_manifest_file())
+                        else:
+                            existing_manifests.append(manifest_file)
+                else:
+                    existing_manifests.append(manifest_file)
 
         return existing_manifests, total_deleted_entries, partial_rewrites_needed
 
@@ -3068,6 +3085,11 @@ def rewrites_needed(self) -> bool:
         """Indicate if data files need to be rewritten."""
         return self._compute_deletes[2]
 
+    @property
+    def files_affected(self) -> bool:
+        """Indicate if any manifest-entries can be dropped."""
+        return len(self._deleted_entries()) > 0
+
 
 class FastAppendFiles(_MergingSnapshotProducer):
     def _existing_manifests(self) -> List[ManifestFile]:
diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py
@@ -22,13 +22,15 @@
 
 from pyiceberg.catalog.rest import RestCatalog
 from pyiceberg.expressions import EqualTo
+from pyiceberg.table.snapshots import Operation, Summary
 
 
 def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None:
     for sql in sqls:
         spark.sql(sql)
 
 
+@pytest.mark.integration
 @pytest.mark.parametrize("format_version", [1, 2])
 def test_partitioned_table_delete_full_file(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None:
     identifier = 'default.table_partitioned_delete'
@@ -63,6 +65,7 @@ def test_partitioned_table_delete_full_file(spark: SparkSession, session_catalog
     assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 11], 'number': [20, 30]}
 
 
+@pytest.mark.integration
 @pytest.mark.parametrize("format_version", [1, 2])
 def test_partitioned_table_rewrite(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None:
     identifier = 'default.table_partitioned_delete'
@@ -92,10 +95,12 @@ def test_partitioned_table_rewrite(spark: SparkSession, session_catalog: RestCat
     tbl = session_catalog.load_table(identifier)
     tbl.delete(EqualTo("number", 20))
 
-    assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'append', 'delete', 'overwrite']
+    # We don't delete a whole partition, so there is only a overwrite
+    assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'append', 'overwrite']
     assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 10], 'number': [30, 30]}
 
 
+@pytest.mark.integration
 @pytest.mark.parametrize("format_version", [1, 2])
 def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None:
     identifier = 'default.table_partitioned_delete'
@@ -123,10 +128,11 @@ def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCa
     tbl.delete(EqualTo("number_partitioned", 22))  # Does not affect any data
 
     # Open for discussion, do we want to create a new snapshot?
-    assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'delete']
+    assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append']
     assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [10, 10], 'number': [20, 30]}
 
 
+@pytest.mark.integration
 def test_partitioned_table_positional_deletes(spark: SparkSession, session_catalog: RestCatalog) -> None:
     identifier = 'default.table_partitioned_delete'
 
@@ -160,14 +166,105 @@ def test_partitioned_table_positional_deletes(spark: SparkSession, session_catal
 
     tbl = session_catalog.load_table(identifier)
 
-    # Assert that there is just a single Parquet file
-    assert len(list(tbl.scan().plan_files())) == 1
+    # Assert that there is just a single Parquet file, that has one merge on read file
+    files = list(tbl.scan().plan_files())
+    assert len(files) == 1
+    assert len(files[0].delete_files) == 1
 
     # Will rewrite a data file with a positional delete
     tbl.delete(EqualTo("number", 40))
 
-    # Yet another wrong status by Spark
     # One positional delete has been added, but an OVERWRITE status is set
-    # Related issue https://github.com/apache/iceberg/issues/9995
-    assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'overwrite', 'delete', 'overwrite']
+    # https://github.com/apache/iceberg/issues/10122
+    assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'overwrite', 'overwrite']
     assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [10], 'number': [20]}
+
+
+@pytest.mark.integration
+def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSession, session_catalog: RestCatalog) -> None:
+    identifier = 'default.table_partitioned_delete_sequence_number'
+
+    # This test case is a bit more complex. Here we run a MoR delete on a file, we make sure that
+    # the manifest gets rewritten (but not the data file with a MoR), and check if the delete is still there
+    # to assure that the sequence numbers are maintained
+
+    run_spark_commands(
+        spark,
+        [
+            f"DROP TABLE IF EXISTS {identifier}",
+            f"""
+            CREATE TABLE {identifier} (
+                number_partitioned  int,
+                number              int
+            )
+            USING iceberg
+            PARTITIONED BY (number_partitioned)
+            TBLPROPERTIES(
+                'format-version' = 2,
+                'write.delete.mode'='merge-on-read',
+                'write.update.mode'='merge-on-read',
+                'write.merge.mode'='merge-on-read'
+            )
+        """,
+            f"""
+            INSERT INTO {identifier} VALUES (10, 100), (10, 101), (20, 200), (20, 201), (20, 202)
+        """,
+            # Generate a positional delete
+            f"""
+            DELETE FROM {identifier} WHERE number = 101
+        """,
+        ],
+    )
+
+    tbl = session_catalog.load_table(identifier)
+
+    files = list(tbl.scan().plan_files())
+    assert len(files) == 2
+
+    # Will rewrite a data file with a positional delete
+    tbl.delete(EqualTo("number", 201))
+
+    # One positional delete has been added, but an OVERWRITE status is set
+    # https://github.com/apache/iceberg/issues/10122
+    snapshots = tbl.snapshots()
+    assert len(snapshots) == 4
+
+    # Snapshots produced by Spark
+    assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ['append', 'overwrite']
+
+    # Snapshots produced by PyIceberg
+    # This is a no-op since nothing has been added or deleted (because the predicate cannot drop a whole file)
+    assert tbl.snapshots()[2].summary == Summary(
+        Operation.DELETE,
+        **{
+            'total-data-files': '2',
+            'total-delete-files': '1',
+            'total-records': '5',
+            'total-files-size': tbl.snapshots()[2].summary['total-files-size'],
+            'total-position-deletes': '1',
+            'total-equality-deletes': '0',
+        },
+    )
+    # Will rewrite one parquet file
+    assert tbl.snapshots()[3].summary == Summary(
+        Operation.OVERWRITE,
+        **{
+            'added-files-size': '1145',
+            'added-data-files': '1',
+            'added-records': '2',
+            'changed-partition-count': '1',
+            'total-files-size': tbl.snapshots()[3].summary['total-files-size'],
+            'total-delete-files': '0',
+            'total-data-files': '1',
+            'total-position-deletes': '0',
+            'total-records': '2',
+            'total-equality-deletes': '0',
+            'deleted-data-files': '2',
+            'removed-delete-files': '1',
+            'deleted-records': '5',
+            'removed-files-size': '3088',
+            'removed-position-deletes': '1',
+        },
+    )
+
+    assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [20, 20, 10], 'number': [200, 202, 100]}