Add option to delete datafiles

Fokko · Fokko · commit 8d459202ed62 · 2024-04-02T16:55:39.000+02:00
This is done through the Iceberg metadata, resulting
in efficient deletes if the data is partitioned correctly
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -50,10 +50,12 @@
 import pyiceberg.expressions.visitors as visitors
 from pyiceberg.exceptions import CommitFailedException, ResolveError, ValidationError
 from pyiceberg.expressions import (
+    AlwaysFalse,
     AlwaysTrue,
     And,
     BooleanExpression,
     EqualTo,
+    Or,
     Reference,
 )
 from pyiceberg.io import FileIO, load_file_io
@@ -2710,6 +2712,114 @@ def _commit(self) -> UpdatesAndRequirements:
         )
 
 
+class DeleteFiles(_MergingSnapshotProducer):
+    _predicate: BooleanExpression
+
+    def __init__(
+        self,
+        operation: Operation,
+        transaction: Transaction,
+        io: FileIO,
+        commit_uuid: Optional[uuid.UUID] = None,
+        snapshot_properties: Dict[str, str] = EMPTY_DICT,
+    ):
+        super().__init__(operation, transaction, io, commit_uuid, snapshot_properties)
+        self._predicate = AlwaysFalse()
+
+    def _build_partition_projection(self, spec_id: int) -> BooleanExpression:
+        schema = self._transaction.table_metadata.schema()
+        spec = self._transaction.table_metadata.specs()[spec_id]
+        project = visitors.inclusive_projection(schema, spec)
+        return project(self._predicate)
+
+    @cached_property
+    def partition_filters(self) -> KeyDefaultDict[int, BooleanExpression]:
+        return KeyDefaultDict(self._build_partition_projection)
+
+    def _build_manifest_evaluator(self, spec_id: int) -> Callable[[ManifestFile], bool]:
+        schema = self._transaction.table_metadata.schema()
+        spec = self._transaction.table_metadata.specs()[spec_id]
+        return visitors.manifest_evaluator(spec, schema, self.partition_filters[spec_id], case_sensitive=True)
+
+    def _build_partition_evaluator(self, spec_id: int) -> Callable[[DataFile], bool]:
+        schema = self._transaction.table_metadata.schema()
+        spec = self._transaction.table_metadata.specs()[spec_id]
+        partition_type = spec.partition_type(schema)
+        partition_schema = Schema(*partition_type.fields)
+        partition_expr = self.partition_filters[spec_id]
+
+        return lambda data_file: visitors.expression_evaluator(partition_schema, partition_expr, case_sensitive=True)(
+            data_file.partition
+        )
+
+    def delete(self, predicate: BooleanExpression) -> None:
+        self._predicate = Or(self._predicate, predicate)
+
+    @cached_property
+    def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry]]:
+        schema = self._transaction.table_metadata.schema()
+
+        def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> ManifestEntry:
+            return ManifestEntry(
+                status=status,
+                snapshot_id=entry.snapshot_id,
+                data_sequence_number=entry.data_sequence_number,
+                file_sequence_number=entry.file_sequence_number,
+                data_file=entry.data_file,
+            )
+
+        manifest_evaluators: Dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator)
+        strict_metrics_evaluator = visitors._StrictMetricsEvaluator(schema, self._predicate, case_sensitive=True).eval
+        inclusive_metrics_evaluator = visitors._InclusiveMetricsEvaluator(schema, self._predicate, case_sensitive=True).eval
+
+        existing_manifests = []
+        total_deleted_entries = []
+        if snapshot := self._transaction.table_metadata.current_snapshot():
+            for num, manifest_file in enumerate(snapshot.manifests(io=self._io)):
+                if not manifest_evaluators[manifest_file.partition_spec_id](manifest_file):
+                    # If the manifest isn't relevant, we can just keep it in the manifest-list
+                    existing_manifests.append(manifest_file)
+                else:
+                    # It is relevant, let's check out the content
+                    deleted_entries = []
+                    existing_entries = []
+                    for entry in manifest_file.fetch_manifest_entry(io=self._io):
+                        if strict_metrics_evaluator(entry.data_file) == visitors.ROWS_MUST_MATCH:
+                            deleted_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.DELETED))
+                        elif inclusive_metrics_evaluator(entry.data_file) == visitors.ROWS_CANNOT_MATCH:
+                            existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
+                        else:
+                            raise ValueError("Deletes do not support rewrites of data files")
+
+                    if len(deleted_entries) > 0:
+                        total_deleted_entries += deleted_entries
+
+                        # Rewrite the manifest
+                        if len(existing_entries) > 0:
+                            output_file_location = _new_manifest_path(
+                                location=self._transaction.table_metadata.location, num=num, commit_uuid=self.commit_uuid
+                            )
+                            with write_manifest(
+                                format_version=self._transaction.table_metadata.format_version,
+                                spec=self._transaction.table_metadata.specs()[manifest_file.partition_spec_id],
+                                schema=self._transaction.table_metadata.schema(),
+                                output_file=self._io.new_output(output_file_location),
+                                snapshot_id=self._snapshot_id,
+                            ) as writer:
+                                for existing_entry in existing_entries:
+                                    writer.add_entry(existing_entry)
+                    else:
+                        existing_manifests.append(manifest_file)
+
+        return existing_manifests, total_deleted_entries
+
+    def _existing_manifests(self) -> List[ManifestFile]:
+        return self._compute_deletes[0]
+
+    def _deleted_entries(self) -> List[ManifestEntry]:
+        return self._compute_deletes[1]
+
+
 class FastAppendFiles(_MergingSnapshotProducer):
     def _existing_manifests(self) -> List[ManifestFile]:
         """To determine if there are any existing manifest files.
@@ -2787,7 +2897,7 @@ class UpdateSnapshot:
     _io: FileIO
     _snapshot_properties: Dict[str, str]
 
-    def __init__(self, transaction: Transaction, io: FileIO, snapshot_properties: Dict[str, str]) -> None:
+    def __init__(self, transaction: Transaction, io: FileIO, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None:
         self._transaction = transaction
         self._io = io
         self._snapshot_properties = snapshot_properties
@@ -2807,6 +2917,14 @@ def overwrite(self) -> OverwriteFiles:
             snapshot_properties=self._snapshot_properties,
         )
 
+    def delete(self) -> DeleteFiles:
+        return DeleteFiles(
+            operation=Operation.DELETE,
+            transaction=self._transaction,
+            io=self._io,
+            snapshot_properties=self._snapshot_properties,
+        )
+
 
 class UpdateSpec(UpdateTableMetadata["UpdateSpec"]):
     _transaction: Transaction
diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -345,7 +345,7 @@ def get_prop(prop: str) -> int:
 def update_snapshot_summaries(
     summary: Summary, previous_summary: Optional[Mapping[str, str]] = None, truncate_full_table: bool = False
 ) -> Summary:
-    if summary.operation not in {Operation.APPEND, Operation.OVERWRITE}:
+    if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE}:
         raise ValueError(f"Operation not implemented: {summary.operation}")
 
     if truncate_full_table and summary.operation == Operation.OVERWRITE and previous_summary is not None:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -46,7 +46,6 @@
 import boto3
 import pytest
 from moto import mock_aws
-from pyspark.sql import SparkSession
 
 from pyiceberg import schema
 from pyiceberg.catalog import Catalog, load_catalog
@@ -86,6 +85,7 @@
 if TYPE_CHECKING:
     import pyarrow as pa
     from moto.server import ThreadedMotoServer  # type: ignore
+    from pyspark.sql import SparkSession
 
     from pyiceberg.io.pyarrow import PyArrowFileIO
 
@@ -1954,9 +1954,10 @@ def session_catalog() -> Catalog:
 
 
 @pytest.fixture(scope="session")
-def spark() -> SparkSession:
+def spark() -> "SparkSession":
     import importlib.metadata
-    import os
+
+    from pyspark.sql import SparkSession
 
     spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2])
     scala_version = "2.12"
diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name
+import pytest
+from pyspark.sql import DataFrame, SparkSession
+
+from pyiceberg.catalog.rest import RestCatalog
+from pyiceberg.expressions import EqualTo
+
+
+@pytest.fixture
+def test_deletes_table(spark: SparkSession) -> DataFrame:
+    identifier = 'default.table_partitioned_delete'
+
+    spark.sql(f"DROP TABLE IF EXISTS {identifier}")
+
+    spark.sql(
+        f"""
+        CREATE TABLE {identifier} (
+            number_partitioned  int,
+            number              int
+        )
+        USING iceberg
+        PARTITIONED BY (number_partitioned)
+    """
+    )
+    spark.sql(
+        f"""
+        INSERT INTO {identifier} VALUES (10, 20), (10, 30)
+    """
+    )
+    spark.sql(
+        f"""
+        INSERT INTO {identifier} VALUES (11, 20), (11, 30)
+    """
+    )
+
+    return spark.table(identifier)
+
+
+def test_partition_deletes(test_deletes_table: DataFrame, session_catalog: RestCatalog) -> None:
+    identifier = 'default.table_partitioned_delete'
+
+    tbl = session_catalog.load_table(identifier)
+
+    with tbl.transaction() as txn:
+        with txn.update_snapshot().delete() as delete:
+            delete.delete(EqualTo("number_partitioned", 10))
+
+    assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 11], 'number': [20, 30]}
+
+
+def test_deletes(test_deletes_table: DataFrame, session_catalog: RestCatalog) -> None:
+    identifier = 'default.table_partitioned_delete'
+
+    tbl = session_catalog.load_table(identifier)
+
+    with tbl.transaction() as txn:
+        with txn.update_snapshot().delete() as delete:
+            delete.delete(EqualTo("number", 30))
+
+    assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 11], 'number': [20, 30]}