WIP

Fokko · Fokko · commit 9c6724e48ba6 · 2024-04-17T23:39:55.000+02:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -439,9 +439,6 @@ def overwrite(
         if not isinstance(df, pa.Table):
             raise ValueError(f"Expected PyArrow table, got: {df}")
 
-        if overwrite_filter != AlwaysTrue():
-            raise NotImplementedError("Cannot overwrite a subset of a table")
-
         if len(self._table.spec().fields) > 0:
             raise ValueError("Cannot write to partitioned tables")
 
@@ -451,6 +448,9 @@ def overwrite(
         if table_arrow_schema != df.schema:
             df = df.cast(table_arrow_schema)
 
+        with self.update_snapshot(snapshot_properties=snapshot_properties).delete() as delete_snapshot:
+            delete_snapshot.delete_by_predicate(overwrite_filter)
+
         with self.update_snapshot(snapshot_properties=snapshot_properties).overwrite() as update_snapshot:
             # skip writing data files if the dataframe is empty
             if df.shape[0] > 0:
diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py
@@ -227,33 +227,20 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio
     # One positional delete has been added, but an OVERWRITE status is set
     # https://github.com/apache/iceberg/issues/10122
     snapshots = tbl.snapshots()
-    assert len(snapshots) == 4
+    assert len(snapshots) == 3
 
     # Snapshots produced by Spark
     assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ['append', 'overwrite']
 
-    # Snapshots produced by PyIceberg
-    # This is a no-op since nothing has been added or deleted (because the predicate cannot drop a whole file)
-    assert tbl.snapshots()[2].summary == Summary(
-        Operation.DELETE,
-        **{
-            'total-data-files': '2',
-            'total-delete-files': '1',
-            'total-records': '5',
-            'total-files-size': tbl.snapshots()[2].summary['total-files-size'],
-            'total-position-deletes': '1',
-            'total-equality-deletes': '0',
-        },
-    )
     # Will rewrite one parquet file
-    assert tbl.snapshots()[3].summary == Summary(
+    assert snapshots[2].summary == Summary(
         Operation.OVERWRITE,
         **{
             'added-files-size': '1145',
             'added-data-files': '1',
             'added-records': '2',
             'changed-partition-count': '1',
-            'total-files-size': tbl.snapshots()[3].summary['total-files-size'],
+            'total-files-size': snapshots[2].summary['total-files-size'],
             'total-delete-files': '0',
             'total-data-files': '1',
             'total-position-deletes': '0',
@@ -262,7 +249,7 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio
             'deleted-data-files': '2',
             'removed-delete-files': '1',
             'deleted-records': '5',
-            'removed-files-size': '3088',
+            'removed-files-size': snapshots[2].summary['removed-files-size'],
             'removed-position-deletes': '1',
         },
     )