64
64
ROWS_MUST_MATCH ,
65
65
_InclusiveMetricsEvaluator ,
66
66
_StrictMetricsEvaluator ,
67
+ bind ,
67
68
expression_evaluator ,
68
69
inclusive_projection ,
69
70
manifest_evaluator ,
70
71
)
71
72
from pyiceberg .io import FileIO , load_file_io
72
- from pyiceberg .io .pyarrow import _dataframe_to_data_files , project_table
73
+ from pyiceberg .io .pyarrow import _dataframe_to_data_files , expression_to_pyarrow , project_table
73
74
from pyiceberg .manifest import (
74
75
POSITIONAL_DELETE_SCHEMA ,
75
76
DataFile ,
@@ -310,8 +311,6 @@ def _apply(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequ
310
311
for new_requirement in requirements :
311
312
if type (new_requirement ) not in existing_requirements :
312
313
self ._requirements = self ._requirements + requirements
313
- else :
314
- warnings .warn (f"Dropped duplicate requirement: { new_requirement } " )
315
314
316
315
self .table_metadata = update_table_metadata (self .table_metadata , updates )
317
316
@@ -430,7 +429,10 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
430
429
update_snapshot .append_data_file (data_file )
431
430
432
431
def overwrite (
433
- self , df : pa .Table , overwrite_filter : BooleanExpression = ALWAYS_TRUE , snapshot_properties : Dict [str , str ] = EMPTY_DICT
432
+ self ,
433
+ df : pa .Table ,
434
+ overwrite_filter : Union [BooleanExpression , str ] = ALWAYS_TRUE ,
435
+ snapshot_properties : Dict [str , str ] = EMPTY_DICT ,
434
436
) -> None :
435
437
"""
436
438
Shorthand for adding a table overwrite with a PyArrow table to the transaction.
@@ -458,8 +460,7 @@ def overwrite(
458
460
if table_arrow_schema != df .schema :
459
461
df = df .cast (table_arrow_schema )
460
462
461
- with self .update_snapshot (snapshot_properties = snapshot_properties ).delete () as delete_snapshot :
462
- delete_snapshot .delete_by_predicate (overwrite_filter )
463
+ self .delete (delete_filter = overwrite_filter , snapshot_properties = snapshot_properties )
463
464
464
465
with self .update_snapshot (snapshot_properties = snapshot_properties ).overwrite () as update_snapshot :
465
466
# skip writing data files if the dataframe is empty
@@ -470,53 +471,73 @@ def overwrite(
470
471
for data_file in data_files :
471
472
update_snapshot .append_data_file (data_file )
472
473
473
- def delete (self , delete_filter : BooleanExpression , snapshot_properties : Dict [str , str ] = EMPTY_DICT ) -> None :
474
+ def delete (self , delete_filter : Union [ str , BooleanExpression ] , snapshot_properties : Dict [str , str ] = EMPTY_DICT ) -> None :
474
475
if (
475
476
self .table_metadata .properties .get (TableProperties .DELETE_MODE , TableProperties .DELETE_MODE_COPY_ON_WRITE )
476
477
== TableProperties .DELETE_MODE_MERGE_ON_READ
477
478
):
478
479
raise NotImplementedError ("Merge on read is not yet supported" )
479
480
481
+ if isinstance (delete_filter , str ):
482
+ delete_filter = _parse_row_filter (delete_filter )
483
+
480
484
with self .update_snapshot (snapshot_properties = snapshot_properties ).delete () as delete_snapshot :
481
485
delete_snapshot .delete_by_predicate (delete_filter )
482
486
483
487
# Check if there are any files that require an actual rewrite of a data file
484
488
if delete_snapshot .rewrites_needed is True :
485
- # When we want to filter out certain rows, we want to invert the expression
486
- # delete id = 22 means that we want to look for that value, and then remove
487
- # if from the Parquet file
488
- delete_row_filter = Not (delete_filter )
489
- with self .update_snapshot (snapshot_properties = snapshot_properties ).overwrite () as overwrite_snapshot :
490
- # Potential optimization is where we check if the files actually contain relevant data.
491
- files = self ._scan (row_filter = delete_filter ).plan_files ()
492
-
493
- counter = itertools .count (0 )
494
-
495
- # This will load the Parquet file into memory, including:
496
- # - Filter out the rows based on the delete filter
497
- # - Projecting it to the current schema
498
- # - Applying the positional deletes if they are there
499
- # When writing
500
- # - Apply the latest partition-spec
501
- # - And sort order when added
502
- for original_file in files :
503
- df = project_table (
504
- tasks = [original_file ],
505
- table_metadata = self ._table .metadata ,
506
- io = self ._table .io ,
507
- row_filter = delete_row_filter ,
508
- projected_schema = self .table_metadata .schema (),
509
- )
510
- for data_file in _dataframe_to_data_files (
511
- io = self ._table .io ,
512
- df = df ,
513
- table_metadata = self ._table .metadata ,
514
- write_uuid = overwrite_snapshot .commit_uuid ,
515
- counter = counter ,
516
- ):
517
- overwrite_snapshot .append_data_file (data_file )
489
+ bound_delete_filter = bind (self ._table .schema (), delete_filter , case_sensitive = True )
490
+ preserve_row_filter = expression_to_pyarrow (Not (bound_delete_filter ))
491
+ commit_uuid = uuid .uuid4 ()
492
+
493
+ files = self ._scan (row_filter = delete_filter ).plan_files ()
494
+
495
+ counter = itertools .count (0 )
496
+
497
+ replaced_files : List [Tuple [DataFile , List [DataFile ]]] = []
498
+ # This will load the Parquet file into memory, including:
499
+ # - Filter out the rows based on the delete filter
500
+ # - Projecting it to the current schema
501
+ # - Applying the positional deletes if they are there
502
+ # When writing
503
+ # - Apply the latest partition-spec
504
+ # - And sort order when added
505
+ for original_file in files :
506
+ df = project_table (
507
+ tasks = [original_file ],
508
+ table_metadata = self ._table .metadata ,
509
+ io = self ._table .io ,
510
+ row_filter = AlwaysTrue (),
511
+ projected_schema = self .table_metadata .schema (),
512
+ )
513
+ filtered_df = df .filter (preserve_row_filter )
514
+
515
+ # Only rewrite if there are records being deleted
516
+ if len (df ) != len (filtered_df ):
517
+ replaced_files .append ((
518
+ original_file .file ,
519
+ list (
520
+ _dataframe_to_data_files (
521
+ io = self ._table .io ,
522
+ df = filtered_df ,
523
+ table_metadata = self ._table .metadata ,
524
+ write_uuid = commit_uuid ,
525
+ counter = counter ,
526
+ )
527
+ ),
528
+ ))
518
529
519
- overwrite_snapshot .delete_data_file (original_file .file )
530
+ if len (replaced_files ) > 0 :
531
+ with self .update_snapshot (snapshot_properties = snapshot_properties ).overwrite (
532
+ commit_uuid = commit_uuid
533
+ ) as overwrite_snapshot :
534
+ for original_data_file , replaced_data_files in replaced_files :
535
+ overwrite_snapshot .delete_data_file (original_data_file )
536
+ for replaced_data_file in replaced_data_files :
537
+ overwrite_snapshot .append_data_file (replaced_data_file )
538
+
539
+ if not delete_snapshot .files_affected and not delete_snapshot .rewrites_needed :
540
+ warnings .warn ("Delete operation did not match any records" )
520
541
521
542
def add_files (self , file_paths : List [str ]) -> None :
522
543
"""
@@ -1405,7 +1426,10 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
1405
1426
tx .append (df = df , snapshot_properties = snapshot_properties )
1406
1427
1407
1428
def overwrite (
1408
- self , df : pa .Table , overwrite_filter : BooleanExpression = ALWAYS_TRUE , snapshot_properties : Dict [str , str ] = EMPTY_DICT
1429
+ self ,
1430
+ df : pa .Table ,
1431
+ overwrite_filter : Union [BooleanExpression , str ] = ALWAYS_TRUE ,
1432
+ snapshot_properties : Dict [str , str ] = EMPTY_DICT ,
1409
1433
) -> None :
1410
1434
"""
1411
1435
Shorthand for overwriting the table with a PyArrow table.
@@ -1419,7 +1443,9 @@ def overwrite(
1419
1443
with self .transaction () as tx :
1420
1444
tx .overwrite (df = df , overwrite_filter = overwrite_filter , snapshot_properties = snapshot_properties )
1421
1445
1422
- def delete (self , delete_filter : BooleanExpression = ALWAYS_TRUE , snapshot_properties : Dict [str , str ] = EMPTY_DICT ) -> None :
1446
+ def delete (
1447
+ self , delete_filter : Union [BooleanExpression , str ] = ALWAYS_TRUE , snapshot_properties : Dict [str , str ] = EMPTY_DICT
1448
+ ) -> None :
1423
1449
"""
1424
1450
Shorthand for deleting rows from the table.
1425
1451
@@ -3011,15 +3037,6 @@ def _build_manifest_evaluator(self, spec_id: int) -> Callable[[ManifestFile], bo
3011
3037
spec = self ._transaction .table_metadata .specs ()[spec_id ]
3012
3038
return manifest_evaluator (spec , schema , self .partition_filters [spec_id ], case_sensitive = True )
3013
3039
3014
- def _build_partition_evaluator (self , spec_id : int ) -> Callable [[DataFile ], bool ]:
3015
- schema = self ._transaction .table_metadata .schema ()
3016
- spec = self ._transaction .table_metadata .specs ()[spec_id ]
3017
- partition_type = spec .partition_type (schema )
3018
- partition_schema = Schema (* partition_type .fields )
3019
- partition_expr = self .partition_filters [spec_id ]
3020
-
3021
- return lambda data_file : expression_evaluator (partition_schema , partition_expr , case_sensitive = True )(data_file .partition )
3022
-
3023
3040
def delete_by_predicate (self , predicate : BooleanExpression ) -> None :
3024
3041
self ._predicate = Or (self ._predicate , predicate )
3025
3042
@@ -3240,8 +3257,9 @@ def fast_append(self) -> FastAppendFiles:
3240
3257
operation = Operation .APPEND , transaction = self ._transaction , io = self ._io , snapshot_properties = self ._snapshot_properties
3241
3258
)
3242
3259
3243
- def overwrite (self ) -> OverwriteFiles :
3260
+ def overwrite (self , commit_uuid : Optional [ uuid . UUID ] = None ) -> OverwriteFiles :
3244
3261
return OverwriteFiles (
3262
+ commit_uuid = commit_uuid ,
3245
3263
operation = Operation .OVERWRITE
3246
3264
if self ._transaction .table_metadata .current_snapshot () is not None
3247
3265
else Operation .APPEND ,
0 commit comments