Skip to content

Commit 234d55b

Browse files
committed
WIP
1 parent 284d05a commit 234d55b

File tree

1 file changed

+26
-25
lines changed

1 file changed

+26
-25
lines changed

pyiceberg/table/__init__.py

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,7 +1467,8 @@ def _parse_row_filter(expr: Union[str, BooleanExpression]) -> BooleanExpression:
14671467

14681468

14691469
class TableScan(ABC):
1470-
table: Table
1470+
table_metadata: TableMetadata
1471+
io: FileIO
14711472
row_filter: BooleanExpression
14721473
selected_fields: Tuple[str, ...]
14731474
case_sensitive: bool
@@ -1477,15 +1478,17 @@ class TableScan(ABC):
14771478

14781479
def __init__(
14791480
self,
1480-
table: Table,
1481+
table_metadata: TableMetadata,
1482+
io: FileIO,
14811483
row_filter: Union[str, BooleanExpression] = ALWAYS_TRUE,
14821484
selected_fields: Tuple[str, ...] = ("*",),
14831485
case_sensitive: bool = True,
14841486
snapshot_id: Optional[int] = None,
14851487
options: Properties = EMPTY_DICT,
14861488
limit: Optional[int] = None,
14871489
):
1488-
self.table = table
1490+
self.table_metadata = table_metadata
1491+
self.io = io
14891492
self.row_filter = _parse_row_filter(row_filter)
14901493
self.selected_fields = selected_fields
14911494
self.case_sensitive = case_sensitive
@@ -1495,16 +1498,16 @@ def __init__(
14951498

14961499
def snapshot(self) -> Optional[Snapshot]:
14971500
if self.snapshot_id:
1498-
return self.table.snapshot_by_id(self.snapshot_id)
1499-
return self.table.current_snapshot()
1501+
return self.table_metadata.snapshot_by_id(self.snapshot_id)
1502+
return self.table_metadata.current_snapshot()
15001503

15011504
def projection(self) -> Schema:
1502-
current_schema = self.table.schema()
1505+
current_schema = self.table_metadata.schema()
15031506
if self.snapshot_id is not None:
1504-
snapshot = self.table.snapshot_by_id(self.snapshot_id)
1507+
snapshot = self.table_metadata.snapshot_by_id(self.snapshot_id)
15051508
if snapshot is not None:
15061509
if snapshot.schema_id is not None:
1507-
snapshot_schema = self.table.schemas().get(snapshot.schema_id)
1510+
snapshot_schema = self.table_metadata.schemas().get(snapshot.schema_id)
15081511
if snapshot_schema is not None:
15091512
current_schema = snapshot_schema
15101513
else:
@@ -1625,17 +1628,6 @@ def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_ent
16251628

16261629

16271630
class DataScan(TableScan):
1628-
def __init__(
1629-
self,
1630-
table: Table,
1631-
row_filter: Union[str, BooleanExpression] = ALWAYS_TRUE,
1632-
selected_fields: Tuple[str, ...] = ("*",),
1633-
case_sensitive: bool = True,
1634-
snapshot_id: Optional[int] = None,
1635-
options: Properties = EMPTY_DICT,
1636-
limit: Optional[int] = None,
1637-
):
1638-
super().__init__(table, row_filter, selected_fields, case_sensitive, snapshot_id, options, limit)
16391631

16401632
def _build_partition_projection(self, spec_id: int) -> BooleanExpression:
16411633
project = inclusive_projection(self.table.schema(), self.table.specs()[spec_id])
@@ -2912,7 +2904,9 @@ def _commit(self) -> UpdatesAndRequirements:
29122904
)
29132905

29142906

2915-
class DeleteFiles(_MergingSnapshotProducer):
2907+
class MetadataDeleteFiles(_MergingSnapshotProducer):
2908+
"""Will delete manifest entries from the current snapshot based on the predicate"""
2909+
29162910
_predicate: BooleanExpression
29172911

29182912
def __init__(
@@ -2954,7 +2948,7 @@ def delete(self, predicate: BooleanExpression) -> None:
29542948
self._predicate = Or(self._predicate, predicate)
29552949

29562950
@cached_property
2957-
def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry]]:
2951+
def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry], bool]:
29582952
schema = self._transaction.table_metadata.schema()
29592953

29602954
def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> ManifestEntry:
@@ -2972,6 +2966,7 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
29722966

29732967
existing_manifests = []
29742968
total_deleted_entries = []
2969+
partial_rewrites_needed = False
29752970
if snapshot := self._transaction.table_metadata.current_snapshot():
29762971
for num, manifest_file in enumerate(snapshot.manifests(io=self._io)):
29772972
if not manifest_evaluators[manifest_file.partition_spec_id](manifest_file):
@@ -2987,7 +2982,8 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
29872982
elif inclusive_metrics_evaluator(entry.data_file) == ROWS_CANNOT_MATCH:
29882983
existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
29892984
else:
2990-
raise ValueError("Deletes do not support rewrites of data files")
2985+
# Based on the metadata, it is unsure to say if the file can be deleted
2986+
partial_rewrites_needed = True
29912987

29922988
if len(deleted_entries) > 0:
29932989
total_deleted_entries += deleted_entries
@@ -3006,17 +3002,22 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
30063002
) as writer:
30073003
for existing_entry in existing_entries:
30083004
writer.add_entry(existing_entry)
3005+
existing_manifests.append(writer.to_manifest_file())
30093006
else:
30103007
existing_manifests.append(manifest_file)
30113008

3012-
return existing_manifests, total_deleted_entries
3009+
return existing_manifests, total_deleted_entries, partial_rewrites_needed
30133010

30143011
def _existing_manifests(self) -> List[ManifestFile]:
30153012
return self._compute_deletes[0]
30163013

30173014
def _deleted_entries(self) -> List[ManifestEntry]:
30183015
return self._compute_deletes[1]
30193016

3017+
def rewrites_needed(self) -> bool:
3018+
"""Indicates if data files need to be rewritten"""
3019+
return self._compute_deletes[2]
3020+
30203021

30213022
class FastAppendFiles(_MergingSnapshotProducer):
30223023
def _existing_manifests(self) -> List[ManifestFile]:
@@ -3115,8 +3116,8 @@ def overwrite(self) -> OverwriteFiles:
31153116
snapshot_properties=self._snapshot_properties,
31163117
)
31173118

3118-
def delete(self) -> DeleteFiles:
3119-
return DeleteFiles(
3119+
def delete(self) -> MetadataDeleteFiles:
3120+
return MetadataDeleteFiles(
31203121
operation=Operation.DELETE,
31213122
transaction=self._transaction,
31223123
io=self._io,

0 commit comments

Comments
 (0)