Skip to content

Commit 7dae071

Browse files
committed
Fix summary generation
1 parent c443af2 commit 7dae071

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

pyiceberg/table/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2888,6 +2888,15 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
28882888
schema=self._transaction.table_metadata.schema(),
28892889
)
28902890

2891+
if len(self._deleted_data_files) > 0:
2892+
specs = self._transaction.table_metadata.specs()
2893+
for data_file in self._deleted_data_files:
2894+
ssc.remove_file(
2895+
data_file=data_file,
2896+
partition_spec=specs.get(data_file.spec_id),
2897+
schema=self._transaction.table_metadata.schema(),
2898+
)
2899+
28912900
previous_snapshot = (
28922901
self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id)
28932902
if self._parent_snapshot_id is not None
@@ -3028,6 +3037,7 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
30283037
existing_manifests = []
30293038
total_deleted_entries = []
30303039
partial_rewrites_needed = False
3040+
self._deleted_data_files = set()
30313041
if snapshot := self._transaction.table_metadata.current_snapshot():
30323042
for manifest_file in snapshot.manifests(io=self._io):
30333043
if manifest_file.content == ManifestContent.DATA:
@@ -3041,6 +3051,7 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
30413051
for entry in manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True):
30423052
if strict_metrics_evaluator(entry.data_file) == ROWS_MUST_MATCH:
30433053
deleted_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.DELETED))
3054+
self._deleted_data_files.add(entry.data_file)
30443055
elif inclusive_metrics_evaluator(entry.data_file) == ROWS_CANNOT_MATCH:
30453056
existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
30463057
else:

tests/integration/test_inspect_table.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,14 @@ def test_inspect_snapshots(
103103
assert isinstance(snapshot_id.as_py(), int)
104104

105105
assert df['parent_id'][0].as_py() is None
106-
assert df['parent_id'][1:] == df['snapshot_id'][:2]
106+
assert df['parent_id'][1:].to_pylist() == df['snapshot_id'][:-1].to_pylist()
107107

108-
assert [operation.as_py() for operation in df['operation']] == ['append', 'overwrite', 'append']
108+
assert [operation.as_py() for operation in df['operation']] == ['append', 'delete', 'overwrite', 'append']
109109

110110
for manifest_list in df['manifest_list']:
111111
assert manifest_list.as_py().startswith("s3://")
112112

113+
# Append
113114
assert df['summary'][0].as_py() == [
114115
('added-files-size', '5459'),
115116
('added-data-files', '1'),
@@ -122,6 +123,19 @@ def test_inspect_snapshots(
122123
('total-equality-deletes', '0'),
123124
]
124125

126+
# Delete
127+
assert df['summary'][1].as_py() == [
128+
('removed-files-size', '5459'),
129+
('deleted-data-files', '1'),
130+
('deleted-records', '3'),
131+
('total-data-files', '0'),
132+
('total-delete-files', '0'),
133+
('total-records', '0'),
134+
('total-files-size', '0'),
135+
('total-position-deletes', '0'),
136+
('total-equality-deletes', '0'),
137+
]
138+
125139
lhs = spark.table(f"{identifier}.snapshots").toPandas()
126140
rhs = df.to_pandas()
127141
for column in df.column_names:

0 commit comments

Comments
 (0)