Skip to content

Commit a9a01ee

Browse files
committed
refactor: remove unused expire_snapshots method and clean up transaction context in MaintenanceTable
1 parent 73658e0 commit a9a01ee

File tree

3 files changed

+20
-18
lines changed

3 files changed

+20
-18
lines changed

pyiceberg/table/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1250,10 +1250,6 @@ def manage_snapshots(self) -> ManageSnapshots:
12501250
"""
12511251
return ManageSnapshots(transaction=Transaction(self, autocommit=True))
12521252

1253-
def expire_snapshots(self) -> ExpireSnapshots:
1254-
"""Shorthand to run expire snapshots by id or by a timestamp."""
1255-
return ExpireSnapshots(transaction=Transaction(self, autocommit=True))
1256-
12571253
def update_statistics(self) -> UpdateStatistics:
12581254
"""
12591255
Shorthand to run statistics management operations like add statistics and remove statistics.

pyiceberg/table/maintenance.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from __future__ import annotations
1818

1919
import logging
20-
from typing import TYPE_CHECKING, Any, List, Optional, Set, Union
20+
from typing import TYPE_CHECKING, List, Optional, Set
2121

2222
from pyiceberg.manifest import DataFile, ManifestFile
2323
from pyiceberg.utils.concurrent import ThreadPoolExecutor # type: ignore[attr-defined]
@@ -104,7 +104,7 @@ def expire_snapshots_older_than(self, timestamp_ms: int) -> None:
104104
snapshots_to_expire.append(snapshot.snapshot_id)
105105

106106
if snapshots_to_expire:
107-
with self.tbl.transaction() as txn:
107+
with self.tbl.transaction():
108108
self.expire_snapshots_by_ids(snapshots_to_expire)
109109

110110
def expire_snapshots_older_than_with_retention(
@@ -122,7 +122,7 @@ def expire_snapshots_older_than_with_retention(
122122
)
123123

124124
if snapshots_to_expire:
125-
with self.tbl.transaction() as txn:
125+
with self.tbl.transaction():
126126
self.expire_snapshots_by_ids(snapshots_to_expire)
127127

128128
def retain_last_n_snapshots(self, n: int) -> None:
@@ -158,7 +158,7 @@ def retain_last_n_snapshots(self, n: int) -> None:
158158
snapshots_to_expire.append(snapshot.snapshot_id)
159159

160160
if snapshots_to_expire:
161-
with self.tbl.transaction() as txn:
161+
with self.tbl.transaction():
162162
self.expire_snapshots_by_ids(snapshots_to_expire)
163163

164164
def _get_snapshots_to_expire_with_retention(
@@ -325,7 +325,7 @@ def deduplicate_data_files(self) -> List[DataFile]:
325325
"""
326326
import os
327327
from collections import defaultdict
328-
328+
329329
removed: List[DataFile] = []
330330

331331
# Get the current snapshot
@@ -349,8 +349,8 @@ def deduplicate_data_files(self) -> List[DataFile]:
349349
has_duplicates = False
350350
files_to_remove = []
351351
files_to_keep = []
352-
353-
for file_name, entries in file_groups.items():
352+
353+
for _file_name, entries in file_groups.items():
354354
if len(entries) > 1:
355355
# Keep the first entry, remove the rest
356356
files_to_keep.append(entries[0].data_file)
@@ -369,11 +369,11 @@ def deduplicate_data_files(self) -> List[DataFile]:
369369
# First, explicitly delete all the duplicate files
370370
for file_to_remove in files_to_remove:
371371
overwrite_snapshot.delete_data_file(file_to_remove)
372-
372+
373373
# Then add back only the files that should be kept
374374
for file_to_keep in files_to_keep:
375375
overwrite_snapshot.append_data_file(file_to_keep)
376-
376+
377377
# Refresh the table to reflect the changes
378378
self.tbl = self.tbl.refresh()
379379

tests/table/test_dedup_data_file_filepaths.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@
1414
# KIND, either express or implied. See the License for the
1515
# specific language governing permissions and limitations
1616
# under the License.
17+
import os
18+
import uuid
1719
from pathlib import Path
1820
from typing import List, Set
19-
import os
2021

2122
import pyarrow as pa
2223
import pyarrow.parquet as pq
@@ -26,7 +27,6 @@
2627
from pyiceberg.table import Table
2728
from pyiceberg.table.maintenance import MaintenanceTable
2829
from tests.catalog.test_base import InMemoryCatalog
29-
import uuid
3030

3131

3232
@pytest.fixture
@@ -98,7 +98,9 @@ def test_overwrite_removes_only_selected_datafile(prepopulated_table: Table, dup
9898
assert dupe_data_file_path.name in file_names_after, f"Expected {dupe_data_file_path.name} to remain in the table"
9999
assert len(file_names_after) == 1, "Expected only one unique file name to remain after deduplication"
100100
# All removed files should have the same file name
101-
assert all(df.file_path.split("/")[-1] == dupe_data_file_path.name for df in removed_files), "All removed files should be duplicates by name"
101+
assert all(df.file_path.split("/")[-1] == dupe_data_file_path.name for df in removed_files), (
102+
"All removed files should be duplicates by name"
103+
)
102104

103105

104106
def test_get_all_datafiles_current_snapshot(prepopulated_table: Table, dupe_data_file_path: Path) -> None:
@@ -117,13 +119,17 @@ def test_get_all_datafiles_all_snapshots(prepopulated_table: Table, dupe_data_fi
117119
assert dupe_data_file_path.name in file_paths
118120

119121

120-
def test_deduplicate_data_files_removes_duplicates_in_current_snapshot(prepopulated_table: Table, dupe_data_file_path: Path) -> None:
122+
def test_deduplicate_data_files_removes_duplicates_in_current_snapshot(
123+
prepopulated_table: Table, dupe_data_file_path: Path
124+
) -> None:
121125
mt = MaintenanceTable(tbl=prepopulated_table)
122126

123127
all_datafiles: List[DataFile] = mt._get_all_datafiles()
124128
file_names: List[str] = [os.path.basename(df.file_path) for df in all_datafiles]
125129
# There should be more than one reference before deduplication
126-
assert file_names.count(dupe_data_file_path.name) > 1, f"Expected multiple references to {dupe_data_file_path.name} before deduplication"
130+
assert file_names.count(dupe_data_file_path.name) > 1, (
131+
f"Expected multiple references to {dupe_data_file_path.name} before deduplication"
132+
)
127133
removed: List[DataFile] = mt.deduplicate_data_files()
128134

129135
all_datafiles_after: List[DataFile] = mt._get_all_datafiles()

0 commit comments

Comments
 (0)