diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 1559fdedab..070a6d08e4 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1296,7 +1296,13 @@ PyIceberg provides table maintenance operations through the `table.maintenance` Expire old snapshots to clean up table metadata and reduce storage costs: ```python -# Basic usage - expire a specific snapshot by ID +# Expire snapshots older than three days +from datetime import datetime, timedelta +table.maintenance.expire_snapshots().older_than( + datetime.now() - timedelta(days=3) +).commit() + +# Expire a specific snapshot by ID table.maintenance.expire_snapshots().by_id(12345).commit() # Context manager usage (recommended for multiple operations) @@ -1304,9 +1310,6 @@ with table.maintenance.expire_snapshots() as expire: expire.by_id(12345) expire.by_id(67890) # Automatically commits when exiting the context - -# Method chaining -table.maintenance.expire_snapshots().by_id(12345).commit() ``` #### Real-world Example diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index 9a44024a3c..018f6614c1 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -22,6 +22,7 @@ from abc import abstractmethod from collections import defaultdict from concurrent.futures import Future +from datetime import datetime from functools import cached_property from typing import TYPE_CHECKING, Callable, Dict, Generic, List, Optional, Set, Tuple @@ -82,6 +83,7 @@ ) from pyiceberg.utils.bin_packing import ListPacker from pyiceberg.utils.concurrent import ExecutorFactory +from pyiceberg.utils.datetime import datetime_to_millis from pyiceberg.utils.properties import property_as_bool, property_as_int if TYPE_CHECKING: @@ -944,13 +946,11 @@ def _get_protected_snapshot_ids(self) -> Set[int]: Returns: Set of protected snapshot IDs to exclude from expiration. """ - protected_ids: Set[int] = set() - - for ref in self._transaction.table_metadata.refs.values(): - if ref.snapshot_ref_type in [SnapshotRefType.TAG, SnapshotRefType.BRANCH]: - protected_ids.add(ref.snapshot_id) - - return protected_ids + return { + ref.snapshot_id + for ref in self._transaction.table_metadata.refs.values() + if ref.snapshot_ref_type in [SnapshotRefType.TAG, SnapshotRefType.BRANCH] + } def by_id(self, snapshot_id: int) -> ExpireSnapshots: """ @@ -988,18 +988,19 @@ def by_ids(self, snapshot_ids: List[int]) -> "ExpireSnapshots": self.by_id(snapshot_id) return self - def older_than(self, timestamp_ms: int) -> "ExpireSnapshots": + def older_than(self, dt: datetime) -> "ExpireSnapshots": """ Expire all unprotected snapshots with a timestamp older than a given value. Args: - timestamp_ms (int): Only snapshots with timestamp_ms < this value will be expired. + dt (datetime): Only snapshots with datetime < this value will be expired. Returns: This for method chaining. """ protected_ids = self._get_protected_snapshot_ids() + expire_from = datetime_to_millis(dt) for snapshot in self._transaction.table_metadata.snapshots: - if snapshot.timestamp_ms < timestamp_ms and snapshot.snapshot_id not in protected_ids: + if snapshot.timestamp_ms < expire_from and snapshot.snapshot_id not in protected_ids: self._snapshot_ids_to_expire.add(snapshot.snapshot_id) return self diff --git a/tests/table/test_expire_snapshots.py b/tests/table/test_expire_snapshots.py index 273b4f631b..e2b2d47b67 100644 --- a/tests/table/test_expire_snapshots.py +++ b/tests/table/test_expire_snapshots.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import datetime from unittest.mock import MagicMock from uuid import uuid4 @@ -142,7 +143,7 @@ def test_expire_snapshots_by_timestamp_skips_protected(table_v2: Table) -> None: table_v2.catalog = MagicMock() # Attempt to expire all snapshots before a future timestamp (so both are candidates) - future_timestamp = 9999999999999 # Far in the future, after any real snapshot + future_datetime = datetime.datetime.now() + datetime.timedelta(days=1) # Mock the catalog's commit_table to return the current metadata (simulate no change) mock_response = CommitTableResponse( @@ -152,7 +153,7 @@ def test_expire_snapshots_by_timestamp_skips_protected(table_v2: Table) -> None: ) table_v2.catalog.commit_table.return_value = mock_response - table_v2.maintenance.expire_snapshots().older_than(future_timestamp).commit() + table_v2.maintenance.expire_snapshots().older_than(future_datetime).commit() # Update metadata to reflect the commit (as in other tests) table_v2.metadata = mock_response.metadata