Skip to content

Commit 1dde51a

Browse files
authored
Support snapshot management operations like creating tags by adding ManageSnapshots API (#728)
1 parent c579e9f commit 1dde51a

File tree

5 files changed

+311
-0
lines changed

5 files changed

+311
-0
lines changed

dev/provision.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,3 +342,50 @@
342342
(array(), map(), array(struct(1)))
343343
"""
344344
)
345+
346+
spark.sql(
347+
f"""
348+
CREATE OR REPLACE TABLE {catalog_name}.default.test_table_snapshot_operations (
349+
number integer
350+
)
351+
USING iceberg
352+
TBLPROPERTIES (
353+
'format-version'='2'
354+
);
355+
"""
356+
)
357+
358+
spark.sql(
359+
f"""
360+
INSERT INTO {catalog_name}.default.test_table_snapshot_operations
361+
VALUES (1)
362+
"""
363+
)
364+
365+
spark.sql(
366+
f"""
367+
INSERT INTO {catalog_name}.default.test_table_snapshot_operations
368+
VALUES (2)
369+
"""
370+
)
371+
372+
spark.sql(
373+
f"""
374+
DELETE FROM {catalog_name}.default.test_table_snapshot_operations
375+
WHERE number = 2
376+
"""
377+
)
378+
379+
spark.sql(
380+
f"""
381+
INSERT INTO {catalog_name}.default.test_table_snapshot_operations
382+
VALUES (3)
383+
"""
384+
)
385+
386+
spark.sql(
387+
f"""
388+
INSERT INTO {catalog_name}.default.test_table_snapshot_operations
389+
VALUES (4)
390+
"""
391+
)

mkdocs/docs/api.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,28 @@ tbl.overwrite(df, snapshot_properties={"abc": "def"})
913913
assert tbl.metadata.snapshots[-1].summary["abc"] == "def"
914914
```
915915

916+
## Snapshot Management
917+
918+
Manage snapshots with operations through the `Table` API:
919+
920+
```python
921+
# To run a specific operation
922+
table.manage_snapshots().create_tag(snapshot_id, "tag123").commit()
923+
# To run multiple operations
924+
table.manage_snapshots()
925+
.create_tag(snapshot_id1, "tag123")
926+
.create_tag(snapshot_id2, "tag456")
927+
.commit()
928+
# Operations are applied on commit.
929+
```
930+
931+
You can also use context managers to make more changes:
932+
933+
```python
934+
with table.manage_snapshots() as ms:
935+
ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789")
936+
```
937+
916938
## Query the data
917939

918940
To query a table, a table scan is needed. A table scan accepts a filter, columns, optionally a limit and a snapshot ID:

pyiceberg/table/__init__.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@
138138
)
139139
from pyiceberg.utils.concurrent import ExecutorFactory
140140
from pyiceberg.utils.datetime import datetime_to_millis
141+
from pyiceberg.utils.deprecated import deprecated
141142
from pyiceberg.utils.singleton import _convert_to_hashable_type
142143

143144
if TYPE_CHECKING:
@@ -351,6 +352,88 @@ def set_properties(self, properties: Properties = EMPTY_DICT, **kwargs: Any) ->
351352
updates = properties or kwargs
352353
return self._apply((SetPropertiesUpdate(updates=updates),))
353354

355+
@deprecated(
356+
deprecated_in="0.7.0",
357+
removed_in="0.8.0",
358+
help_message="Please use one of the functions in ManageSnapshots instead",
359+
)
360+
def add_snapshot(self, snapshot: Snapshot) -> Transaction:
361+
"""Add a new snapshot to the table.
362+
363+
Returns:
364+
The transaction with the add-snapshot staged.
365+
"""
366+
updates = (AddSnapshotUpdate(snapshot=snapshot),)
367+
368+
return self._apply(updates, ())
369+
370+
@deprecated(
371+
deprecated_in="0.7.0",
372+
removed_in="0.8.0",
373+
help_message="Please use one of the functions in ManageSnapshots instead",
374+
)
375+
def set_ref_snapshot(
376+
self,
377+
snapshot_id: int,
378+
parent_snapshot_id: Optional[int],
379+
ref_name: str,
380+
type: str,
381+
max_ref_age_ms: Optional[int] = None,
382+
max_snapshot_age_ms: Optional[int] = None,
383+
min_snapshots_to_keep: Optional[int] = None,
384+
) -> Transaction:
385+
"""Update a ref to a snapshot.
386+
387+
Returns:
388+
The transaction with the set-snapshot-ref staged
389+
"""
390+
updates = (
391+
SetSnapshotRefUpdate(
392+
snapshot_id=snapshot_id,
393+
ref_name=ref_name,
394+
type=type,
395+
max_ref_age_ms=max_ref_age_ms,
396+
max_snapshot_age_ms=max_snapshot_age_ms,
397+
min_snapshots_to_keep=min_snapshots_to_keep,
398+
),
399+
)
400+
401+
requirements = (AssertRefSnapshotId(snapshot_id=parent_snapshot_id, ref="main"),)
402+
return self._apply(updates, requirements)
403+
404+
def _set_ref_snapshot(
405+
self,
406+
snapshot_id: int,
407+
ref_name: str,
408+
type: str,
409+
max_ref_age_ms: Optional[int] = None,
410+
max_snapshot_age_ms: Optional[int] = None,
411+
min_snapshots_to_keep: Optional[int] = None,
412+
) -> UpdatesAndRequirements:
413+
"""Update a ref to a snapshot.
414+
415+
Returns:
416+
The updates and requirements for the set-snapshot-ref staged
417+
"""
418+
updates = (
419+
SetSnapshotRefUpdate(
420+
snapshot_id=snapshot_id,
421+
ref_name=ref_name,
422+
type=type,
423+
max_ref_age_ms=max_ref_age_ms,
424+
max_snapshot_age_ms=max_snapshot_age_ms,
425+
min_snapshots_to_keep=min_snapshots_to_keep,
426+
),
427+
)
428+
requirements = (
429+
AssertRefSnapshotId(
430+
snapshot_id=self.table_metadata.refs[ref_name].snapshot_id if ref_name in self.table_metadata.refs else None,
431+
ref=ref_name,
432+
),
433+
)
434+
435+
return updates, requirements
436+
354437
def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema:
355438
"""Create a new UpdateSchema to alter the columns of this table.
356439
@@ -1323,6 +1406,21 @@ def history(self) -> List[SnapshotLogEntry]:
13231406
"""Get the snapshot history of this table."""
13241407
return self.metadata.snapshot_log
13251408

1409+
def manage_snapshots(self) -> ManageSnapshots:
1410+
"""
1411+
Shorthand to run snapshot management operations like create branch, create tag, etc.
1412+
1413+
Use table.manage_snapshots().<operation>().commit() to run a specific operation.
1414+
Use table.manage_snapshots().<operation-one>().<operation-two>().commit() to run multiple operations.
1415+
Pending changes are applied on commit.
1416+
1417+
We can also use context managers to make more changes. For example,
1418+
1419+
with table.manage_snapshots() as ms:
1420+
ms.create_tag(snapshot_id1, "Tag_A").create_tag(snapshot_id2, "Tag_B")
1421+
"""
1422+
return ManageSnapshots(transaction=Transaction(self, autocommit=True))
1423+
13261424
def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema:
13271425
"""Create a new UpdateSchema to alter the columns of this table.
13281426
@@ -1835,6 +1933,84 @@ def __enter__(self) -> U:
18351933
return self # type: ignore
18361934

18371935

1936+
class ManageSnapshots(UpdateTableMetadata["ManageSnapshots"]):
1937+
"""
1938+
Run snapshot management operations using APIs.
1939+
1940+
APIs include create branch, create tag, etc.
1941+
1942+
Use table.manage_snapshots().<operation>().commit() to run a specific operation.
1943+
Use table.manage_snapshots().<operation-one>().<operation-two>().commit() to run multiple operations.
1944+
Pending changes are applied on commit.
1945+
1946+
We can also use context managers to make more changes. For example,
1947+
1948+
with table.manage_snapshots() as ms:
1949+
ms.create_tag(snapshot_id1, "Tag_A").create_tag(snapshot_id2, "Tag_B")
1950+
"""
1951+
1952+
_updates: Tuple[TableUpdate, ...] = ()
1953+
_requirements: Tuple[TableRequirement, ...] = ()
1954+
1955+
def _commit(self) -> UpdatesAndRequirements:
1956+
"""Apply the pending changes and commit."""
1957+
return self._updates, self._requirements
1958+
1959+
def create_tag(self, snapshot_id: int, tag_name: str, max_ref_age_ms: Optional[int] = None) -> ManageSnapshots:
1960+
"""
1961+
Create a new tag pointing to the given snapshot id.
1962+
1963+
Args:
1964+
snapshot_id (int): snapshot id of the existing snapshot to tag
1965+
tag_name (str): name of the tag
1966+
max_ref_age_ms (Optional[int]): max ref age in milliseconds
1967+
1968+
Returns:
1969+
This for method chaining
1970+
"""
1971+
update, requirement = self._transaction._set_ref_snapshot(
1972+
snapshot_id=snapshot_id,
1973+
ref_name=tag_name,
1974+
type="tag",
1975+
max_ref_age_ms=max_ref_age_ms,
1976+
)
1977+
self._updates += update
1978+
self._requirements += requirement
1979+
return self
1980+
1981+
def create_branch(
1982+
self,
1983+
snapshot_id: int,
1984+
branch_name: str,
1985+
max_ref_age_ms: Optional[int] = None,
1986+
max_snapshot_age_ms: Optional[int] = None,
1987+
min_snapshots_to_keep: Optional[int] = None,
1988+
) -> ManageSnapshots:
1989+
"""
1990+
Create a new branch pointing to the given snapshot id.
1991+
1992+
Args:
1993+
snapshot_id (int): snapshot id of existing snapshot at which the branch is created.
1994+
branch_name (str): name of the new branch
1995+
max_ref_age_ms (Optional[int]): max ref age in milliseconds
1996+
max_snapshot_age_ms (Optional[int]): max age of snapshots to keep in milliseconds
1997+
min_snapshots_to_keep (Optional[int]): min number of snapshots to keep in milliseconds
1998+
Returns:
1999+
This for method chaining
2000+
"""
2001+
update, requirement = self._transaction._set_ref_snapshot(
2002+
snapshot_id=snapshot_id,
2003+
ref_name=branch_name,
2004+
type="branch",
2005+
max_ref_age_ms=max_ref_age_ms,
2006+
max_snapshot_age_ms=max_snapshot_age_ms,
2007+
min_snapshots_to_keep=min_snapshots_to_keep,
2008+
)
2009+
self._updates += update
2010+
self._requirements += requirement
2011+
return self
2012+
2013+
18382014
class UpdateSchema(UpdateTableMetadata["UpdateSchema"]):
18392015
_schema: Schema
18402016
_last_column_id: itertools.count[int]
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
import pytest
18+
19+
from pyiceberg.catalog import Catalog
20+
from pyiceberg.table.refs import SnapshotRef
21+
22+
23+
@pytest.mark.integration
24+
@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
25+
def test_create_tag(catalog: Catalog) -> None:
26+
identifier = "default.test_table_snapshot_operations"
27+
tbl = catalog.load_table(identifier)
28+
assert len(tbl.history()) > 3
29+
tag_snapshot_id = tbl.history()[-3].snapshot_id
30+
tbl.manage_snapshots().create_tag(snapshot_id=tag_snapshot_id, tag_name="tag123").commit()
31+
assert tbl.metadata.refs["tag123"] == SnapshotRef(snapshot_id=tag_snapshot_id, snapshot_ref_type="tag")
32+
33+
34+
@pytest.mark.integration
35+
@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
36+
def test_create_branch(catalog: Catalog) -> None:
37+
identifier = "default.test_table_snapshot_operations"
38+
tbl = catalog.load_table(identifier)
39+
assert len(tbl.history()) > 2
40+
branch_snapshot_id = tbl.history()[-2].snapshot_id
41+
tbl.manage_snapshots().create_branch(snapshot_id=branch_snapshot_id, branch_name="branch123").commit()
42+
assert tbl.metadata.refs["branch123"] == SnapshotRef(snapshot_id=branch_snapshot_id, snapshot_ref_type="branch")

tests/table/test_init.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,30 @@ def test_update_metadata_add_snapshot(table_v2: Table) -> None:
689689
assert new_metadata.last_updated_ms == new_snapshot.timestamp_ms
690690

691691

692+
def test_update_metadata_set_ref_snapshot(table_v2: Table) -> None:
693+
update, _ = table_v2.transaction()._set_ref_snapshot(
694+
snapshot_id=3051729675574597004,
695+
ref_name="main",
696+
type="branch",
697+
max_ref_age_ms=123123123,
698+
max_snapshot_age_ms=12312312312,
699+
min_snapshots_to_keep=1,
700+
)
701+
702+
new_metadata = update_table_metadata(table_v2.metadata, update)
703+
assert len(new_metadata.snapshot_log) == 3
704+
assert new_metadata.snapshot_log[2].snapshot_id == 3051729675574597004
705+
assert new_metadata.current_snapshot_id == 3051729675574597004
706+
assert new_metadata.last_updated_ms > table_v2.metadata.last_updated_ms
707+
assert new_metadata.refs["main"] == SnapshotRef(
708+
snapshot_id=3051729675574597004,
709+
snapshot_ref_type="branch",
710+
min_snapshots_to_keep=1,
711+
max_snapshot_age_ms=12312312312,
712+
max_ref_age_ms=123123123,
713+
)
714+
715+
692716
def test_update_metadata_set_snapshot_ref(table_v2: Table) -> None:
693717
update = SetSnapshotRefUpdate(
694718
ref_name="main",

0 commit comments

Comments
 (0)