Skip to content

Commit 69b9e39

Browse files
authored
Add Snapshots table metadata (#524)
* Add Snapshots table metadata * Use Spark for tests
1 parent bbc7e7c commit 69b9e39

File tree

3 files changed

+158
-4
lines changed

3 files changed

+158
-4
lines changed

mkdocs/docs/api.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,38 @@ table.append(df)
319319

320320
<!-- prettier-ignore-end -->
321321

322+
## Inspecting tables
323+
324+
To explore the table metadata, tables can be inspected.
325+
326+
### Snapshots
327+
328+
Inspect the snapshots of the table:
329+
330+
```python
331+
table.inspect.snapshots()
332+
```
333+
334+
```
335+
pyarrow.Table
336+
committed_at: timestamp[ms] not null
337+
snapshot_id: int64 not null
338+
parent_id: int64
339+
operation: string
340+
manifest_list: string not null
341+
summary: map<string, string>
342+
child 0, entries: struct<key: string not null, value: string> not null
343+
child 0, key: string not null
344+
child 1, value: string
345+
----
346+
committed_at: [[2024-03-15 15:01:25.682,2024-03-15 15:01:25.730,2024-03-15 15:01:25.772]]
347+
snapshot_id: [[805611270568163028,3679426539959220963,5588071473139865870]]
348+
parent_id: [[null,805611270568163028,3679426539959220963]]
349+
operation: [["append","overwrite","append"]]
350+
manifest_list: [["s3://warehouse/default/table_metadata_snapshots/metadata/snap-805611270568163028-0-43637daf-ea4b-4ceb-b096-a60c25481eb5.avro","s3://warehouse/default/table_metadata_snapshots/metadata/snap-3679426539959220963-0-8be81019-adf1-4bb6-a127-e15217bd50b3.avro","s3://warehouse/default/table_metadata_snapshots/metadata/snap-5588071473139865870-0-1382dd7e-5fbc-4c51-9776-a832d7d0984e.avro"]]
351+
summary: [[keys:["added-files-size","added-data-files","added-records","total-data-files","total-delete-files","total-records","total-files-size","total-position-deletes","total-equality-deletes"]values:["5459","1","3","1","0","3","5459","0","0"],keys:["added-files-size","added-data-files","added-records","total-data-files","total-records",...,"total-equality-deletes","total-files-size","deleted-data-files","deleted-records","removed-files-size"]values:["5459","1","3","1","3",...,"0","5459","1","3","5459"],keys:["added-files-size","added-data-files","added-records","total-data-files","total-delete-files","total-records","total-files-size","total-position-deletes","total-equality-deletes"]values:["5459","1","3","2","0","6","10918","0","0"]]]
352+
```
353+
322354
### Add Files
323355
324356
Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.

pyiceberg/table/__init__.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,11 @@ def transaction(self) -> Transaction:
971971
"""
972972
return Transaction(self)
973973

974+
@property
975+
def inspect(self) -> InspectTable:
976+
"""Return the InspectTable object to browse the table metadata."""
977+
return InspectTable(self)
978+
974979
def refresh(self) -> Table:
975980
"""Refresh the current table metadata."""
976981
fresh = self.catalog.load_table(self.identifier[1:])
@@ -3046,3 +3051,49 @@ def _new_field_id(self) -> int:
30463051

30473052
def _is_duplicate_partition(self, transform: Transform[Any, Any], partition_field: PartitionField) -> bool:
30483053
return partition_field.field_id not in self._deletes and partition_field.transform == transform
3054+
3055+
3056+
class InspectTable:
3057+
tbl: Table
3058+
3059+
def __init__(self, tbl: Table) -> None:
3060+
self.tbl = tbl
3061+
3062+
try:
3063+
import pyarrow as pa # noqa
3064+
except ModuleNotFoundError as e:
3065+
raise ModuleNotFoundError("For metadata operations PyArrow needs to be installed") from e
3066+
3067+
def snapshots(self) -> "pa.Table":
3068+
import pyarrow as pa
3069+
3070+
snapshots_schema = pa.schema([
3071+
pa.field('committed_at', pa.timestamp(unit='ms'), nullable=False),
3072+
pa.field('snapshot_id', pa.int64(), nullable=False),
3073+
pa.field('parent_id', pa.int64(), nullable=True),
3074+
pa.field('operation', pa.string(), nullable=True),
3075+
pa.field('manifest_list', pa.string(), nullable=False),
3076+
pa.field('summary', pa.map_(pa.string(), pa.string()), nullable=True),
3077+
])
3078+
snapshots = []
3079+
for snapshot in self.tbl.metadata.snapshots:
3080+
if summary := snapshot.summary:
3081+
operation = summary.operation.value
3082+
additional_properties = snapshot.summary.additional_properties
3083+
else:
3084+
operation = None
3085+
additional_properties = None
3086+
3087+
snapshots.append({
3088+
'committed_at': datetime.datetime.utcfromtimestamp(snapshot.timestamp_ms / 1000.0),
3089+
'snapshot_id': snapshot.snapshot_id,
3090+
'parent_id': snapshot.parent_snapshot_id,
3091+
'operation': str(operation),
3092+
'manifest_list': snapshot.manifest_list,
3093+
'summary': additional_properties,
3094+
})
3095+
3096+
return pa.Table.from_pylist(
3097+
snapshots,
3098+
schema=snapshots_schema,
3099+
)

tests/integration/test_writes.py

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717
# pylint:disable=redefined-outer-name
18+
import math
1819
import os
1920
import time
2021
import uuid
2122
from datetime import date, datetime
2223
from pathlib import Path
23-
from typing import Any, Dict, List
24+
from typing import Any, Dict, List, Optional
2425
from urllib.parse import urlparse
2526

2627
import pyarrow as pa
@@ -135,15 +136,19 @@ def arrow_table_with_only_nulls(pa_schema: pa.Schema) -> pa.Table:
135136
return pa.Table.from_pylist([{}, {}], schema=pa_schema)
136137

137138

138-
def _create_table(session_catalog: Catalog, identifier: str, properties: Properties, data: List[pa.Table]) -> Table:
139+
def _create_table(
140+
session_catalog: Catalog, identifier: str, properties: Properties, data: Optional[List[pa.Table]] = None
141+
) -> Table:
139142
try:
140143
session_catalog.drop_table(identifier=identifier)
141144
except NoSuchTableError:
142145
pass
143146

144147
tbl = session_catalog.create_table(identifier=identifier, schema=TABLE_SCHEMA, properties=properties)
145-
for d in data:
146-
tbl.append(d)
148+
149+
if data:
150+
for d in data:
151+
tbl.append(d)
147152

148153
return tbl
149154

@@ -667,3 +672,69 @@ def test_table_properties_raise_for_none_value(
667672
session_catalog, identifier, {"format-version": format_version, **property_with_none}, [arrow_table_with_null]
668673
)
669674
assert "None type is not a supported value in properties: property_name" in str(exc_info.value)
675+
676+
677+
@pytest.mark.integration
678+
@pytest.mark.parametrize("format_version", [1, 2])
679+
def test_inspect_snapshots(
680+
spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
681+
) -> None:
682+
identifier = "default.table_metadata_snapshots"
683+
tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version})
684+
685+
tbl.overwrite(arrow_table_with_null)
686+
# should produce a DELETE entry
687+
tbl.overwrite(arrow_table_with_null)
688+
# Since we don't rewrite, this should produce a new manifest with an ADDED entry
689+
tbl.append(arrow_table_with_null)
690+
691+
df = tbl.inspect.snapshots()
692+
693+
assert df.column_names == [
694+
'committed_at',
695+
'snapshot_id',
696+
'parent_id',
697+
'operation',
698+
'manifest_list',
699+
'summary',
700+
]
701+
702+
for committed_at in df['committed_at']:
703+
assert isinstance(committed_at.as_py(), datetime)
704+
705+
for snapshot_id in df['snapshot_id']:
706+
assert isinstance(snapshot_id.as_py(), int)
707+
708+
assert df['parent_id'][0].as_py() is None
709+
assert df['parent_id'][1:] == df['snapshot_id'][:2]
710+
711+
assert [operation.as_py() for operation in df['operation']] == ['append', 'overwrite', 'append']
712+
713+
for manifest_list in df['manifest_list']:
714+
assert manifest_list.as_py().startswith("s3://")
715+
716+
assert df['summary'][0].as_py() == [
717+
('added-files-size', '5459'),
718+
('added-data-files', '1'),
719+
('added-records', '3'),
720+
('total-data-files', '1'),
721+
('total-delete-files', '0'),
722+
('total-records', '3'),
723+
('total-files-size', '5459'),
724+
('total-position-deletes', '0'),
725+
('total-equality-deletes', '0'),
726+
]
727+
728+
lhs = spark.table(f"{identifier}.snapshots").toPandas()
729+
rhs = df.to_pandas()
730+
for column in df.column_names:
731+
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
732+
if column == 'summary':
733+
# Arrow returns a list of tuples, instead of a dict
734+
right = dict(right)
735+
736+
if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right):
737+
# NaN != NaN in Python
738+
continue
739+
740+
assert left == right, f"Difference in column {column}: {left} != {right}"

0 commit comments

Comments
 (0)