Skip to content

Commit 5a3c346

Browse files
authored
Add V3 read support (#1554)
1 parent 0638493 commit 5a3c346

File tree

8 files changed

+233
-22
lines changed

8 files changed

+233
-22
lines changed

pyiceberg/partitioning.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
Field,
3838
PlainSerializer,
3939
WithJsonSchema,
40+
model_validator,
4041
)
4142
from typing_extensions import Annotated
4243

@@ -111,6 +112,19 @@ def __init__(
111112

112113
super().__init__(**data)
113114

115+
@model_validator(mode="before")
116+
@classmethod
117+
def map_source_ids_onto_source_id(cls, data: Any) -> Any:
118+
if isinstance(data, dict):
119+
if "source-id" not in data and (source_ids := data["source-ids"]):
120+
if isinstance(source_ids, list):
121+
if len(source_ids) == 0:
122+
raise ValueError("Empty source-ids is not allowed")
123+
if len(source_ids) > 1:
124+
raise ValueError("Multi argument transforms are not yet supported")
125+
data["source-id"] = source_ids[0]
126+
return data
127+
114128
def __str__(self) -> str:
115129
"""Return the string representation of the PartitionField class."""
116130
return f"{self.field_id}: {self.name}: {self.transform}({self.source_id})"

pyiceberg/table/metadata.py

Lines changed: 98 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -459,9 +459,8 @@ def to_v2(self) -> TableMetadataV2:
459459
return TableMetadataV2.model_validate(metadata)
460460

461461
format_version: Literal[1] = Field(alias="format-version", default=1)
462-
"""An integer version number for the format. Currently, this can be 1 or 2
463-
based on the spec. Implementations must throw an exception if a table’s
464-
version is higher than the supported version."""
462+
"""An integer version number for the format. Implementations must throw
463+
an exception if a table’s version is higher than the supported version."""
465464

466465
schema_: Schema = Field(alias="schema")
467466
"""The table’s current schema. (Deprecated: use schemas and
@@ -507,16 +506,74 @@ def construct_refs(cls, table_metadata: TableMetadata) -> TableMetadata:
507506
return construct_refs(table_metadata)
508507

509508
format_version: Literal[2] = Field(alias="format-version", default=2)
510-
"""An integer version number for the format. Currently, this can be 1 or 2
511-
based on the spec. Implementations must throw an exception if a table’s
512-
version is higher than the supported version."""
509+
"""An integer version number for the format. Implementations must throw
510+
an exception if a table’s version is higher than the supported version."""
513511

514512
last_sequence_number: int = Field(alias="last-sequence-number", default=INITIAL_SEQUENCE_NUMBER)
515513
"""The table’s highest assigned sequence number, a monotonically
516514
increasing long that tracks the order of snapshots in a table."""
517515

518516

519-
TableMetadata = Annotated[Union[TableMetadataV1, TableMetadataV2], Field(discriminator="format_version")]
517+
class TableMetadataV3(TableMetadataCommonFields, IcebergBaseModel):
518+
"""Represents version 3 of the Table Metadata.
519+
520+
Version 3 of the Iceberg spec extends data types and existing metadata structures to add new capabilities:
521+
522+
- New data types: nanosecond timestamp(tz), unknown
523+
- Default value support for columns
524+
- Multi-argument transforms for partitioning and sorting
525+
- Row Lineage tracking
526+
- Binary deletion vectors
527+
528+
For more information:
529+
https://iceberg.apache.org/spec/?column-projection#version-3-extended-types-and-capabilities
530+
"""
531+
532+
@model_validator(mode="before")
533+
def cleanup_snapshot_id(cls, data: Dict[str, Any]) -> Dict[str, Any]:
534+
return cleanup_snapshot_id(data)
535+
536+
@model_validator(mode="after")
537+
def check_schemas(cls, table_metadata: TableMetadata) -> TableMetadata:
538+
return check_schemas(table_metadata)
539+
540+
@model_validator(mode="after")
541+
def check_partition_specs(cls, table_metadata: TableMetadata) -> TableMetadata:
542+
return check_partition_specs(table_metadata)
543+
544+
@model_validator(mode="after")
545+
def check_sort_orders(cls, table_metadata: TableMetadata) -> TableMetadata:
546+
return check_sort_orders(table_metadata)
547+
548+
@model_validator(mode="after")
549+
def construct_refs(cls, table_metadata: TableMetadata) -> TableMetadata:
550+
return construct_refs(table_metadata)
551+
552+
format_version: Literal[3] = Field(alias="format-version", default=3)
553+
"""An integer version number for the format. Implementations must throw
554+
an exception if a table’s version is higher than the supported version."""
555+
556+
last_sequence_number: int = Field(alias="last-sequence-number", default=INITIAL_SEQUENCE_NUMBER)
557+
"""The table’s highest assigned sequence number, a monotonically
558+
increasing long that tracks the order of snapshots in a table."""
559+
560+
row_lineage: bool = Field(alias="row-lineage", default=False)
561+
"""Indicates that row-lineage is enabled on the table
562+
563+
For more information:
564+
https://iceberg.apache.org/spec/?column-projection#row-lineage
565+
"""
566+
567+
next_row_id: Optional[int] = Field(alias="next-row-id", default=None)
568+
"""A long higher than all assigned row IDs; the next snapshot's `first-row-id`."""
569+
570+
def model_dump_json(
571+
self, exclude_none: bool = True, exclude: Optional[Any] = None, by_alias: bool = True, **kwargs: Any
572+
) -> str:
573+
raise NotImplementedError("Writing V3 is not yet supported, see: https://github.com/apache/iceberg-python/issues/1551")
574+
575+
576+
TableMetadata = Annotated[Union[TableMetadataV1, TableMetadataV2, TableMetadataV3], Field(discriminator="format_version")]
520577

521578

522579
def new_table_metadata(
@@ -553,20 +610,36 @@ def new_table_metadata(
553610
last_partition_id=fresh_partition_spec.last_assigned_field_id,
554611
table_uuid=table_uuid,
555612
)
556-
557-
return TableMetadataV2(
558-
location=location,
559-
schemas=[fresh_schema],
560-
last_column_id=fresh_schema.highest_field_id,
561-
current_schema_id=fresh_schema.schema_id,
562-
partition_specs=[fresh_partition_spec],
563-
default_spec_id=fresh_partition_spec.spec_id,
564-
sort_orders=[fresh_sort_order],
565-
default_sort_order_id=fresh_sort_order.order_id,
566-
properties=properties,
567-
last_partition_id=fresh_partition_spec.last_assigned_field_id,
568-
table_uuid=table_uuid,
569-
)
613+
elif format_version == 2:
614+
return TableMetadataV2(
615+
location=location,
616+
schemas=[fresh_schema],
617+
last_column_id=fresh_schema.highest_field_id,
618+
current_schema_id=fresh_schema.schema_id,
619+
partition_specs=[fresh_partition_spec],
620+
default_spec_id=fresh_partition_spec.spec_id,
621+
sort_orders=[fresh_sort_order],
622+
default_sort_order_id=fresh_sort_order.order_id,
623+
properties=properties,
624+
last_partition_id=fresh_partition_spec.last_assigned_field_id,
625+
table_uuid=table_uuid,
626+
)
627+
elif format_version == 3:
628+
return TableMetadataV3(
629+
location=location,
630+
schemas=[fresh_schema],
631+
last_column_id=fresh_schema.highest_field_id,
632+
current_schema_id=fresh_schema.schema_id,
633+
partition_specs=[fresh_partition_spec],
634+
default_spec_id=fresh_partition_spec.spec_id,
635+
sort_orders=[fresh_sort_order],
636+
default_sort_order_id=fresh_sort_order.order_id,
637+
properties=properties,
638+
last_partition_id=fresh_partition_spec.last_assigned_field_id,
639+
table_uuid=table_uuid,
640+
)
641+
else:
642+
raise ValidationError(f"Unknown format version: {format_version}")
570643

571644

572645
class TableMetadataWrapper(IcebergRootModel[TableMetadata]):
@@ -593,6 +666,8 @@ def parse_obj(data: Dict[str, Any]) -> TableMetadata:
593666
return TableMetadataV1(**data)
594667
elif format_version == 2:
595668
return TableMetadataV2(**data)
669+
elif format_version == 3:
670+
return TableMetadataV3(**data)
596671
else:
597672
raise ValidationError(f"Unknown format version: {format_version}")
598673

@@ -609,6 +684,8 @@ def _construct_without_validation(table_metadata: TableMetadata) -> TableMetadat
609684
return TableMetadataV1.model_construct(**dict(table_metadata))
610685
elif table_metadata.format_version == 2:
611686
return TableMetadataV2.model_construct(**dict(table_metadata))
687+
elif table_metadata.format_version == 3:
688+
return TableMetadataV3.model_construct(**dict(table_metadata))
612689
else:
613690
raise ValidationError(f"Unknown format version: {table_metadata.format_version}")
614691

pyiceberg/table/sorting.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,19 @@ def set_null_order(cls, values: Dict[str, Any]) -> Dict[str, Any]:
102102
values["null-order"] = NullOrder.NULLS_FIRST if values["direction"] == SortDirection.ASC else NullOrder.NULLS_LAST
103103
return values
104104

105+
@model_validator(mode="before")
106+
@classmethod
107+
def map_source_ids_onto_source_id(cls, data: Any) -> Any:
108+
if isinstance(data, dict):
109+
if "source-id" not in data and (source_ids := data["source-ids"]):
110+
if isinstance(source_ids, list):
111+
if len(source_ids) == 0:
112+
raise ValueError("Empty source-ids is not allowed")
113+
if len(source_ids) > 1:
114+
raise ValueError("Multi argument transforms are not yet supported")
115+
data["source-id"] = source_ids[0]
116+
return data
117+
105118
source_id: int = Field(alias="source-id")
106119
transform: Annotated[ # type: ignore
107120
Transform,

pyiceberg/typedef.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,4 +206,4 @@ def __hash__(self) -> int:
206206
return hash(str(self))
207207

208208

209-
TableVersion: TypeAlias = Literal[1, 2]
209+
TableVersion: TypeAlias = Literal[1, 2, 3]

tests/conftest.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,72 @@ def generate_snapshot(
902902
"refs": {"test": {"snapshot-id": 3051729675574597004, "type": "tag", "max-ref-age-ms": 10000000}},
903903
}
904904

905+
EXAMPLE_TABLE_METADATA_V3 = {
906+
"format-version": 3,
907+
"table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1",
908+
"location": "s3://bucket/test/location",
909+
"last-sequence-number": 34,
910+
"last-updated-ms": 1602638573590,
911+
"last-column-id": 3,
912+
"current-schema-id": 1,
913+
"schemas": [
914+
{"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]},
915+
{
916+
"type": "struct",
917+
"schema-id": 1,
918+
"identifier-field-ids": [1, 2],
919+
"fields": [
920+
{"id": 1, "name": "x", "required": True, "type": "long"},
921+
{"id": 2, "name": "y", "required": True, "type": "long", "doc": "comment"},
922+
{"id": 3, "name": "z", "required": True, "type": "long"},
923+
# TODO: Add unknown, timestamp(tz)_ns
924+
# {"id": 4, "name": "u", "required": True, "type": "unknown"},
925+
# {"id": 5, "name": "ns", "required": True, "type": "timestamp_ns"},
926+
# {"id": 6, "name": "nstz", "required": True, "type": "timestamptz_ns"},
927+
],
928+
},
929+
],
930+
"default-spec-id": 0,
931+
"partition-specs": [{"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-ids": [1], "field-id": 1000}]}],
932+
"last-partition-id": 1000,
933+
"default-sort-order-id": 3,
934+
"sort-orders": [
935+
{
936+
"order-id": 3,
937+
"fields": [
938+
{"transform": "identity", "source-ids": [2], "direction": "asc", "null-order": "nulls-first"},
939+
{"transform": "bucket[4]", "source-ids": [3], "direction": "desc", "null-order": "nulls-last"},
940+
],
941+
}
942+
],
943+
"properties": {"read.split.target.size": "134217728"},
944+
"current-snapshot-id": 3055729675574597004,
945+
"snapshots": [
946+
{
947+
"snapshot-id": 3051729675574597004,
948+
"timestamp-ms": 1515100955770,
949+
"sequence-number": 0,
950+
"summary": {"operation": "append"},
951+
"manifest-list": "s3://a/b/1.avro",
952+
},
953+
{
954+
"snapshot-id": 3055729675574597004,
955+
"parent-snapshot-id": 3051729675574597004,
956+
"timestamp-ms": 1555100955770,
957+
"sequence-number": 1,
958+
"summary": {"operation": "append"},
959+
"manifest-list": "s3://a/b/2.avro",
960+
"schema-id": 1,
961+
},
962+
],
963+
"snapshot-log": [
964+
{"snapshot-id": 3051729675574597004, "timestamp-ms": 1515100955770},
965+
{"snapshot-id": 3055729675574597004, "timestamp-ms": 1555100955770},
966+
],
967+
"metadata-log": [{"metadata-file": "s3://bucket/.../v1.json", "timestamp-ms": 1515100}],
968+
"refs": {"test": {"snapshot-id": 3051729675574597004, "type": "tag", "max-ref-age-ms": 10000000}},
969+
}
970+
905971
TABLE_METADATA_V2_WITH_FIXED_AND_DECIMAL_TYPES = {
906972
"format-version": 2,
907973
"table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1",
@@ -1052,6 +1118,11 @@ def table_metadata_v2_with_statistics() -> Dict[str, Any]:
10521118
return TABLE_METADATA_V2_WITH_STATISTICS
10531119

10541120

1121+
@pytest.fixture
1122+
def example_table_metadata_v3() -> Dict[str, Any]:
1123+
return EXAMPLE_TABLE_METADATA_V3
1124+
1125+
10551126
@pytest.fixture(scope="session")
10561127
def metadata_location(tmp_path_factory: pytest.TempPathFactory) -> str:
10571128
from pyiceberg.io.pyarrow import PyArrowFileIO

tests/table/test_metadata.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
TableMetadataUtil,
3434
TableMetadataV1,
3535
TableMetadataV2,
36+
TableMetadataV3,
3637
new_table_metadata,
3738
)
3839
from pyiceberg.table.refs import SnapshotRef, SnapshotRefType
@@ -178,6 +179,15 @@ def test_serialize_v2(example_table_metadata_v2: Dict[str, Any]) -> None:
178179
assert table_metadata == expected
179180

180181

182+
def test_serialize_v3(example_table_metadata_v3: Dict[str, Any]) -> None:
183+
# Writing will be part of https://github.com/apache/iceberg-python/issues/1551
184+
185+
with pytest.raises(NotImplementedError) as exc_info:
186+
_ = TableMetadataV3(**example_table_metadata_v3).model_dump_json()
187+
188+
assert "Writing V3 is not yet supported, see: https://github.com/apache/iceberg-python/issues/1551" in str(exc_info.value)
189+
190+
181191
def test_migrate_v1_schemas(example_table_metadata_v1: Dict[str, Any]) -> None:
182192
table_metadata = TableMetadataV1(**example_table_metadata_v1)
183193

tests/table/test_partitioning.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,17 @@ def test_partition_type(table_schema_simple: Schema) -> None:
151151
NestedField(field_id=1000, name="str_truncate", field_type=StringType(), required=False),
152152
NestedField(field_id=1001, name="int_bucket", field_type=IntegerType(), required=True),
153153
)
154+
155+
156+
def test_deserialize_partition_field_v2() -> None:
157+
json_partition_spec = """{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}"""
158+
159+
field = PartitionField.model_validate_json(json_partition_spec)
160+
assert field == PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate")
161+
162+
163+
def test_deserialize_partition_field_v3() -> None:
164+
json_partition_spec = """{"source-ids": [1], "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}"""
165+
166+
field = PartitionField.model_validate_json(json_partition_spec)
167+
assert field == PartitionField(source_id=1, field_id=1000, transform=TruncateTransform(width=19), name="str_truncate")

tests/table/test_sorting.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,15 @@ def test_unsorting_to_repr() -> None:
102102
def test_sorting_repr(sort_order: SortOrder) -> None:
103103
"""To make sure that the repr converts back to the original object"""
104104
assert sort_order == eval(repr(sort_order))
105+
106+
107+
def test_serialize_sort_field_v2() -> None:
108+
expected = SortField(source_id=19, transform=IdentityTransform(), null_order=NullOrder.NULLS_FIRST)
109+
payload = '{"source-id":19,"transform":"identity","direction":"asc","null-order":"nulls-first"}'
110+
assert SortField.model_validate_json(payload) == expected
111+
112+
113+
def test_serialize_sort_field_v3() -> None:
114+
expected = SortField(source_id=19, transform=IdentityTransform(), null_order=NullOrder.NULLS_FIRST)
115+
payload = '{"source-ids":[19],"transform":"identity","direction":"asc","null-order":"nulls-first"}'
116+
assert SortField.model_validate_json(payload) == expected

0 commit comments

Comments
 (0)