145
145
visit ,
146
146
visit_with_partner ,
147
147
)
148
+ from pyiceberg .table import TableProperties
148
149
from pyiceberg .table .locations import load_location_provider
149
150
from pyiceberg .table .metadata import TableMetadata
150
151
from pyiceberg .table .name_mapping import NameMapping , apply_name_mapping
151
152
from pyiceberg .table .puffin import PuffinFile
152
153
from pyiceberg .transforms import IdentityTransform , TruncateTransform
153
- from pyiceberg .typedef import EMPTY_DICT , Properties , Record
154
+ from pyiceberg .typedef import EMPTY_DICT , Properties , Record , TableVersion
154
155
from pyiceberg .types import (
155
156
BinaryType ,
156
157
BooleanType ,
@@ -1017,22 +1018,36 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start
1017
1018
1018
1019
1019
1020
def pyarrow_to_schema (
1020
- schema : pa .Schema , name_mapping : Optional [NameMapping ] = None , downcast_ns_timestamp_to_us : bool = False
1021
+ schema : pa .Schema ,
1022
+ name_mapping : Optional [NameMapping ] = None ,
1023
+ downcast_ns_timestamp_to_us : bool = False ,
1024
+ format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ,
1021
1025
) -> Schema :
1022
1026
has_ids = visit_pyarrow (schema , _HasIds ())
1023
1027
if has_ids :
1024
- return visit_pyarrow (schema , _ConvertToIceberg (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ))
1028
+ return visit_pyarrow (
1029
+ schema , _ConvertToIceberg (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version )
1030
+ )
1025
1031
elif name_mapping is not None :
1026
- schema_without_ids = _pyarrow_to_schema_without_ids (schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us )
1032
+ schema_without_ids = _pyarrow_to_schema_without_ids (
1033
+ schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version
1034
+ )
1027
1035
return apply_name_mapping (schema_without_ids , name_mapping )
1028
1036
else :
1029
1037
raise ValueError (
1030
1038
"Parquet file does not have field-ids and the Iceberg table does not have 'schema.name-mapping.default' defined"
1031
1039
)
1032
1040
1033
1041
1034
- def _pyarrow_to_schema_without_ids (schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False ) -> Schema :
1035
- return visit_pyarrow (schema , _ConvertToIcebergWithoutIDs (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ))
1042
+ def _pyarrow_to_schema_without_ids (
1043
+ schema : pa .Schema ,
1044
+ downcast_ns_timestamp_to_us : bool = False ,
1045
+ format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ,
1046
+ ) -> Schema :
1047
+ return visit_pyarrow (
1048
+ schema ,
1049
+ _ConvertToIcebergWithoutIDs (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version ),
1050
+ )
1036
1051
1037
1052
1038
1053
def _pyarrow_schema_ensure_large_types (schema : pa .Schema ) -> pa .Schema :
@@ -1214,9 +1229,12 @@ class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
1214
1229
1215
1230
_field_names : List [str ]
1216
1231
1217
- def __init__ (self , downcast_ns_timestamp_to_us : bool = False ) -> None :
1232
+ def __init__ (
1233
+ self , downcast_ns_timestamp_to_us : bool = False , format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION
1234
+ ) -> None : # noqa: F821
1218
1235
self ._field_names = []
1219
1236
self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
1237
+ self ._format_version = format_version
1220
1238
1221
1239
def _field_id (self , field : pa .Field ) -> int :
1222
1240
if (field_id := _get_field_id (field )) is not None :
@@ -1287,6 +1305,11 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType:
1287
1305
elif primitive .unit == "ns" :
1288
1306
if self ._downcast_ns_timestamp_to_us :
1289
1307
logger .warning ("Iceberg does not yet support 'ns' timestamp precision. Downcasting to 'us'." )
1308
+ elif self ._format_version >= 3 :
1309
+ if primitive .tz in UTC_ALIASES :
1310
+ return TimestamptzNanoType ()
1311
+ else :
1312
+ return TimestampNanoType ()
1290
1313
else :
1291
1314
raise TypeError (
1292
1315
"Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." ,
@@ -2519,7 +2542,10 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[
2519
2542
2520
2543
2521
2544
def _check_pyarrow_schema_compatible (
2522
- requested_schema : Schema , provided_schema : pa .Schema , downcast_ns_timestamp_to_us : bool = False
2545
+ requested_schema : Schema ,
2546
+ provided_schema : pa .Schema ,
2547
+ downcast_ns_timestamp_to_us : bool = False ,
2548
+ format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ,
2523
2549
) -> None :
2524
2550
"""
2525
2551
Check if the `requested_schema` is compatible with `provided_schema`.
@@ -2532,10 +2558,15 @@ def _check_pyarrow_schema_compatible(
2532
2558
name_mapping = requested_schema .name_mapping
2533
2559
try :
2534
2560
provided_schema = pyarrow_to_schema (
2535
- provided_schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
2561
+ provided_schema ,
2562
+ name_mapping = name_mapping ,
2563
+ downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ,
2564
+ format_version = format_version ,
2536
2565
)
2537
2566
except ValueError as e :
2538
- provided_schema = _pyarrow_to_schema_without_ids (provided_schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us )
2567
+ provided_schema = _pyarrow_to_schema_without_ids (
2568
+ provided_schema , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us , format_version = format_version
2569
+ )
2539
2570
additional_names = set (provided_schema ._name_to_id .keys ()) - set (requested_schema ._name_to_id .keys ())
2540
2571
raise ValueError (
2541
2572
f"PyArrow table contains more columns: { ', ' .join (sorted (additional_names ))} . Update the schema first (hint, use union_by_name)."
@@ -2561,7 +2592,7 @@ def parquet_file_to_data_file(io: FileIO, table_metadata: TableMetadata, file_pa
2561
2592
)
2562
2593
2563
2594
schema = table_metadata .schema ()
2564
- _check_pyarrow_schema_compatible (schema , arrow_schema )
2595
+ _check_pyarrow_schema_compatible (schema , arrow_schema , format_version = table_metadata . format_version )
2565
2596
2566
2597
statistics = data_file_statistics_from_parquet_metadata (
2567
2598
parquet_metadata = parquet_metadata ,
@@ -2652,7 +2683,12 @@ def _dataframe_to_data_files(
2652
2683
)
2653
2684
name_mapping = table_metadata .schema ().name_mapping
2654
2685
downcast_ns_timestamp_to_us = Config ().get_bool (DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE ) or False
2655
- task_schema = pyarrow_to_schema (df .schema , name_mapping = name_mapping , downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us )
2686
+ task_schema = pyarrow_to_schema (
2687
+ df .schema ,
2688
+ name_mapping = name_mapping ,
2689
+ downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ,
2690
+ format_version = table_metadata .format_version ,
2691
+ )
2656
2692
2657
2693
if table_metadata .spec ().is_unpartitioned ():
2658
2694
yield from write_file (
0 commit comments