196
196
ICEBERG_SCHEMA = b"iceberg.schema"
197
197
# The PARQUET: in front means that it is Parquet specific, in this case the field_id
198
198
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
199
+ # ORC stores IDs as string metadata
200
+ ORC_FIELD_ID_KEY = b"iceberg.id"
199
201
PYARROW_FIELD_DOC_KEY = b"doc"
200
202
LIST_ELEMENT_NAME = "element"
201
203
MAP_KEY_NAME = "key"
@@ -388,14 +390,28 @@ def __init__(self, properties: Properties = EMPTY_DICT):
388
390
389
391
@staticmethod
390
392
def parse_location (location : str ) -> Tuple [str , str , str ]:
391
- """Return the path without the scheme."""
393
+ """Return (scheme, netloc, path) for the given location.
394
+ Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC
395
+ if scheme/netloc are missing.
396
+ """
392
397
uri = urlparse (location )
393
- if not uri .scheme :
394
- return "file" , uri .netloc , os .path .abspath (location )
395
- elif uri .scheme in ("hdfs" , "viewfs" ):
396
- return uri .scheme , uri .netloc , uri .path
398
+
399
+ # Load defaults from environment
400
+ default_scheme = os .getenv ("DEFAULT_SCHEME" , "file" )
401
+ default_netloc = os .getenv ("DEFAULT_NETLOC" , "" )
402
+
403
+ # Apply logic
404
+ scheme = uri .scheme or default_scheme
405
+ netloc = uri .netloc or default_netloc
406
+
407
+ if scheme in ("hdfs" , "viewfs" ):
408
+ return scheme , netloc , uri .path
397
409
else :
398
- return uri .scheme , uri .netloc , f"{ uri .netloc } { uri .path } "
410
+ # For non-HDFS URIs, include netloc in the path if present
411
+ path = uri .path if uri .scheme else os .path .abspath (location )
412
+ if netloc and not path .startswith (netloc ):
413
+ path = f"{ netloc } { path } "
414
+ return scheme , netloc , path
399
415
400
416
def _initialize_fs (self , scheme : str , netloc : Optional [str ] = None ) -> FileSystem :
401
417
"""Initialize FileSystem for different scheme."""
@@ -575,7 +591,7 @@ def _initialize_gcs_fs(self) -> FileSystem:
575
591
def _initialize_local_fs (self ) -> FileSystem :
576
592
return PyArrowLocalFileSystem ()
577
593
578
- def new_input (self , location : str ) -> PyArrowFile :
594
+ def new_input (self , location : str , fs : Optional [ FileIO ] = None ) -> PyArrowFile :
579
595
"""Get a PyArrowFile instance to read bytes from the file at the given location.
580
596
581
597
Args:
@@ -585,8 +601,11 @@ def new_input(self, location: str) -> PyArrowFile:
585
601
PyArrowFile: A PyArrowFile instance for the given location.
586
602
"""
587
603
scheme , netloc , path = self .parse_location (location )
604
+ logger .warning (f"Scheme: { scheme } , Netloc: { netloc } , Path: { path } " )
605
+ if not fs :
606
+ fs = self .fs_by_scheme (scheme , netloc )
588
607
return PyArrowFile (
589
- fs = self . fs_by_scheme ( scheme , netloc ) ,
608
+ fs = fs ,
590
609
location = location ,
591
610
path = path ,
592
611
buffer_size = int (self .properties .get (BUFFER_SIZE , ONE_MEGABYTE )),
@@ -1022,7 +1041,11 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start
1022
1041
def pyarrow_to_schema (
1023
1042
schema : pa .Schema , name_mapping : Optional [NameMapping ] = None , downcast_ns_timestamp_to_us : bool = False
1024
1043
) -> Schema :
1025
- has_ids = visit_pyarrow (schema , _HasIds ())
1044
+ logger .warning (f"schema { schema } " )
1045
+ hids = _HasIds ()
1046
+ logger .warning ("hasIds" )
1047
+ has_ids = visit_pyarrow (schema , hids )
1048
+ logger .warning (f"has_ids is { has_ids } , name_mapping is { name_mapping } " )
1026
1049
if has_ids :
1027
1050
return visit_pyarrow (schema , _ConvertToIceberg (downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ))
1028
1051
elif name_mapping is not None :
@@ -1179,11 +1202,22 @@ def primitive(self, primitive: pa.DataType) -> T:
1179
1202
1180
1203
1181
1204
def _get_field_id (field : pa .Field ) -> Optional [int ]:
1182
- return (
1183
- int (field_id_str .decode ())
1184
- if (field .metadata and (field_id_str := field .metadata .get (PYARROW_PARQUET_FIELD_ID_KEY )))
1185
- else None
1186
- )
1205
+ """Return the Iceberg field ID from Parquet or ORC metadata if available."""
1206
+ if not field .metadata :
1207
+ return None
1208
+
1209
+ # Try Parquet field ID first
1210
+ field_id_bytes = field .metadata .get (PYARROW_PARQUET_FIELD_ID_KEY )
1211
+ if field_id_bytes :
1212
+ return int (field_id_bytes .decode ())
1213
+
1214
+ # Fallback: try ORC field ID
1215
+ field_id_bytes = field .metadata .get (ORC_FIELD_ID_KEY )
1216
+ if field_id_bytes :
1217
+ return int (field_id_bytes .decode ())
1218
+
1219
+ return None
1220
+
1187
1221
1188
1222
1189
1223
class _HasIds (PyArrowSchemaVisitor [bool ]):
@@ -1434,6 +1468,7 @@ def _task_to_record_batches(
1434
1468
name_mapping : Optional [NameMapping ] = None ,
1435
1469
partition_spec : Optional [PartitionSpec ] = None ,
1436
1470
) -> Iterator [pa .RecordBatch ]:
1471
+ logger .warning (f"file format is { task .file .file_format } " )
1437
1472
if task .file .file_format == FileFormat .PARQUET :
1438
1473
arrow_format = ds .ParquetFileFormat (pre_buffer = True , buffer_size = (ONE_MEGABYTE * 8 ))
1439
1474
elif task .file .file_format == FileFormat .ORC :
@@ -1443,6 +1478,7 @@ def _task_to_record_batches(
1443
1478
with io .new_input (task .file .file_path ).open () as fin :
1444
1479
fragment = arrow_format .make_fragment (fin )
1445
1480
physical_schema = fragment .physical_schema
1481
+ logger .warning (f"formats: filepath { task .file .file_path } , fragment { fragment } , physical_schema { physical_schema } " )
1446
1482
# In V1 and V2 table formats, we only support Timestamp 'us' in Iceberg Schema
1447
1483
# Hence it is reasonable to always cast 'ns' timestamp to 'us' on read.
1448
1484
# When V3 support is introduced, we will update `downcast_ns_timestamp_to_us` flag based on
0 commit comments