201201ICEBERG_SCHEMA = b"iceberg.schema"
202202# The PARQUET: in front means that it is Parquet specific, in this case the field_id
203203PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
204+ # ORC field ID key for Iceberg field IDs in ORC metadata
205+ ORC_FIELD_ID_KEY = b"iceberg.id"
204206PYARROW_FIELD_DOC_KEY = b"doc"
205207LIST_ELEMENT_NAME = "element"
206208MAP_KEY_NAME = "key"
@@ -690,16 +692,20 @@ def schema_to_pyarrow(
690692 schema : Union [Schema , IcebergType ],
691693 metadata : Dict [bytes , bytes ] = EMPTY_DICT ,
692694 include_field_ids : bool = True ,
695+ file_format : FileFormat = FileFormat .PARQUET ,
693696) -> pa .schema :
694- return visit (schema , _ConvertToArrowSchema (metadata , include_field_ids ))
697+ return visit (schema , _ConvertToArrowSchema (metadata , include_field_ids , file_format ))
695698
696699
697700class _ConvertToArrowSchema (SchemaVisitorPerPrimitiveType [pa .DataType ]):
698701 _metadata : Dict [bytes , bytes ]
699702
700- def __init__ (self , metadata : Dict [bytes , bytes ] = EMPTY_DICT , include_field_ids : bool = True ) -> None :
703+ def __init__ (
704+ self , metadata : Dict [bytes , bytes ] = EMPTY_DICT , include_field_ids : bool = True , file_format : Optional [FileFormat ] = None
705+ ) -> None :
701706 self ._metadata = metadata
702707 self ._include_field_ids = include_field_ids
708+ self ._file_format = file_format
703709
704710 def schema (self , _ : Schema , struct_result : pa .StructType ) -> pa .schema :
705711 return pa .schema (list (struct_result ), metadata = self ._metadata )
@@ -712,7 +718,12 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
712718 if field .doc :
713719 metadata [PYARROW_FIELD_DOC_KEY ] = field .doc
714720 if self ._include_field_ids :
715- metadata [PYARROW_PARQUET_FIELD_ID_KEY ] = str (field .field_id )
721+ # Add field ID based on file format
722+ if self ._file_format == FileFormat .ORC :
723+ metadata [ORC_FIELD_ID_KEY ] = str (field .field_id )
724+ else :
725+ # Default to Parquet for backward compatibility
726+ metadata [PYARROW_PARQUET_FIELD_ID_KEY ] = str (field .field_id )
716727
717728 return pa .field (
718729 name = field .name ,
@@ -1011,6 +1022,10 @@ def _expression_to_complementary_pyarrow(expr: BooleanExpression) -> pc.Expressi
10111022def _get_file_format (file_format : FileFormat , ** kwargs : Dict [str , Any ]) -> ds .FileFormat :
10121023 if file_format == FileFormat .PARQUET :
10131024 return ds .ParquetFileFormat (** kwargs )
1025+ elif file_format == FileFormat .ORC :
1026+ # ORC doesn't support pre_buffer and buffer_size parameters
1027+ orc_kwargs = {k : v for k , v in kwargs .items () if k not in ["pre_buffer" , "buffer_size" ]}
1028+ return ds .OrcFileFormat (** orc_kwargs )
10141029 else :
10151030 raise ValueError (f"Unsupported file format: { file_format } " )
10161031
@@ -1027,6 +1042,15 @@ def _read_deletes(io: FileIO, data_file: DataFile) -> Dict[str, pa.ChunkedArray]
10271042 file .as_py (): table .filter (pc .field ("file_path" ) == file ).column ("pos" )
10281043 for file in table .column ("file_path" ).chunks [0 ].dictionary
10291044 }
1045+ elif data_file .file_format == FileFormat .ORC :
1046+ with io .new_input (data_file .file_path ).open () as fi :
1047+ delete_fragment = _get_file_format (data_file .file_format ).make_fragment (fi )
1048+ table = ds .Scanner .from_fragment (fragment = delete_fragment ).to_table ()
1049+ # For ORC, file_path columns are not dictionary-encoded, so we use unique() directly
1050+ return {
1051+ path .as_py (): table .filter (pc .field ("file_path" ) == path ).column ("pos" )
1052+ for path in table .column ("file_path" ).unique ()
1053+ }
10301054 elif data_file .file_format == FileFormat .PUFFIN :
10311055 with io .new_input (data_file .file_path ).open () as fi :
10321056 payload = fi .read ()
@@ -1228,11 +1252,17 @@ def primitive(self, primitive: pa.DataType) -> T:
12281252
12291253
12301254def _get_field_id (field : pa .Field ) -> Optional [int ]:
1231- return (
1232- int (field_id_str .decode ())
1233- if (field .metadata and (field_id_str := field .metadata .get (PYARROW_PARQUET_FIELD_ID_KEY )))
1234- else None
1235- )
1255+ """Return the Iceberg field ID from Parquet or ORC metadata if available."""
1256+ if field .metadata :
1257+ # Try Parquet field ID first
1258+ if field_id_bytes := field .metadata .get (PYARROW_PARQUET_FIELD_ID_KEY ):
1259+ return int (field_id_bytes .decode ())
1260+
1261+ # Fallback: try ORC field ID
1262+ if field_id_bytes := field .metadata .get (ORC_FIELD_ID_KEY ):
1263+ return int (field_id_bytes .decode ())
1264+
1265+ return None
12361266
12371267
12381268class _HasIds (PyArrowSchemaVisitor [bool ]):
@@ -1495,7 +1525,7 @@ def _task_to_record_batches(
14951525 format_version : TableVersion = TableProperties .DEFAULT_FORMAT_VERSION ,
14961526 downcast_ns_timestamp_to_us : Optional [bool ] = None ,
14971527) -> Iterator [pa .RecordBatch ]:
1498- arrow_format = ds . ParquetFileFormat ( pre_buffer = True , buffer_size = (ONE_MEGABYTE * 8 ))
1528+ arrow_format = _get_file_format ( task . file . file_format , pre_buffer = True , buffer_size = (ONE_MEGABYTE * 8 ))
14991529 with io .new_input (task .file .file_path ).open () as fin :
15001530 fragment = arrow_format .make_fragment (fin )
15011531 physical_schema = fragment .physical_schema
@@ -1845,6 +1875,8 @@ def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Fi
18451875 if field .doc :
18461876 metadata [PYARROW_FIELD_DOC_KEY ] = field .doc
18471877 if self ._include_field_ids :
1878+ # For projection visitor, we don't know the file format, so default to Parquet
1879+ # This is used for schema conversion during reads, not writes
18481880 metadata [PYARROW_PARQUET_FIELD_ID_KEY ] = str (field .field_id )
18491881
18501882 return pa .field (
0 commit comments