Skip to content

Commit 208b8d2

Browse files
author
Tom McCormick
committed
basic read support for orc iceberg table
1 parent 289f9f6 commit 208b8d2

File tree

2 files changed

+76
-5
lines changed

2 files changed

+76
-5
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@
197197
ICEBERG_SCHEMA = b"iceberg.schema"
198198
# The PARQUET: in front means that it is Parquet specific, in this case the field_id
199199
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
200+
# ORC stores IDs as string metadata
201+
ORC_FIELD_ID_KEY = b"iceberg.id"
200202
PYARROW_FIELD_DOC_KEY = b"doc"
201203
LIST_ELEMENT_NAME = "element"
202204
MAP_KEY_NAME = "key"
@@ -1180,11 +1182,21 @@ def primitive(self, primitive: pa.DataType) -> T:
11801182

11811183

11821184
def _get_field_id(field: pa.Field) -> Optional[int]:
1183-
return (
1184-
int(field_id_str.decode())
1185-
if (field.metadata and (field_id_str := field.metadata.get(PYARROW_PARQUET_FIELD_ID_KEY)))
1186-
else None
1187-
)
1185+
"""Return the Iceberg field ID from Parquet or ORC metadata if available."""
1186+
if not field.metadata:
1187+
return None
1188+
1189+
# Try Parquet field ID first
1190+
field_id_bytes = field.metadata.get(PYARROW_PARQUET_FIELD_ID_KEY)
1191+
if field_id_bytes:
1192+
return int(field_id_bytes.decode())
1193+
1194+
# Fallback: try ORC field ID
1195+
field_id_bytes = field.metadata.get(ORC_FIELD_ID_KEY)
1196+
if field_id_bytes:
1197+
return int(field_id_bytes.decode())
1198+
1199+
return None
11881200

11891201

11901202
class _HasIds(PyArrowSchemaVisitor[bool]):

tests/io/test_pyarrow_visitor.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
pyarrow_to_schema,
4646
schema_to_pyarrow,
4747
visit_pyarrow,
48+
ORC_FIELD_ID_KEY,
4849
)
4950
from pyiceberg.schema import Accessor, Schema, visit
5051
from pyiceberg.table.name_mapping import MappedField, NameMapping
@@ -239,6 +240,64 @@ def test_pyarrow_variable_binary_to_iceberg(pyarrow_type: pa.DataType) -> None:
239240
assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.large_binary()
240241

241242

243+
def test_orc_vs_parquet_metadata_handling() -> None:
244+
"""Test that ORC and Parquet metadata use different keys but produce equivalent schemas."""
245+
# ORC struct with ORC metadata
246+
orc_struct = pa.struct(
247+
[
248+
pa.field("foo", pa.string(), nullable=True, metadata={ORC_FIELD_ID_KEY.decode(): "1", "doc": "foo doc"}),
249+
pa.field("bar", pa.int32(), nullable=False, metadata={ORC_FIELD_ID_KEY.decode(): "2"}),
250+
]
251+
)
252+
253+
# Parquet struct with Parquet metadata (same structure, different metadata keys)
254+
parquet_struct = pa.struct(
255+
[
256+
pa.field("foo", pa.string(), nullable=True, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}),
257+
pa.field("bar", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "2"}),
258+
]
259+
)
260+
261+
# Both should produce the same Iceberg schema
262+
orc_result = visit_pyarrow(orc_struct, _ConvertToIceberg())
263+
parquet_result = visit_pyarrow(parquet_struct, _ConvertToIceberg())
264+
265+
assert orc_result == parquet_result
266+
267+
# Verify they use different metadata keys
268+
assert ORC_FIELD_ID_KEY.decode() == "iceberg.id"
269+
assert "PARQUET:field_id" != ORC_FIELD_ID_KEY.decode()
270+
271+
# Verify the metadata keys are present in the respective structs (using bytes keys)
272+
assert ORC_FIELD_ID_KEY in orc_struct[0].metadata # b'iceberg.id'
273+
assert b"PARQUET:field_id" in parquet_struct[0].metadata
274+
assert ORC_FIELD_ID_KEY not in parquet_struct[0].metadata
275+
assert b"PARQUET:field_id" not in orc_struct[0].metadata
276+
277+
278+
def test_pyarrow_struct_to_iceberg_orc() -> None:
279+
pyarrow_struct = pa.struct(
280+
[
281+
pa.field("foo", pa.string(), nullable=True, metadata={ORC_FIELD_ID_KEY.decode(): "1", "doc": "foo doc"}),
282+
pa.field("bar", pa.int32(), nullable=False, metadata={ORC_FIELD_ID_KEY.decode(): "2"}),
283+
pa.field("baz", pa.bool_(), nullable=True, metadata={ORC_FIELD_ID_KEY.decode(): "3"}),
284+
]
285+
)
286+
expected = StructType(
287+
NestedField(id=1, name="foo", field_type=StringType(), required=False, doc="foo doc"),
288+
NestedField(id=2, name="bar", field_type=IntegerType(), required=True),
289+
NestedField(id=3, name="baz", field_type=BooleanType(), required=False),
290+
)
291+
result = visit_pyarrow(pyarrow_struct, _ConvertToIceberg())
292+
assert result == expected
293+
294+
# Validate that field IDs were extracted from ORC metadata
295+
assert result.fields[0].field_id == 1 # foo
296+
assert result.fields[1].field_id == 2 # bar
297+
assert result.fields[2].field_id == 3 # baz
298+
299+
300+
242301
def test_pyarrow_struct_to_iceberg() -> None:
243302
pyarrow_struct = pa.struct(
244303
[

0 commit comments

Comments
 (0)