Basic read/write support for ORC

Tom McCormick · Tom McCormick · commit c39a94f4ce0b · 2025-07-22T12:06:28.000-04:00
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -63,6 +63,7 @@
 import pyarrow.dataset as ds
 import pyarrow.lib
 import pyarrow.parquet as pq
+import pyarrow.orc as orc
 from pyarrow import ChunkedArray
 from pyarrow._s3fs import S3RetryStrategy
 from pyarrow.fs import (
@@ -974,6 +975,8 @@ def _expression_to_complementary_pyarrow(expr: BooleanExpression) -> pc.Expressi
 def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) -> ds.FileFormat:
     if file_format == FileFormat.PARQUET:
         return ds.ParquetFileFormat(**kwargs)
+    elif file_format == FileFormat.ORC:
+        return ds.OrcFileFormat(**kwargs)
     else:
         raise ValueError(f"Unsupported file format: {file_format}")
 
@@ -1450,7 +1453,13 @@ def _task_to_record_batches(
     name_mapping: Optional[NameMapping] = None,
     partition_spec: Optional[PartitionSpec] = None,
 ) -> Iterator[pa.RecordBatch]:
-    arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
+    if task.file.file_format == FileFormat.PARQUET:
+        arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
+    elif task.file.file_format == FileFormat.ORC:
+        arrow_format = ds.OrcFileFormat()
+        # arrow_format = ds.OrcFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))
+    else:
+        raise ValueError("Unsupported file format")
     with io.new_input(task.file.file_path).open() as fin:
         fragment = arrow_format.make_fragment(fin)
         physical_schema = fragment.physical_schema
@@ -2512,9 +2521,60 @@ def write_parquet(task: WriteTask) -> DataFile:
 
         return data_file
 
+    def write_orc(task: WriteTask) -> DataFile:
+        table_schema = table_metadata.schema()
+        if (sanitized_schema := sanitize_column_names(table_schema)) != table_schema:
+            file_schema = sanitized_schema
+        else:
+            file_schema = table_schema
+
+        downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
+        batches = [
+            _to_requested_schema(
+                requested_schema=file_schema,
+                file_schema=task.schema,
+                batch=batch,
+                downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
+                include_field_ids=True,
+            )
+            for batch in task.record_batches
+        ]
+        arrow_table = pa.Table.from_batches(batches)
+        file_path = location_provider.new_data_location(
+            data_file_name=task.generate_data_file_filename("orc"),
+            partition_key=task.partition_key,
+        )
+        fo = io.new_output(file_path)
+        with fo.create(overwrite=True) as fos:
+            orc.write_table(arrow_table, fos)
+        # You may want to add statistics extraction here if needed
+        data_file = DataFile.from_args(
+            content=DataFileContent.DATA,
+            file_path=file_path,
+            file_format=FileFormat.ORC,
+            partition=task.partition_key.partition if task.partition_key else Record(),
+            file_size_in_bytes=len(fo),
+            sort_order_id=None,
+            spec_id=table_metadata.default_spec_id,
+            equality_ids=None,
+            key_metadata=None,
+            # statistics=... (if you implement ORC stats)
+        )
+        return data_file
+
     executor = ExecutorFactory.get_or_create()
-    data_files = executor.map(write_parquet, tasks)
+    def dispatch(task: WriteTask) -> DataFile:
+        file_format = FileFormat(table_metadata.properties.get(
+            TableProperties.WRITE_FILE_FORMAT, 
+            TableProperties.WRITE_FILE_FORMAT_DEFAULT))
+        if file_format == FileFormat.PARQUET:
+            return write_parquet(task)
+        elif file_format == FileFormat.ORC:
+            return write_orc(task)
+        else:
+            raise ValueError(f"Unsupported file format: {file_format}")
 
+    data_files = executor.map(dispatch, tasks)
     return iter(data_files)
 
 
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -161,6 +161,9 @@ class UpsertResult:
 
 
 class TableProperties:
+    WRITE_FILE_FORMAT = "write.format.default"
+    WRITE_FILE_FORMAT_DEFAULT = "parquet"
+
     PARQUET_ROW_GROUP_SIZE_BYTES = "write.parquet.row-group-size-bytes"
     PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024  # 128 MB
 
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -28,6 +28,7 @@
 import pyarrow
 import pyarrow as pa
 import pyarrow.parquet as pq
+import pyarrow.orc as orc
 import pytest
 from packaging import version
 from pyarrow.fs import AwsDefaultS3RetryStrategy, FileType, LocalFileSystem, S3FileSystem
@@ -2638,3 +2639,102 @@ def test_retry_strategy_not_found() -> None:
     io = PyArrowFileIO(properties={S3_RETRY_STRATEGY_IMPL: "pyiceberg.DoesNotExist"})
     with pytest.warns(UserWarning, match="Could not initialize S3 retry strategy: pyiceberg.DoesNotExist"):
         io.new_input("s3://bucket/path/to/file")
+
+
+def test_write_and_read_orc(tmp_path):
+    # Create a simple Arrow table
+    data = pa.table({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
+    orc_path = tmp_path / 'test.orc'
+    orc.write_table(data, str(orc_path))
+    # Read it back
+    orc_file = orc.ORCFile(str(orc_path))
+    table_read = orc_file.read()
+    assert table_read.equals(data)
+
+
+def test_orc_file_format_integration(tmp_path):
+    # This test mimics a minimal integration with PyIceberg's FileFormat enum and pyarrow.orc
+    from pyiceberg.manifest import FileFormat
+    import pyarrow.dataset as ds
+    data = pa.table({'a': [10, 20], 'b': ['foo', 'bar']})
+    orc_path = tmp_path / 'iceberg.orc'
+    orc.write_table(data, str(orc_path))
+    # Use PyArrow dataset API to read as ORC
+    dataset = ds.dataset(str(orc_path), format=ds.OrcFileFormat())
+    table_read = dataset.to_table()
+    assert table_read.equals(data)
+
+
+def test_iceberg_write_and_read_orc(tmp_path):
+    """
+    Integration test: Write and read ORC via Iceberg API.
+    To run just this test:
+        pytest tests/io/test_pyarrow.py -k test_iceberg_write_and_read_orc
+    """
+    import pyarrow as pa
+    from pyiceberg.schema import Schema, NestedField
+    from pyiceberg.types import IntegerType, StringType
+    from pyiceberg.manifest import FileFormat, DataFileContent
+    from pyiceberg.table.metadata import TableMetadataV2
+    from pyiceberg.partitioning import PartitionSpec
+    from pyiceberg.io.pyarrow import write_file, PyArrowFileIO, ArrowScan
+    from pyiceberg.table import WriteTask, FileScanTask
+    import uuid
+
+    # Define schema and data
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=True),
+        NestedField(2, "name", StringType(), required=False),
+    )
+    data = pa.table({"id": pa.array([1, 2, 3], type=pa.int32()), "name": ["a", "b", "c"]})
+
+    # Create table metadata
+    table_metadata = TableMetadataV2(
+        location=str(tmp_path),
+        last_column_id=2,
+        format_version=2,
+        schemas=[schema],
+        partition_specs=[PartitionSpec()],
+        properties={
+            "write.format.default": "orc",
+        }
+    )
+    io = PyArrowFileIO()
+
+    # Write ORC file using Iceberg API
+    write_uuid = uuid.uuid4()
+    tasks = [
+        WriteTask(
+            write_uuid=write_uuid,
+            task_id=0,
+            record_batches=data.to_batches(),
+            schema=schema,
+        )
+    ]
+    data_files = list(write_file(io, table_metadata, iter(tasks)))
+    assert len(data_files) == 1
+    data_file = data_files[0]
+    assert data_file.file_format == FileFormat.ORC
+    assert data_file.content == DataFileContent.DATA
+
+    # Read back using ArrowScan
+    scan = ArrowScan(
+        table_metadata=table_metadata,
+        io=io,
+        projected_schema=schema,
+        row_filter=AlwaysTrue(),
+        case_sensitive=True,
+    )
+    scan_task = FileScanTask(data_file=data_file)
+    table_read = scan.to_table([scan_task])
+    
+    # Compare data ignoring schema metadata (like not null constraints)
+    assert table_read.num_rows == data.num_rows
+    assert table_read.num_columns == data.num_columns
+    assert table_read.column_names == data.column_names
+    
+    # Compare actual column data values
+    for col_name in data.column_names:
+        original_values = data.column(col_name).to_pylist()
+        read_values = table_read.column(col_name).to_pylist()
+        assert original_values == read_values, f"Column {col_name} values don't match"