Skip to content

Commit 7f41565

Browse files
Modified exception when converting Pyarrow (#1498)
* Modified exception objects being thrown when converting Pyarrow tables Signed-off-by: Christian Molina <[email protected]> * Added visit_pyarrow dispatch for pyarrow field Signed-off-by: Christian Molina <[email protected]> * Removed unnecessary codes and modified testing Signed-off-by: Christian Molina <[email protected]> * Fixed integration test Signed-off-by: Christian Molina <[email protected]> * Moved UnsupportedPyArrowTypeException to pyarrow.py Signed-off-by: Christian Molina <[email protected]> --------- Signed-off-by: Christian Molina <[email protected]>
1 parent 5a3c346 commit 7f41565

File tree

3 files changed

+121
-14
lines changed

3 files changed

+121
-14
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,14 @@
189189
T = TypeVar("T")
190190

191191

192+
class UnsupportedPyArrowTypeException(Exception):
193+
"""Cannot convert PyArrow type to corresponding Iceberg type."""
194+
195+
def __init__(self, field: pa.Field, *args: Any):
196+
self.field = field
197+
super().__init__(*args)
198+
199+
192200
class PyArrowLocalFileSystem(pyarrow.fs.LocalFileSystem):
193201
def open_output_stream(self, path: str, *args: Any, **kwargs: Any) -> pyarrow.NativeFile:
194202
# In LocalFileSystem, parent directories must be first created before opening an output stream
@@ -952,13 +960,7 @@ def _(obj: pa.Schema, visitor: PyArrowSchemaVisitor[T]) -> T:
952960

953961
@visit_pyarrow.register(pa.StructType)
954962
def _(obj: pa.StructType, visitor: PyArrowSchemaVisitor[T]) -> T:
955-
results = []
956-
957-
for field in obj:
958-
visitor.before_field(field)
959-
result = visit_pyarrow(field.type, visitor)
960-
results.append(visitor.field(field, result))
961-
visitor.after_field(field)
963+
results = [visit_pyarrow(field, visitor) for field in obj]
962964

963965
return visitor.struct(obj, results)
964966

@@ -996,6 +998,20 @@ def _(obj: pa.DictionaryType, visitor: PyArrowSchemaVisitor[T]) -> T:
996998
return visit_pyarrow(obj.value_type, visitor)
997999

9981000

1001+
@visit_pyarrow.register(pa.Field)
1002+
def _(obj: pa.Field, visitor: PyArrowSchemaVisitor[T]) -> T:
1003+
field_type = obj.type
1004+
1005+
visitor.before_field(obj)
1006+
try:
1007+
result = visit_pyarrow(field_type, visitor)
1008+
except TypeError as e:
1009+
raise UnsupportedPyArrowTypeException(obj, f"Column '{obj.name}' has an unsupported type: {field_type}") from e
1010+
visitor.after_field(obj)
1011+
1012+
return visitor.field(obj, result)
1013+
1014+
9991015
@visit_pyarrow.register(pa.DataType)
10001016
def _(obj: pa.DataType, visitor: PyArrowSchemaVisitor[T]) -> T:
10011017
if pa.types.is_nested(obj):
@@ -1167,7 +1183,7 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType:
11671183
logger.warning("Iceberg does not yet support 'ns' timestamp precision. Downcasting to 'us'.")
11681184
else:
11691185
raise TypeError(
1170-
"Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write."
1186+
"Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write.",
11711187
)
11721188
else:
11731189
raise TypeError(f"Unsupported precision for timestamp type: {primitive.unit}")

tests/integration/test_add_files.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from pyiceberg.catalog import Catalog
3131
from pyiceberg.exceptions import NoSuchTableError
3232
from pyiceberg.io import FileIO
33-
from pyiceberg.io.pyarrow import _pyarrow_schema_ensure_large_types
33+
from pyiceberg.io.pyarrow import UnsupportedPyArrowTypeException, _pyarrow_schema_ensure_large_types
3434
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
3535
from pyiceberg.schema import Schema
3636
from pyiceberg.table import Table
@@ -616,13 +616,18 @@ def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v
616616

617617
# add the parquet files as data files
618618
with pytest.raises(
619-
TypeError,
620-
match=re.escape(
621-
"Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write."
622-
),
623-
):
619+
UnsupportedPyArrowTypeException,
620+
match=re.escape("Column 'quux' has an unsupported type: timestamp[ns, tz=UTC]"),
621+
) as exc_info:
624622
tbl.add_files(file_paths=[file_path])
625623

624+
exception_cause = exc_info.value.__cause__
625+
assert isinstance(exception_cause, TypeError)
626+
assert (
627+
"Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write."
628+
in exception_cause.args[0]
629+
)
630+
626631

627632
@pytest.mark.integration
628633
@pytest.mark.parametrize("format_version", [1, 2])

tests/io/test_pyarrow_visitor.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
)
3434
from pyiceberg.expressions.literals import literal
3535
from pyiceberg.io.pyarrow import (
36+
UnsupportedPyArrowTypeException,
3637
_ConvertToArrowSchema,
3738
_ConvertToIceberg,
3839
_ConvertToIcebergWithoutIDs,
@@ -625,6 +626,91 @@ def test_pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids: pa
625626
assert _pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids) == expected_schema
626627

627628

629+
def test_pyarrow_schema_unsupported_type() -> None:
630+
unsupported_field = pa.field("latitude", pa.decimal256(20, 26), nullable=False, metadata={"PARQUET:field_id": "2"})
631+
schema = pa.schema(
632+
[
633+
pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1"}),
634+
pa.field(
635+
"location",
636+
pa.large_list(
637+
pa.field(
638+
"item",
639+
pa.struct(
640+
[
641+
unsupported_field,
642+
pa.field("longitude", pa.float32(), nullable=False, metadata={"PARQUET:field_id": "3"}),
643+
]
644+
),
645+
metadata={"PARQUET:field_id": "4"},
646+
)
647+
),
648+
nullable=False,
649+
metadata={"PARQUET:field_id": "5"},
650+
),
651+
],
652+
metadata={"PARQUET:field_id": "6"},
653+
)
654+
with pytest.raises(
655+
UnsupportedPyArrowTypeException, match=re.escape("Column 'latitude' has an unsupported type: decimal256(20, 26)")
656+
) as exc_info:
657+
pyarrow_to_schema(schema)
658+
assert exc_info.value.field == unsupported_field
659+
exception_cause = exc_info.value.__cause__
660+
assert isinstance(exception_cause, TypeError)
661+
assert "Unsupported type: decimal256(20, 26)" in exception_cause.args[0]
662+
663+
unsupported_field = pa.field(
664+
"quux",
665+
pa.map_(
666+
pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}),
667+
pa.field(
668+
"value",
669+
pa.map_(
670+
pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "5"}),
671+
pa.field("value", pa.decimal256(2, 3), metadata={"PARQUET:field_id": "6"}),
672+
),
673+
nullable=False,
674+
metadata={"PARQUET:field_id": "4"},
675+
),
676+
),
677+
nullable=False,
678+
metadata={"PARQUET:field_id": "3"},
679+
)
680+
schema = pa.schema(
681+
[
682+
pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1"}),
683+
unsupported_field,
684+
]
685+
)
686+
with pytest.raises(
687+
UnsupportedPyArrowTypeException,
688+
match=re.escape("Column 'quux' has an unsupported type: map<string, map<string, decimal256(2, 3)>>"),
689+
) as exc_info:
690+
pyarrow_to_schema(schema)
691+
assert exc_info.value.field == unsupported_field
692+
exception_cause = exc_info.value.__cause__
693+
assert isinstance(exception_cause, TypeError)
694+
assert "Unsupported type: decimal256(2, 3)" in exception_cause.args[0]
695+
696+
unsupported_field = pa.field("foo", pa.timestamp(unit="ns"), nullable=False, metadata={"PARQUET:field_id": "1"})
697+
schema = pa.schema(
698+
[
699+
unsupported_field,
700+
pa.field("bar", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "2"}),
701+
]
702+
)
703+
with pytest.raises(
704+
UnsupportedPyArrowTypeException,
705+
match=re.escape("Column 'foo' has an unsupported type: timestamp[ns]"),
706+
) as exc_info:
707+
pyarrow_to_schema(schema)
708+
assert exc_info.value.field == unsupported_field
709+
exception_cause = exc_info.value.__cause__
710+
assert isinstance(exception_cause, TypeError)
711+
assert "Iceberg does not yet support 'ns' timestamp precision" in exception_cause.args[0]
712+
713+
628714
def test_pyarrow_schema_round_trip_ensure_large_types_and_then_small_types(pyarrow_schema_nested_without_ids: pa.Schema) -> None:
629715
schema_with_large_types = _pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids)
630716
assert _pyarrow_schema_ensure_small_types(schema_with_large_types) == pyarrow_schema_nested_without_ids

0 commit comments

Comments
 (0)