Fix UUID support (#2007)

Fokko · dingo4dev · web-flow · commit bbb1c2597108 · 2025-07-08T22:39:45.000+02:00
# Rationale for this change The UUID support is a gift that keeps on giving. The current support of PyIceberg is incomplete, and problematic. Mostly because: - It is an extension-type in Arrow, which means it is not fully supported: apache/arrow#46469 apache/arrow#46468 - It doesn't have native support in Spark, where it is converted into a string. This limits the current tests, which are mostly Spark-based. I think we have to wait for some fixes in Arrow upstream until we can fully support this. In PyIceberg, we're converting the `fixed[16]` to a `UUID`, but Spark does seem to error because the logical type annotation in Parquet is missing: ``` E py4j.protocol.Py4JJavaError: An error occurred while calling o72.collectToPython. E : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1) (localhost executor driver): java.lang.UnsupportedOperationException: Unsupported type: UTF8String E at org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor.getUTF8String(ArrowVectorAccessor.java:81) E at org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector.getUTF8String(IcebergArrowColumnVector.java:143) E at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) E at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) E at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43) E at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388) E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893) E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893) E at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367) E at org.apache.spark.rdd.RDD.iterator(RDD.scala:331) E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93) E at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166) E at org.apache.spark.scheduler.Task.run(Task.scala:141) E at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620) E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) E at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94) E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623) E at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) E at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) E at java.base/java.lang.Thread.run(Thread.java:829) E E Driver stacktrace: E at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856) E at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792) E at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791) E at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) E at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) E at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) E at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791) E at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247) E at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247) E at scala.Option.foreach(Option.scala:407) E at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247) E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060) E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994) E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983) E at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) E at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989) E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393) E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414) E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433) E at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458) E at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049) E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) E at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) E at org.apache.spark.rdd.RDD.withScope(RDD.scala:410) E at org.apache.spark.rdd.RDD.collect(RDD.scala:1048) E at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448) E at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4149) E at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323) E at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546) E at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321) E at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125) E at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201) E at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108) E at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900) E at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66) E at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321) E at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4146) E at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) E at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) E at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) E at java.base/java.lang.reflect.Method.invoke(Method.java:566) E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) E at py4j.Gateway.invoke(Gateway.java:282) E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) E at py4j.commands.CallCommand.execute(CallCommand.java:79) E at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) E at py4j.ClientServerConnection.run(ClientServerConnection.java:106) E at java.base/java.lang.Thread.run(Thread.java:829) E Caused by: java.lang.UnsupportedOperationException: Unsupported type: UTF8String E at org.apache.iceberg.arrow.vectorized.ArrowVectorAccessor.getUTF8String(ArrowVectorAccessor.java:81) E at org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector.getUTF8String(IcebergArrowColumnVector.java:143) E at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) E at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) E at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43) E at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388) E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893) E at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893) E at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367) E at org.apache.spark.rdd.RDD.iterator(RDD.scala:331) E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93) E at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166) E at org.apache.spark.scheduler.Task.run(Task.scala:141) E at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620) E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) E at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) E at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94) E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623) E at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) E at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) E ... 1 more ``` # Are these changes tested? # Are there any user-facing changes? Closes #1986 Closes #2002  --------- Co-authored-by: DinGo4DEV <stanleylkal@gmail.com>
diff --git a/pyiceberg/avro/writer.py b/pyiceberg/avro/writer.py
@@ -32,6 +32,7 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 from uuid import UUID
 
@@ -121,8 +122,11 @@ def write(self, encoder: BinaryEncoder, val: Any) -> None:
 
 @dataclass(frozen=True)
 class UUIDWriter(Writer):
-    def write(self, encoder: BinaryEncoder, val: UUID) -> None:
-        encoder.write(val.bytes)
+    def write(self, encoder: BinaryEncoder, val: Union[UUID, bytes]) -> None:
+        if isinstance(val, UUID):
+            encoder.write(val.bytes)
+        else:
+            encoder.write(val)
 
 
 @dataclass(frozen=True)
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -746,7 +746,7 @@ def visit_string(self, _: StringType) -> pa.DataType:
         return pa.large_string()
 
     def visit_uuid(self, _: UUIDType) -> pa.DataType:
-        return pa.binary(16)
+        return pa.uuid()
 
     def visit_unknown(self, _: UnknownType) -> pa.DataType:
         return pa.null()
@@ -1307,6 +1307,8 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType:
             return FixedType(primitive.byte_width)
         elif pa.types.is_null(primitive):
             return UnknownType()
+        elif isinstance(primitive, pa.UuidType):
+            return UUIDType()
 
         raise TypeError(f"Unsupported type: {primitive}")
 
diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
@@ -467,8 +467,17 @@ def _(type: IcebergType, value: Optional[time]) -> Optional[int]:
 
 
 @_to_partition_representation.register(UUIDType)
-def _(type: IcebergType, value: Optional[uuid.UUID]) -> Optional[str]:
-    return str(value) if value is not None else None
+def _(type: IcebergType, value: Optional[Union[uuid.UUID, int, bytes]]) -> Optional[Union[bytes, int]]:
+    if value is None:
+        return None
+    elif isinstance(value, bytes):
+        return value  # IdentityTransform
+    elif isinstance(value, uuid.UUID):
+        return value.bytes  # IdentityTransform
+    elif isinstance(value, int):
+        return value  # BucketTransform
+    else:
+        raise ValueError(f"Type not recognized: {value}")
 
 
 @_to_partition_representation.register(PrimitiveType)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2827,7 +2827,7 @@ def pyarrow_schema_with_promoted_types() -> "pa.Schema":
             pa.field("list", pa.list_(pa.int32()), nullable=False),  # can support upcasting integer to long
             pa.field("map", pa.map_(pa.string(), pa.int32()), nullable=False),  # can support upcasting integer to long
             pa.field("double", pa.float32(), nullable=True),  # can support upcasting float to double
-            pa.field("uuid", pa.binary(length=16), nullable=True),  # can support upcasting float to double
+            pa.field("uuid", pa.binary(length=16), nullable=True),  # can support upcasting fixed to uuid
         )
     )
 
@@ -2843,7 +2843,10 @@ def pyarrow_table_with_promoted_types(pyarrow_schema_with_promoted_types: "pa.Sc
             "list": [[1, 1], [2, 2]],
             "map": [{"a": 1}, {"b": 2}],
             "double": [1.1, 9.2],
-            "uuid": [b"qZx\xefNS@\x89\x9b\xf9:\xd0\xee\x9b\xf5E", b"\x97]\x87T^JDJ\x96\x97\xf4v\xe4\x03\x0c\xde"],
+            "uuid": [
+                uuid.UUID("00000000-0000-0000-0000-000000000000").bytes,
+                uuid.UUID("11111111-1111-1111-1111-111111111111").bytes,
+            ],
         },
         schema=pyarrow_schema_with_promoted_types,
     )
diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py
@@ -737,7 +737,7 @@ def test_add_files_with_valid_upcast(
         with pq.ParquetWriter(fos, schema=pyarrow_schema_with_promoted_types) as writer:
             writer.write_table(pyarrow_table_with_promoted_types)
 
-    tbl.add_files(file_paths=[file_path])
+    tbl.add_files(file_paths=[file_path], check_duplicate_files=False)
     # table's long field should cast to long on read
     written_arrow_table = tbl.scan().to_arrow()
     assert written_arrow_table == pyarrow_table_with_promoted_types.cast(
@@ -747,7 +747,7 @@ def test_add_files_with_valid_upcast(
                 pa.field("list", pa.list_(pa.int64()), nullable=False),
                 pa.field("map", pa.map_(pa.string(), pa.int64()), nullable=False),
                 pa.field("double", pa.float64(), nullable=True),
-                pa.field("uuid", pa.binary(length=16), nullable=True),  # can UUID is read as fixed length binary of length 16
+                pa.field("uuid", pa.uuid(), nullable=True),
             )
         )
     )
diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint:disable=redefined-outer-name
-import uuid
 from datetime import date, datetime, timedelta, timezone
 from decimal import Decimal
 from typing import Any, List
@@ -308,25 +307,6 @@
             (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
             """,
         ),
-        (
-            [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")],
-            [uuid.UUID("f47ac10b-58cc-4372-a567-0e02b2c3d479")],
-            Record("f47ac10b-58cc-4372-a567-0e02b2c3d479"),
-            "uuid_field=f47ac10b-58cc-4372-a567-0e02b2c3d479",
-            f"""CREATE TABLE {identifier} (
-                uuid_field string,
-                string_field string
-            )
-            USING iceberg
-            PARTITIONED BY (
-                identity(uuid_field)
-            )
-            """,
-            f"""INSERT INTO {identifier}
-            VALUES
-            ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
-            """,
-        ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")],
             [b"example"],
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
@@ -589,15 +589,15 @@ def test_partitioned_tables(catalog: Catalog) -> None:
 def test_unpartitioned_uuid_table(catalog: Catalog) -> None:
     unpartitioned_uuid = catalog.load_table("default.test_uuid_and_fixed_unpartitioned")
     arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
-    assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967").bytes]
+    assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")]
 
     arrow_table_neq = unpartitioned_uuid.scan(
         row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
     ).to_arrow()
     assert arrow_table_neq["uuid_col"].to_pylist() == [
-        uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226").bytes,
-        uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b").bytes,
-        uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e").bytes,
+        uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"),
+        uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"),
+        uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"),
     ]
 
 
diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -19,6 +19,7 @@
 import os
 import random
 import time
+import uuid
 from datetime import date, datetime, timedelta
 from decimal import Decimal
 from pathlib import Path
@@ -49,7 +50,7 @@
 from pyiceberg.table import TableProperties
 from pyiceberg.table.refs import MAIN_BRANCH
 from pyiceberg.table.sorting import SortDirection, SortField, SortOrder
-from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform
+from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform, Transform
 from pyiceberg.types import (
     DateType,
     DecimalType,
@@ -59,6 +60,7 @@
     LongType,
     NestedField,
     StringType,
+    UUIDType,
 )
 from utils import _create_table
 
@@ -1286,7 +1288,7 @@ def test_table_write_schema_with_valid_upcast(
                 pa.field("list", pa.list_(pa.int64()), nullable=False),
                 pa.field("map", pa.map_(pa.string(), pa.int64()), nullable=False),
                 pa.field("double", pa.float64(), nullable=True),  # can support upcasting float to double
-                pa.field("uuid", pa.binary(length=16), nullable=True),  # can UUID is read as fixed length binary of length 16
+                pa.field("uuid", pa.uuid(), nullable=True),
             )
         )
     )
@@ -1858,6 +1860,59 @@ def test_read_write_decimals(session_catalog: Catalog) -> None:
     assert tbl.scan().to_arrow() == arrow_table
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "transform",
+    [
+        IdentityTransform(),
+        # Bucket is disabled because of an issue in Iceberg Java:
+        # https://github.com/apache/iceberg/pull/13324
+        # BucketTransform(32)
+    ],
+)
+def test_uuid_partitioning(session_catalog: Catalog, spark: SparkSession, transform: Transform) -> None:  # type: ignore
+    identifier = f"default.test_uuid_partitioning_{str(transform).replace('[32]', '')}"
+
+    schema = Schema(NestedField(field_id=1, name="uuid", field_type=UUIDType(), required=True))
+
+    try:
+        session_catalog.drop_table(identifier=identifier)
+    except NoSuchTableError:
+        pass
+
+    partition_spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=transform, name="uuid_identity"))
+
+    import pyarrow as pa
+
+    arr_table = pa.Table.from_pydict(
+        {
+            "uuid": [
+                uuid.UUID("00000000-0000-0000-0000-000000000000").bytes,
+                uuid.UUID("11111111-1111-1111-1111-111111111111").bytes,
+            ],
+        },
+        schema=pa.schema(
+            [
+                # Uuid not yet supported, so we have to stick with `binary(16)`
+                # https://github.com/apache/arrow/issues/46468
+                pa.field("uuid", pa.binary(16), nullable=False),
+            ]
+        ),
+    )
+
+    tbl = session_catalog.create_table(
+        identifier=identifier,
+        schema=schema,
+        partition_spec=partition_spec,
+    )
+
+    tbl.append(arr_table)
+
+    lhs = [r[0] for r in spark.table(identifier).collect()]
+    rhs = [str(u.as_py()) for u in tbl.scan().to_arrow()["uuid"].combine_chunks()]
+    assert lhs == rhs
+
+
 @pytest.mark.integration
 def test_avro_compression_codecs(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
     identifier = "default.test_avro_compression_codecs"

Original file line number	Diff line number	Diff line change
`@@ -2827,7 +2827,7 @@ def pyarrow_schema_with_promoted_types() -> "pa.Schema":`
`2827`	`2827`	`pa.field("list", pa.list_(pa.int32()), nullable=False), # can support upcasting integer to long`
`2828`	`2828`	`pa.field("map", pa.map_(pa.string(), pa.int32()), nullable=False), # can support upcasting integer to long`
`2829`	`2829`	`pa.field("double", pa.float32(), nullable=True), # can support upcasting float to double`
`2830`		`- pa.field("uuid", pa.binary(length=16), nullable=True), # can support upcasting float to double`
	`2830`	`+ pa.field("uuid", pa.binary(length=16), nullable=True), # can support upcasting fixed to uuid`
`2831`	`2831`	`)`
`2832`	`2832`	`)`
`2833`	`2833`
`@@ -2843,7 +2843,10 @@ def pyarrow_table_with_promoted_types(pyarrow_schema_with_promoted_types: "pa.Sc`
`2843`	`2843`	`"list": [[1, 1], [2, 2]],`
`2844`	`2844`	`"map": [{"a": 1}, {"b": 2}],`
`2845`	`2845`	`"double": [1.1, 9.2],`
`2846`		`- "uuid": [b"qZx\xefNS@\x89\x9b\xf9:\xd0\xee\x9b\xf5E", b"\x97]\x87T^JDJ\x96\x97\xf4v\xe4\x03\x0c\xde"],`
	`2846`	`+ "uuid": [`
	`2847`	`+ uuid.UUID("00000000-0000-0000-0000-000000000000").bytes,`
	`2848`	`+ uuid.UUID("11111111-1111-1111-1111-111111111111").bytes,`
	`2849`	`+ ],`
`2847`	`2850`	`},`
`2848`	`2851`	`schema=pyarrow_schema_with_promoted_types,`
`2849`	`2852`	`)`
Original file line number	Diff line number	Diff line change
`@@ -737,7 +737,7 @@ def test_add_files_with_valid_upcast(`
`737`	`737`	`with pq.ParquetWriter(fos, schema=pyarrow_schema_with_promoted_types) as writer:`
`738`	`738`	`writer.write_table(pyarrow_table_with_promoted_types)`
`739`	`739`
`740`		`- tbl.add_files(file_paths=[file_path])`
	`740`	`+ tbl.add_files(file_paths=[file_path], check_duplicate_files=False)`
`741`	`741`	`# table's long field should cast to long on read`
`742`	`742`	`written_arrow_table = tbl.scan().to_arrow()`
`743`	`743`	`assert written_arrow_table == pyarrow_table_with_promoted_types.cast(`
`@@ -747,7 +747,7 @@ def test_add_files_with_valid_upcast(`
`747`	`747`	`pa.field("list", pa.list_(pa.int64()), nullable=False),`
`748`	`748`	`pa.field("map", pa.map_(pa.string(), pa.int64()), nullable=False),`
`749`	`749`	`pa.field("double", pa.float64(), nullable=True),`
`750`		`- pa.field("uuid", pa.binary(length=16), nullable=True), # can UUID is read as fixed length binary of length 16`
	`750`	`+ pa.field("uuid", pa.uuid(), nullable=True),`
`751`	`751`	`)`
`752`	`752`	`)`
`753`	`753`	`)`