@@ -504,7 +504,7 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
504
504
505
505
def list (self , list_type : ListType , element_result : pa .DataType ) -> pa .DataType :
506
506
element_field = self .field (list_type .element_field , element_result )
507
- return pa .list_ (value_type = element_field )
507
+ return pa .large_list (value_type = element_field )
508
508
509
509
def map (self , map_type : MapType , key_result : pa .DataType , value_result : pa .DataType ) -> pa .DataType :
510
510
key_field = self .field (map_type .key_field , key_result )
@@ -548,7 +548,7 @@ def visit_timestamptz(self, _: TimestamptzType) -> pa.DataType:
548
548
return pa .timestamp (unit = "us" , tz = "UTC" )
549
549
550
550
def visit_string (self , _ : StringType ) -> pa .DataType :
551
- return pa .string ()
551
+ return pa .large_string ()
552
552
553
553
def visit_uuid (self , _ : UUIDType ) -> pa .DataType :
554
554
return pa .binary (16 )
@@ -680,6 +680,10 @@ def _pyarrow_to_schema_without_ids(schema: pa.Schema) -> Schema:
680
680
return visit_pyarrow (schema , _ConvertToIcebergWithoutIDs ())
681
681
682
682
683
+ def _pyarrow_schema_ensure_large_types (schema : pa .Schema ) -> pa .Schema :
684
+ return visit_pyarrow (schema , _ConvertToLargeTypes ())
685
+
686
+
683
687
@singledispatch
684
688
def visit_pyarrow (obj : Union [pa .DataType , pa .Schema ], visitor : PyArrowSchemaVisitor [T ]) -> T :
685
689
"""Apply a pyarrow schema visitor to any point within a schema.
@@ -952,6 +956,30 @@ def after_map_value(self, element: pa.Field) -> None:
952
956
self ._field_names .pop ()
953
957
954
958
959
+ class _ConvertToLargeTypes (PyArrowSchemaVisitor [Union [pa .DataType , pa .Schema ]]):
960
+ def schema (self , schema : pa .Schema , struct_result : pa .StructType ) -> pa .Schema :
961
+ return pa .schema (struct_result )
962
+
963
+ def struct (self , struct : pa .StructType , field_results : List [pa .Field ]) -> pa .StructType :
964
+ return pa .struct (field_results )
965
+
966
+ def field (self , field : pa .Field , field_result : pa .DataType ) -> pa .Field :
967
+ return field .with_type (field_result )
968
+
969
+ def list (self , list_type : pa .ListType , element_result : pa .DataType ) -> pa .DataType :
970
+ return pa .large_list (element_result )
971
+
972
+ def map (self , map_type : pa .MapType , key_result : pa .DataType , value_result : pa .DataType ) -> pa .DataType :
973
+ return pa .map_ (key_result , value_result )
974
+
975
+ def primitive (self , primitive : pa .DataType ) -> pa .DataType :
976
+ if primitive == pa .string ():
977
+ return pa .large_string ()
978
+ elif primitive == pa .binary ():
979
+ return pa .large_binary ()
980
+ return primitive
981
+
982
+
955
983
class _ConvertToIcebergWithoutIDs (_ConvertToIceberg ):
956
984
"""
957
985
Converts PyArrowSchema to Iceberg Schema with all -1 ids.
@@ -998,7 +1026,9 @@ def _task_to_table(
998
1026
999
1027
fragment_scanner = ds .Scanner .from_fragment (
1000
1028
fragment = fragment ,
1001
- schema = physical_schema ,
1029
+ # We always use large types in memory as it uses larger offsets
1030
+ # That can chunk more row values into the buffers
1031
+ schema = _pyarrow_schema_ensure_large_types (physical_schema ),
1002
1032
# This will push down the query to Arrow.
1003
1033
# But in case there are positional deletes, we have to apply them first
1004
1034
filter = pyarrow_filter if not positional_deletes else None ,
@@ -1167,8 +1197,14 @@ def __init__(self, file_schema: Schema):
1167
1197
1168
1198
def _cast_if_needed (self , field : NestedField , values : pa .Array ) -> pa .Array :
1169
1199
file_field = self .file_schema .find_field (field .field_id )
1170
- if field .field_type .is_primitive and field .field_type != file_field .field_type :
1171
- return values .cast (schema_to_pyarrow (promote (file_field .field_type , field .field_type ), include_field_ids = False ))
1200
+ if field .field_type .is_primitive :
1201
+ if field .field_type != file_field .field_type :
1202
+ return values .cast (schema_to_pyarrow (promote (file_field .field_type , field .field_type ), include_field_ids = False ))
1203
+ elif (target_type := schema_to_pyarrow (field .field_type , include_field_ids = False )) != values .type :
1204
+ # if file_field and field_type (e.g. String) are the same
1205
+ # but the pyarrow type of the array is different from the expected type
1206
+ # (e.g. string vs larger_string), we want to cast the array to the larger type
1207
+ return values .cast (target_type )
1172
1208
return values
1173
1209
1174
1210
def _construct_field (self , field : NestedField , arrow_type : pa .DataType ) -> pa .Field :
@@ -1207,13 +1243,13 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional
1207
1243
return field_array
1208
1244
1209
1245
def list (self , list_type : ListType , list_array : Optional [pa .Array ], value_array : Optional [pa .Array ]) -> Optional [pa .Array ]:
1210
- if isinstance (list_array , pa .ListArray ) and value_array is not None :
1246
+ if isinstance (list_array , ( pa .ListArray , pa . LargeListArray , pa . FixedSizeListArray ) ) and value_array is not None :
1211
1247
if isinstance (value_array , pa .StructArray ):
1212
1248
# This can be removed once this has been fixed:
1213
1249
# https://github.com/apache/arrow/issues/38809
1214
- list_array = pa .ListArray .from_arrays (list_array .offsets , value_array )
1250
+ list_array = pa .LargeListArray .from_arrays (list_array .offsets , value_array )
1215
1251
1216
- arrow_field = pa .list_ (self ._construct_field (list_type .element_field , value_array .type ))
1252
+ arrow_field = pa .large_list (self ._construct_field (list_type .element_field , value_array .type ))
1217
1253
return list_array .cast (arrow_field )
1218
1254
else :
1219
1255
return None
@@ -1263,7 +1299,7 @@ def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: st
1263
1299
return None
1264
1300
1265
1301
def list_element_partner (self , partner_list : Optional [pa .Array ]) -> Optional [pa .Array ]:
1266
- return partner_list .values if isinstance (partner_list , pa .ListArray ) else None
1302
+ return partner_list .values if isinstance (partner_list , ( pa .ListArray , pa . LargeListArray , pa . FixedSizeListArray ) ) else None
1267
1303
1268
1304
def map_key_partner (self , partner_map : Optional [pa .Array ]) -> Optional [pa .Array ]:
1269
1305
return partner_map .keys if isinstance (partner_map , pa .MapArray ) else None
@@ -1800,10 +1836,10 @@ def write_parquet(task: WriteTask) -> DataFile:
1800
1836
# otherwise use the original schema
1801
1837
if (sanitized_schema := sanitize_column_names (table_schema )) != table_schema :
1802
1838
file_schema = sanitized_schema
1803
- arrow_table = to_requested_schema (requested_schema = file_schema , file_schema = table_schema , table = arrow_table )
1804
1839
else :
1805
1840
file_schema = table_schema
1806
1841
1842
+ arrow_table = to_requested_schema (requested_schema = file_schema , file_schema = table_schema , table = arrow_table )
1807
1843
file_path = f'{ table_metadata .location } /data/{ task .generate_data_file_path ("parquet" )} '
1808
1844
fo = io .new_output (file_path )
1809
1845
with fo .create (overwrite = True ) as fos :
0 commit comments