95
95
HDFS_KERB_TICKET ,
96
96
HDFS_PORT ,
97
97
HDFS_USER ,
98
+ PYARROW_USE_LARGE_TYPES_ON_READ ,
98
99
S3_ACCESS_KEY_ID ,
99
100
S3_CONNECT_TIMEOUT ,
100
101
S3_ENDPOINT ,
158
159
from pyiceberg .utils .config import Config
159
160
from pyiceberg .utils .datetime import millis_to_datetime
160
161
from pyiceberg .utils .deprecated import deprecated
161
- from pyiceberg .utils .properties import get_first_property_value , property_as_int
162
+ from pyiceberg .utils .properties import get_first_property_value , property_as_bool , property_as_int
162
163
from pyiceberg .utils .singleton import Singleton
163
164
from pyiceberg .utils .truncate import truncate_upper_bound_binary_string , truncate_upper_bound_text_string
164
165
@@ -835,6 +836,10 @@ def _pyarrow_schema_ensure_large_types(schema: pa.Schema) -> pa.Schema:
835
836
return visit_pyarrow (schema , _ConvertToLargeTypes ())
836
837
837
838
839
+ def _pyarrow_schema_ensure_small_types (schema : pa .Schema ) -> pa .Schema :
840
+ return visit_pyarrow (schema , _ConvertToSmallTypes ())
841
+
842
+
838
843
@singledispatch
839
844
def visit_pyarrow (obj : Union [pa .DataType , pa .Schema ], visitor : PyArrowSchemaVisitor [T ]) -> T :
840
845
"""Apply a pyarrow schema visitor to any point within a schema.
@@ -876,7 +881,6 @@ def _(obj: Union[pa.ListType, pa.LargeListType, pa.FixedSizeListType], visitor:
876
881
visitor .before_list_element (obj .value_field )
877
882
result = visit_pyarrow (obj .value_type , visitor )
878
883
visitor .after_list_element (obj .value_field )
879
-
880
884
return visitor .list (obj , result )
881
885
882
886
@@ -1145,6 +1149,30 @@ def primitive(self, primitive: pa.DataType) -> pa.DataType:
1145
1149
return primitive
1146
1150
1147
1151
1152
+ class _ConvertToSmallTypes (PyArrowSchemaVisitor [Union [pa .DataType , pa .Schema ]]):
1153
+ def schema (self , schema : pa .Schema , struct_result : pa .StructType ) -> pa .Schema :
1154
+ return pa .schema (struct_result )
1155
+
1156
+ def struct (self , struct : pa .StructType , field_results : List [pa .Field ]) -> pa .StructType :
1157
+ return pa .struct (field_results )
1158
+
1159
+ def field (self , field : pa .Field , field_result : pa .DataType ) -> pa .Field :
1160
+ return field .with_type (field_result )
1161
+
1162
+ def list (self , list_type : pa .ListType , element_result : pa .DataType ) -> pa .DataType :
1163
+ return pa .list_ (element_result )
1164
+
1165
+ def map (self , map_type : pa .MapType , key_result : pa .DataType , value_result : pa .DataType ) -> pa .DataType :
1166
+ return pa .map_ (key_result , value_result )
1167
+
1168
+ def primitive (self , primitive : pa .DataType ) -> pa .DataType :
1169
+ if primitive == pa .large_string ():
1170
+ return pa .string ()
1171
+ elif primitive == pa .large_binary ():
1172
+ return pa .binary ()
1173
+ return primitive
1174
+
1175
+
1148
1176
class _ConvertToIcebergWithoutIDs (_ConvertToIceberg ):
1149
1177
"""
1150
1178
Converts PyArrowSchema to Iceberg Schema with all -1 ids.
@@ -1169,6 +1197,7 @@ def _task_to_record_batches(
1169
1197
positional_deletes : Optional [List [ChunkedArray ]],
1170
1198
case_sensitive : bool ,
1171
1199
name_mapping : Optional [NameMapping ] = None ,
1200
+ use_large_types : bool = True ,
1172
1201
) -> Iterator [pa .RecordBatch ]:
1173
1202
_ , _ , path = PyArrowFileIO .parse_location (task .file .file_path )
1174
1203
arrow_format = ds .ParquetFileFormat (pre_buffer = True , buffer_size = (ONE_MEGABYTE * 8 ))
@@ -1197,7 +1226,9 @@ def _task_to_record_batches(
1197
1226
# https://github.com/apache/arrow/issues/41884
1198
1227
# https://github.com/apache/arrow/issues/43183
1199
1228
# Would be good to remove this later on
1200
- schema = _pyarrow_schema_ensure_large_types (physical_schema ),
1229
+ schema = _pyarrow_schema_ensure_large_types (physical_schema )
1230
+ if use_large_types
1231
+ else (_pyarrow_schema_ensure_small_types (physical_schema )),
1201
1232
# This will push down the query to Arrow.
1202
1233
# But in case there are positional deletes, we have to apply them first
1203
1234
filter = pyarrow_filter if not positional_deletes else None ,
@@ -1219,7 +1250,9 @@ def _task_to_record_batches(
1219
1250
arrow_table = pa .Table .from_batches ([batch ])
1220
1251
arrow_table = arrow_table .filter (pyarrow_filter )
1221
1252
batch = arrow_table .to_batches ()[0 ]
1222
- yield _to_requested_schema (projected_schema , file_project_schema , batch , downcast_ns_timestamp_to_us = True )
1253
+ yield _to_requested_schema (
1254
+ projected_schema , file_project_schema , batch , downcast_ns_timestamp_to_us = True , use_large_types = use_large_types
1255
+ )
1223
1256
current_index += len (batch )
1224
1257
1225
1258
@@ -1232,10 +1265,19 @@ def _task_to_table(
1232
1265
positional_deletes : Optional [List [ChunkedArray ]],
1233
1266
case_sensitive : bool ,
1234
1267
name_mapping : Optional [NameMapping ] = None ,
1268
+ use_large_types : bool = True ,
1235
1269
) -> Optional [pa .Table ]:
1236
1270
batches = list (
1237
1271
_task_to_record_batches (
1238
- fs , task , bound_row_filter , projected_schema , projected_field_ids , positional_deletes , case_sensitive , name_mapping
1272
+ fs ,
1273
+ task ,
1274
+ bound_row_filter ,
1275
+ projected_schema ,
1276
+ projected_field_ids ,
1277
+ positional_deletes ,
1278
+ case_sensitive ,
1279
+ name_mapping ,
1280
+ use_large_types ,
1239
1281
)
1240
1282
)
1241
1283
@@ -1303,6 +1345,8 @@ def project_table(
1303
1345
# When FsSpec is not installed
1304
1346
raise ValueError (f"Expected PyArrowFileIO or FsspecFileIO, got: { io } " ) from e
1305
1347
1348
+ use_large_types = property_as_bool (io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , True )
1349
+
1306
1350
bound_row_filter = bind (table_metadata .schema (), row_filter , case_sensitive = case_sensitive )
1307
1351
1308
1352
projected_field_ids = {
@@ -1322,6 +1366,7 @@ def project_table(
1322
1366
deletes_per_file .get (task .file .file_path ),
1323
1367
case_sensitive ,
1324
1368
table_metadata .name_mapping (),
1369
+ use_large_types ,
1325
1370
)
1326
1371
for task in tasks
1327
1372
]
@@ -1394,6 +1439,8 @@ def project_batches(
1394
1439
# When FsSpec is not installed
1395
1440
raise ValueError (f"Expected PyArrowFileIO or FsspecFileIO, got: { io } " ) from e
1396
1441
1442
+ use_large_types = property_as_bool (io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , True )
1443
+
1397
1444
bound_row_filter = bind (table_metadata .schema (), row_filter , case_sensitive = case_sensitive )
1398
1445
1399
1446
projected_field_ids = {
@@ -1414,6 +1461,7 @@ def project_batches(
1414
1461
deletes_per_file .get (task .file .file_path ),
1415
1462
case_sensitive ,
1416
1463
table_metadata .name_mapping (),
1464
+ use_large_types ,
1417
1465
)
1418
1466
for batch in batches :
1419
1467
if limit is not None :
@@ -1447,12 +1495,13 @@ def _to_requested_schema(
1447
1495
batch : pa .RecordBatch ,
1448
1496
downcast_ns_timestamp_to_us : bool = False ,
1449
1497
include_field_ids : bool = False ,
1498
+ use_large_types : bool = True ,
1450
1499
) -> pa .RecordBatch :
1451
1500
# We could re-use some of these visitors
1452
1501
struct_array = visit_with_partner (
1453
1502
requested_schema ,
1454
1503
batch ,
1455
- ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids ),
1504
+ ArrowProjectionVisitor (file_schema , downcast_ns_timestamp_to_us , include_field_ids , use_large_types ),
1456
1505
ArrowAccessor (file_schema ),
1457
1506
)
1458
1507
return pa .RecordBatch .from_struct_array (struct_array )
@@ -1462,20 +1511,31 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Arra
1462
1511
_file_schema : Schema
1463
1512
_include_field_ids : bool
1464
1513
_downcast_ns_timestamp_to_us : bool
1514
+ _use_large_types : bool
1465
1515
1466
- def __init__ (self , file_schema : Schema , downcast_ns_timestamp_to_us : bool = False , include_field_ids : bool = False ) -> None :
1516
+ def __init__ (
1517
+ self ,
1518
+ file_schema : Schema ,
1519
+ downcast_ns_timestamp_to_us : bool = False ,
1520
+ include_field_ids : bool = False ,
1521
+ use_large_types : bool = True ,
1522
+ ) -> None :
1467
1523
self ._file_schema = file_schema
1468
1524
self ._include_field_ids = include_field_ids
1469
1525
self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
1526
+ self ._use_large_types = use_large_types
1470
1527
1471
1528
def _cast_if_needed (self , field : NestedField , values : pa .Array ) -> pa .Array :
1472
1529
file_field = self ._file_schema .find_field (field .field_id )
1473
1530
1474
1531
if field .field_type .is_primitive :
1475
1532
if field .field_type != file_field .field_type :
1476
- return values . cast (
1477
- schema_to_pyarrow ( promote (file_field .field_type , field .field_type ), include_field_ids = self ._include_field_ids )
1533
+ target_schema = schema_to_pyarrow (
1534
+ promote (file_field .field_type , field .field_type ), include_field_ids = self ._include_field_ids
1478
1535
)
1536
+ if not self ._use_large_types :
1537
+ target_schema = _pyarrow_schema_ensure_small_types (target_schema )
1538
+ return values .cast (target_schema )
1479
1539
elif (target_type := schema_to_pyarrow (field .field_type , include_field_ids = self ._include_field_ids )) != values .type :
1480
1540
if field .field_type == TimestampType ():
1481
1541
# Downcasting of nanoseconds to microseconds
@@ -1547,12 +1607,13 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional
1547
1607
1548
1608
def list (self , list_type : ListType , list_array : Optional [pa .Array ], value_array : Optional [pa .Array ]) -> Optional [pa .Array ]:
1549
1609
if isinstance (list_array , (pa .ListArray , pa .LargeListArray , pa .FixedSizeListArray )) and value_array is not None :
1610
+ list_initializer = pa .large_list if isinstance (list_array , pa .LargeListArray ) else pa .list_
1550
1611
if isinstance (value_array , pa .StructArray ):
1551
1612
# This can be removed once this has been fixed:
1552
1613
# https://github.com/apache/arrow/issues/38809
1553
1614
list_array = pa .LargeListArray .from_arrays (list_array .offsets , value_array )
1554
1615
value_array = self ._cast_if_needed (list_type .element_field , value_array )
1555
- arrow_field = pa . large_list (self ._construct_field (list_type .element_field , value_array .type ))
1616
+ arrow_field = list_initializer (self ._construct_field (list_type .element_field , value_array .type ))
1556
1617
return list_array .cast (arrow_field )
1557
1618
else :
1558
1619
return None
0 commit comments