@@ -946,13 +946,9 @@ def _task_to_table(
946
946
projected_field_ids : Set [int ],
947
947
positional_deletes : Optional [List [ChunkedArray ]],
948
948
case_sensitive : bool ,
949
- row_counts : List [int ],
950
949
limit : Optional [int ] = None ,
951
950
name_mapping : Optional [NameMapping ] = None ,
952
951
) -> Optional [pa .Table ]:
953
- if limit and sum (row_counts ) >= limit :
954
- return None
955
-
956
952
_ , _ , path = PyArrowFileIO .parse_location (task .file .file_path )
957
953
arrow_format = ds .ParquetFileFormat (pre_buffer = True , buffer_size = (ONE_MEGABYTE * 8 ))
958
954
with fs .open_input_file (path ) as fin :
@@ -1015,11 +1011,6 @@ def _task_to_table(
1015
1011
if len (arrow_table ) < 1 :
1016
1012
return None
1017
1013
1018
- if limit is not None and sum (row_counts ) >= limit :
1019
- return None
1020
-
1021
- row_counts .append (len (arrow_table ))
1022
-
1023
1014
return to_requested_schema (projected_schema , file_project_schema , arrow_table )
1024
1015
1025
1016
@@ -1085,7 +1076,6 @@ def project_table(
1085
1076
id for id in projected_schema .field_ids if not isinstance (projected_schema .find_type (id ), (MapType , ListType ))
1086
1077
}.union (extract_field_ids (bound_row_filter ))
1087
1078
1088
- row_counts : List [int ] = []
1089
1079
deletes_per_file = _read_all_delete_files (fs , tasks )
1090
1080
executor = ExecutorFactory .get_or_create ()
1091
1081
futures = [
@@ -1098,21 +1088,21 @@ def project_table(
1098
1088
projected_field_ids ,
1099
1089
deletes_per_file .get (task .file .file_path ),
1100
1090
case_sensitive ,
1101
- row_counts ,
1102
1091
limit ,
1103
1092
table .name_mapping (),
1104
1093
)
1105
1094
for task in tasks
1106
1095
]
1107
-
1096
+ total_row_count = 0
1108
1097
# for consistent ordering, we need to maintain future order
1109
1098
futures_index = {f : i for i , f in enumerate (futures )}
1110
1099
completed_futures : SortedList [Future [pa .Table ]] = SortedList (iterable = [], key = lambda f : futures_index [f ])
1111
1100
for future in concurrent .futures .as_completed (futures ):
1112
1101
completed_futures .add (future )
1113
-
1102
+ if table_result := future .result ():
1103
+ total_row_count += len (table_result )
1114
1104
# stop early if limit is satisfied
1115
- if limit is not None and sum ( row_counts ) >= limit :
1105
+ if limit is not None and total_row_count >= limit :
1116
1106
break
1117
1107
1118
1108
# by now, we've either completed all tasks or satisfied the limit
0 commit comments