@@ -115,11 +115,6 @@ pub struct PagePruningAccessPlanFilter {
115115 /// single column predicates (e.g. (`col = 5`) extracted from the overall
116116 /// predicate. Must all be true for a row to be included in the result.
117117 predicates : Vec < PruningPredicate > ,
118- /// For each row group, tracks which pages are fully matched (all rows satisfy predicates)
119- /// Key: row group index, Value: Vec of booleans (one per page)
120- fully_matched_pages : HashMap < usize , Vec < bool > > ,
121- /// For each row group, stores the row counts per page
122- page_row_counts : HashMap < usize , Vec < usize > > ,
123118}
124119
125120impl PagePruningAccessPlanFilter {
@@ -154,11 +149,7 @@ impl PagePruningAccessPlanFilter {
154149 Some ( pp)
155150 } )
156151 . collect :: < Vec < _ > > ( ) ;
157- Self {
158- predicates,
159- fully_matched_pages : HashMap :: new ( ) ,
160- page_row_counts : HashMap :: new ( ) ,
161- }
152+ Self { predicates }
162153 }
163154
164155 /// Returns an updated [`ParquetAccessPlan`] by applying predicates to the
@@ -337,7 +328,7 @@ impl PagePruningAccessPlanFilter {
337328 rg_metadata : & [ RowGroupMetaData ] ,
338329 file_metrics : & ParquetFileMetrics ,
339330 ) -> ParquetAccessPlan {
340- if self . fully_matched_pages . is_empty ( ) {
331+ if page_match_infos . is_empty ( ) {
341332 return access_plan;
342333 }
343334
@@ -388,15 +379,16 @@ impl PagePruningAccessPlanFilter {
388379 }
389380
390381 // Build a RowSelection for this row group that includes only the fully matched pages
391- let page_row_counts = self . page_row_counts . get ( & rg_idx) . unwrap ( ) ;
392- let row_selection = build_row_selection_for_pages (
393- & page_indices,
394- & page_row_counts,
395- limit - rows_selected,
396- ) ;
397-
398- new_access_plan. scan_selection ( rg_idx, row_selection) ;
399- rows_selected += std:: cmp:: min ( row_count, limit - rows_selected) ;
382+ if let Some ( page_match_info) = page_match_infos. get ( & rg_idx) {
383+ let row_selection = build_row_selection_for_pages (
384+ & page_indices,
385+ & page_match_info. page_row_counts ,
386+ limit - rows_selected,
387+ ) ;
388+ new_access_plan. scan ( rg_idx) ;
389+ new_access_plan. scan_selection ( rg_idx, row_selection) ;
390+ rows_selected += std:: cmp:: min ( row_count, limit - rows_selected) ;
391+ }
400392 }
401393
402394 let original_row_groups = access_plan. row_group_indexes ( ) . len ( ) ;
@@ -406,7 +398,7 @@ impl PagePruningAccessPlanFilter {
406398 if pruned_row_groups > 0 {
407399 file_metrics. limit_pruned_row_groups . add ( pruned_row_groups) ;
408400 }
409-
401+ file_metrics . limit_pruning_matched_rows . add ( rows_selected ) ;
410402 return new_access_plan;
411403 }
412404
@@ -490,7 +482,10 @@ fn build_row_selection_for_pages(
490482 let mut page_idx_iter = page_indices. iter ( ) . peekable ( ) ;
491483
492484 for ( page_num, & page_row_count) in page_row_counts. iter ( ) . enumerate ( ) {
493- if let Some ( & & next_page_idx) = page_idx_iter. peek ( ) {
485+ if rows_selected >= limit {
486+ // Once we've reached the limit, skip all remaining pages
487+ current_row += page_row_count;
488+ } else if let Some ( & & next_page_idx) = page_idx_iter. peek ( ) {
494489 if page_num == next_page_idx {
495490 // This page should be selected
496491 page_idx_iter. next ( ) ;
@@ -510,10 +505,6 @@ fn build_row_selection_for_pages(
510505 if rows_to_select < page_row_count {
511506 current_row = page_row_count - rows_to_select;
512507 }
513-
514- if rows_selected >= limit {
515- break ;
516- }
517508 } else {
518509 // This page should be skipped
519510 current_row += page_row_count;
@@ -876,7 +867,7 @@ mod tests {
876867 build_row_selection_for_pages ( & page_indices, & page_row_counts, limit) ;
877868 let result = row_selection_to_vec ( & selection) ;
878869
879- assert_eq ! ( result, vec![ ( true , 200 ) , ( false , 50 ) ] ) ;
870+ assert_eq ! ( result, vec![ ( true , 200 ) , ( false , 250 ) ] ) ;
880871 }
881872
882873 #[ test]
@@ -890,6 +881,6 @@ mod tests {
890881 build_row_selection_for_pages ( & page_indices, & page_row_counts, limit) ;
891882 let result = row_selection_to_vec ( & selection) ;
892883
893- assert_eq ! ( result, vec![ ( false , 50 ) , ( true , 150 ) , ( false , 30 ) , ] ) ;
884+ assert_eq ! ( result, vec![ ( false , 50 ) , ( true , 150 ) , ( false , 210 ) , ] ) ;
894885 }
895886}
0 commit comments