@@ -6,6 +6,7 @@ use std::cmp::Ordering;
66use std:: sync:: { Arc , OnceLock } ;
77
88use arrow_buffer:: { BooleanBuffer , BooleanBufferBuilder } ;
9+ use itertools:: Itertools ;
910use vortex_error:: vortex_panic;
1011
1112/// If the mask selects more than this fraction of rows, iterate over slices instead of indices.
@@ -31,6 +32,7 @@ struct Inner {
3132 // Pre-computed values.
3233 len : usize ,
3334 true_count : usize ,
35+ // i.e., the fraction of values that are true
3436 selectivity : f64 ,
3537}
3638
@@ -51,6 +53,7 @@ impl Inner {
5153 // TODO(ngates): for dense indices, we can do better by collecting into u64s.
5254 buf. append_n ( self . len , false ) ;
5355 indices. iter ( ) . for_each ( |idx| buf. set_bit ( * idx, true ) ) ;
56+ debug_assert_eq ! ( buf. len( ) , self . len) ;
5457 return BooleanBuffer :: from ( buf) ;
5558 }
5659
@@ -85,12 +88,16 @@ impl Inner {
8588 if let Some ( buffer) = self . buffer . get ( ) {
8689 let mut indices = Vec :: with_capacity ( self . true_count ) ;
8790 indices. extend ( buffer. set_indices ( ) ) ;
91+ debug_assert ! ( indices. is_sorted( ) ) ;
92+ assert_eq ! ( indices. len( ) , self . true_count) ;
8893 return indices;
8994 }
9095
9196 if let Some ( slices) = self . slices . get ( ) {
9297 let mut indices = Vec :: with_capacity ( self . true_count ) ;
9398 indices. extend ( slices. iter ( ) . flat_map ( |( start, end) | * start..* end) ) ;
99+ debug_assert ! ( indices. is_sorted( ) ) ;
100+ assert_eq ! ( indices. len( ) , self . true_count) ;
94101 return indices;
95102 }
96103
@@ -99,6 +106,7 @@ impl Inner {
99106 }
100107
101108 /// Constructs a slices vector from one of the other representations.
109+ #[ allow( clippy:: cast_possible_truncation) ]
102110 fn slices ( & self ) -> & [ ( usize , usize ) ] {
103111 self . slices . get_or_init ( || {
104112 if self . true_count == self . len {
@@ -110,7 +118,10 @@ impl Inner {
110118 }
111119
112120 if let Some ( indices) = self . indices . get ( ) {
113- let mut slices = Vec :: with_capacity ( self . true_count ) ; // Upper bound
121+ // Expected number of contiguous slices assuming a uniform distribution of true values.
122+ let expected_num_slices =
123+ ( self . selectivity * ( self . len - self . true_count + 1 ) as f64 ) . round ( ) as usize ;
124+ let mut slices = Vec :: with_capacity ( expected_num_slices) ;
114125 let mut iter = indices. iter ( ) . copied ( ) ;
115126
116127 // Handle empty input
@@ -200,7 +211,11 @@ impl Mask {
200211 /// Create a new [`Mask`] from a [`Vec<usize>`].
201212 pub fn from_indices ( len : usize , vec : Vec < usize > ) -> Self {
202213 let true_count = vec. len ( ) ;
203- assert ! ( vec. iter( ) . all( |& idx| idx < len) ) ;
214+ assert ! ( vec. is_sorted( ) , "Mask indices must be sorted" ) ;
215+ assert ! (
216+ vec. last( ) . is_none_or( |& idx| idx < len) ,
217+ "Mask indices must be in bounds (len={len})"
218+ ) ;
204219 Self ( Arc :: new ( Inner {
205220 buffer : Default :: default ( ) ,
206221 indices : OnceLock :: from ( vec) ,
@@ -214,7 +229,14 @@ impl Mask {
214229 /// Create a new [`Mask`] from a [`Vec<(usize, usize)>`] where each range
215230 /// represents a contiguous range of true values.
216231 pub fn from_slices ( len : usize , vec : Vec < ( usize , usize ) > ) -> Self {
217- assert ! ( vec. iter( ) . all( |& ( b, e) | b < e && e <= len) ) ;
232+ Self :: check_slices ( len, & vec) ;
233+ Self :: from_slices_unchecked ( len, vec)
234+ }
235+
236+ fn from_slices_unchecked ( len : usize , vec : Vec < ( usize , usize ) > ) -> Self {
237+ #[ cfg( debug_assertions) ]
238+ Self :: check_slices ( len, & vec) ;
239+
218240 let true_count = vec. iter ( ) . map ( |( b, e) | e - b) . sum ( ) ;
219241 Self ( Arc :: new ( Inner {
220242 buffer : Default :: default ( ) ,
@@ -226,6 +248,25 @@ impl Mask {
226248 } ) )
227249 }
228250
251+ #[ inline( always) ]
252+ fn check_slices ( len : usize , vec : & [ ( usize , usize ) ] ) {
253+ assert ! ( vec. iter( ) . all( |& ( b, e) | b < e && e <= len) ) ;
254+ for ( first, second) in vec. iter ( ) . tuple_windows ( ) {
255+ assert ! (
256+ first. 0 < second. 0 ,
257+ "Slices must be sorted, got {:?} and {:?}" ,
258+ first,
259+ second
260+ ) ;
261+ assert ! (
262+ first. 1 <= second. 0 ,
263+ "Slices must be non-overlapping, got {:?} and {:?}" ,
264+ first,
265+ second
266+ ) ;
267+ }
268+ }
269+
229270 /// Create a new [`Mask`] from the intersection of two indices slices.
230271 pub fn from_intersection_indices (
231272 len : usize ,
@@ -254,7 +295,7 @@ impl Mask {
254295 }
255296
256297 #[ inline]
257- // There is good definition of is_empty, does it mean len == 0 or true_count == 0?
298+ // There is no good definition of is_empty, does it mean len == 0 or true_count == 0?
258299 #[ allow( clippy:: len_without_is_empty) ]
259300 pub fn len ( & self ) -> usize {
260301 self . 0 . len
@@ -328,7 +369,8 @@ impl Mask {
328369 let indices = indices
329370 . iter ( )
330371 . copied ( )
331- . filter ( |& idx| offset <= idx && idx < end)
372+ . skip_while ( |idx| * idx < offset)
373+ . take_while ( |idx| * idx < end)
332374 . map ( |idx| idx - offset)
333375 . collect ( ) ;
334376 return Self :: from_indices ( length, indices) ;
@@ -338,10 +380,11 @@ impl Mask {
338380 let slices = slices
339381 . iter ( )
340382 . copied ( )
341- . filter ( |( s, e) | * s < end && * e > offset)
383+ . skip_while ( |( _, e) | * e <= offset)
384+ . take_while ( |( s, _) | * s < end)
342385 . map ( |( s, e) | ( s. max ( offset) , e. min ( end) ) )
343386 . collect ( ) ;
344- return Self :: from_slices ( length, slices) ;
387+ return Self :: from_slices_unchecked ( length, slices) ;
345388 }
346389
347390 vortex_panic ! ( "No mask representation found" )
0 commit comments