@@ -75,61 +75,166 @@ impl<const NB: usize, T: Copy> Filter<BitView<'_, NB>> for &[T] {
7575
7676 // First we loop 64 elements at a time (usize::BITS)
7777 for mut word in mask. iter_words ( ) {
78- match word {
79- 0usize => {
80- // No bits set => skip usize::BITS slice.
81- unsafe {
82- read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
83- }
78+ if word == 0usize {
79+ // No bits set => skip usize::BITS slice.
80+ unsafe {
81+ read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
8482 }
85- usize:: MAX => {
86- // All slice => copy usize::BITS slice.
87- unsafe {
88- ptr:: copy_nonoverlapping ( read_ptr, write_ptr, usize:: BITS as usize ) ;
89- read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
90- write_ptr = write_ptr. add ( usize:: BITS as usize ) ;
91- }
83+ continue ;
84+ }
85+
86+ if word == usize:: MAX {
87+ // All slice => copy usize::BITS slice.
88+ unsafe {
89+ ptr:: copy_nonoverlapping ( read_ptr, write_ptr, usize:: BITS as usize ) ;
90+ read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
91+ write_ptr = write_ptr. add ( usize:: BITS as usize ) ;
9292 }
93- _ => {
94- // Iterate the bits in a word, attempting to copy contiguous runs of values.
95- let mut read_pos = 0 ;
96- let mut write_pos = 0 ;
93+ continue ;
94+ }
95+
96+ // We switch strategies based on the density of the word.
97+ let popcount = word. count_ones ( ) as usize ;
98+
99+ if popcount <= 16 {
100+ // Sparse word: iterate only set bits
101+ // This minimizes work when few bits are set
102+ unsafe {
103+ let mut bitpos = 0 ;
97104 while word != 0 {
98- let tz = word. trailing_zeros ( ) ;
99- if tz > 0 {
100- // shift off the trailing zeros since they are unselected.
101- // this advances the read head, but not the write head.
102- read_pos += tz;
103- word >>= tz;
105+ bitpos += word. trailing_zeros ( ) as usize ;
106+ * write_ptr = * read_ptr. add ( bitpos) ;
107+ write_ptr = write_ptr. add ( 1 ) ;
108+ word &= word - 1 ; // Clear lowest set bit (branchless)
109+ }
110+ read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
111+ }
112+ continue ;
113+ }
114+
115+ if popcount <= 48 {
116+ // Medium density (~25-75%): process byte-by-byte
117+ // This is optimal for the 50% case
118+ unsafe {
119+ // Process 8 bytes (64 bits) in chunks of 8 bits
120+ for chunk in 0 ..8 {
121+ let byte = ( ( word >> ( chunk * 8 ) ) & 0xFF ) as u8 ;
122+ if byte == 0 {
104123 continue ;
105124 }
106125
107- // copy the next several values to our out pointer.
108- let extent = word. trailing_ones ( ) ;
109- unsafe {
110- ptr:: copy_nonoverlapping (
111- read_ptr. add ( read_pos as usize ) ,
112- write_ptr. add ( write_pos as usize ) ,
113- extent as usize ,
114- ) ;
126+ let base_offset = chunk * 8 ;
127+
128+ if byte == 0xFF {
129+ // All 8 bits set, use fast copy
130+ ptr:: copy_nonoverlapping ( read_ptr. add ( base_offset) , write_ptr, 8 ) ;
131+ write_ptr = write_ptr. add ( 8 ) ;
132+ continue ;
115133 }
116- // Advance the reader and writer by the number of values
117- // we just copied.
118- read_pos += extent;
119- write_pos += extent;
120134
121- // shift off the low bits of the word so we can copy the next run.
122- word >>= extent;
135+ // Unrolled bit checks - compiler optimizes to conditional moves
136+ // This eliminates branches and trailing_zeros/ones overhead
137+ if byte & 0x01 != 0 {
138+ * write_ptr = * read_ptr. add ( base_offset + 0 ) ;
139+ write_ptr = write_ptr. add ( 1 ) ;
140+ }
141+ if byte & 0x02 != 0 {
142+ * write_ptr = * read_ptr. add ( base_offset + 1 ) ;
143+ write_ptr = write_ptr. add ( 1 ) ;
144+ }
145+ if byte & 0x04 != 0 {
146+ * write_ptr = * read_ptr. add ( base_offset + 2 ) ;
147+ write_ptr = write_ptr. add ( 1 ) ;
148+ }
149+ if byte & 0x08 != 0 {
150+ * write_ptr = * read_ptr. add ( base_offset + 3 ) ;
151+ write_ptr = write_ptr. add ( 1 ) ;
152+ }
153+ if byte & 0x10 != 0 {
154+ * write_ptr = * read_ptr. add ( base_offset + 4 ) ;
155+ write_ptr = write_ptr. add ( 1 ) ;
156+ }
157+ if byte & 0x20 != 0 {
158+ * write_ptr = * read_ptr. add ( base_offset + 5 ) ;
159+ write_ptr = write_ptr. add ( 1 ) ;
160+ }
161+ if byte & 0x40 != 0 {
162+ * write_ptr = * read_ptr. add ( base_offset + 6 ) ;
163+ write_ptr = write_ptr. add ( 1 ) ;
164+ }
165+ if byte & 0x80 != 0 {
166+ * write_ptr = * read_ptr. add ( base_offset + 7 ) ;
167+ write_ptr = write_ptr. add ( 1 ) ;
168+ }
123169 }
170+ read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
171+ continue ;
172+ }
173+ }
124174
125- unsafe {
126- read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
127- write_ptr = write_ptr. add ( write_pos as usize ) ;
128- } ;
175+ // Dense word (>75% bits set): use run-based copying
176+ // Optimized for long runs of 1s
177+ let mut read_pos = 0 ;
178+ let mut write_pos = 0 ;
179+ unsafe {
180+ while word != 0 {
181+ let tz = word. trailing_zeros ( ) ;
182+ read_pos += tz;
183+ word >>= tz;
184+
185+ if word == 0 {
186+ break ;
187+ }
188+
189+ let extent = word. trailing_ones ( ) ;
190+
191+ // Use optimized copy for the run
192+ copy_small (
193+ read_ptr. add ( read_pos as usize ) ,
194+ write_ptr. add ( write_pos as usize ) ,
195+ extent as usize ,
196+ ) ;
197+
198+ read_pos += extent;
199+ write_pos += extent;
200+ word >>= extent;
129201 }
202+
203+ read_ptr = read_ptr. add ( usize:: BITS as usize ) ;
130204 }
131205 }
132206
133207 write. freeze ( )
134208 }
135209}
210+
211+ /// Optimized small copy that avoids function call overhead
212+ #[ inline( always) ]
213+ unsafe fn copy_small < T : Copy > ( src : * const T , dst : * mut T , count : usize ) {
214+ // For small counts, direct assignment is faster than copy_nonoverlapping
215+ // This avoids the ~8-12 cycle function call overhead
216+ match count {
217+ 0 => { }
218+ 1 => unsafe {
219+ ptr:: write ( dst, ptr:: read ( src) ) ;
220+ } ,
221+ 2 => unsafe {
222+ ptr:: write ( dst, ptr:: read ( src) ) ;
223+ ptr:: write ( dst. add ( 1 ) , ptr:: read ( src. add ( 1 ) ) ) ;
224+ } ,
225+ 3 => unsafe {
226+ ptr:: write ( dst, ptr:: read ( src) ) ;
227+ ptr:: write ( dst. add ( 1 ) , ptr:: read ( src. add ( 1 ) ) ) ;
228+ ptr:: write ( dst. add ( 2 ) , ptr:: read ( src. add ( 2 ) ) ) ;
229+ } ,
230+ 4 => unsafe {
231+ ptr:: write ( dst, ptr:: read ( src) ) ;
232+ ptr:: write ( dst. add ( 1 ) , ptr:: read ( src. add ( 1 ) ) ) ;
233+ ptr:: write ( dst. add ( 2 ) , ptr:: read ( src. add ( 2 ) ) ) ;
234+ ptr:: write ( dst. add ( 3 ) , ptr:: read ( src. add ( 3 ) ) ) ;
235+ } ,
236+ _ => unsafe {
237+ ptr:: copy_nonoverlapping ( src, dst, count) ;
238+ } ,
239+ }
240+ }
0 commit comments