Skip to content

Commit fcc6846

Browse files
committed
Vector pipeline
Signed-off-by: Nicholas Gates <[email protected]>
1 parent a2b63ac commit fcc6846

File tree

1 file changed

+146
-41
lines changed

1 file changed

+146
-41
lines changed

vortex-compute/src/filter/slice.rs

Lines changed: 146 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -75,61 +75,166 @@ impl<const NB: usize, T: Copy> Filter<BitView<'_, NB>> for &[T] {
7575

7676
// First we loop 64 elements at a time (usize::BITS)
7777
for mut word in mask.iter_words() {
78-
match word {
79-
0usize => {
80-
// No bits set => skip usize::BITS slice.
81-
unsafe {
82-
read_ptr = read_ptr.add(usize::BITS as usize);
83-
}
78+
if word == 0usize {
79+
// No bits set => skip usize::BITS slice.
80+
unsafe {
81+
read_ptr = read_ptr.add(usize::BITS as usize);
8482
}
85-
usize::MAX => {
86-
// All slice => copy usize::BITS slice.
87-
unsafe {
88-
ptr::copy_nonoverlapping(read_ptr, write_ptr, usize::BITS as usize);
89-
read_ptr = read_ptr.add(usize::BITS as usize);
90-
write_ptr = write_ptr.add(usize::BITS as usize);
91-
}
83+
continue;
84+
}
85+
86+
if word == usize::MAX {
87+
// All slice => copy usize::BITS slice.
88+
unsafe {
89+
ptr::copy_nonoverlapping(read_ptr, write_ptr, usize::BITS as usize);
90+
read_ptr = read_ptr.add(usize::BITS as usize);
91+
write_ptr = write_ptr.add(usize::BITS as usize);
9292
}
93-
_ => {
94-
// Iterate the bits in a word, attempting to copy contiguous runs of values.
95-
let mut read_pos = 0;
96-
let mut write_pos = 0;
93+
continue;
94+
}
95+
96+
// We switch strategies based on the density of the word.
97+
let popcount = word.count_ones() as usize;
98+
99+
if popcount <= 16 {
100+
// Sparse word: iterate only set bits
101+
// This minimizes work when few bits are set
102+
unsafe {
103+
let mut bitpos = 0;
97104
while word != 0 {
98-
let tz = word.trailing_zeros();
99-
if tz > 0 {
100-
// shift off the trailing zeros since they are unselected.
101-
// this advances the read head, but not the write head.
102-
read_pos += tz;
103-
word >>= tz;
105+
bitpos += word.trailing_zeros() as usize;
106+
*write_ptr = *read_ptr.add(bitpos);
107+
write_ptr = write_ptr.add(1);
108+
word &= word - 1; // Clear lowest set bit (branchless)
109+
}
110+
read_ptr = read_ptr.add(usize::BITS as usize);
111+
}
112+
continue;
113+
}
114+
115+
if popcount <= 48 {
116+
// Medium density (~25-75%): process byte-by-byte
117+
// This is optimal for the 50% case
118+
unsafe {
119+
// Process 8 bytes (64 bits) in chunks of 8 bits
120+
for chunk in 0..8 {
121+
let byte = ((word >> (chunk * 8)) & 0xFF) as u8;
122+
if byte == 0 {
104123
continue;
105124
}
106125

107-
// copy the next several values to our out pointer.
108-
let extent = word.trailing_ones();
109-
unsafe {
110-
ptr::copy_nonoverlapping(
111-
read_ptr.add(read_pos as usize),
112-
write_ptr.add(write_pos as usize),
113-
extent as usize,
114-
);
126+
let base_offset = chunk * 8;
127+
128+
if byte == 0xFF {
129+
// All 8 bits set, use fast copy
130+
ptr::copy_nonoverlapping(read_ptr.add(base_offset), write_ptr, 8);
131+
write_ptr = write_ptr.add(8);
132+
continue;
115133
}
116-
// Advance the reader and writer by the number of values
117-
// we just copied.
118-
read_pos += extent;
119-
write_pos += extent;
120134

121-
// shift off the low bits of the word so we can copy the next run.
122-
word >>= extent;
135+
// Unrolled bit checks - compiler optimizes to conditional moves
136+
// This eliminates branches and trailing_zeros/ones overhead
137+
if byte & 0x01 != 0 {
138+
*write_ptr = *read_ptr.add(base_offset + 0);
139+
write_ptr = write_ptr.add(1);
140+
}
141+
if byte & 0x02 != 0 {
142+
*write_ptr = *read_ptr.add(base_offset + 1);
143+
write_ptr = write_ptr.add(1);
144+
}
145+
if byte & 0x04 != 0 {
146+
*write_ptr = *read_ptr.add(base_offset + 2);
147+
write_ptr = write_ptr.add(1);
148+
}
149+
if byte & 0x08 != 0 {
150+
*write_ptr = *read_ptr.add(base_offset + 3);
151+
write_ptr = write_ptr.add(1);
152+
}
153+
if byte & 0x10 != 0 {
154+
*write_ptr = *read_ptr.add(base_offset + 4);
155+
write_ptr = write_ptr.add(1);
156+
}
157+
if byte & 0x20 != 0 {
158+
*write_ptr = *read_ptr.add(base_offset + 5);
159+
write_ptr = write_ptr.add(1);
160+
}
161+
if byte & 0x40 != 0 {
162+
*write_ptr = *read_ptr.add(base_offset + 6);
163+
write_ptr = write_ptr.add(1);
164+
}
165+
if byte & 0x80 != 0 {
166+
*write_ptr = *read_ptr.add(base_offset + 7);
167+
write_ptr = write_ptr.add(1);
168+
}
123169
}
170+
read_ptr = read_ptr.add(usize::BITS as usize);
171+
continue;
172+
}
173+
}
124174

125-
unsafe {
126-
read_ptr = read_ptr.add(usize::BITS as usize);
127-
write_ptr = write_ptr.add(write_pos as usize);
128-
};
175+
// Dense word (>75% bits set): use run-based copying
176+
// Optimized for long runs of 1s
177+
let mut read_pos = 0;
178+
let mut write_pos = 0;
179+
unsafe {
180+
while word != 0 {
181+
let tz = word.trailing_zeros();
182+
read_pos += tz;
183+
word >>= tz;
184+
185+
if word == 0 {
186+
break;
187+
}
188+
189+
let extent = word.trailing_ones();
190+
191+
// Use optimized copy for the run
192+
copy_small(
193+
read_ptr.add(read_pos as usize),
194+
write_ptr.add(write_pos as usize),
195+
extent as usize,
196+
);
197+
198+
read_pos += extent;
199+
write_pos += extent;
200+
word >>= extent;
129201
}
202+
203+
read_ptr = read_ptr.add(usize::BITS as usize);
130204
}
131205
}
132206

133207
write.freeze()
134208
}
135209
}
210+
211+
/// Optimized small copy that avoids function call overhead
212+
#[inline(always)]
213+
unsafe fn copy_small<T: Copy>(src: *const T, dst: *mut T, count: usize) {
214+
// For small counts, direct assignment is faster than copy_nonoverlapping
215+
// This avoids the ~8-12 cycle function call overhead
216+
match count {
217+
0 => {}
218+
1 => unsafe {
219+
ptr::write(dst, ptr::read(src));
220+
},
221+
2 => unsafe {
222+
ptr::write(dst, ptr::read(src));
223+
ptr::write(dst.add(1), ptr::read(src.add(1)));
224+
},
225+
3 => unsafe {
226+
ptr::write(dst, ptr::read(src));
227+
ptr::write(dst.add(1), ptr::read(src.add(1)));
228+
ptr::write(dst.add(2), ptr::read(src.add(2)));
229+
},
230+
4 => unsafe {
231+
ptr::write(dst, ptr::read(src));
232+
ptr::write(dst.add(1), ptr::read(src.add(1)));
233+
ptr::write(dst.add(2), ptr::read(src.add(2)));
234+
ptr::write(dst.add(3), ptr::read(src.add(3)));
235+
},
236+
_ => unsafe {
237+
ptr::copy_nonoverlapping(src, dst, count);
238+
},
239+
}
240+
}

0 commit comments

Comments
 (0)