Skip to content

Commit 230a7f2

Browse files
committed
Add append_filtered_vals_from_key, use it in append_unique_vals_2
1 parent b8107be commit 230a7f2

File tree

1 file changed

+94
-91
lines changed

1 file changed

+94
-91
lines changed

src/intrinsics/dedup.rs

Lines changed: 94 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,42 @@ pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &
8282
let mask = _mm256_movemask_ps(transmute(mask)) as usize;
8383
let numberofnewvalues = L - mask.count_ones() as usize;
8484
let key = transmute(UNIQSHUF[mask]);
85-
let val = _mm256_permutevar8x32_epi32(transmute(vals), key);
86-
_mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val);
85+
append_filtered_vals_from_key(vals, key, v, write_idx);
8786
*write_idx += numberofnewvalues;
8887
}
8988
}
9089

90+
#[cfg(target_feature = "avx2")]
91+
#[inline(always)]
92+
pub unsafe fn append_filtered_vals_2(
93+
vals: S,
94+
vals2: S,
95+
mask: S,
96+
v: &mut [u32],
97+
v2: &mut [u32],
98+
write_idx: &mut usize,
99+
) {
100+
unsafe {
101+
use core::arch::x86_64::*;
102+
let mask = _mm256_movemask_ps(transmute(mask)) as usize;
103+
let numberofnewvalues = L - mask.count_ones() as usize;
104+
let key = UNIQSHUF[mask];
105+
append_filtered_vals_from_key(vals, key, v, write_idx);
106+
append_filtered_vals_from_key(vals2, key, v2, write_idx);
107+
*write_idx += numberofnewvalues;
108+
}
109+
}
110+
111+
#[cfg(target_feature = "avx2")]
112+
#[inline(always)]
113+
unsafe fn append_filtered_vals_from_key(vals: S, key: S, v: &mut [u32], write_idx: &mut usize) {
114+
unsafe {
115+
use core::arch::x86_64::*;
116+
let val = _mm256_permutevar8x32_epi32(transmute(vals), transmute(key));
117+
_mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val);
118+
}
119+
}
120+
91121
/// Dedup adjacent `new` values (starting with the last element of `old`).
92122
/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
93123
///
@@ -151,42 +181,82 @@ pub unsafe fn append_unique_vals_2(
151181
let recon = _mm256_blend_epi32(old, new, 0b01111111);
152182
let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle
153183
let vec_tmp = _mm256_permutevar8x32_epi32(recon, movebyone_mask);
184+
let mut mask = transmute(_mm256_cmpeq_epi32(vec_tmp, new));
154185

155-
let mask = _mm256_movemask_ps(transmute(_mm256_cmpeq_epi32(vec_tmp, new))) as usize;
156-
let numberofnewvalues = L - mask.count_ones() as usize;
157-
let key = transmute(UNIQSHUF[mask]);
158-
let val = _mm256_permutevar8x32_epi32(vals, key);
159-
_mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val);
160-
let val2 = _mm256_permutevar8x32_epi32(vals2, key);
161-
_mm256_storeu_si256(v2.as_mut_ptr().add(*write_idx) as *mut __m256i, val2);
162-
*write_idx += numberofnewvalues;
186+
append_filtered_vals_2(vals, vals2, mask, v, v2, write_idx);
163187
}
164188
}
165189

190+
#[cfg(target_feature = "neon")]
191+
const POW1: wide::u32x4 = wide::u32x4::new([1, 2, 4, 8]);
192+
#[cfg(target_feature = "neon")]
193+
const POW2: wide::u32x4 = wide::u32x4::new([16, 32, 64, 128]);
194+
#[cfg(target_feature = "neon")]
195+
const NEW_OLD_MASK: S = S::new([!0, !0, !0, !0, !0, !0, !0, 0]);
196+
#[cfg(target_feature = "neon")]
197+
const I1: core::arch::aarch64::uint8x16_t =
198+
unsafe { transmute([0x1F1E1D1C, 0x03020100, 0x07060504, 0x0B0A0908]) };
199+
#[cfg(target_feature = "neon")]
200+
const I2: core::arch::aarch64::uint8x16_t =
201+
unsafe { transmute([0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918]) };
202+
166203
/// Append the values of `x` selected by `mask` to `v`.
167204
#[cfg(target_feature = "neon")]
168205
#[inline(always)]
169206
pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &mut usize) {
170207
unsafe {
171-
use core::arch::aarch64::{vaddvq_u32, vqtbl2q_u8, vst1_u32_x4};
208+
use core::arch::aarch64::vaddvq_u32;
172209
use wide::u32x4;
173210

174-
const POW1: u32x4 = u32x4::new([1, 2, 4, 8]);
175-
const POW2: u32x4 = u32x4::new([16, 32, 64, 128]);
211+
let (m1, m2): (u32x4, u32x4) = transmute(mask);
212+
let m1 = vaddvq_u32(transmute(m1 & POW1));
213+
let m2 = vaddvq_u32(transmute(m2 & POW2));
214+
let mask = (m1 | m2) as usize;
215+
let numberofnewvalues = L - mask.count_ones() as usize;
216+
let key = UNIQSHUF_NEON[mask];
217+
append_filtered_vals_from_key(vals, key, v, write_idx);
218+
*write_idx += numberofnewvalues;
219+
}
220+
}
221+
222+
#[cfg(target_feature = "neon")]
223+
#[inline(always)]
224+
pub unsafe fn append_filtered_vals_2(
225+
vals: S,
226+
vals2: S,
227+
mask: S,
228+
v: &mut [u32],
229+
v2: &mut [u32],
230+
write_idx: &mut usize,
231+
) {
232+
unsafe {
233+
use core::arch::aarch64::vaddvq_u32;
234+
use wide::u32x4;
176235

177236
let (m1, m2): (u32x4, u32x4) = transmute(mask);
178237
let m1 = vaddvq_u32(transmute(m1 & POW1));
179238
let m2 = vaddvq_u32(transmute(m2 & POW2));
180239
let mask = (m1 | m2) as usize;
181240
let numberofnewvalues = L - mask.count_ones() as usize;
182-
let idx = UNIQSHUF_NEON[mask];
183-
let (i1, i2) = transmute(idx);
241+
let key = UNIQSHUF_NEON[mask];
242+
append_filtered_vals_from_key(vals, key, v, write_idx);
243+
append_filtered_vals_from_key(vals2, key, v2, write_idx);
244+
*write_idx += numberofnewvalues;
245+
}
246+
}
247+
248+
#[cfg(target_feature = "neon")]
249+
#[inline(always)]
250+
unsafe fn append_filtered_vals_from_key(vals: S, key: S, v: &mut [u32], write_idx: &mut usize) {
251+
unsafe {
252+
use core::arch::aarch64::{vqtbl2q_u8, vst1_u32_x4};
253+
254+
let (i1, i2) = transmute(key);
184255
let t = transmute(vals);
185256
let r1 = vqtbl2q_u8(t, i1);
186257
let r2 = vqtbl2q_u8(t, i2);
187258
let val: S = transmute((r1, r2));
188259
vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val));
189-
*write_idx += numberofnewvalues;
190260
}
191261
}
192262

@@ -208,32 +278,11 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
208278
unsafe {
209279
use core::arch::aarch64::vqtbl2q_u8;
210280

211-
const NEW_OLD_MASK: S = S::new([
212-
u32::MAX,
213-
u32::MAX,
214-
u32::MAX,
215-
u32::MAX,
216-
u32::MAX,
217-
u32::MAX,
218-
u32::MAX,
219-
0,
220-
]);
221281
let recon = NEW_OLD_MASK.blend(new, old);
222-
let (i1, i2) = transmute([
223-
0x1F1E1D1Cu32,
224-
0x03020100,
225-
0x07060504,
226-
0x0B0A0908,
227-
0x0F0E0D0C,
228-
0x13121110,
229-
0x17161514,
230-
0x1B1A1918,
231-
]);
232282
let t = transmute(recon);
233-
let r1 = vqtbl2q_u8(t, i1);
234-
let r2 = vqtbl2q_u8(t, i2);
283+
let r1 = vqtbl2q_u8(t, I1);
284+
let r2 = vqtbl2q_u8(t, I2);
235285
let prec: S = transmute((r1, r2));
236-
237286
let mut dup = prec.cmp_eq(new);
238287
if SKIP_MAX {
239288
dup |= new.cmp_eq(SIMD_SKIPPED);
@@ -260,61 +309,15 @@ pub unsafe fn append_unique_vals_2(
260309
write_idx: &mut usize,
261310
) {
262311
unsafe {
263-
use core::arch::aarch64::{vaddvq_u32, vqtbl2q_u8, vst1_u32_x4};
264-
use wide::u32x4;
312+
use core::arch::aarch64::vqtbl2q_u8;
265313

266-
let new_old_mask = S::new([
267-
u32::MAX,
268-
u32::MAX,
269-
u32::MAX,
270-
u32::MAX,
271-
u32::MAX,
272-
u32::MAX,
273-
u32::MAX,
274-
0,
275-
]);
276-
let recon = new_old_mask.blend(new, old);
277-
278-
// let rotate_idx = S::new([7, 0, 1, 2, 3, 4, 5, 6]);
279-
// let idx = rotate_idx * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
280-
// let (i1, i2) = transmute(idx);
281-
let (i1, i2) = transmute([
282-
0x1F1E1D1Cu32,
283-
0x03020100,
284-
0x07060504,
285-
0x0B0A0908,
286-
0x0F0E0D0C,
287-
0x13121110,
288-
0x17161514,
289-
0x1B1A1918,
290-
]);
314+
let recon = NEW_OLD_MASK.blend(new, old);
291315
let t = transmute(recon);
292-
let r1 = vqtbl2q_u8(t, i1);
293-
let r2 = vqtbl2q_u8(t, i2);
316+
let r1 = vqtbl2q_u8(t, I1);
317+
let r2 = vqtbl2q_u8(t, I2);
294318
let prec: S = transmute((r1, r2));
295-
296319
let dup = prec.cmp_eq(new);
297-
let (d1, d2): (u32x4, u32x4) = transmute(dup);
298-
let pow1 = u32x4::new([1, 2, 4, 8]);
299-
let pow2 = u32x4::new([16, 32, 64, 128]);
300-
let m1 = vaddvq_u32(transmute(d1 & pow1));
301-
let m2 = vaddvq_u32(transmute(d2 & pow2));
302-
let mask = (m1 | m2) as usize;
303-
304-
let numberofnewvalues = L - mask.count_ones() as usize;
305-
let idx = UNIQSHUF_NEON[mask];
306-
let (i1, i2) = transmute(idx);
307-
let t = transmute(vals);
308-
let r1 = vqtbl2q_u8(t, i1);
309-
let r2 = vqtbl2q_u8(t, i2);
310-
let val: S = transmute((r1, r2));
311-
vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val));
312-
let t = transmute(vals2);
313-
let r1 = vqtbl2q_u8(t, i1);
314-
let r2 = vqtbl2q_u8(t, i2);
315-
let val2: S = transmute((r1, r2));
316-
vst1_u32_x4(v2.as_mut_ptr().add(*write_idx), transmute(val2));
317-
*write_idx += numberofnewvalues;
320+
append_filtered_vals_2(vals, vals2, dup, v, v2, write_idx);
318321
}
319322
}
320323

@@ -584,7 +587,7 @@ const UNIQSHUF: [S; 256] = unsafe {transmute([
584587
#[cfg(target_feature = "neon")]
585588
#[allow(clippy::erasing_op, clippy::identity_op)]
586589
#[rustfmt::skip]
587-
const UNIQSHUF_NEON: [wide::u8x32; 256] = unsafe {
590+
const UNIQSHUF_NEON: [wide::u32x8; 256] = unsafe {
588591
const M: u32 = 0x04_04_04_04;
589592
const O: u32 = 0x03_02_01_00;
590593
transmute([

0 commit comments

Comments
 (0)