@@ -82,12 +82,42 @@ pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &
8282 let mask = _mm256_movemask_ps ( transmute ( mask) ) as usize ;
8383 let numberofnewvalues = L - mask. count_ones ( ) as usize ;
8484 let key = transmute ( UNIQSHUF [ mask] ) ;
85- let val = _mm256_permutevar8x32_epi32 ( transmute ( vals) , key) ;
86- _mm256_storeu_si256 ( v. as_mut_ptr ( ) . add ( * write_idx) as * mut __m256i , val) ;
85+ append_filtered_vals_from_key ( vals, key, v, write_idx) ;
8786 * write_idx += numberofnewvalues;
8887 }
8988}
9089
90+ #[ cfg( target_feature = "avx2" ) ]
91+ #[ inline( always) ]
92+ pub unsafe fn append_filtered_vals_2 (
93+ vals : S ,
94+ vals2 : S ,
95+ mask : S ,
96+ v : & mut [ u32 ] ,
97+ v2 : & mut [ u32 ] ,
98+ write_idx : & mut usize ,
99+ ) {
100+ unsafe {
101+ use core:: arch:: x86_64:: * ;
102+ let mask = _mm256_movemask_ps ( transmute ( mask) ) as usize ;
103+ let numberofnewvalues = L - mask. count_ones ( ) as usize ;
104+ let key = UNIQSHUF [ mask] ;
105+ append_filtered_vals_from_key ( vals, key, v, write_idx) ;
106+ append_filtered_vals_from_key ( vals2, key, v2, write_idx) ;
107+ * write_idx += numberofnewvalues;
108+ }
109+ }
110+
111+ #[ cfg( target_feature = "avx2" ) ]
112+ #[ inline( always) ]
113+ unsafe fn append_filtered_vals_from_key ( vals : S , key : S , v : & mut [ u32 ] , write_idx : & mut usize ) {
114+ unsafe {
115+ use core:: arch:: x86_64:: * ;
116+ let val = _mm256_permutevar8x32_epi32 ( transmute ( vals) , transmute ( key) ) ;
117+ _mm256_storeu_si256 ( v. as_mut_ptr ( ) . add ( * write_idx) as * mut __m256i , val) ;
118+ }
119+ }
120+
91121/// Dedup adjacent `new` values (starting with the last element of `old`).
92122/// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`.
93123///
@@ -151,42 +181,82 @@ pub unsafe fn append_unique_vals_2(
151181 let recon = _mm256_blend_epi32 ( old, new, 0b01111111 ) ;
152182 let movebyone_mask = _mm256_set_epi32 ( 6 , 5 , 4 , 3 , 2 , 1 , 0 , 7 ) ; // rotate shuffle
153183 let vec_tmp = _mm256_permutevar8x32_epi32 ( recon, movebyone_mask) ;
184+ let mut mask = transmute ( _mm256_cmpeq_epi32 ( vec_tmp, new) ) ;
154185
155- let mask = _mm256_movemask_ps ( transmute ( _mm256_cmpeq_epi32 ( vec_tmp, new) ) ) as usize ;
156- let numberofnewvalues = L - mask. count_ones ( ) as usize ;
157- let key = transmute ( UNIQSHUF [ mask] ) ;
158- let val = _mm256_permutevar8x32_epi32 ( vals, key) ;
159- _mm256_storeu_si256 ( v. as_mut_ptr ( ) . add ( * write_idx) as * mut __m256i , val) ;
160- let val2 = _mm256_permutevar8x32_epi32 ( vals2, key) ;
161- _mm256_storeu_si256 ( v2. as_mut_ptr ( ) . add ( * write_idx) as * mut __m256i , val2) ;
162- * write_idx += numberofnewvalues;
186+ append_filtered_vals_2 ( vals, vals2, mask, v, v2, write_idx) ;
163187 }
164188}
165189
190+ #[ cfg( target_feature = "neon" ) ]
191+ const POW1 : wide:: u32x4 = wide:: u32x4:: new ( [ 1 , 2 , 4 , 8 ] ) ;
192+ #[ cfg( target_feature = "neon" ) ]
193+ const POW2 : wide:: u32x4 = wide:: u32x4:: new ( [ 16 , 32 , 64 , 128 ] ) ;
194+ #[ cfg( target_feature = "neon" ) ]
195+ const NEW_OLD_MASK : S = S :: new ( [ !0 , !0 , !0 , !0 , !0 , !0 , !0 , 0 ] ) ;
196+ #[ cfg( target_feature = "neon" ) ]
197+ const I1 : core:: arch:: aarch64:: uint8x16_t =
198+ unsafe { transmute ( [ 0x1F1E1D1C , 0x03020100 , 0x07060504 , 0x0B0A0908 ] ) } ;
199+ #[ cfg( target_feature = "neon" ) ]
200+ const I2 : core:: arch:: aarch64:: uint8x16_t =
201+ unsafe { transmute ( [ 0x0F0E0D0C , 0x13121110 , 0x17161514 , 0x1B1A1918 ] ) } ;
202+
166203/// Append the values of `x` selected by `mask` to `v`.
167204#[ cfg( target_feature = "neon" ) ]
168205#[ inline( always) ]
169206pub unsafe fn append_filtered_vals ( vals : S , mask : S , v : & mut [ u32 ] , write_idx : & mut usize ) {
170207 unsafe {
171- use core:: arch:: aarch64:: { vaddvq_u32, vqtbl2q_u8 , vst1_u32_x4 } ;
208+ use core:: arch:: aarch64:: vaddvq_u32;
172209 use wide:: u32x4;
173210
174- const POW1 : u32x4 = u32x4:: new ( [ 1 , 2 , 4 , 8 ] ) ;
175- const POW2 : u32x4 = u32x4:: new ( [ 16 , 32 , 64 , 128 ] ) ;
211+ let ( m1, m2) : ( u32x4 , u32x4 ) = transmute ( mask) ;
212+ let m1 = vaddvq_u32 ( transmute ( m1 & POW1 ) ) ;
213+ let m2 = vaddvq_u32 ( transmute ( m2 & POW2 ) ) ;
214+ let mask = ( m1 | m2) as usize ;
215+ let numberofnewvalues = L - mask. count_ones ( ) as usize ;
216+ let key = UNIQSHUF_NEON [ mask] ;
217+ append_filtered_vals_from_key ( vals, key, v, write_idx) ;
218+ * write_idx += numberofnewvalues;
219+ }
220+ }
221+
222+ #[ cfg( target_feature = "neon" ) ]
223+ #[ inline( always) ]
224+ pub unsafe fn append_filtered_vals_2 (
225+ vals : S ,
226+ vals2 : S ,
227+ mask : S ,
228+ v : & mut [ u32 ] ,
229+ v2 : & mut [ u32 ] ,
230+ write_idx : & mut usize ,
231+ ) {
232+ unsafe {
233+ use core:: arch:: aarch64:: vaddvq_u32;
234+ use wide:: u32x4;
176235
177236 let ( m1, m2) : ( u32x4 , u32x4 ) = transmute ( mask) ;
178237 let m1 = vaddvq_u32 ( transmute ( m1 & POW1 ) ) ;
179238 let m2 = vaddvq_u32 ( transmute ( m2 & POW2 ) ) ;
180239 let mask = ( m1 | m2) as usize ;
181240 let numberofnewvalues = L - mask. count_ones ( ) as usize ;
182- let idx = UNIQSHUF_NEON [ mask] ;
183- let ( i1, i2) = transmute ( idx) ;
241+ let key = UNIQSHUF_NEON [ mask] ;
242+ append_filtered_vals_from_key ( vals, key, v, write_idx) ;
243+ append_filtered_vals_from_key ( vals2, key, v2, write_idx) ;
244+ * write_idx += numberofnewvalues;
245+ }
246+ }
247+
248+ #[ cfg( target_feature = "neon" ) ]
249+ #[ inline( always) ]
250+ unsafe fn append_filtered_vals_from_key ( vals : S , key : S , v : & mut [ u32 ] , write_idx : & mut usize ) {
251+ unsafe {
252+ use core:: arch:: aarch64:: { vqtbl2q_u8, vst1_u32_x4} ;
253+
254+ let ( i1, i2) = transmute ( key) ;
184255 let t = transmute ( vals) ;
185256 let r1 = vqtbl2q_u8 ( t, i1) ;
186257 let r2 = vqtbl2q_u8 ( t, i2) ;
187258 let val: S = transmute ( ( r1, r2) ) ;
188259 vst1_u32_x4 ( v. as_mut_ptr ( ) . add ( * write_idx) , transmute ( val) ) ;
189- * write_idx += numberofnewvalues;
190260 }
191261}
192262
@@ -208,32 +278,11 @@ pub unsafe fn append_unique_vals<const SKIP_MAX: bool>(
208278 unsafe {
209279 use core:: arch:: aarch64:: vqtbl2q_u8;
210280
211- const NEW_OLD_MASK : S = S :: new ( [
212- u32:: MAX ,
213- u32:: MAX ,
214- u32:: MAX ,
215- u32:: MAX ,
216- u32:: MAX ,
217- u32:: MAX ,
218- u32:: MAX ,
219- 0 ,
220- ] ) ;
221281 let recon = NEW_OLD_MASK . blend ( new, old) ;
222- let ( i1, i2) = transmute ( [
223- 0x1F1E1D1Cu32 ,
224- 0x03020100 ,
225- 0x07060504 ,
226- 0x0B0A0908 ,
227- 0x0F0E0D0C ,
228- 0x13121110 ,
229- 0x17161514 ,
230- 0x1B1A1918 ,
231- ] ) ;
232282 let t = transmute ( recon) ;
233- let r1 = vqtbl2q_u8 ( t, i1 ) ;
234- let r2 = vqtbl2q_u8 ( t, i2 ) ;
283+ let r1 = vqtbl2q_u8 ( t, I1 ) ;
284+ let r2 = vqtbl2q_u8 ( t, I2 ) ;
235285 let prec: S = transmute ( ( r1, r2) ) ;
236-
237286 let mut dup = prec. cmp_eq ( new) ;
238287 if SKIP_MAX {
239288 dup |= new. cmp_eq ( SIMD_SKIPPED ) ;
@@ -260,61 +309,15 @@ pub unsafe fn append_unique_vals_2(
260309 write_idx : & mut usize ,
261310) {
262311 unsafe {
263- use core:: arch:: aarch64:: { vaddvq_u32, vqtbl2q_u8, vst1_u32_x4} ;
264- use wide:: u32x4;
312+ use core:: arch:: aarch64:: vqtbl2q_u8;
265313
266- let new_old_mask = S :: new ( [
267- u32:: MAX ,
268- u32:: MAX ,
269- u32:: MAX ,
270- u32:: MAX ,
271- u32:: MAX ,
272- u32:: MAX ,
273- u32:: MAX ,
274- 0 ,
275- ] ) ;
276- let recon = new_old_mask. blend ( new, old) ;
277-
278- // let rotate_idx = S::new([7, 0, 1, 2, 3, 4, 5, 6]);
279- // let idx = rotate_idx * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00);
280- // let (i1, i2) = transmute(idx);
281- let ( i1, i2) = transmute ( [
282- 0x1F1E1D1Cu32 ,
283- 0x03020100 ,
284- 0x07060504 ,
285- 0x0B0A0908 ,
286- 0x0F0E0D0C ,
287- 0x13121110 ,
288- 0x17161514 ,
289- 0x1B1A1918 ,
290- ] ) ;
314+ let recon = NEW_OLD_MASK . blend ( new, old) ;
291315 let t = transmute ( recon) ;
292- let r1 = vqtbl2q_u8 ( t, i1 ) ;
293- let r2 = vqtbl2q_u8 ( t, i2 ) ;
316+ let r1 = vqtbl2q_u8 ( t, I1 ) ;
317+ let r2 = vqtbl2q_u8 ( t, I2 ) ;
294318 let prec: S = transmute ( ( r1, r2) ) ;
295-
296319 let dup = prec. cmp_eq ( new) ;
297- let ( d1, d2) : ( u32x4 , u32x4 ) = transmute ( dup) ;
298- let pow1 = u32x4:: new ( [ 1 , 2 , 4 , 8 ] ) ;
299- let pow2 = u32x4:: new ( [ 16 , 32 , 64 , 128 ] ) ;
300- let m1 = vaddvq_u32 ( transmute ( d1 & pow1) ) ;
301- let m2 = vaddvq_u32 ( transmute ( d2 & pow2) ) ;
302- let mask = ( m1 | m2) as usize ;
303-
304- let numberofnewvalues = L - mask. count_ones ( ) as usize ;
305- let idx = UNIQSHUF_NEON [ mask] ;
306- let ( i1, i2) = transmute ( idx) ;
307- let t = transmute ( vals) ;
308- let r1 = vqtbl2q_u8 ( t, i1) ;
309- let r2 = vqtbl2q_u8 ( t, i2) ;
310- let val: S = transmute ( ( r1, r2) ) ;
311- vst1_u32_x4 ( v. as_mut_ptr ( ) . add ( * write_idx) , transmute ( val) ) ;
312- let t = transmute ( vals2) ;
313- let r1 = vqtbl2q_u8 ( t, i1) ;
314- let r2 = vqtbl2q_u8 ( t, i2) ;
315- let val2: S = transmute ( ( r1, r2) ) ;
316- vst1_u32_x4 ( v2. as_mut_ptr ( ) . add ( * write_idx) , transmute ( val2) ) ;
317- * write_idx += numberofnewvalues;
320+ append_filtered_vals_2 ( vals, vals2, dup, v, v2, write_idx) ;
318321 }
319322}
320323
@@ -584,7 +587,7 @@ const UNIQSHUF: [S; 256] = unsafe {transmute([
584587#[ cfg( target_feature = "neon" ) ]
585588#[ allow( clippy:: erasing_op, clippy:: identity_op) ]
586589#[ rustfmt:: skip]
587- const UNIQSHUF_NEON : [ wide:: u8x32 ; 256 ] = unsafe {
590+ const UNIQSHUF_NEON : [ wide:: u32x8 ; 256 ] = unsafe {
588591const M : u32 = 0x04_04_04_04 ;
589592const O : u32 = 0x03_02_01_00 ;
590593transmute ( [
0 commit comments