@@ -276,17 +276,24 @@ unsafe fn inner(s: &str, lut: &[u64; 1000]) -> u64 {
276276 0 , 0 , 100 , 1 ,
277277 ) ;
278278 let v = _mm256_madd_epi16 ( v, mul) ;
279- let com_l = _mm256_i32gather_epi32 :: < 8 > ( lut. as_ptr ( ) . cast ( ) , v) ;
280- let com_h = _mm256_i32gather_epi32 :: < 8 > ( lut. as_ptr ( ) . cast :: < i32 > ( ) . offset ( 1 ) , v) ;
281-
282- let h1 = _mm256_unpacklo_epi32 ( com_l, com_h) ;
283- let h2 = _mm256_unpackhi_epi32 ( com_l, com_h) ;
284- let v = _mm256_add_epi64 ( h1, h2) ;
285- let vs = _mm256_shuffle_epi32 :: < { ( 1 << 6 ) | ( 0 << 4 ) | ( 3 << 2 ) | 2 } > ( v) ;
286- let v = _mm256_add_epi64 ( v, vs) ;
287-
288- _mm256_extract_epi64 :: < 0 > ( v) as u64 + _mm256_extract_epi64 :: < 2 > ( v) as u64
289- // let mut p = [0u64; 256 / 64];
279+ lut[ _mm256_extract_epi32 :: < 0 > ( v) as u32 as usize ]
280+ + lut[ _mm256_extract_epi32 :: < 1 > ( v) as u32 as usize ]
281+ + lut[ _mm256_extract_epi32 :: < 4 > ( v) as u32 as usize ]
282+ + lut[ _mm256_extract_epi32 :: < 5 > ( v) as u32 as usize ]
283+ + lut[ _mm256_extract_epi32 :: < 7 > ( v) as u32 as usize ]
284+ // let com_l = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast(), v);
285+ // let com_h = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast::<i32>().offset(1), v);
286+
287+ // let h1 = _mm256_unpacklo_epi32(com_l, com_h);
288+ // let h2 = _mm256_unpackhi_epi32(com_l, com_h);
289+
290+ // let sum = 0;
291+ // let v = _mm256_add_epi64(h1, h2);
292+ // let vs = _mm256_shuffle_epi32::<{ (1 << 6) | (0 << 4) | (3 << 2) | 2 }>(v);
293+ // let v = _mm256_add_epi64(v, vs);
294+
295+ // _mm256_extract_epi64::<0>(v) as u64 + _mm256_extract_epi64::<2>(v) as u64
296+ // let mut p = [0u32; 256 / 32];
290297 // p.as_mut_ptr().cast::<__m256i>().write(v);
291298 // println!("{:?}", &p);
292299
0 commit comments