@@ -234,106 +234,99 @@ static LUT_P1: [u64; 10usize.pow(3)] = const {
234234static LUT_P2 : [ u64 ; 10usize . pow ( 3 ) ] =
235235 unsafe { transmute ( * include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/day21.bin" ) ) ) } ;
236236
237+ use std:: arch:: x86_64:: * ;
238+
239+ // 029An980An179An4 56An379A
240+
241+ // 029An980An179An4 56An379A
242+
243+ // 029A n980 An17 9An4 56An 379A
244+ #[ target_feature( enable = "avx2,bmi1,bmi2,cmpxchg16b,lzcnt,movbe,popcnt" ) ]
245+ unsafe fn inner ( s : & str , lut : & [ u64 ; 1000 ] ) -> u64 {
246+ // 029An980An179An456An379A
247+ let v: __m256i = s. as_ptr ( ) . cast :: < __m256i > ( ) . read_unaligned ( ) ;
248+ let v = _mm256_subs_epu8 ( v, _mm256_set1_epi8 ( b'0' as i8 ) ) ;
249+
250+ // 029A n980 0000 0000 An17 9An4 56An 379A
251+ let v = _mm256_permutevar8x32_epi32 ( v, _mm256_setr_epi32 ( 0 , 1 , 6 , 6 , 2 , 3 , 4 , 5 ) ) ;
252+
253+ // _029_980_________ _179_456____ _379
254+ let idx = _mm256_setr_epi8 (
255+ 15 , 0 , 1 , 2 , 15 , 5 , 6 , 7 , //
256+ 15 , 15 , 15 , 15 , 15 , 15 , 15 , 15 , //
257+ 15 , 2 , 3 , 4 , 15 , 7 , 8 , 9 , //
258+ 15 , 15 , 15 , 15 , 15 , 12 , 13 , 14 ,
259+ ) ;
260+ let v = _mm256_shuffle_epi8 ( v, idx) ;
261+
262+ // 0, 29, 9, 80, 0, 0, 0, 0, 1, 79, 4, 56, 0, 0, 7, 90
263+ let mul: __m256i = _mm256_setr_epi8 (
264+ 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , //
265+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , //
266+ 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , //
267+ 0 , 0 , 0 , 0 , 0 , 1 , 10 , 1 , //
268+ ) ;
269+ let v = _mm256_maddubs_epi16 ( v, mul) ;
270+
271+ // 29, 980, 0, 0, 179, 456, 0, 379
272+ let mul: __m256i = _mm256_setr_epi16 (
273+ 100 , 1 , 100 , 1 , //
274+ 0 , 0 , 0 , 0 , //
275+ 100 , 1 , 100 , 1 , //
276+ 0 , 0 , 100 , 1 ,
277+ ) ;
278+ let v = _mm256_madd_epi16 ( v, mul) ;
279+ * lut. get_unchecked ( _mm256_extract_epi32 :: < 0 > ( v) as u32 as usize )
280+ + * lut. get_unchecked ( _mm256_extract_epi32 :: < 1 > ( v) as u32 as usize )
281+ + * lut. get_unchecked ( _mm256_extract_epi32 :: < 4 > ( v) as u32 as usize )
282+ + * lut. get_unchecked ( _mm256_extract_epi32 :: < 5 > ( v) as u32 as usize )
283+ + * lut. get_unchecked ( _mm256_extract_epi32 :: < 7 > ( v) as u32 as usize )
284+ // let com_l = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast(), v);
285+ // let com_h = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast::<i32>().offset(1), v);
286+
287+ // let h1 = _mm256_unpacklo_epi32(com_l, com_h);
288+ // let h2 = _mm256_unpackhi_epi32(com_l, com_h);
289+
290+ // let sum = 0;
291+ // let v = _mm256_add_epi64(h1, h2);
292+ // let vs = _mm256_shuffle_epi32::<{ (1 << 6) | (0 << 4) | (3 << 2) | 2 }>(v);
293+ // let v = _mm256_add_epi64(v, vs);
294+
295+ // _mm256_extract_epi64::<0>(v) as u64 + _mm256_extract_epi64::<2>(v) as u64
296+ // let mut p = [0u32; 256 / 32];
297+ // p.as_mut_ptr().cast::<__m256i>().write(v);
298+ // println!("{:?}", &p);
299+
300+ // panic!()
301+
302+ // let v = _mm256_shuffle_epi8(v, )
303+
304+ // let s = s.as_bytes();
305+ // let mut sum = 0;
306+ // unsafe {
307+ // let mut i = 0;
308+ // for _ in 0..5 {
309+ // let idx = (*s.get_unchecked(i + 0) as usize) * 100
310+ // + (*s.get_unchecked(i + 1) as usize) * 10
311+ // + (*s.get_unchecked(i + 2) as usize) * 1
312+ // - (b'0' as usize * 111);
313+
314+ // sum += lut[idx];
315+ // i += 5;
316+ // }
317+ // }
318+
319+ // sum
320+ }
321+
237322#[ aoc( day21, part1) ]
238323pub fn part1 ( s : & str ) -> u64 {
239- static LCPI0_0 : [ u8 ; 32 ] = [
240- 3 , 8 , 9 , 10 , 3 , 13 , 14 , 15 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 7 , 10 , 11 , 12 , 7 , 15 , 0 , 1 , 0 , 0 , 0 , 0 ,
241- 7 , 4 , 5 , 6 ,
242- ] ;
243- static LCPI0_1 : [ u8 ; 32 ] = [
244- 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 ,
245- 0 , 0 , 0 , 48 , 48 , 48 , 48 ,
246- ] ;
247- static LCPI0_2 : [ u8 ; 32 ] = [
248- 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 ,
249- 1 , 10 , 1 ,
250- ] ;
251- static LCPI0_3 : [ u16 ; 16 ] = [ 100 , 1 , 100 , 1 , 0 , 0 , 0 , 0 , 100 , 1 , 100 , 1 , 0 , 0 , 100 , 1 ] ;
252-
253- let r: u64 ;
254- unsafe {
255- std:: arch:: asm!(
256- "vpermq {ymm:y}, ymmword ptr [{s}], 99" ,
257- "vpshufb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_0}]" ,
258- "vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]" ,
259- "vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]" ,
260- "vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]" ,
261- "vmovd {c:e}, {ymm:x}" ,
262- "vpextrd {a:e}, {ymm:x}, 1" ,
263- "mov {a:r}, qword ptr [{lut} + 8*{a:r}]" ,
264- "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
265- "vextracti128 {ymm:x}, {ymm:y}, 1" ,
266- "vmovd {c:e}, {ymm:x}" ,
267- "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
268- "vpextrd {c:e}, {ymm:x}, 1" ,
269- "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
270- "vpextrd {c:e}, {ymm:x}, 3" ,
271- "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
272-
273- LCPI0_0 = sym LCPI0_0 ,
274- LCPI0_1 = sym LCPI0_1 ,
275- LCPI0_2 = sym LCPI0_2 ,
276- LCPI0_3 = sym LCPI0_3 ,
277- s = in( reg) s. as_ptr( ) ,
278- lut = in( reg) LUT_P1 . as_ptr( ) ,
279- ymm = out( ymm_reg) _,
280- c = out( reg) _,
281- a = out( reg) r,
282- options( nostack)
283- ) ;
284- }
285- r
324+ unsafe { inner ( s, & LUT_P1 ) }
286325}
287326
288327#[ aoc( day21, part2) ]
289328pub fn part2 ( s : & str ) -> u64 {
290- static LCPI0_0 : [ u8 ; 32 ] = [
291- 3 , 8 , 9 , 10 , 3 , 13 , 14 , 15 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 7 , 10 , 11 , 12 , 7 , 15 , 0 , 1 , 0 , 0 , 0 , 0 ,
292- 7 , 4 , 5 , 6 ,
293- ] ;
294- static LCPI0_1 : [ u8 ; 32 ] = [
295- 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 ,
296- 0 , 0 , 0 , 48 , 48 , 48 , 48 ,
297- ] ;
298- static LCPI0_2 : [ u8 ; 32 ] = [
299- 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 ,
300- 1 , 10 , 1 ,
301- ] ;
302- static LCPI0_3 : [ u16 ; 16 ] = [ 100 , 1 , 100 , 1 , 0 , 0 , 0 , 0 , 100 , 1 , 100 , 1 , 0 , 0 , 100 , 1 ] ;
303-
304- let r: u64 ;
305- unsafe {
306- std:: arch:: asm!(
307- "vpermq {ymm}, ymmword ptr [{s}], 99" ,
308- "vpshufb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_0}]" ,
309- "vpsubusb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_1}]" ,
310- "vpmaddubsw {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_2}]" ,
311- "vpmaddwd {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_3}]" ,
312- "vmovd {t:e}, {ymm:x}" ,
313- "vpextrd {r:e}, {ymm:x}, 1" ,
314- "mov {r:r}, qword ptr [{lut} + 8*{r:r}]" ,
315- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
316- "vextracti128 {ymm:x}, {ymm}, 1" ,
317- "vmovd {t:e}, {ymm:x}" ,
318- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
319- "vpextrd {t:e}, {ymm:x}, 1" ,
320- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
321- "vpextrd {t:e}, {ymm:x}, 3" ,
322- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
323-
324- LCPI0_0 = sym LCPI0_0 ,
325- LCPI0_1 = sym LCPI0_1 ,
326- LCPI0_2 = sym LCPI0_2 ,
327- LCPI0_3 = sym LCPI0_3 ,
328- s = in( reg) s. as_ptr( ) ,
329- lut = in( reg) LUT_P2 . as_ptr( ) ,
330- r = out( reg) r,
331- ymm = out( ymm_reg) _,
332- t = out( reg) _,
333- options( nostack)
334- ) ;
335- }
336- r
329+ unsafe { inner ( s, & LUT_P2 ) }
337330}
338331
339332#[ cfg( test) ]
0 commit comments