@@ -234,99 +234,106 @@ static LUT_P1: [u64; 10usize.pow(3)] = const {
234234static LUT_P2 : [ u64 ; 10usize . pow ( 3 ) ] =
235235 unsafe { transmute ( * include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/day21.bin" ) ) ) } ;
236236
237- use std:: arch:: x86_64:: * ;
238-
239- // 029An980An179An4 56An379A
240-
241- // 029An980An179An4 56An379A
242-
243- // 029A n980 An17 9An4 56An 379A
244- #[ target_feature( enable = "avx2,bmi1,bmi2,cmpxchg16b,lzcnt,movbe,popcnt" ) ]
245- unsafe fn inner ( s : & str , lut : & [ u64 ; 1000 ] ) -> u64 {
246- // 029An980An179An456An379A
247- let v: __m256i = s. as_ptr ( ) . cast :: < __m256i > ( ) . read_unaligned ( ) ;
248- let v = _mm256_subs_epu8 ( v, _mm256_set1_epi8 ( b'0' as i8 ) ) ;
249-
250- // 029A n980 0000 0000 An17 9An4 56An 379A
251- let v = _mm256_permutevar8x32_epi32 ( v, _mm256_setr_epi32 ( 0 , 1 , 6 , 6 , 2 , 3 , 4 , 5 ) ) ;
252-
253- // _029_980_________ _179_456____ _379
254- let idx = _mm256_setr_epi8 (
255- 15 , 0 , 1 , 2 , 15 , 5 , 6 , 7 , //
256- 15 , 15 , 15 , 15 , 15 , 15 , 15 , 15 , //
257- 15 , 2 , 3 , 4 , 15 , 7 , 8 , 9 , //
258- 15 , 15 , 15 , 15 , 15 , 12 , 13 , 14 ,
259- ) ;
260- let v = _mm256_shuffle_epi8 ( v, idx) ;
261-
262- // 0, 29, 9, 80, 0, 0, 0, 0, 1, 79, 4, 56, 0, 0, 7, 90
263- let mul: __m256i = _mm256_setr_epi8 (
264- 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , //
265- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , //
266- 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , //
267- 0 , 0 , 0 , 0 , 0 , 1 , 10 , 1 , //
268- ) ;
269- let v = _mm256_maddubs_epi16 ( v, mul) ;
270-
271- // 29, 980, 0, 0, 179, 456, 0, 379
272- let mul: __m256i = _mm256_setr_epi16 (
273- 100 , 1 , 100 , 1 , //
274- 0 , 0 , 0 , 0 , //
275- 100 , 1 , 100 , 1 , //
276- 0 , 0 , 100 , 1 ,
277- ) ;
278- let v = _mm256_madd_epi16 ( v, mul) ;
279- * lut. get_unchecked ( _mm256_extract_epi32 :: < 0 > ( v) as u32 as usize )
280- + * lut. get_unchecked ( _mm256_extract_epi32 :: < 1 > ( v) as u32 as usize )
281- + * lut. get_unchecked ( _mm256_extract_epi32 :: < 4 > ( v) as u32 as usize )
282- + * lut. get_unchecked ( _mm256_extract_epi32 :: < 5 > ( v) as u32 as usize )
283- + * lut. get_unchecked ( _mm256_extract_epi32 :: < 7 > ( v) as u32 as usize )
284- // let com_l = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast(), v);
285- // let com_h = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast::<i32>().offset(1), v);
286-
287- // let h1 = _mm256_unpacklo_epi32(com_l, com_h);
288- // let h2 = _mm256_unpackhi_epi32(com_l, com_h);
289-
290- // let sum = 0;
291- // let v = _mm256_add_epi64(h1, h2);
292- // let vs = _mm256_shuffle_epi32::<{ (1 << 6) | (0 << 4) | (3 << 2) | 2 }>(v);
293- // let v = _mm256_add_epi64(v, vs);
294-
295- // _mm256_extract_epi64::<0>(v) as u64 + _mm256_extract_epi64::<2>(v) as u64
296- // let mut p = [0u32; 256 / 32];
297- // p.as_mut_ptr().cast::<__m256i>().write(v);
298- // println!("{:?}", &p);
299-
300- // panic!()
301-
302- // let v = _mm256_shuffle_epi8(v, )
303-
304- // let s = s.as_bytes();
305- // let mut sum = 0;
306- // unsafe {
307- // let mut i = 0;
308- // for _ in 0..5 {
309- // let idx = (*s.get_unchecked(i + 0) as usize) * 100
310- // + (*s.get_unchecked(i + 1) as usize) * 10
311- // + (*s.get_unchecked(i + 2) as usize) * 1
312- // - (b'0' as usize * 111);
313-
314- // sum += lut[idx];
315- // i += 5;
316- // }
317- // }
318-
319- // sum
320- }
321-
322237#[ aoc( day21, part1) ]
323238pub fn part1 ( s : & str ) -> u64 {
324- unsafe { inner ( s, & LUT_P1 ) }
239+ static LCPI0_0 : [ u8 ; 32 ] = [
240+ 3 , 8 , 9 , 10 , 3 , 13 , 14 , 15 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 7 , 10 , 11 , 12 , 7 , 15 , 0 , 1 , 0 , 0 , 0 , 0 ,
241+ 7 , 4 , 5 , 6 ,
242+ ] ;
243+ static LCPI0_1 : [ u8 ; 32 ] = [
244+ 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 ,
245+ 0 , 0 , 0 , 48 , 48 , 48 , 48 ,
246+ ] ;
247+ static LCPI0_2 : [ u8 ; 32 ] = [
248+ 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 ,
249+ 1 , 10 , 1 ,
250+ ] ;
251+ static LCPI0_3 : [ u16 ; 16 ] = [ 100 , 1 , 100 , 1 , 0 , 0 , 0 , 0 , 100 , 1 , 100 , 1 , 0 , 0 , 100 , 1 ] ;
252+
253+ let r: u64 ;
254+ unsafe {
255+ std:: arch:: asm!(
256+ "vpermq {ymm:y}, ymmword ptr [{s}], 99" ,
257+ "vpshufb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_0}]" ,
258+ "vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]" ,
259+ "vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]" ,
260+ "vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]" ,
261+ "vmovd {c:e}, {ymm:x}" ,
262+ "vpextrd {a:e}, {ymm:x}, 1" ,
263+ "mov {a:r}, qword ptr [{lut} + 8*{a:r}]" ,
264+ "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
265+ "vextracti128 {ymm:x}, {ymm:y}, 1" ,
266+ "vmovd {c:e}, {ymm:x}" ,
267+ "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
268+ "vpextrd {c:e}, {ymm:x}, 1" ,
269+ "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
270+ "vpextrd {c:e}, {ymm:x}, 3" ,
271+ "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
272+
273+ LCPI0_0 = sym LCPI0_0 ,
274+ LCPI0_1 = sym LCPI0_1 ,
275+ LCPI0_2 = sym LCPI0_2 ,
276+ LCPI0_3 = sym LCPI0_3 ,
277+ s = in( reg) s. as_ptr( ) ,
278+ lut = in( reg) LUT_P1 . as_ptr( ) ,
279+ ymm = out( ymm_reg) _,
280+ c = out( reg) _,
281+ a = out( reg) r,
282+ options( nostack)
283+ ) ;
284+ }
285+ r
325286}
326287
327288#[ aoc( day21, part2) ]
328289pub fn part2 ( s : & str ) -> u64 {
329- unsafe { inner ( s, & LUT_P2 ) }
290+ static LCPI0_0 : [ u8 ; 32 ] = [
291+ 3 , 8 , 9 , 10 , 3 , 13 , 14 , 15 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 7 , 10 , 11 , 12 , 7 , 15 , 0 , 1 , 0 , 0 , 0 , 0 ,
292+ 7 , 4 , 5 , 6 ,
293+ ] ;
294+ static LCPI0_1 : [ u8 ; 32 ] = [
295+ 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 48 , 0 ,
296+ 0 , 0 , 0 , 48 , 48 , 48 , 48 ,
297+ ] ;
298+ static LCPI0_2 : [ u8 ; 32 ] = [
299+ 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 10 , 1 , 0 , 1 , 10 , 1 , 0 , 0 , 0 , 0 , 0 ,
300+ 1 , 10 , 1 ,
301+ ] ;
302+ static LCPI0_3 : [ u16 ; 16 ] = [ 100 , 1 , 100 , 1 , 0 , 0 , 0 , 0 , 100 , 1 , 100 , 1 , 0 , 0 , 100 , 1 ] ;
303+
304+ let r: u64 ;
305+ unsafe {
306+ std:: arch:: asm!(
307+ "vpermq {ymm}, ymmword ptr [{s}], 99" ,
308+ "vpshufb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_0}]" ,
309+ "vpsubusb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_1}]" ,
310+ "vpmaddubsw {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_2}]" ,
311+ "vpmaddwd {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_3}]" ,
312+ "vmovd {t:e}, {ymm:x}" ,
313+ "vpextrd {r:e}, {ymm:x}, 1" ,
314+ "mov {r:r}, qword ptr [{lut} + 8*{r:r}]" ,
315+ "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
316+ "vextracti128 {ymm:x}, {ymm}, 1" ,
317+ "vmovd {t:e}, {ymm:x}" ,
318+ "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
319+ "vpextrd {t:e}, {ymm:x}, 1" ,
320+ "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
321+ "vpextrd {t:e}, {ymm:x}, 3" ,
322+ "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
323+
324+ LCPI0_0 = sym LCPI0_0 ,
325+ LCPI0_1 = sym LCPI0_1 ,
326+ LCPI0_2 = sym LCPI0_2 ,
327+ LCPI0_3 = sym LCPI0_3 ,
328+ s = in( reg) s. as_ptr( ) ,
329+ lut = in( reg) LUT_P2 . as_ptr( ) ,
330+ r = out( reg) r,
331+ ymm = out( ymm_reg) _,
332+ t = out( reg) _,
333+ options( nostack)
334+ ) ;
335+ }
336+ r
330337}
331338
332339#[ cfg( test) ]
0 commit comments