@@ -231,9 +231,41 @@ static LUT_P1: [u64; 10usize.pow(3)] = const {
231231 lut
232232} ;
233233
234+ static LUT_P1_BIG : [ u64 ; 10usize . pow ( 6 ) ] = const {
235+ let mut lut = [ 0u64 ; 10usize . pow ( 6 ) ] ;
236+
237+ let mut a = 0 ;
238+ while a < 1000 {
239+ let mut b = 0 ;
240+ while b < 1000 {
241+ lut[ a * 1000 + b] = LUT_P1 [ a] + LUT_P1 [ b] ;
242+ b += 1 ;
243+ }
244+ a += 1 ;
245+ }
246+
247+ lut
248+ } ;
249+
234250static LUT_P2 : [ u64 ; 10usize . pow ( 3 ) ] =
235251 unsafe { transmute ( * include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/day21.bin" ) ) ) } ;
236252
253+ static LUT_P2_BIG : [ u64 ; 10usize . pow ( 6 ) ] = const {
254+ let mut lut = [ 0u64 ; 10usize . pow ( 6 ) ] ;
255+
256+ let mut a = 0 ;
257+ while a < 1000 {
258+ let mut b = 0 ;
259+ while b < 1000 {
260+ lut[ a * 1000 + b] = LUT_P2 [ a] + LUT_P2 [ b] ;
261+ b += 1 ;
262+ }
263+ a += 1 ;
264+ }
265+
266+ lut
267+ } ;
268+
237269#[ aoc( day21, part1) ]
238270pub fn part1 ( s : & str ) -> u64 {
239271 static LCPI0_0 : [ u8 ; 32 ] = [
@@ -249,6 +281,7 @@ pub fn part1(s: &str) -> u64 {
249281 1 , 10 , 1 ,
250282 ] ;
251283 static LCPI0_3 : [ u16 ; 16 ] = [ 100 , 1 , 100 , 1 , 0 , 0 , 0 , 0 , 100 , 1 , 100 , 1 , 0 , 0 , 100 , 1 ] ;
284+ static LCPI0_4 : [ u16 ; 16 ] = [ 1000 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 1000 , 1 , 0 , 1 , 0 , 0 , 0 , 0 ] ;
252285
253286 let r: u64 ;
254287 unsafe {
@@ -258,30 +291,30 @@ pub fn part1(s: &str) -> u64 {
258291 "vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]" ,
259292 "vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]" ,
260293 "vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]" ,
294+ "vpackusdw {ymm:y}, {ymm:y}, {ymm:y}" ,
295+ "vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_4}]" ,
261296 "vmovd {c:e}, {ymm:x}" ,
262- "vpextrd {a:e}, {ymm:x}, 1" ,
263- "mov {a:r}, qword ptr [{lut} + 8*{a:r}]" ,
264- "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
265297 "vextracti128 {ymm:x}, {ymm:y}, 1" ,
266- "vmovd {c:e}, {ymm:x}" ,
298+ "vmovd {a:e}, {ymm:x}" ,
299+ "mov {a:r}, qword ptr [{lut} + 8*{a:r}]" ,
267300 "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
268301 "vpextrd {c:e}, {ymm:x}, 1" ,
269302 "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
270- "vpextrd {c:e}, {ymm:x}, 3" ,
271- "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
272303
273304 LCPI0_0 = sym LCPI0_0 ,
274305 LCPI0_1 = sym LCPI0_1 ,
275306 LCPI0_2 = sym LCPI0_2 ,
276307 LCPI0_3 = sym LCPI0_3 ,
308+ LCPI0_4 = sym LCPI0_4 ,
277309 s = in( reg) s. as_ptr( ) ,
278- lut = in( reg) LUT_P1 . as_ptr( ) ,
310+ lut = in( reg) LUT_P1_BIG . as_ptr( ) ,
279311 ymm = out( ymm_reg) _,
280312 c = out( reg) _,
281313 a = out( reg) r,
282314 options( nostack)
283315 ) ;
284316 }
317+
285318 r
286319}
287320
@@ -301,38 +334,40 @@ pub fn part2(s: &str) -> u64 {
301334 ] ;
302335 static LCPI0_3 : [ u16 ; 16 ] = [ 100 , 1 , 100 , 1 , 0 , 0 , 0 , 0 , 100 , 1 , 100 , 1 , 0 , 0 , 100 , 1 ] ;
303336
337+ static LCPI0_4 : [ u16 ; 16 ] = [ 1000 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 1000 , 1 , 0 , 1 , 0 , 0 , 0 , 0 ] ;
338+
304339 let r: u64 ;
305340 unsafe {
306341 std:: arch:: asm!(
307- "vpermq {ymm}, ymmword ptr [{s}], 99" ,
308- "vpshufb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_0}]" ,
309- "vpsubusb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_1}]" ,
310- "vpmaddubsw {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_2}]" ,
311- "vpmaddwd {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_3}]" ,
312- "vmovd {t:e}, {ymm:x}" ,
313- "vpextrd {r:e}, {ymm:x}, 1" ,
314- "mov {r:r}, qword ptr [{lut} + 8*{r:r}]" ,
315- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
316- "vextracti128 {ymm:x}, {ymm}, 1" ,
317- "vmovd {t:e}, {ymm:x}" ,
318- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
319- "vpextrd {t:e}, {ymm:x}, 1" ,
320- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
321- "vpextrd {t:e}, {ymm:x}, 3" ,
322- "add {r:r}, qword ptr [{lut} + 8*{t:r}]" ,
342+ "vpermq {ymm:y}, ymmword ptr [{s}], 99" ,
343+ "vpshufb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_0}]" ,
344+ "vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]" ,
345+ "vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]" ,
346+ "vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]" ,
347+ "vpackusdw {ymm:y}, {ymm:y}, {ymm:y}" ,
348+ "vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_4}]" ,
349+ "vmovd {c:e}, {ymm:x}" ,
350+ "vextracti128 {ymm:x}, {ymm:y}, 1" ,
351+ "vmovd {a:e}, {ymm:x}" ,
352+ "mov {a:r}, qword ptr [{lut} + 8*{a:r}]" ,
353+ "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
354+ "vpextrd {c:e}, {ymm:x}, 1" ,
355+ "add {a:r}, qword ptr [{lut} + 8*{c:r}]" ,
323356
324357 LCPI0_0 = sym LCPI0_0 ,
325358 LCPI0_1 = sym LCPI0_1 ,
326359 LCPI0_2 = sym LCPI0_2 ,
327360 LCPI0_3 = sym LCPI0_3 ,
361+ LCPI0_4 = sym LCPI0_4 ,
328362 s = in( reg) s. as_ptr( ) ,
329- lut = in( reg) LUT_P2 . as_ptr( ) ,
330- r = out( reg) r,
363+ lut = in( reg) LUT_P2_BIG . as_ptr( ) ,
331364 ymm = out( ymm_reg) _,
332- t = out( reg) _,
365+ c = out( reg) _,
366+ a = out( reg) r,
333367 options( nostack)
334368 ) ;
335369 }
370+
336371 r
337372}
338373
0 commit comments