Skip to content

Commit 8d226e0

Browse files
committed
Do two at once
1 parent c4621c1 commit 8d226e0

File tree

1 file changed

+61
-26
lines changed

1 file changed

+61
-26
lines changed

src/day21.rs

Lines changed: 61 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,41 @@ static LUT_P1: [u64; 10usize.pow(3)] = const {
231231
lut
232232
};
233233

234+
static LUT_P1_BIG: [u64; 10usize.pow(6)] = const {
235+
let mut lut = [0u64; 10usize.pow(6)];
236+
237+
let mut a = 0;
238+
while a < 1000 {
239+
let mut b = 0;
240+
while b < 1000 {
241+
lut[a * 1000 + b] = LUT_P1[a] + LUT_P1[b];
242+
b += 1;
243+
}
244+
a += 1;
245+
}
246+
247+
lut
248+
};
249+
234250
static LUT_P2: [u64; 10usize.pow(3)] =
235251
unsafe { transmute(*include_bytes!(concat!(env!("OUT_DIR"), "/day21.bin"))) };
236252

253+
static LUT_P2_BIG: [u64; 10usize.pow(6)] = const {
254+
let mut lut = [0u64; 10usize.pow(6)];
255+
256+
let mut a = 0;
257+
while a < 1000 {
258+
let mut b = 0;
259+
while b < 1000 {
260+
lut[a * 1000 + b] = LUT_P2[a] + LUT_P2[b];
261+
b += 1;
262+
}
263+
a += 1;
264+
}
265+
266+
lut
267+
};
268+
237269
#[aoc(day21, part1)]
238270
pub fn part1(s: &str) -> u64 {
239271
static LCPI0_0: [u8; 32] = [
@@ -249,6 +281,7 @@ pub fn part1(s: &str) -> u64 {
249281
1, 10, 1,
250282
];
251283
static LCPI0_3: [u16; 16] = [100, 1, 100, 1, 0, 0, 0, 0, 100, 1, 100, 1, 0, 0, 100, 1];
284+
static LCPI0_4: [u16; 16] = [1000, 1, 0, 0, 0, 0, 0, 0, 1000, 1, 0, 1, 0, 0, 0, 0];
252285

253286
let r: u64;
254287
unsafe {
@@ -258,30 +291,30 @@ pub fn part1(s: &str) -> u64 {
258291
"vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]",
259292
"vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]",
260293
"vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]",
294+
"vpackusdw {ymm:y}, {ymm:y}, {ymm:y}",
295+
"vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_4}]",
261296
"vmovd {c:e}, {ymm:x}",
262-
"vpextrd {a:e}, {ymm:x}, 1",
263-
"mov {a:r}, qword ptr [{lut} + 8*{a:r}]",
264-
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
265297
"vextracti128 {ymm:x}, {ymm:y}, 1",
266-
"vmovd {c:e}, {ymm:x}",
298+
"vmovd {a:e}, {ymm:x}",
299+
"mov {a:r}, qword ptr [{lut} + 8*{a:r}]",
267300
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
268301
"vpextrd {c:e}, {ymm:x}, 1",
269302
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
270-
"vpextrd {c:e}, {ymm:x}, 3",
271-
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
272303

273304
LCPI0_0 = sym LCPI0_0,
274305
LCPI0_1 = sym LCPI0_1,
275306
LCPI0_2 = sym LCPI0_2,
276307
LCPI0_3 = sym LCPI0_3,
308+
LCPI0_4 = sym LCPI0_4,
277309
s = in(reg) s.as_ptr(),
278-
lut = in(reg) LUT_P1.as_ptr(),
310+
lut = in(reg) LUT_P1_BIG.as_ptr(),
279311
ymm = out(ymm_reg) _,
280312
c = out(reg) _,
281313
a = out(reg) r,
282314
options(nostack)
283315
);
284316
}
317+
285318
r
286319
}
287320

@@ -301,38 +334,40 @@ pub fn part2(s: &str) -> u64 {
301334
];
302335
static LCPI0_3: [u16; 16] = [100, 1, 100, 1, 0, 0, 0, 0, 100, 1, 100, 1, 0, 0, 100, 1];
303336

337+
static LCPI0_4: [u16; 16] = [1000, 1, 0, 0, 0, 0, 0, 0, 1000, 1, 0, 1, 0, 0, 0, 0];
338+
304339
let r: u64;
305340
unsafe {
306341
std::arch::asm!(
307-
"vpermq {ymm}, ymmword ptr [{s}], 99",
308-
"vpshufb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_0}]",
309-
"vpsubusb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_1}]",
310-
"vpmaddubsw {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_2}]",
311-
"vpmaddwd {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_3}]",
312-
"vmovd {t:e}, {ymm:x}",
313-
"vpextrd {r:e}, {ymm:x}, 1",
314-
"mov {r:r}, qword ptr [{lut} + 8*{r:r}]",
315-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
316-
"vextracti128 {ymm:x}, {ymm}, 1",
317-
"vmovd {t:e}, {ymm:x}",
318-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
319-
"vpextrd {t:e}, {ymm:x}, 1",
320-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
321-
"vpextrd {t:e}, {ymm:x}, 3",
322-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
342+
"vpermq {ymm:y}, ymmword ptr [{s}], 99",
343+
"vpshufb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_0}]",
344+
"vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]",
345+
"vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]",
346+
"vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]",
347+
"vpackusdw {ymm:y}, {ymm:y}, {ymm:y}",
348+
"vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_4}]",
349+
"vmovd {c:e}, {ymm:x}",
350+
"vextracti128 {ymm:x}, {ymm:y}, 1",
351+
"vmovd {a:e}, {ymm:x}",
352+
"mov {a:r}, qword ptr [{lut} + 8*{a:r}]",
353+
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
354+
"vpextrd {c:e}, {ymm:x}, 1",
355+
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
323356

324357
LCPI0_0 = sym LCPI0_0,
325358
LCPI0_1 = sym LCPI0_1,
326359
LCPI0_2 = sym LCPI0_2,
327360
LCPI0_3 = sym LCPI0_3,
361+
LCPI0_4 = sym LCPI0_4,
328362
s = in(reg) s.as_ptr(),
329-
lut = in(reg) LUT_P2.as_ptr(),
330-
r = out(reg) r,
363+
lut = in(reg) LUT_P2_BIG.as_ptr(),
331364
ymm = out(ymm_reg) _,
332-
t = out(reg) _,
365+
c = out(reg) _,
366+
a = out(reg) r,
333367
options(nostack)
334368
);
335369
}
370+
336371
r
337372
}
338373

0 commit comments

Comments
 (0)