Skip to content

Commit 983f86a

Browse files
committed
Use inline asm
1 parent 7c6a966 commit 983f86a

File tree

1 file changed

+67
-13
lines changed

1 file changed

+67
-13
lines changed

src/day25.rs

Lines changed: 67 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::{arch::x86_64::*, mem::MaybeUninit};
1+
use std::arch::x86_64::*;
22

33
use aoc_runner_derive::aoc;
44

@@ -49,12 +49,39 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
4949
);
5050

5151
if is_key {
52-
for i in 0..holes_i {
53-
let o = *holes.get_unchecked(i);
54-
let collisions = _mm256_cmpeq_epi8(d, o);
55-
let collisions = _mm256_movemask_epi8(collisions);
56-
sum += (collisions == 0) as u64;
57-
}
52+
std::arch::asm!(
53+
"test {i}, {i}",
54+
"je 2f", // Jump on empty
55+
"cmp {i}, 1",
56+
"je 3f", // Jump to one case
57+
"shl {i}, 5",
58+
"4:",
59+
"add {i}, -64",
60+
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]",
61+
"vpmovmskb {t}, {vt}",
62+
"cmp {t}, 1",
63+
"adc {sum}, 0",
64+
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]",
65+
"vpmovmskb {t}, {vt}",
66+
"cmp {t}, 1",
67+
"adc {sum}, 0",
68+
"cmp {i}, 32",
69+
"jg 4b", // Loop
70+
"jne 2f", // Is zero
71+
"3:",
72+
"vpcmpeqb {vt}, {d}, ymmword ptr [{os}]",
73+
"vpmovmskb {t}, {vt}",
74+
"cmp {t}, 1",
75+
"adc {sum}, 0",
76+
"2:",
77+
os = in(reg) holes,
78+
d = in(ymm_reg) d,
79+
i = inout(reg) holes_i => _,
80+
sum = inout(reg) sum,
81+
t = out(reg) _,
82+
vt = out(ymm_reg) _,
83+
options(nostack),
84+
);
5885
let d = _mm256_and_si256(d, _mm256_set1_epi8(KEPT_BITS));
5986
let d = _mm256_or_si256(
6087
d,
@@ -69,12 +96,39 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
6996
*keys.get_unchecked_mut(keys_i) = d;
7097
keys_i += 1;
7198
} else {
72-
for i in 0..keys_i {
73-
let o = *keys.get_unchecked(i);
74-
let collisions = _mm256_cmpeq_epi8(d, o);
75-
let collisions = _mm256_movemask_epi8(collisions);
76-
sum += (collisions == 0) as u64;
77-
}
99+
std::arch::asm!(
100+
"test {i}, {i}",
101+
"je 2f", // Jump on empty
102+
"cmp {i}, 1",
103+
"je 3f", // Jump to one case
104+
"shl {i}, 5",
105+
"4:",
106+
"add {i}, -64",
107+
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]",
108+
"vpmovmskb {t}, {vt}",
109+
"cmp {t}, 1",
110+
"adc {sum}, 0",
111+
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]",
112+
"vpmovmskb {t}, {vt}",
113+
"cmp {t}, 1",
114+
"adc {sum}, 0",
115+
"cmp {i}, 32",
116+
"jg 4b", // Loop
117+
"jne 2f", // Is zero
118+
"3:",
119+
"vpcmpeqb {vt}, {d}, ymmword ptr [{os}]",
120+
"vpmovmskb {t}, {vt}",
121+
"cmp {t}, 1",
122+
"adc {sum}, 0",
123+
"2:",
124+
os = in(reg) keys,
125+
d = in(ymm_reg) d,
126+
i = inout(reg) keys_i => _,
127+
sum = inout(reg) sum,
128+
t = out(reg) _,
129+
vt = out(ymm_reg) _,
130+
options(nostack),
131+
);
78132
let d = _mm256_and_si256(d, _mm256_set1_epi8(KEPT_BITS));
79133
let d = _mm256_or_si256(
80134
d,

0 commit comments

Comments
 (0)