Skip to content

Commit 08c30df

Browse files
committed
Use new simd system
1 parent df9c8a9 commit 08c30df

File tree

1 file changed

+58
-182
lines changed

1 file changed

+58
-182
lines changed

src/day25.rs

Lines changed: 58 additions & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -8,207 +8,83 @@ pub fn part1(s: &str) -> u64 {
88
unsafe { part1_inner(s) }
99
}
1010

11-
pub fn part2(_s: &str) -> u64 {
12-
// To be sure you know...
13-
42
14-
}
15-
1611
const DS: usize = 7 * 6 + 1;
1712

18-
const ALL_BITS: i8 = 0b111;
19-
const KEPT_BITS: i8 = 0b011;
20-
2113
#[inline(always)]
2214
unsafe fn part1_inner(s: &[u8]) -> u64 {
2315
let mut sum = 0;
2416

25-
static mut KEYS: [__m256i; 512] = unsafe { std::mem::transmute([0u8; 512 * 32]) };
26-
static mut HOLES: [__m256i; 512] = unsafe { std::mem::transmute([0u8; 512 * 32]) };
27-
28-
let keys = &mut *(&raw mut KEYS);
29-
let holes = &mut *(&raw mut HOLES);
30-
31-
let mut keys_i = 0;
32-
let mut holes_i = 0;
17+
let mut keys = heapless::Vec::<u64, 512>::new();
18+
let mut holes = heapless::Vec::<u64, 512>::new();
3319

3420
let mut i = 0;
3521

3622
std::hint::assert_unchecked(s.len() > 0);
3723
while i < s.len() {
3824
let is_key = *s.get_unchecked(i) == b'.';
3925

40-
let d = s
26+
let d = (s
4127
.as_ptr()
4228
.offset(i as isize + 6)
43-
.cast::<__m256i>()
44-
.read_unaligned();
45-
let d = _mm256_and_si256(
46-
d,
47-
_mm256_setr_epi8(
48-
ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, 0, //
49-
ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, 0, //
50-
ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, 0, //
51-
ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, 0, //
52-
ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, ALL_BITS, 0, 0, 0,
53-
),
54-
);
55-
56-
if is_key {
57-
std::arch::asm!(
58-
"test {max_i}, {max_i}",
59-
"je 2f", // Jump on empty
60-
"cmp {max_i}, 1",
61-
"je 3f", // Jump to one case
62-
"mov {i}, {max_i}",
63-
"shl {i}, 5",
64-
"jmp 5f",
29+
.cast::<u64>()
30+
.read_unaligned()
31+
& 0x0101010101)
32+
+ (s.as_ptr()
33+
.offset(i as isize + 6 + 6)
34+
.cast::<u64>()
35+
.read_unaligned()
36+
& 0x0101010101)
37+
+ (s.as_ptr()
38+
.offset(i as isize + 6 + 12)
39+
.cast::<u64>()
40+
.read_unaligned()
41+
& 0x0101010101)
42+
+ (s.as_ptr()
43+
.offset(i as isize + 6 + 18)
44+
.cast::<u64>()
45+
.read_unaligned()
46+
& 0x0101010101)
47+
+ (s.as_ptr()
48+
.offset(i as isize + 6 + 24)
49+
.cast::<u64>()
50+
.read_unaligned()
51+
& 0x0101010101);
52+
53+
let other = if is_key { &holes } else { &keys };
54+
let mut j = other.len();
55+
while j >= 4 {
56+
j -= 4;
57+
let o = other
58+
.as_ptr()
59+
.offset(j as isize)
60+
.cast::<__m256i>()
61+
.read_unaligned();
62+
let s = _mm256_add_epi64(o, _mm256_set1_epi64x(d as i64));
63+
64+
let s = _mm256_movemask_epi8(s) as u32;
65+
66+
sum += ((s & 0xFF_00_00_00) == 0) as u64;
67+
sum += ((s & 0x00_FF_00_00) == 0) as u64;
68+
sum += ((s & 0x00_00_FF_00) == 0) as u64;
69+
sum += ((s & 0x00_00_00_FF) == 0) as u64;
70+
}
71+
if j > 0 {
72+
let o = other.as_ptr().cast::<__m256i>().read_unaligned();
73+
let s = _mm256_add_epi64(o, _mm256_set1_epi64x(d as i64));
6574

66-
"4:",
67-
"add {i}, -32 * 4",
68-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 96]",
69-
"vpmovmskb {t}, {vt}",
70-
"cmp {t}, 1",
71-
"adc {sum}, 0",
72-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 64]",
73-
"vpmovmskb {t}, {vt}",
74-
"cmp {t}, 1",
75-
"adc {sum}, 0",
76-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]",
77-
"vpmovmskb {t}, {vt}",
78-
"cmp {t}, 1",
79-
"adc {sum}, 0",
80-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]",
81-
"vpmovmskb {t}, {vt}",
82-
"cmp {t}, 1",
83-
"adc {sum}, 0",
84-
"5:",
85-
"cmp {i}, 96",
86-
"jg 4b", // Loop
87-
"cmp {i}, 32",
88-
"jl 2f", // Is zero
89-
"je 3f", // Is one
90-
// Is 2 or 3
75+
let s = _mm256_movemask_epi8(s) as u32;
9176

92-
"4:",
93-
"add {i}, -32 * 2",
94-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]",
95-
"vpmovmskb {t}, {vt}",
96-
"cmp {t}, 1",
97-
"adc {sum}, 0",
98-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]",
99-
"vpmovmskb {t}, {vt}",
100-
"cmp {t}, 1",
101-
"adc {sum}, 0",
102-
"cmp {i}, 32",
103-
"jg 4b", // Loop
104-
"jne 2f", // Is zero
77+
let s = !(!s << (3 - j) * 8);
78+
sum += ((s & 0x00_FF_00_00) == 0) as u64;
79+
sum += ((s & 0x00_00_FF_00) == 0) as u64;
80+
sum += ((s & 0x00_00_00_FF) == 0) as u64;
81+
}
10582

106-
"3:",
107-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os}]",
108-
"vpmovmskb {t}, {vt}",
109-
"cmp {t}, 1",
110-
"adc {sum}, 0",
111-
"2:",
112-
os = in(reg) holes,
113-
max_i = in(reg) holes_i,
114-
d = in(ymm_reg) d,
115-
i = out(reg) _,
116-
sum = inout(reg) sum,
117-
t = out(reg) _,
118-
vt = out(ymm_reg) _,
119-
options(nostack),
120-
);
121-
let d = _mm256_and_si256(d, _mm256_set1_epi8(KEPT_BITS));
122-
let d = _mm256_or_si256(
123-
d,
124-
_mm256_setr_epi8(
125-
0, 0, 0, 0, 0, -1, //
126-
0, 0, 0, 0, 0, -1, //
127-
0, 0, 0, 0, 0, -1, //
128-
0, 0, 0, 0, 0, -1, //
129-
0, 0, 0, 0, 0, -1, -1, -1,
130-
),
131-
);
132-
*keys.get_unchecked_mut(keys_i) = d;
133-
keys_i += 1;
83+
let d = d + 0x7A7A7A7A7A;
84+
if is_key {
85+
keys.push_unchecked(d);
13486
} else {
135-
std::arch::asm!(
136-
"test {max_i}, {max_i}",
137-
"je 2f", // Jump on empty
138-
"cmp {max_i}, 1",
139-
"je 3f", // Jump to one case
140-
"mov {i}, {max_i}",
141-
"shl {i}, 5",
142-
"jmp 5f",
143-
144-
"4:",
145-
"add {i}, -32 * 4",
146-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 96]",
147-
"vpmovmskb {t}, {vt}",
148-
"cmp {t}, 1",
149-
"adc {sum}, 0",
150-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 64]",
151-
"vpmovmskb {t}, {vt}",
152-
"cmp {t}, 1",
153-
"adc {sum}, 0",
154-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]",
155-
"vpmovmskb {t}, {vt}",
156-
"cmp {t}, 1",
157-
"adc {sum}, 0",
158-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]",
159-
"vpmovmskb {t}, {vt}",
160-
"cmp {t}, 1",
161-
"adc {sum}, 0",
162-
"5:",
163-
"cmp {i}, 96",
164-
"jg 4b", // Loop
165-
"cmp {i}, 32",
166-
"jl 2f", // Is zero
167-
"je 3f", // Is one
168-
// Is 2 or 3
169-
170-
"4:",
171-
"add {i}, -32 * 2",
172-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]",
173-
"vpmovmskb {t}, {vt}",
174-
"cmp {t}, 1",
175-
"adc {sum}, 0",
176-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]",
177-
"vpmovmskb {t}, {vt}",
178-
"cmp {t}, 1",
179-
"adc {sum}, 0",
180-
"cmp {i}, 32",
181-
"jg 4b", // Loop
182-
"jne 2f", // Is zero
183-
184-
"3:",
185-
"vpcmpeqb {vt}, {d}, ymmword ptr [{os}]",
186-
"vpmovmskb {t}, {vt}",
187-
"cmp {t}, 1",
188-
"adc {sum}, 0",
189-
"2:",
190-
os = in(reg) keys,
191-
max_i = in(reg) keys_i,
192-
d = in(ymm_reg) d,
193-
i = out(reg) _,
194-
sum = inout(reg) sum,
195-
t = out(reg) _,
196-
vt = out(ymm_reg) _,
197-
options(nostack),
198-
);
199-
let d = _mm256_and_si256(d, _mm256_set1_epi8(KEPT_BITS));
200-
let d = _mm256_or_si256(
201-
d,
202-
_mm256_setr_epi8(
203-
0, 0, 0, 0, 0, -1, //
204-
0, 0, 0, 0, 0, -1, //
205-
0, 0, 0, 0, 0, -1, //
206-
0, 0, 0, 0, 0, -1, //
207-
0, 0, 0, 0, 0, -1, -1, -1,
208-
),
209-
);
210-
*holes.get_unchecked_mut(holes_i) = d;
211-
holes_i += 1;
87+
holes.push_unchecked(d);
21288
}
21389

21490
i += DS;

0 commit comments

Comments
 (0)