Skip to content

Commit a6629b4

Browse files
committed
Try to vectorize
1 parent a5ee2d2 commit a6629b4

File tree

1 file changed

+161
-39
lines changed

1 file changed

+161
-39
lines changed

src/day22.rs

Lines changed: 161 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use core::str;
2-
use std::mem::transmute;
2+
use std::{arch::x86_64::*, mem::transmute};
33

44
use aoc_runner_derive::aoc;
55

@@ -46,70 +46,192 @@ pub fn part1(s: &str) -> u64 {
4646

4747
const SEQUENCES: usize = 18 * 18 * 18 * 18;
4848

49+
#[inline(always)]
50+
unsafe fn vmod10(a: __m256i) -> __m256i {
51+
// Algo from LLVM
52+
let prod02 = _mm256_mul_epu32(a, _mm256_set1_epi32(3435973837u32 as i32));
53+
let prod13 = _mm256_mul_epu32(
54+
_mm256_shuffle_epi32::<0xf5>(a),
55+
_mm256_set1_epi32(3435973837u32 as i32),
56+
);
57+
let d = _mm256_unpackhi_epi64(
58+
_mm256_unpacklo_epi32(prod02, prod13),
59+
_mm256_unpackhi_epi32(prod02, prod13),
60+
);
61+
62+
let d = _mm256_srli_epi32::<3>(d);
63+
let c = _mm256_mullo_epi32(d, _mm256_set1_epi32(10));
64+
_mm256_sub_epi32(a, c)
65+
}
66+
67+
#[inline(always)]
68+
unsafe fn vmod104976(a: __m256i) -> __m256i {
69+
// Algo from LLVM
70+
let prod02 = _mm256_mul_epu32(a, _mm256_set1_epi32(2681326939u32 as i32));
71+
let prod13 = _mm256_mul_epu32(
72+
_mm256_shuffle_epi32::<0xf5>(a),
73+
_mm256_set1_epi32(2681326939u32 as i32),
74+
);
75+
let d = _mm256_unpackhi_epi64(
76+
_mm256_unpacklo_epi32(prod02, prod13),
77+
_mm256_unpackhi_epi32(prod02, prod13),
78+
);
79+
80+
let d = _mm256_srli_epi32::<16>(d);
81+
let c = _mm256_mullo_epi32(d, _mm256_set1_epi32(104976));
82+
_mm256_sub_epi32(a, c)
83+
}
84+
85+
static mut DONE: [[u16; 104976]; 8] = [[0u16; SEQUENCES]; 8];
86+
4987
#[aoc(day22, part2)]
50-
pub fn part2(s: &str) -> i16 {
88+
pub fn part2(s: &str) -> i32 {
5189
let s = s.as_bytes();
5290

5391
let mut sequences = [0; SEQUENCES];
54-
let mut done = [0u16; SEQUENCES];
92+
let done = unsafe { &mut DONE };
93+
94+
for j in 0..8 {
95+
done[j].fill(0);
96+
}
5597

5698
let mut i = 0;
5799
let mut monky = 1;
58100
unsafe {
59101
while i < s.len() {
60-
#[cfg(not(test))]
61-
let mut sn = (*s.get_unchecked(i + 0) as u32) * 100000
62-
+ (*s.get_unchecked(i + 1) as u32) * 10000
63-
+ (*s.get_unchecked(i + 2) as u32) * 1000
64-
+ (*s.get_unchecked(i + 3) as u32) * 100
65-
+ (*s.get_unchecked(i + 4) as u32) * 10
66-
+ (*s.get_unchecked(i + 5) as u32) * 1
67-
- (b'0' as u32 * 111_111);
68-
#[cfg(not(test))]
69-
{
70-
i += 6;
71-
}
102+
let mut sns: __m256i = _mm256_setzero_si256();
103+
let mut sns_len = 0;
104+
while i < s.len() && sns_len < 8 {
105+
#[cfg(not(test))]
106+
let mut sn = (*s.get_unchecked(i + 0) as u32) * 100000
107+
+ (*s.get_unchecked(i + 1) as u32) * 10000
108+
+ (*s.get_unchecked(i + 2) as u32) * 1000
109+
+ (*s.get_unchecked(i + 3) as u32) * 100
110+
+ (*s.get_unchecked(i + 4) as u32) * 10
111+
+ (*s.get_unchecked(i + 5) as u32) * 1
112+
- (b'0' as u32 * 111_111);
113+
#[cfg(not(test))]
114+
{
115+
i += 6;
116+
}
72117

73-
#[cfg(test)]
74-
let mut sn = 0;
75-
while *s.get_unchecked(i) != b'\n' {
76-
sn *= 10;
77-
sn += (s.get_unchecked(i) - b'0') as u32;
118+
#[cfg(test)]
119+
let mut sn = 0;
120+
while *s.get_unchecked(i) != b'\n' {
121+
sn *= 10;
122+
sn += (s.get_unchecked(i) - b'0') as u32;
123+
i += 1;
124+
}
78125
i += 1;
126+
sns = _mm256_permutevar8x32_epi32(sns, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6));
127+
sns = _mm256_blend_epi32::<1>(sns, _mm256_set1_epi32(sn as i32));
128+
sns_len += 1;
79129
}
80-
i += 1;
81130

82-
let mut diffs = 0;
83-
let mut prev = sn % 10;
131+
let mut diffs = _mm256_setzero_si256();
132+
let mut prev = vmod10(sns);
84133

85134
for _ in 0..3 {
86-
sn = ((sn as u64 * 64) % MAX as u64) as u32 ^ sn;
87-
sn = (sn / 32) ^ sn;
88-
sn = ((sn as u64 * 2048) % MAX as u64) as u32 ^ sn;
89-
let price = sn % 10;
90-
let diff = price + 9 - prev;
91-
diffs = diffs * 18 + diff;
135+
let i = _mm256_slli_epi32::<6>(sns);
136+
let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152));
137+
sns = _mm256_xor_si256(i, sns);
138+
139+
let i = _mm256_srli_epi32::<5>(sns);
140+
sns = _mm256_xor_si256(i, sns);
141+
let i = _mm256_slli_epi32::<11>(sns);
142+
let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152));
143+
sns = _mm256_xor_si256(i, sns);
144+
145+
let price = vmod10(sns);
146+
let diff = _mm256_sub_epi32(_mm256_add_epi32(price, _mm256_set1_epi32(9)), prev);
147+
diffs = _mm256_add_epi32(_mm256_mullo_epi32(diffs, _mm256_set1_epi32(18)), diff);
92148

93149
prev = price;
94150
}
95151

96152
for _ in 4..2000 {
97-
sn = ((sn as u64 * 64) % MAX as u64) as u32 ^ sn;
98-
sn = (sn / 32) ^ sn;
99-
sn = ((sn as u64 * 2048) % MAX as u64) as u32 ^ sn;
100-
let price = sn % 10;
101-
let diff = price + 9 - prev;
102-
diffs = (diffs * 18 + diff) % SEQUENCES as u32;
153+
let i = _mm256_slli_epi32::<6>(sns);
154+
let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152));
155+
sns = _mm256_xor_si256(i, sns);
156+
let i = _mm256_srli_epi32::<5>(sns);
157+
sns = _mm256_xor_si256(i, sns);
158+
let i = _mm256_slli_epi32::<11>(sns);
159+
let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152));
160+
sns = _mm256_xor_si256(i, sns);
161+
162+
let price = vmod10(sns);
163+
let diff = _mm256_sub_epi32(_mm256_add_epi32(price, _mm256_set1_epi32(9)), prev);
164+
diffs = _mm256_add_epi32(_mm256_mullo_epi32(diffs, _mm256_set1_epi32(18)), diff);
165+
diffs = vmod104976(diffs);
166+
167+
let diff_i = _mm256_extract_epi32::<0>(diffs) as usize;
168+
std::hint::assert_unchecked(diff_i < SEQUENCES);
169+
if done[0][diff_i] != monky + 0 {
170+
let price = _mm256_extract_epi32::<0>(price);
171+
sequences[diff_i] += price;
172+
173+
done[0][diff_i] = monky + 0;
174+
}
175+
let diff_i = _mm256_extract_epi32::<1>(diffs) as usize;
176+
std::hint::assert_unchecked(diff_i < SEQUENCES);
177+
if done[1][diff_i] != monky + 1 {
178+
let price = _mm256_extract_epi32::<1>(price);
179+
sequences[diff_i] += price;
103180

104-
if done[diffs as usize] != monky {
105-
sequences[diffs as usize] += price as i16;
181+
done[1][diff_i] = monky + 1;
182+
}
183+
let diff_i = _mm256_extract_epi32::<2>(diffs) as usize;
184+
std::hint::assert_unchecked(diff_i < SEQUENCES);
185+
if done[2][diff_i] != monky + 2 {
186+
let price = _mm256_extract_epi32::<2>(price);
187+
sequences[diff_i] += price;
188+
189+
done[2][diff_i] = monky + 2;
190+
}
191+
let diff_i = _mm256_extract_epi32::<3>(diffs) as usize;
192+
std::hint::assert_unchecked(diff_i < SEQUENCES);
193+
if done[3][diff_i] != monky + 3 {
194+
let price = _mm256_extract_epi32::<3>(price);
195+
sequences[diff_i] += price;
196+
197+
done[3][diff_i] = monky + 3;
198+
}
199+
let diff_i = _mm256_extract_epi32::<4>(diffs) as usize;
200+
std::hint::assert_unchecked(diff_i < SEQUENCES);
201+
if done[4][diff_i] != monky + 4 {
202+
let price = _mm256_extract_epi32::<4>(price);
203+
sequences[diff_i] += price;
204+
205+
done[4][diff_i] = monky + 4;
206+
}
207+
let diff_i = _mm256_extract_epi32::<5>(diffs) as usize;
208+
std::hint::assert_unchecked(diff_i < SEQUENCES);
209+
if done[5][diff_i] != monky + 5 {
210+
let price = _mm256_extract_epi32::<5>(price);
211+
sequences[diff_i] += price;
212+
213+
done[5][diff_i] = monky + 5;
214+
}
215+
let diff_i = _mm256_extract_epi32::<6>(diffs) as usize;
216+
std::hint::assert_unchecked(diff_i < SEQUENCES);
217+
if done[6][diff_i] != monky + 6 {
218+
let price = _mm256_extract_epi32::<6>(price);
219+
sequences[diff_i] += price;
220+
221+
done[6][diff_i] = monky + 6;
222+
}
223+
let diff_i = _mm256_extract_epi32::<7>(diffs) as usize;
224+
std::hint::assert_unchecked(diff_i < SEQUENCES);
225+
if done[7][diff_i] != monky + 7 {
226+
let price = _mm256_extract_epi32::<7>(price);
227+
sequences[diff_i] += price;
106228

107-
done[diffs as usize] = monky;
229+
done[7][diff_i] = monky + 7;
108230
}
109231

110232
prev = price;
111233
}
112-
monky += 1;
234+
monky += 8;
113235
}
114236

115237
sequences.into_iter().max().unwrap_unchecked()

0 commit comments

Comments
 (0)