|
1 | 1 | use core::str; |
2 | | -use std::mem::transmute; |
| 2 | +use std::{arch::x86_64::*, mem::transmute}; |
3 | 3 |
|
4 | 4 | use aoc_runner_derive::aoc; |
5 | 5 |
|
@@ -46,70 +46,192 @@ pub fn part1(s: &str) -> u64 { |
46 | 46 |
|
47 | 47 | const SEQUENCES: usize = 18 * 18 * 18 * 18; |
48 | 48 |
|
| 49 | +#[inline(always)] |
| 50 | +unsafe fn vmod10(a: __m256i) -> __m256i { |
| 51 | + // Algo from LLVM |
| 52 | + let prod02 = _mm256_mul_epu32(a, _mm256_set1_epi32(3435973837u32 as i32)); |
| 53 | + let prod13 = _mm256_mul_epu32( |
| 54 | + _mm256_shuffle_epi32::<0xf5>(a), |
| 55 | + _mm256_set1_epi32(3435973837u32 as i32), |
| 56 | + ); |
| 57 | + let d = _mm256_unpackhi_epi64( |
| 58 | + _mm256_unpacklo_epi32(prod02, prod13), |
| 59 | + _mm256_unpackhi_epi32(prod02, prod13), |
| 60 | + ); |
| 61 | + |
| 62 | + let d = _mm256_srli_epi32::<3>(d); |
| 63 | + let c = _mm256_mullo_epi32(d, _mm256_set1_epi32(10)); |
| 64 | + _mm256_sub_epi32(a, c) |
| 65 | +} |
| 66 | + |
| 67 | +#[inline(always)] |
| 68 | +unsafe fn vmod104976(a: __m256i) -> __m256i { |
| 69 | + // Algo from LLVM |
| 70 | + let prod02 = _mm256_mul_epu32(a, _mm256_set1_epi32(2681326939u32 as i32)); |
| 71 | + let prod13 = _mm256_mul_epu32( |
| 72 | + _mm256_shuffle_epi32::<0xf5>(a), |
| 73 | + _mm256_set1_epi32(2681326939u32 as i32), |
| 74 | + ); |
| 75 | + let d = _mm256_unpackhi_epi64( |
| 76 | + _mm256_unpacklo_epi32(prod02, prod13), |
| 77 | + _mm256_unpackhi_epi32(prod02, prod13), |
| 78 | + ); |
| 79 | + |
| 80 | + let d = _mm256_srli_epi32::<16>(d); |
| 81 | + let c = _mm256_mullo_epi32(d, _mm256_set1_epi32(104976)); |
| 82 | + _mm256_sub_epi32(a, c) |
| 83 | +} |
| 84 | + |
| 85 | +static mut DONE: [[u16; 104976]; 8] = [[0u16; SEQUENCES]; 8]; |
| 86 | + |
49 | 87 | #[aoc(day22, part2)] |
50 | | -pub fn part2(s: &str) -> i16 { |
| 88 | +pub fn part2(s: &str) -> i32 { |
51 | 89 | let s = s.as_bytes(); |
52 | 90 |
|
53 | 91 | let mut sequences = [0; SEQUENCES]; |
54 | | - let mut done = [0u16; SEQUENCES]; |
| 92 | + let done = unsafe { &mut DONE }; |
| 93 | + |
| 94 | + for j in 0..8 { |
| 95 | + done[j].fill(0); |
| 96 | + } |
55 | 97 |
|
56 | 98 | let mut i = 0; |
57 | 99 | let mut monky = 1; |
58 | 100 | unsafe { |
59 | 101 | while i < s.len() { |
60 | | - #[cfg(not(test))] |
61 | | - let mut sn = (*s.get_unchecked(i + 0) as u32) * 100000 |
62 | | - + (*s.get_unchecked(i + 1) as u32) * 10000 |
63 | | - + (*s.get_unchecked(i + 2) as u32) * 1000 |
64 | | - + (*s.get_unchecked(i + 3) as u32) * 100 |
65 | | - + (*s.get_unchecked(i + 4) as u32) * 10 |
66 | | - + (*s.get_unchecked(i + 5) as u32) * 1 |
67 | | - - (b'0' as u32 * 111_111); |
68 | | - #[cfg(not(test))] |
69 | | - { |
70 | | - i += 6; |
71 | | - } |
| 102 | + let mut sns: __m256i = _mm256_setzero_si256(); |
| 103 | + let mut sns_len = 0; |
| 104 | + while i < s.len() && sns_len < 8 { |
| 105 | + #[cfg(not(test))] |
| 106 | + let mut sn = (*s.get_unchecked(i + 0) as u32) * 100000 |
| 107 | + + (*s.get_unchecked(i + 1) as u32) * 10000 |
| 108 | + + (*s.get_unchecked(i + 2) as u32) * 1000 |
| 109 | + + (*s.get_unchecked(i + 3) as u32) * 100 |
| 110 | + + (*s.get_unchecked(i + 4) as u32) * 10 |
| 111 | + + (*s.get_unchecked(i + 5) as u32) * 1 |
| 112 | + - (b'0' as u32 * 111_111); |
| 113 | + #[cfg(not(test))] |
| 114 | + { |
| 115 | + i += 6; |
| 116 | + } |
72 | 117 |
|
73 | | - #[cfg(test)] |
74 | | - let mut sn = 0; |
75 | | - while *s.get_unchecked(i) != b'\n' { |
76 | | - sn *= 10; |
77 | | - sn += (s.get_unchecked(i) - b'0') as u32; |
| 118 | + #[cfg(test)] |
| 119 | + let mut sn = 0; |
| 120 | + while *s.get_unchecked(i) != b'\n' { |
| 121 | + sn *= 10; |
| 122 | + sn += (s.get_unchecked(i) - b'0') as u32; |
| 123 | + i += 1; |
| 124 | + } |
78 | 125 | i += 1; |
| 126 | + sns = _mm256_permutevar8x32_epi32(sns, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6)); |
| 127 | + sns = _mm256_blend_epi32::<1>(sns, _mm256_set1_epi32(sn as i32)); |
| 128 | + sns_len += 1; |
79 | 129 | } |
80 | | - i += 1; |
81 | 130 |
|
82 | | - let mut diffs = 0; |
83 | | - let mut prev = sn % 10; |
| 131 | + let mut diffs = _mm256_setzero_si256(); |
| 132 | + let mut prev = vmod10(sns); |
84 | 133 |
|
85 | 134 | for _ in 0..3 { |
86 | | - sn = ((sn as u64 * 64) % MAX as u64) as u32 ^ sn; |
87 | | - sn = (sn / 32) ^ sn; |
88 | | - sn = ((sn as u64 * 2048) % MAX as u64) as u32 ^ sn; |
89 | | - let price = sn % 10; |
90 | | - let diff = price + 9 - prev; |
91 | | - diffs = diffs * 18 + diff; |
| 135 | + let i = _mm256_slli_epi32::<6>(sns); |
| 136 | + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); |
| 137 | + sns = _mm256_xor_si256(i, sns); |
| 138 | + |
| 139 | + let i = _mm256_srli_epi32::<5>(sns); |
| 140 | + sns = _mm256_xor_si256(i, sns); |
| 141 | + let i = _mm256_slli_epi32::<11>(sns); |
| 142 | + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); |
| 143 | + sns = _mm256_xor_si256(i, sns); |
| 144 | + |
| 145 | + let price = vmod10(sns); |
| 146 | + let diff = _mm256_sub_epi32(_mm256_add_epi32(price, _mm256_set1_epi32(9)), prev); |
| 147 | + diffs = _mm256_add_epi32(_mm256_mullo_epi32(diffs, _mm256_set1_epi32(18)), diff); |
92 | 148 |
|
93 | 149 | prev = price; |
94 | 150 | } |
95 | 151 |
|
96 | 152 | for _ in 4..2000 { |
97 | | - sn = ((sn as u64 * 64) % MAX as u64) as u32 ^ sn; |
98 | | - sn = (sn / 32) ^ sn; |
99 | | - sn = ((sn as u64 * 2048) % MAX as u64) as u32 ^ sn; |
100 | | - let price = sn % 10; |
101 | | - let diff = price + 9 - prev; |
102 | | - diffs = (diffs * 18 + diff) % SEQUENCES as u32; |
| 153 | + let i = _mm256_slli_epi32::<6>(sns); |
| 154 | + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); |
| 155 | + sns = _mm256_xor_si256(i, sns); |
| 156 | + let i = _mm256_srli_epi32::<5>(sns); |
| 157 | + sns = _mm256_xor_si256(i, sns); |
| 158 | + let i = _mm256_slli_epi32::<11>(sns); |
| 159 | + let i = _mm256_and_si256(i, _mm256_set1_epi32(16777152)); |
| 160 | + sns = _mm256_xor_si256(i, sns); |
| 161 | + |
| 162 | + let price = vmod10(sns); |
| 163 | + let diff = _mm256_sub_epi32(_mm256_add_epi32(price, _mm256_set1_epi32(9)), prev); |
| 164 | + diffs = _mm256_add_epi32(_mm256_mullo_epi32(diffs, _mm256_set1_epi32(18)), diff); |
| 165 | + diffs = vmod104976(diffs); |
| 166 | + |
| 167 | + let diff_i = _mm256_extract_epi32::<0>(diffs) as usize; |
| 168 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 169 | + if done[0][diff_i] != monky + 0 { |
| 170 | + let price = _mm256_extract_epi32::<0>(price); |
| 171 | + sequences[diff_i] += price; |
| 172 | + |
| 173 | + done[0][diff_i] = monky + 0; |
| 174 | + } |
| 175 | + let diff_i = _mm256_extract_epi32::<1>(diffs) as usize; |
| 176 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 177 | + if done[1][diff_i] != monky + 1 { |
| 178 | + let price = _mm256_extract_epi32::<1>(price); |
| 179 | + sequences[diff_i] += price; |
103 | 180 |
|
104 | | - if done[diffs as usize] != monky { |
105 | | - sequences[diffs as usize] += price as i16; |
| 181 | + done[1][diff_i] = monky + 1; |
| 182 | + } |
| 183 | + let diff_i = _mm256_extract_epi32::<2>(diffs) as usize; |
| 184 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 185 | + if done[2][diff_i] != monky + 2 { |
| 186 | + let price = _mm256_extract_epi32::<2>(price); |
| 187 | + sequences[diff_i] += price; |
| 188 | + |
| 189 | + done[2][diff_i] = monky + 2; |
| 190 | + } |
| 191 | + let diff_i = _mm256_extract_epi32::<3>(diffs) as usize; |
| 192 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 193 | + if done[3][diff_i] != monky + 3 { |
| 194 | + let price = _mm256_extract_epi32::<3>(price); |
| 195 | + sequences[diff_i] += price; |
| 196 | + |
| 197 | + done[3][diff_i] = monky + 3; |
| 198 | + } |
| 199 | + let diff_i = _mm256_extract_epi32::<4>(diffs) as usize; |
| 200 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 201 | + if done[4][diff_i] != monky + 4 { |
| 202 | + let price = _mm256_extract_epi32::<4>(price); |
| 203 | + sequences[diff_i] += price; |
| 204 | + |
| 205 | + done[4][diff_i] = monky + 4; |
| 206 | + } |
| 207 | + let diff_i = _mm256_extract_epi32::<5>(diffs) as usize; |
| 208 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 209 | + if done[5][diff_i] != monky + 5 { |
| 210 | + let price = _mm256_extract_epi32::<5>(price); |
| 211 | + sequences[diff_i] += price; |
| 212 | + |
| 213 | + done[5][diff_i] = monky + 5; |
| 214 | + } |
| 215 | + let diff_i = _mm256_extract_epi32::<6>(diffs) as usize; |
| 216 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 217 | + if done[6][diff_i] != monky + 6 { |
| 218 | + let price = _mm256_extract_epi32::<6>(price); |
| 219 | + sequences[diff_i] += price; |
| 220 | + |
| 221 | + done[6][diff_i] = monky + 6; |
| 222 | + } |
| 223 | + let diff_i = _mm256_extract_epi32::<7>(diffs) as usize; |
| 224 | + std::hint::assert_unchecked(diff_i < SEQUENCES); |
| 225 | + if done[7][diff_i] != monky + 7 { |
| 226 | + let price = _mm256_extract_epi32::<7>(price); |
| 227 | + sequences[diff_i] += price; |
106 | 228 |
|
107 | | - done[diffs as usize] = monky; |
| 229 | + done[7][diff_i] = monky + 7; |
108 | 230 | } |
109 | 231 |
|
110 | 232 | prev = price; |
111 | 233 | } |
112 | | - monky += 1; |
| 234 | + monky += 8; |
113 | 235 | } |
114 | 236 |
|
115 | 237 | sequences.into_iter().max().unwrap_unchecked() |
|
0 commit comments