Skip to content

Commit 142abbb

Browse files
committed
No inline asm
1 parent fbea5fc commit 142abbb

File tree

1 file changed

+87
-94
lines changed

1 file changed

+87
-94
lines changed

src/day21.rs

Lines changed: 87 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -234,106 +234,99 @@ static LUT_P1: [u64; 10usize.pow(3)] = const {
234234
static LUT_P2: [u64; 10usize.pow(3)] =
235235
unsafe { transmute(*include_bytes!(concat!(env!("OUT_DIR"), "/day21.bin"))) };
236236

237+
use std::arch::x86_64::*;
238+
239+
// 029An980An179An4 56An379A
240+
241+
// 029An980An179An4 56An379A
242+
243+
// 029A n980 An17 9An4 56An 379A
244+
#[target_feature(enable = "avx2,bmi1,bmi2,cmpxchg16b,lzcnt,movbe,popcnt")]
245+
unsafe fn inner(s: &str, lut: &[u64; 1000]) -> u64 {
246+
// 029An980An179An456An379A
247+
let v: __m256i = s.as_ptr().cast::<__m256i>().read_unaligned();
248+
let v = _mm256_subs_epu8(v, _mm256_set1_epi8(b'0' as i8));
249+
250+
// 029A n980 0000 0000 An17 9An4 56An 379A
251+
let v = _mm256_permutevar8x32_epi32(v, _mm256_setr_epi32(0, 1, 6, 6, 2, 3, 4, 5));
252+
253+
// _029_980_________ _179_456____ _379
254+
let idx = _mm256_setr_epi8(
255+
15, 0, 1, 2, 15, 5, 6, 7, //
256+
15, 15, 15, 15, 15, 15, 15, 15, //
257+
15, 2, 3, 4, 15, 7, 8, 9, //
258+
15, 15, 15, 15, 15, 12, 13, 14,
259+
);
260+
let v = _mm256_shuffle_epi8(v, idx);
261+
262+
// 0, 29, 9, 80, 0, 0, 0, 0, 1, 79, 4, 56, 0, 0, 7, 90
263+
let mul: __m256i = _mm256_setr_epi8(
264+
0, 1, 10, 1, 0, 1, 10, 1, //
265+
0, 0, 0, 0, 0, 0, 0, 0, //
266+
0, 1, 10, 1, 0, 1, 10, 1, //
267+
0, 0, 0, 0, 0, 1, 10, 1, //
268+
);
269+
let v = _mm256_maddubs_epi16(v, mul);
270+
271+
// 29, 980, 0, 0, 179, 456, 0, 379
272+
let mul: __m256i = _mm256_setr_epi16(
273+
100, 1, 100, 1, //
274+
0, 0, 0, 0, //
275+
100, 1, 100, 1, //
276+
0, 0, 100, 1,
277+
);
278+
let v = _mm256_madd_epi16(v, mul);
279+
*lut.get_unchecked(_mm256_extract_epi32::<0>(v) as u32 as usize)
280+
+ *lut.get_unchecked(_mm256_extract_epi32::<1>(v) as u32 as usize)
281+
+ *lut.get_unchecked(_mm256_extract_epi32::<4>(v) as u32 as usize)
282+
+ *lut.get_unchecked(_mm256_extract_epi32::<5>(v) as u32 as usize)
283+
+ *lut.get_unchecked(_mm256_extract_epi32::<7>(v) as u32 as usize)
284+
// let com_l = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast(), v);
285+
// let com_h = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast::<i32>().offset(1), v);
286+
287+
// let h1 = _mm256_unpacklo_epi32(com_l, com_h);
288+
// let h2 = _mm256_unpackhi_epi32(com_l, com_h);
289+
290+
// let sum = 0;
291+
// let v = _mm256_add_epi64(h1, h2);
292+
// let vs = _mm256_shuffle_epi32::<{ (1 << 6) | (0 << 4) | (3 << 2) | 2 }>(v);
293+
// let v = _mm256_add_epi64(v, vs);
294+
295+
// _mm256_extract_epi64::<0>(v) as u64 + _mm256_extract_epi64::<2>(v) as u64
296+
// let mut p = [0u32; 256 / 32];
297+
// p.as_mut_ptr().cast::<__m256i>().write(v);
298+
// println!("{:?}", &p);
299+
300+
// panic!()
301+
302+
// let v = _mm256_shuffle_epi8(v, )
303+
304+
// let s = s.as_bytes();
305+
// let mut sum = 0;
306+
// unsafe {
307+
// let mut i = 0;
308+
// for _ in 0..5 {
309+
// let idx = (*s.get_unchecked(i + 0) as usize) * 100
310+
// + (*s.get_unchecked(i + 1) as usize) * 10
311+
// + (*s.get_unchecked(i + 2) as usize) * 1
312+
// - (b'0' as usize * 111);
313+
314+
// sum += lut[idx];
315+
// i += 5;
316+
// }
317+
// }
318+
319+
// sum
320+
}
321+
237322
#[aoc(day21, part1)]
238323
pub fn part1(s: &str) -> u64 {
239-
static LCPI0_0: [u8; 32] = [
240-
3, 8, 9, 10, 3, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 11, 12, 7, 15, 0, 1, 0, 0, 0, 0,
241-
7, 4, 5, 6,
242-
];
243-
static LCPI0_1: [u8; 32] = [
244-
48, 48, 48, 48, 48, 48, 48, 48, 0, 0, 0, 0, 0, 0, 0, 0, 48, 48, 48, 48, 48, 48, 48, 48, 0,
245-
0, 0, 0, 48, 48, 48, 48,
246-
];
247-
static LCPI0_2: [u8; 32] = [
248-
0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0,
249-
1, 10, 1,
250-
];
251-
static LCPI0_3: [u16; 16] = [100, 1, 100, 1, 0, 0, 0, 0, 100, 1, 100, 1, 0, 0, 100, 1];
252-
253-
let r: u64;
254-
unsafe {
255-
std::arch::asm!(
256-
"vpermq {ymm:y}, ymmword ptr [{s}], 99",
257-
"vpshufb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_0}]",
258-
"vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]",
259-
"vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]",
260-
"vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]",
261-
"vmovd {c:e}, {ymm:x}",
262-
"vpextrd {a:e}, {ymm:x}, 1",
263-
"mov {a:r}, qword ptr [{lut} + 8*{a:r}]",
264-
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
265-
"vextracti128 {ymm:x}, {ymm:y}, 1",
266-
"vmovd {c:e}, {ymm:x}",
267-
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
268-
"vpextrd {c:e}, {ymm:x}, 1",
269-
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
270-
"vpextrd {c:e}, {ymm:x}, 3",
271-
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
272-
273-
LCPI0_0 = sym LCPI0_0,
274-
LCPI0_1 = sym LCPI0_1,
275-
LCPI0_2 = sym LCPI0_2,
276-
LCPI0_3 = sym LCPI0_3,
277-
s = in(reg) s.as_ptr(),
278-
lut = in(reg) LUT_P1.as_ptr(),
279-
ymm = out(ymm_reg) _,
280-
c = out(reg) _,
281-
a = out(reg) r,
282-
options(nostack)
283-
);
284-
}
285-
r
324+
unsafe { inner(s, &LUT_P1) }
286325
}
287326

288327
#[aoc(day21, part2)]
289328
pub fn part2(s: &str) -> u64 {
290-
static LCPI0_0: [u8; 32] = [
291-
3, 8, 9, 10, 3, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 11, 12, 7, 15, 0, 1, 0, 0, 0, 0,
292-
7, 4, 5, 6,
293-
];
294-
static LCPI0_1: [u8; 32] = [
295-
48, 48, 48, 48, 48, 48, 48, 48, 0, 0, 0, 0, 0, 0, 0, 0, 48, 48, 48, 48, 48, 48, 48, 48, 0,
296-
0, 0, 0, 48, 48, 48, 48,
297-
];
298-
static LCPI0_2: [u8; 32] = [
299-
0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0,
300-
1, 10, 1,
301-
];
302-
static LCPI0_3: [u16; 16] = [100, 1, 100, 1, 0, 0, 0, 0, 100, 1, 100, 1, 0, 0, 100, 1];
303-
304-
let r: u64;
305-
unsafe {
306-
std::arch::asm!(
307-
"vpermq {ymm}, ymmword ptr [{s}], 99",
308-
"vpshufb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_0}]",
309-
"vpsubusb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_1}]",
310-
"vpmaddubsw {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_2}]",
311-
"vpmaddwd {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_3}]",
312-
"vmovd {t:e}, {ymm:x}",
313-
"vpextrd {r:e}, {ymm:x}, 1",
314-
"mov {r:r}, qword ptr [{lut} + 8*{r:r}]",
315-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
316-
"vextracti128 {ymm:x}, {ymm}, 1",
317-
"vmovd {t:e}, {ymm:x}",
318-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
319-
"vpextrd {t:e}, {ymm:x}, 1",
320-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
321-
"vpextrd {t:e}, {ymm:x}, 3",
322-
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
323-
324-
LCPI0_0 = sym LCPI0_0,
325-
LCPI0_1 = sym LCPI0_1,
326-
LCPI0_2 = sym LCPI0_2,
327-
LCPI0_3 = sym LCPI0_3,
328-
s = in(reg) s.as_ptr(),
329-
lut = in(reg) LUT_P2.as_ptr(),
330-
r = out(reg) r,
331-
ymm = out(ymm_reg) _,
332-
t = out(reg) _,
333-
options(nostack)
334-
);
335-
}
336-
r
329+
unsafe { inner(s, &LUT_P2) }
337330
}
338331

339332
#[cfg(test)]

0 commit comments

Comments
 (0)