Skip to content

Commit c4621c1

Browse files
committed
Revert "No inline asm"
This reverts commit 142abbb.
1 parent 142abbb commit c4621c1

File tree

1 file changed

+94
-87
lines changed

1 file changed

+94
-87
lines changed

src/day21.rs

Lines changed: 94 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -234,99 +234,106 @@ static LUT_P1: [u64; 10usize.pow(3)] = const {
234234
static LUT_P2: [u64; 10usize.pow(3)] =
235235
unsafe { transmute(*include_bytes!(concat!(env!("OUT_DIR"), "/day21.bin"))) };
236236

237-
use std::arch::x86_64::*;
238-
239-
// 029An980An179An4 56An379A
240-
241-
// 029An980An179An4 56An379A
242-
243-
// 029A n980 An17 9An4 56An 379A
244-
#[target_feature(enable = "avx2,bmi1,bmi2,cmpxchg16b,lzcnt,movbe,popcnt")]
245-
unsafe fn inner(s: &str, lut: &[u64; 1000]) -> u64 {
246-
// 029An980An179An456An379A
247-
let v: __m256i = s.as_ptr().cast::<__m256i>().read_unaligned();
248-
let v = _mm256_subs_epu8(v, _mm256_set1_epi8(b'0' as i8));
249-
250-
// 029A n980 0000 0000 An17 9An4 56An 379A
251-
let v = _mm256_permutevar8x32_epi32(v, _mm256_setr_epi32(0, 1, 6, 6, 2, 3, 4, 5));
252-
253-
// _029_980_________ _179_456____ _379
254-
let idx = _mm256_setr_epi8(
255-
15, 0, 1, 2, 15, 5, 6, 7, //
256-
15, 15, 15, 15, 15, 15, 15, 15, //
257-
15, 2, 3, 4, 15, 7, 8, 9, //
258-
15, 15, 15, 15, 15, 12, 13, 14,
259-
);
260-
let v = _mm256_shuffle_epi8(v, idx);
261-
262-
// 0, 29, 9, 80, 0, 0, 0, 0, 1, 79, 4, 56, 0, 0, 7, 90
263-
let mul: __m256i = _mm256_setr_epi8(
264-
0, 1, 10, 1, 0, 1, 10, 1, //
265-
0, 0, 0, 0, 0, 0, 0, 0, //
266-
0, 1, 10, 1, 0, 1, 10, 1, //
267-
0, 0, 0, 0, 0, 1, 10, 1, //
268-
);
269-
let v = _mm256_maddubs_epi16(v, mul);
270-
271-
// 29, 980, 0, 0, 179, 456, 0, 379
272-
let mul: __m256i = _mm256_setr_epi16(
273-
100, 1, 100, 1, //
274-
0, 0, 0, 0, //
275-
100, 1, 100, 1, //
276-
0, 0, 100, 1,
277-
);
278-
let v = _mm256_madd_epi16(v, mul);
279-
*lut.get_unchecked(_mm256_extract_epi32::<0>(v) as u32 as usize)
280-
+ *lut.get_unchecked(_mm256_extract_epi32::<1>(v) as u32 as usize)
281-
+ *lut.get_unchecked(_mm256_extract_epi32::<4>(v) as u32 as usize)
282-
+ *lut.get_unchecked(_mm256_extract_epi32::<5>(v) as u32 as usize)
283-
+ *lut.get_unchecked(_mm256_extract_epi32::<7>(v) as u32 as usize)
284-
// let com_l = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast(), v);
285-
// let com_h = _mm256_i32gather_epi32::<8>(lut.as_ptr().cast::<i32>().offset(1), v);
286-
287-
// let h1 = _mm256_unpacklo_epi32(com_l, com_h);
288-
// let h2 = _mm256_unpackhi_epi32(com_l, com_h);
289-
290-
// let sum = 0;
291-
// let v = _mm256_add_epi64(h1, h2);
292-
// let vs = _mm256_shuffle_epi32::<{ (1 << 6) | (0 << 4) | (3 << 2) | 2 }>(v);
293-
// let v = _mm256_add_epi64(v, vs);
294-
295-
// _mm256_extract_epi64::<0>(v) as u64 + _mm256_extract_epi64::<2>(v) as u64
296-
// let mut p = [0u32; 256 / 32];
297-
// p.as_mut_ptr().cast::<__m256i>().write(v);
298-
// println!("{:?}", &p);
299-
300-
// panic!()
301-
302-
// let v = _mm256_shuffle_epi8(v, )
303-
304-
// let s = s.as_bytes();
305-
// let mut sum = 0;
306-
// unsafe {
307-
// let mut i = 0;
308-
// for _ in 0..5 {
309-
// let idx = (*s.get_unchecked(i + 0) as usize) * 100
310-
// + (*s.get_unchecked(i + 1) as usize) * 10
311-
// + (*s.get_unchecked(i + 2) as usize) * 1
312-
// - (b'0' as usize * 111);
313-
314-
// sum += lut[idx];
315-
// i += 5;
316-
// }
317-
// }
318-
319-
// sum
320-
}
321-
322237
#[aoc(day21, part1)]
323238
pub fn part1(s: &str) -> u64 {
324-
unsafe { inner(s, &LUT_P1) }
239+
static LCPI0_0: [u8; 32] = [
240+
3, 8, 9, 10, 3, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 11, 12, 7, 15, 0, 1, 0, 0, 0, 0,
241+
7, 4, 5, 6,
242+
];
243+
static LCPI0_1: [u8; 32] = [
244+
48, 48, 48, 48, 48, 48, 48, 48, 0, 0, 0, 0, 0, 0, 0, 0, 48, 48, 48, 48, 48, 48, 48, 48, 0,
245+
0, 0, 0, 48, 48, 48, 48,
246+
];
247+
static LCPI0_2: [u8; 32] = [
248+
0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0,
249+
1, 10, 1,
250+
];
251+
static LCPI0_3: [u16; 16] = [100, 1, 100, 1, 0, 0, 0, 0, 100, 1, 100, 1, 0, 0, 100, 1];
252+
253+
let r: u64;
254+
unsafe {
255+
std::arch::asm!(
256+
"vpermq {ymm:y}, ymmword ptr [{s}], 99",
257+
"vpshufb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_0}]",
258+
"vpsubusb {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_1}]",
259+
"vpmaddubsw {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_2}]",
260+
"vpmaddwd {ymm:y}, {ymm:y}, ymmword ptr [rip + {LCPI0_3}]",
261+
"vmovd {c:e}, {ymm:x}",
262+
"vpextrd {a:e}, {ymm:x}, 1",
263+
"mov {a:r}, qword ptr [{lut} + 8*{a:r}]",
264+
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
265+
"vextracti128 {ymm:x}, {ymm:y}, 1",
266+
"vmovd {c:e}, {ymm:x}",
267+
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
268+
"vpextrd {c:e}, {ymm:x}, 1",
269+
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
270+
"vpextrd {c:e}, {ymm:x}, 3",
271+
"add {a:r}, qword ptr [{lut} + 8*{c:r}]",
272+
273+
LCPI0_0 = sym LCPI0_0,
274+
LCPI0_1 = sym LCPI0_1,
275+
LCPI0_2 = sym LCPI0_2,
276+
LCPI0_3 = sym LCPI0_3,
277+
s = in(reg) s.as_ptr(),
278+
lut = in(reg) LUT_P1.as_ptr(),
279+
ymm = out(ymm_reg) _,
280+
c = out(reg) _,
281+
a = out(reg) r,
282+
options(nostack)
283+
);
284+
}
285+
r
325286
}
326287

327288
#[aoc(day21, part2)]
328289
pub fn part2(s: &str) -> u64 {
329-
unsafe { inner(s, &LUT_P2) }
290+
static LCPI0_0: [u8; 32] = [
291+
3, 8, 9, 10, 3, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 7, 10, 11, 12, 7, 15, 0, 1, 0, 0, 0, 0,
292+
7, 4, 5, 6,
293+
];
294+
static LCPI0_1: [u8; 32] = [
295+
48, 48, 48, 48, 48, 48, 48, 48, 0, 0, 0, 0, 0, 0, 0, 0, 48, 48, 48, 48, 48, 48, 48, 48, 0,
296+
0, 0, 0, 48, 48, 48, 48,
297+
];
298+
static LCPI0_2: [u8; 32] = [
299+
0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 10, 1, 0, 1, 10, 1, 0, 0, 0, 0, 0,
300+
1, 10, 1,
301+
];
302+
static LCPI0_3: [u16; 16] = [100, 1, 100, 1, 0, 0, 0, 0, 100, 1, 100, 1, 0, 0, 100, 1];
303+
304+
let r: u64;
305+
unsafe {
306+
std::arch::asm!(
307+
"vpermq {ymm}, ymmword ptr [{s}], 99",
308+
"vpshufb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_0}]",
309+
"vpsubusb {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_1}]",
310+
"vpmaddubsw {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_2}]",
311+
"vpmaddwd {ymm}, {ymm}, ymmword ptr [rip + {LCPI0_3}]",
312+
"vmovd {t:e}, {ymm:x}",
313+
"vpextrd {r:e}, {ymm:x}, 1",
314+
"mov {r:r}, qword ptr [{lut} + 8*{r:r}]",
315+
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
316+
"vextracti128 {ymm:x}, {ymm}, 1",
317+
"vmovd {t:e}, {ymm:x}",
318+
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
319+
"vpextrd {t:e}, {ymm:x}, 1",
320+
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
321+
"vpextrd {t:e}, {ymm:x}, 3",
322+
"add {r:r}, qword ptr [{lut} + 8*{t:r}]",
323+
324+
LCPI0_0 = sym LCPI0_0,
325+
LCPI0_1 = sym LCPI0_1,
326+
LCPI0_2 = sym LCPI0_2,
327+
LCPI0_3 = sym LCPI0_3,
328+
s = in(reg) s.as_ptr(),
329+
lut = in(reg) LUT_P2.as_ptr(),
330+
r = out(reg) r,
331+
ymm = out(ymm_reg) _,
332+
t = out(reg) _,
333+
options(nostack)
334+
);
335+
}
336+
r
330337
}
331338

332339
#[cfg(test)]

0 commit comments

Comments
 (0)