1- use std:: { arch:: x86_64:: * , mem :: MaybeUninit } ;
1+ use std:: arch:: x86_64:: * ;
22
33use aoc_runner_derive:: aoc;
44
@@ -49,12 +49,39 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
4949 ) ;
5050
5151 if is_key {
52- for i in 0 ..holes_i {
53- let o = * holes. get_unchecked ( i) ;
54- let collisions = _mm256_cmpeq_epi8 ( d, o) ;
55- let collisions = _mm256_movemask_epi8 ( collisions) ;
56- sum += ( collisions == 0 ) as u64 ;
57- }
52+ std:: arch:: asm!(
53+ "test {i}, {i}" ,
54+ "je 2f" , // Jump on empty
55+ "cmp {i}, 1" ,
56+ "je 3f" , // Jump to one case
57+ "shl {i}, 5" ,
58+ "4:" ,
59+ "add {i}, -64" ,
60+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
61+ "vpmovmskb {t}, {vt}" ,
62+ "cmp {t}, 1" ,
63+ "adc {sum}, 0" ,
64+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
65+ "vpmovmskb {t}, {vt}" ,
66+ "cmp {t}, 1" ,
67+ "adc {sum}, 0" ,
68+ "cmp {i}, 32" ,
69+ "jg 4b" , // Loop
70+ "jne 2f" , // Is zero
71+ "3:" ,
72+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os}]" ,
73+ "vpmovmskb {t}, {vt}" ,
74+ "cmp {t}, 1" ,
75+ "adc {sum}, 0" ,
76+ "2:" ,
77+ os = in( reg) holes,
78+ d = in( ymm_reg) d,
79+ i = inout( reg) holes_i => _,
80+ sum = inout( reg) sum,
81+ t = out( reg) _,
82+ vt = out( ymm_reg) _,
83+ options( nostack) ,
84+ ) ;
5885 let d = _mm256_and_si256 ( d, _mm256_set1_epi8 ( KEPT_BITS ) ) ;
5986 let d = _mm256_or_si256 (
6087 d,
@@ -69,12 +96,39 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
6996 * keys. get_unchecked_mut ( keys_i) = d;
7097 keys_i += 1 ;
7198 } else {
72- for i in 0 ..keys_i {
73- let o = * keys. get_unchecked ( i) ;
74- let collisions = _mm256_cmpeq_epi8 ( d, o) ;
75- let collisions = _mm256_movemask_epi8 ( collisions) ;
76- sum += ( collisions == 0 ) as u64 ;
77- }
99+ std:: arch:: asm!(
100+ "test {i}, {i}" ,
101+ "je 2f" , // Jump on empty
102+ "cmp {i}, 1" ,
103+ "je 3f" , // Jump to one case
104+ "shl {i}, 5" ,
105+ "4:" ,
106+ "add {i}, -64" ,
107+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
108+ "vpmovmskb {t}, {vt}" ,
109+ "cmp {t}, 1" ,
110+ "adc {sum}, 0" ,
111+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
112+ "vpmovmskb {t}, {vt}" ,
113+ "cmp {t}, 1" ,
114+ "adc {sum}, 0" ,
115+ "cmp {i}, 32" ,
116+ "jg 4b" , // Loop
117+ "jne 2f" , // Is zero
118+ "3:" ,
119+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os}]" ,
120+ "vpmovmskb {t}, {vt}" ,
121+ "cmp {t}, 1" ,
122+ "adc {sum}, 0" ,
123+ "2:" ,
124+ os = in( reg) keys,
125+ d = in( ymm_reg) d,
126+ i = inout( reg) keys_i => _,
127+ sum = inout( reg) sum,
128+ t = out( reg) _,
129+ vt = out( ymm_reg) _,
130+ options( nostack) ,
131+ ) ;
78132 let d = _mm256_and_si256 ( d, _mm256_set1_epi8 ( KEPT_BITS ) ) ;
79133 let d = _mm256_or_si256 (
80134 d,
0 commit comments