@@ -8,207 +8,83 @@ pub fn part1(s: &str) -> u64 {
88 unsafe { part1_inner ( s) }
99}
1010
11- pub fn part2 ( _s : & str ) -> u64 {
12- // To be sure you know...
13- 42
14- }
15-
1611const DS : usize = 7 * 6 + 1 ;
1712
18- const ALL_BITS : i8 = 0b111 ;
19- const KEPT_BITS : i8 = 0b011 ;
20-
2113#[ inline( always) ]
2214unsafe fn part1_inner ( s : & [ u8 ] ) -> u64 {
2315 let mut sum = 0 ;
2416
25- static mut KEYS : [ __m256i ; 512 ] = unsafe { std:: mem:: transmute ( [ 0u8 ; 512 * 32 ] ) } ;
26- static mut HOLES : [ __m256i ; 512 ] = unsafe { std:: mem:: transmute ( [ 0u8 ; 512 * 32 ] ) } ;
27-
28- let keys = & mut * ( & raw mut KEYS ) ;
29- let holes = & mut * ( & raw mut HOLES ) ;
30-
31- let mut keys_i = 0 ;
32- let mut holes_i = 0 ;
17+ let mut keys = heapless:: Vec :: < u64 , 512 > :: new ( ) ;
18+ let mut holes = heapless:: Vec :: < u64 , 512 > :: new ( ) ;
3319
3420 let mut i = 0 ;
3521
3622 std:: hint:: assert_unchecked ( s. len ( ) > 0 ) ;
3723 while i < s. len ( ) {
3824 let is_key = * s. get_unchecked ( i) == b'.' ;
3925
40- let d = s
26+ let d = ( s
4127 . as_ptr ( )
4228 . offset ( i as isize + 6 )
43- . cast :: < __m256i > ( )
44- . read_unaligned ( ) ;
45- let d = _mm256_and_si256 (
46- d,
47- _mm256_setr_epi8 (
48- ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , 0 , //
49- ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , 0 , //
50- ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , 0 , //
51- ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , 0 , //
52- ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , ALL_BITS , 0 , 0 , 0 ,
53- ) ,
54- ) ;
55-
56- if is_key {
57- std:: arch:: asm!(
58- "test {max_i}, {max_i}" ,
59- "je 2f" , // Jump on empty
60- "cmp {max_i}, 1" ,
61- "je 3f" , // Jump to one case
62- "mov {i}, {max_i}" ,
63- "shl {i}, 5" ,
64- "jmp 5f" ,
29+ . cast :: < u64 > ( )
30+ . read_unaligned ( )
31+ & 0x0101010101 )
32+ + ( s. as_ptr ( )
33+ . offset ( i as isize + 6 + 6 )
34+ . cast :: < u64 > ( )
35+ . read_unaligned ( )
36+ & 0x0101010101 )
37+ + ( s. as_ptr ( )
38+ . offset ( i as isize + 6 + 12 )
39+ . cast :: < u64 > ( )
40+ . read_unaligned ( )
41+ & 0x0101010101 )
42+ + ( s. as_ptr ( )
43+ . offset ( i as isize + 6 + 18 )
44+ . cast :: < u64 > ( )
45+ . read_unaligned ( )
46+ & 0x0101010101 )
47+ + ( s. as_ptr ( )
48+ . offset ( i as isize + 6 + 24 )
49+ . cast :: < u64 > ( )
50+ . read_unaligned ( )
51+ & 0x0101010101 ) ;
52+
53+ let other = if is_key { & holes } else { & keys } ;
54+ let mut j = other. len ( ) ;
55+ while j >= 4 {
56+ j -= 4 ;
57+ let o = other
58+ . as_ptr ( )
59+ . offset ( j as isize )
60+ . cast :: < __m256i > ( )
61+ . read_unaligned ( ) ;
62+ let s = _mm256_add_epi64 ( o, _mm256_set1_epi64x ( d as i64 ) ) ;
63+
64+ let s = _mm256_movemask_epi8 ( s) as u32 ;
65+
66+ sum += ( ( s & 0xFF_00_00_00 ) == 0 ) as u64 ;
67+ sum += ( ( s & 0x00_FF_00_00 ) == 0 ) as u64 ;
68+ sum += ( ( s & 0x00_00_FF_00 ) == 0 ) as u64 ;
69+ sum += ( ( s & 0x00_00_00_FF ) == 0 ) as u64 ;
70+ }
71+ if j > 0 {
72+ let o = other. as_ptr ( ) . cast :: < __m256i > ( ) . read_unaligned ( ) ;
73+ let s = _mm256_add_epi64 ( o, _mm256_set1_epi64x ( d as i64 ) ) ;
6574
66- "4:" ,
67- "add {i}, -32 * 4" ,
68- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 96]" ,
69- "vpmovmskb {t}, {vt}" ,
70- "cmp {t}, 1" ,
71- "adc {sum}, 0" ,
72- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 64]" ,
73- "vpmovmskb {t}, {vt}" ,
74- "cmp {t}, 1" ,
75- "adc {sum}, 0" ,
76- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
77- "vpmovmskb {t}, {vt}" ,
78- "cmp {t}, 1" ,
79- "adc {sum}, 0" ,
80- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
81- "vpmovmskb {t}, {vt}" ,
82- "cmp {t}, 1" ,
83- "adc {sum}, 0" ,
84- "5:" ,
85- "cmp {i}, 96" ,
86- "jg 4b" , // Loop
87- "cmp {i}, 32" ,
88- "jl 2f" , // Is zero
89- "je 3f" , // Is one
90- // Is 2 or 3
75+ let s = _mm256_movemask_epi8 ( s) as u32 ;
9176
92- "4:" ,
93- "add {i}, -32 * 2" ,
94- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
95- "vpmovmskb {t}, {vt}" ,
96- "cmp {t}, 1" ,
97- "adc {sum}, 0" ,
98- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
99- "vpmovmskb {t}, {vt}" ,
100- "cmp {t}, 1" ,
101- "adc {sum}, 0" ,
102- "cmp {i}, 32" ,
103- "jg 4b" , // Loop
104- "jne 2f" , // Is zero
77+ let s = !( !s << ( 3 - j) * 8 ) ;
78+ sum += ( ( s & 0x00_FF_00_00 ) == 0 ) as u64 ;
79+ sum += ( ( s & 0x00_00_FF_00 ) == 0 ) as u64 ;
80+ sum += ( ( s & 0x00_00_00_FF ) == 0 ) as u64 ;
81+ }
10582
106- "3:" ,
107- "vpcmpeqb {vt}, {d}, ymmword ptr [{os}]" ,
108- "vpmovmskb {t}, {vt}" ,
109- "cmp {t}, 1" ,
110- "adc {sum}, 0" ,
111- "2:" ,
112- os = in( reg) holes,
113- max_i = in( reg) holes_i,
114- d = in( ymm_reg) d,
115- i = out( reg) _,
116- sum = inout( reg) sum,
117- t = out( reg) _,
118- vt = out( ymm_reg) _,
119- options( nostack) ,
120- ) ;
121- let d = _mm256_and_si256 ( d, _mm256_set1_epi8 ( KEPT_BITS ) ) ;
122- let d = _mm256_or_si256 (
123- d,
124- _mm256_setr_epi8 (
125- 0 , 0 , 0 , 0 , 0 , -1 , //
126- 0 , 0 , 0 , 0 , 0 , -1 , //
127- 0 , 0 , 0 , 0 , 0 , -1 , //
128- 0 , 0 , 0 , 0 , 0 , -1 , //
129- 0 , 0 , 0 , 0 , 0 , -1 , -1 , -1 ,
130- ) ,
131- ) ;
132- * keys. get_unchecked_mut ( keys_i) = d;
133- keys_i += 1 ;
83+ let d = d + 0x7A7A7A7A7A ;
84+ if is_key {
85+ keys. push_unchecked ( d) ;
13486 } else {
135- std:: arch:: asm!(
136- "test {max_i}, {max_i}" ,
137- "je 2f" , // Jump on empty
138- "cmp {max_i}, 1" ,
139- "je 3f" , // Jump to one case
140- "mov {i}, {max_i}" ,
141- "shl {i}, 5" ,
142- "jmp 5f" ,
143-
144- "4:" ,
145- "add {i}, -32 * 4" ,
146- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 96]" ,
147- "vpmovmskb {t}, {vt}" ,
148- "cmp {t}, 1" ,
149- "adc {sum}, 0" ,
150- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 64]" ,
151- "vpmovmskb {t}, {vt}" ,
152- "cmp {t}, 1" ,
153- "adc {sum}, 0" ,
154- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
155- "vpmovmskb {t}, {vt}" ,
156- "cmp {t}, 1" ,
157- "adc {sum}, 0" ,
158- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
159- "vpmovmskb {t}, {vt}" ,
160- "cmp {t}, 1" ,
161- "adc {sum}, 0" ,
162- "5:" ,
163- "cmp {i}, 96" ,
164- "jg 4b" , // Loop
165- "cmp {i}, 32" ,
166- "jl 2f" , // Is zero
167- "je 3f" , // Is one
168- // Is 2 or 3
169-
170- "4:" ,
171- "add {i}, -32 * 2" ,
172- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
173- "vpmovmskb {t}, {vt}" ,
174- "cmp {t}, 1" ,
175- "adc {sum}, 0" ,
176- "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
177- "vpmovmskb {t}, {vt}" ,
178- "cmp {t}, 1" ,
179- "adc {sum}, 0" ,
180- "cmp {i}, 32" ,
181- "jg 4b" , // Loop
182- "jne 2f" , // Is zero
183-
184- "3:" ,
185- "vpcmpeqb {vt}, {d}, ymmword ptr [{os}]" ,
186- "vpmovmskb {t}, {vt}" ,
187- "cmp {t}, 1" ,
188- "adc {sum}, 0" ,
189- "2:" ,
190- os = in( reg) keys,
191- max_i = in( reg) keys_i,
192- d = in( ymm_reg) d,
193- i = out( reg) _,
194- sum = inout( reg) sum,
195- t = out( reg) _,
196- vt = out( ymm_reg) _,
197- options( nostack) ,
198- ) ;
199- let d = _mm256_and_si256 ( d, _mm256_set1_epi8 ( KEPT_BITS ) ) ;
200- let d = _mm256_or_si256 (
201- d,
202- _mm256_setr_epi8 (
203- 0 , 0 , 0 , 0 , 0 , -1 , //
204- 0 , 0 , 0 , 0 , 0 , -1 , //
205- 0 , 0 , 0 , 0 , 0 , -1 , //
206- 0 , 0 , 0 , 0 , 0 , -1 , //
207- 0 , 0 , 0 , 0 , 0 , -1 , -1 , -1 ,
208- ) ,
209- ) ;
210- * holes. get_unchecked_mut ( holes_i) = d;
211- holes_i += 1 ;
87+ holes. push_unchecked ( d) ;
21288 }
21389
21490 i += DS ;
0 commit comments