@@ -66,7 +66,7 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
6666 "je 2f" , // Jump on empty
6767 "mov {i}, {max_i}" ,
6868 "cmp {i}, 16" ,
69- "jb 6f " , // Jump to < 16 case
69+ "jb 7f " , // Jump to < 16 case
7070
7171 "4:" ,
7272 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
@@ -96,26 +96,47 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
9696 "add {sum},{t}" ,
9797 "cmp {i}, 16" ,
9898 "jae 4b" , // Loop
99- "6:" ,
99+ "7:" ,
100+ // i < 16
100101 "cmp {i}, 4" ,
101102 "jb 3f" , // Is < 4
102- // Is >= 4 and < 16
103+ "cmp {i}, 8" ,
104+ "jb 5f" , // Is < 8
105+ "cmp {i}, 12" ,
106+ "jb 6f" , // Is < 8
107+
108+ // i < 16
109+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
110+ "add {i}, -4" ,
111+ "vpand {vt}, {vt}, {msb}" ,
112+ "vpcmpeqq {vt}, {vt}, {zero}" ,
113+ "vpmovmskb {t}, {vt}" ,
114+ "popcnt {t}, {t}" ,
115+ "add {sum},{t}" ,
116+
117+ "6:" ,
118+ // i < 12
119+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
120+ "add {i}, -4" ,
121+ "vpand {vt}, {vt}, {msb}" ,
122+ "vpcmpeqq {vt}, {vt}, {zero}" ,
123+ "vpmovmskb {t}, {vt}" ,
124+ "popcnt {t}, {t}" ,
125+ "add {sum},{t}" ,
103126
104127 "5:" ,
128+ // i < 8
105129 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
106130 "add {i}, -4" ,
107131 "vpand {vt}, {vt}, {msb}" ,
108132 "vpcmpeqq {vt}, {vt}, {zero}" ,
109133 "vpmovmskb {t}, {vt}" ,
110134 "popcnt {t}, {t}" ,
111135 "add {sum},{t}" ,
112- "cmp {i}, 4" ,
113- "jae 5b" , // Loop
136+
114137 "3:" ,
115138 "test {i}, {i}" ,
116139 "je 2f" , // Is zero
117-
118- // Is > 0 and < 4
119140 "vpaddq {vt}, {d}, ymmword ptr [{os}]" ,
120141 "vpand {vt}, {vt}, {msb}" ,
121142 "vpcmpeqq {vt}, {vt}, {zero}" ,
@@ -140,12 +161,11 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
140161 keys_i += 1 ;
141162 } else {
142163 std:: arch:: asm!(
143-
144164 "test {max_i}, {max_i}" ,
145165 "je 2f" , // Jump on empty
146166 "mov {i}, {max_i}" ,
147167 "cmp {i}, 16" ,
148- "jb 6f " , // Jump to < 16 case
168+ "jb 7f " , // Jump to < 16 case
149169
150170 "4:" ,
151171 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
@@ -175,26 +195,47 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
175195 "add {sum},{t}" ,
176196 "cmp {i}, 16" ,
177197 "jae 4b" , // Loop
178- "6:" ,
198+ "7:" ,
199+ // i < 16
179200 "cmp {i}, 4" ,
180201 "jb 3f" , // Is < 4
181- // Is >= 4 and < 16
202+ "cmp {i}, 8" ,
203+ "jb 5f" , // Is < 8
204+ "cmp {i}, 12" ,
205+ "jb 6f" , // Is < 8
206+
207+ // i < 16
208+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
209+ "add {i}, -4" ,
210+ "vpand {vt}, {vt}, {msb}" ,
211+ "vpcmpeqq {vt}, {vt}, {zero}" ,
212+ "vpmovmskb {t}, {vt}" ,
213+ "popcnt {t}, {t}" ,
214+ "add {sum},{t}" ,
215+
216+ "6:" ,
217+ // i < 12
218+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
219+ "add {i}, -4" ,
220+ "vpand {vt}, {vt}, {msb}" ,
221+ "vpcmpeqq {vt}, {vt}, {zero}" ,
222+ "vpmovmskb {t}, {vt}" ,
223+ "popcnt {t}, {t}" ,
224+ "add {sum},{t}" ,
182225
183226 "5:" ,
227+ // i < 8
184228 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
185229 "add {i}, -4" ,
186230 "vpand {vt}, {vt}, {msb}" ,
187231 "vpcmpeqq {vt}, {vt}, {zero}" ,
188232 "vpmovmskb {t}, {vt}" ,
189233 "popcnt {t}, {t}" ,
190234 "add {sum},{t}" ,
191- "cmp {i}, 4" ,
192- "jae 5b" , // Loop
235+
193236 "3:" ,
194237 "test {i}, {i}" ,
195238 "je 2f" , // Is zero
196-
197- // Is > 0 and < 4
198239 "vpaddq {vt}, {d}, ymmword ptr [{os}]" ,
199240 "vpand {vt}, {vt}, {msb}" ,
200241 "vpcmpeqq {vt}, {vt}, {zero}" ,
0 commit comments