@@ -66,7 +66,7 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
6666 "je 2f" , // Jump on empty
6767 "mov {i}, {max_i}" ,
6868 "cmp {i}, 16" ,
69- "jb 7f " , // Jump to < 16 case
69+ "jb 6f " , // Jump to < 16 case
7070
7171 "4:" ,
7272 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
@@ -96,47 +96,26 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
9696 "add {sum},{t}" ,
9797 "cmp {i}, 16" ,
9898 "jae 4b" , // Loop
99- "7:" ,
100- // i < 16
99+ "6:" ,
101100 "cmp {i}, 4" ,
102101 "jb 3f" , // Is < 4
103- "cmp {i}, 8" ,
104- "jb 5f" , // Is < 8
105- "cmp {i}, 12" ,
106- "jb 6f" , // Is < 8
107-
108- // i < 16
109- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
110- "add {i}, -4" ,
111- "vpand {vt}, {vt}, {msb}" ,
112- "vpcmpeqq {vt}, {vt}, {zero}" ,
113- "vpmovmskb {t}, {vt}" ,
114- "popcnt {t}, {t}" ,
115- "add {sum},{t}" ,
116-
117- "6:" ,
118- // i < 12
119- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
120- "add {i}, -4" ,
121- "vpand {vt}, {vt}, {msb}" ,
122- "vpcmpeqq {vt}, {vt}, {zero}" ,
123- "vpmovmskb {t}, {vt}" ,
124- "popcnt {t}, {t}" ,
125- "add {sum},{t}" ,
102+ // Is >= 4 and < 16
126103
127104 "5:" ,
128- // i < 8
129105 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
130106 "add {i}, -4" ,
131107 "vpand {vt}, {vt}, {msb}" ,
132108 "vpcmpeqq {vt}, {vt}, {zero}" ,
133109 "vpmovmskb {t}, {vt}" ,
134110 "popcnt {t}, {t}" ,
135111 "add {sum},{t}" ,
136-
112+ "cmp {i}, 4" ,
113+ "jae 5b" , // Loop
137114 "3:" ,
138115 "test {i}, {i}" ,
139116 "je 2f" , // Is zero
117+
118+ // Is > 0 and < 4
140119 "vpaddq {vt}, {d}, ymmword ptr [{os}]" ,
141120 "vpand {vt}, {vt}, {msb}" ,
142121 "vpcmpeqq {vt}, {vt}, {zero}" ,
@@ -161,11 +140,12 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
161140 keys_i += 1 ;
162141 } else {
163142 std:: arch:: asm!(
143+
164144 "test {max_i}, {max_i}" ,
165145 "je 2f" , // Jump on empty
166146 "mov {i}, {max_i}" ,
167147 "cmp {i}, 16" ,
168- "jb 7f " , // Jump to < 16 case
148+ "jb 6f " , // Jump to < 16 case
169149
170150 "4:" ,
171151 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
@@ -195,47 +175,26 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
195175 "add {sum},{t}" ,
196176 "cmp {i}, 16" ,
197177 "jae 4b" , // Loop
198- "7:" ,
199- // i < 16
178+ "6:" ,
200179 "cmp {i}, 4" ,
201180 "jb 3f" , // Is < 4
202- "cmp {i}, 8" ,
203- "jb 5f" , // Is < 8
204- "cmp {i}, 12" ,
205- "jb 6f" , // Is < 8
206-
207- // i < 16
208- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
209- "add {i}, -4" ,
210- "vpand {vt}, {vt}, {msb}" ,
211- "vpcmpeqq {vt}, {vt}, {zero}" ,
212- "vpmovmskb {t}, {vt}" ,
213- "popcnt {t}, {t}" ,
214- "add {sum},{t}" ,
215-
216- "6:" ,
217- // i < 12
218- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
219- "add {i}, -4" ,
220- "vpand {vt}, {vt}, {msb}" ,
221- "vpcmpeqq {vt}, {vt}, {zero}" ,
222- "vpmovmskb {t}, {vt}" ,
223- "popcnt {t}, {t}" ,
224- "add {sum},{t}" ,
181+ // Is >= 4 and < 16
225182
226183 "5:" ,
227- // i < 8
228184 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
229185 "add {i}, -4" ,
230186 "vpand {vt}, {vt}, {msb}" ,
231187 "vpcmpeqq {vt}, {vt}, {zero}" ,
232188 "vpmovmskb {t}, {vt}" ,
233189 "popcnt {t}, {t}" ,
234190 "add {sum},{t}" ,
235-
191+ "cmp {i}, 4" ,
192+ "jae 5b" , // Loop
236193 "3:" ,
237194 "test {i}, {i}" ,
238195 "je 2f" , // Is zero
196+
197+ // Is > 0 and < 4
239198 "vpaddq {vt}, {d}, ymmword ptr [{os}]" ,
240199 "vpand {vt}, {vt}, {msb}" ,
241200 "vpcmpeqq {vt}, {vt}, {zero}" ,
0 commit comments