@@ -72,27 +72,27 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
7272 "jb 6f" , // Jump to < 16 case
7373
7474 "4:" ,
75+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
7576 "add {i}, -16" ,
76- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*3]" ,
7777 "vpand {vt}, {vt}, {msb}" ,
7878 "vpcmpeqq {vt}, {vt}, {zero}" ,
7979 "vpmovmskb {t}, {vt}" ,
80- "popcnt {t}, {t}" ,
81- "add {sum},{t}" ,
8280 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*2]" ,
81+ "popcnt {t}, {t}" ,
8382 "vpand {vt}, {vt}, {msb}" ,
83+ "add {sum},{t}" ,
8484 "vpcmpeqq {vt}, {vt}, {zero}" ,
8585 "vpmovmskb {t}, {vt}" ,
86- "popcnt {t}, {t}" ,
87- "add {sum},{t}" ,
8886 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*1]" ,
87+ "popcnt {t}, {t}" ,
8988 "vpand {vt}, {vt}, {msb}" ,
89+ "add {sum},{t}" ,
9090 "vpcmpeqq {vt}, {vt}, {zero}" ,
9191 "vpmovmskb {t}, {vt}" ,
92- "popcnt {t}, {t}" ,
93- "add {sum},{t}" ,
9492 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*0]" ,
93+ "popcnt {t}, {t}" ,
9594 "vpand {vt}, {vt}, {msb}" ,
95+ "add {sum},{t}" ,
9696 "vpcmpeqq {vt}, {vt}, {zero}" ,
9797 "vpmovmskb {t}, {vt}" ,
9898 "popcnt {t}, {t}" ,
@@ -105,8 +105,8 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
105105 // Is >= 4 and < 16
106106
107107 "5:" ,
108+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
108109 "add {i}, -4" ,
109- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i}]" ,
110110 "vpand {vt}, {vt}, {msb}" ,
111111 "vpcmpeqq {vt}, {vt}, {zero}" ,
112112 "vpmovmskb {t}, {vt}" ,
@@ -143,34 +143,35 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
143143 keys_i += 1 ;
144144 } else {
145145 std:: arch:: asm!(
146+
146147 "test {max_i}, {max_i}" ,
147148 "je 2f" , // Jump on empty
148149 "mov {i}, {max_i}" ,
149150 "cmp {i}, 16" ,
150151 "jb 6f" , // Jump to < 16 case
151152
152153 "4:" ,
154+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
153155 "add {i}, -16" ,
154- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*3]" ,
155156 "vpand {vt}, {vt}, {msb}" ,
156157 "vpcmpeqq {vt}, {vt}, {zero}" ,
157158 "vpmovmskb {t}, {vt}" ,
158- "popcnt {t}, {t}" ,
159- "add {sum},{t}" ,
160159 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*2]" ,
160+ "popcnt {t}, {t}" ,
161161 "vpand {vt}, {vt}, {msb}" ,
162+ "add {sum},{t}" ,
162163 "vpcmpeqq {vt}, {vt}, {zero}" ,
163164 "vpmovmskb {t}, {vt}" ,
164- "popcnt {t}, {t}" ,
165- "add {sum},{t}" ,
166165 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*1]" ,
166+ "popcnt {t}, {t}" ,
167167 "vpand {vt}, {vt}, {msb}" ,
168+ "add {sum},{t}" ,
168169 "vpcmpeqq {vt}, {vt}, {zero}" ,
169170 "vpmovmskb {t}, {vt}" ,
170- "popcnt {t}, {t}" ,
171- "add {sum},{t}" ,
172171 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*0]" ,
172+ "popcnt {t}, {t}" ,
173173 "vpand {vt}, {vt}, {msb}" ,
174+ "add {sum},{t}" ,
174175 "vpcmpeqq {vt}, {vt}, {zero}" ,
175176 "vpmovmskb {t}, {vt}" ,
176177 "popcnt {t}, {t}" ,
@@ -183,8 +184,8 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
183184 // Is >= 4 and < 16
184185
185186 "5:" ,
187+ "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} - 32]" ,
186188 "add {i}, -4" ,
187- "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i}]" ,
188189 "vpand {vt}, {vt}, {msb}" ,
189190 "vpcmpeqq {vt}, {vt}, {zero}" ,
190191 "vpmovmskb {t}, {vt}" ,
0 commit comments