@@ -78,28 +78,24 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
7878 "vpcmpeqq {vt}, {vt}, {zero}" ,
7979 "vpmovmskb {t}, {vt}" ,
8080 "popcnt {t}, {t}" ,
81- "shr {t}, 3" ,
8281 "add {sum},{t}" ,
8382 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*2]" ,
8483 "vpand {vt}, {vt}, {msb}" ,
8584 "vpcmpeqq {vt}, {vt}, {zero}" ,
8685 "vpmovmskb {t}, {vt}" ,
8786 "popcnt {t}, {t}" ,
88- "shr {t}, 3" ,
8987 "add {sum},{t}" ,
9088 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*1]" ,
9189 "vpand {vt}, {vt}, {msb}" ,
9290 "vpcmpeqq {vt}, {vt}, {zero}" ,
9391 "vpmovmskb {t}, {vt}" ,
9492 "popcnt {t}, {t}" ,
95- "shr {t}, 3" ,
9693 "add {sum},{t}" ,
9794 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*0]" ,
9895 "vpand {vt}, {vt}, {msb}" ,
9996 "vpcmpeqq {vt}, {vt}, {zero}" ,
10097 "vpmovmskb {t}, {vt}" ,
10198 "popcnt {t}, {t}" ,
102- "shr {t}, 3" ,
10399 "add {sum},{t}" ,
104100 "cmp {i}, 16" ,
105101 "jae 4b" , // Loop
@@ -115,7 +111,6 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
115111 "vpcmpeqq {vt}, {vt}, {zero}" ,
116112 "vpmovmskb {t}, {vt}" ,
117113 "popcnt {t}, {t}" ,
118- "shr {t}, 3" ,
119114 "add {sum},{t}" ,
120115 "cmp {i}, 4" ,
121116 "jae 5b" , // Loop
@@ -130,7 +125,6 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
130125 "vpmovmskb {t}, {vt}" ,
131126 "and {t:e}, dword ptr [{lut} + 4*{i}]" ,
132127 "popcnt {t}, {t}" ,
133- "shr {t}, 3" ,
134128 "add {sum},{t}" ,
135129 "2:" ,
136130 d = in( ymm_reg) _mm256_set1_epi64x( d as i64 ) ,
@@ -162,28 +156,24 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
162156 "vpcmpeqq {vt}, {vt}, {zero}" ,
163157 "vpmovmskb {t}, {vt}" ,
164158 "popcnt {t}, {t}" ,
165- "shr {t}, 3" ,
166159 "add {sum},{t}" ,
167160 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*2]" ,
168161 "vpand {vt}, {vt}, {msb}" ,
169162 "vpcmpeqq {vt}, {vt}, {zero}" ,
170163 "vpmovmskb {t}, {vt}" ,
171164 "popcnt {t}, {t}" ,
172- "shr {t}, 3" ,
173165 "add {sum},{t}" ,
174166 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*1]" ,
175167 "vpand {vt}, {vt}, {msb}" ,
176168 "vpcmpeqq {vt}, {vt}, {zero}" ,
177169 "vpmovmskb {t}, {vt}" ,
178170 "popcnt {t}, {t}" ,
179- "shr {t}, 3" ,
180171 "add {sum},{t}" ,
181172 "vpaddq {vt}, {d}, ymmword ptr [{os} + 8*{i} + 32*0]" ,
182173 "vpand {vt}, {vt}, {msb}" ,
183174 "vpcmpeqq {vt}, {vt}, {zero}" ,
184175 "vpmovmskb {t}, {vt}" ,
185176 "popcnt {t}, {t}" ,
186- "shr {t}, 3" ,
187177 "add {sum},{t}" ,
188178 "cmp {i}, 16" ,
189179 "jae 4b" , // Loop
@@ -199,7 +189,6 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
199189 "vpcmpeqq {vt}, {vt}, {zero}" ,
200190 "vpmovmskb {t}, {vt}" ,
201191 "popcnt {t}, {t}" ,
202- "shr {t}, 3" ,
203192 "add {sum},{t}" ,
204193 "cmp {i}, 4" ,
205194 "jae 5b" , // Loop
@@ -214,7 +203,6 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
214203 "vpmovmskb {t}, {vt}" ,
215204 "and {t:e}, dword ptr [{lut} + 4*{i}]" ,
216205 "popcnt {t}, {t}" ,
217- "shr {t}, 3" ,
218206 "add {sum},{t}" ,
219207 "2:" ,
220208 d = in( ymm_reg) _mm256_set1_epi64x( d as i64 ) ,
@@ -236,7 +224,7 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
236224 i += DS ;
237225 }
238226
239- sum
227+ sum / 8
240228}
241229
242230#[ cfg( test) ]
0 commit comments