@@ -56,8 +56,36 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
5656 "je 3f" , // Jump to one case
5757 "mov {i}, {max_i}" ,
5858 "shl {i}, 5" ,
59+ "jmp 5f" ,
60+
61+ "4:" ,
62+ "add {i}, -32 * 4" ,
63+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 96]" ,
64+ "vpmovmskb {t}, {vt}" ,
65+ "cmp {t}, 1" ,
66+ "adc {sum}, 0" ,
67+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 64]" ,
68+ "vpmovmskb {t}, {vt}" ,
69+ "cmp {t}, 1" ,
70+ "adc {sum}, 0" ,
71+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
72+ "vpmovmskb {t}, {vt}" ,
73+ "cmp {t}, 1" ,
74+ "adc {sum}, 0" ,
75+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
76+ "vpmovmskb {t}, {vt}" ,
77+ "cmp {t}, 1" ,
78+ "adc {sum}, 0" ,
79+ "5:" ,
80+ "cmp {i}, 96" ,
81+ "jg 4b" , // Loop
82+ "cmp {i}, 32" ,
83+ "jl 2f" , // Is zero
84+ "je 3f" , // Is one
85+ // Is 2 or 3
86+
5987 "4:" ,
60- "add {i}, -64 " ,
88+ "add {i}, -32 * 2 " ,
6189 "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
6290 "vpmovmskb {t}, {vt}" ,
6391 "cmp {t}, 1" ,
@@ -69,6 +97,7 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
6997 "cmp {i}, 32" ,
7098 "jg 4b" , // Loop
7199 "jne 2f" , // Is zero
100+
72101 "3:" ,
73102 "vpcmpeqb {vt}, {d}, ymmword ptr [{os}]" ,
74103 "vpmovmskb {t}, {vt}" ,
@@ -105,8 +134,36 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
105134 "je 3f" , // Jump to one case
106135 "mov {i}, {max_i}" ,
107136 "shl {i}, 5" ,
137+ "jmp 5f" ,
138+
139+ "4:" ,
140+ "add {i}, -32 * 4" ,
141+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 96]" ,
142+ "vpmovmskb {t}, {vt}" ,
143+ "cmp {t}, 1" ,
144+ "adc {sum}, 0" ,
145+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 64]" ,
146+ "vpmovmskb {t}, {vt}" ,
147+ "cmp {t}, 1" ,
148+ "adc {sum}, 0" ,
149+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
150+ "vpmovmskb {t}, {vt}" ,
151+ "cmp {t}, 1" ,
152+ "adc {sum}, 0" ,
153+ "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i}]" ,
154+ "vpmovmskb {t}, {vt}" ,
155+ "cmp {t}, 1" ,
156+ "adc {sum}, 0" ,
157+ "5:" ,
158+ "cmp {i}, 96" ,
159+ "jg 4b" , // Loop
160+ "cmp {i}, 32" ,
161+ "jl 2f" , // Is zero
162+ "je 3f" , // Is one
163+ // Is 2 or 3
164+
108165 "4:" ,
109- "add {i}, -64 " ,
166+ "add {i}, -32 * 2 " ,
110167 "vpcmpeqb {vt}, {d}, ymmword ptr [{os} + {i} + 32]" ,
111168 "vpmovmskb {t}, {vt}" ,
112169 "cmp {t}, 1" ,
@@ -118,6 +175,7 @@ unsafe fn part1_inner(s: &[u8]) -> u64 {
118175 "cmp {i}, 32" ,
119176 "jg 4b" , // Loop
120177 "jne 2f" , // Is zero
178+
121179 "3:" ,
122180 "vpcmpeqb {vt}, {d}, ymmword ptr [{os}]" ,
123181 "vpmovmskb {t}, {vt}" ,
0 commit comments