@@ -161,6 +161,67 @@ private static nint CountSimd<T>(ref T r0, nint length, T value)
161161
162162 var partials = Vector < T > . Zero ;
163163
164+ // Unrolled vectorized loop, with 8 unrolled iterations. We only run this when the
165+ // current type T is at least 2 bytes in size, otherwise the average chunk length
166+ // would always be too small to be able to trigger the unrolled loop, and the overall
167+ // performance would just be slightly worse due to the additional conditional branches.
168+ if ( typeof ( T ) != typeof ( sbyte ) )
169+ {
170+ while ( chunkLength >= Vector < T > . Count * 8 )
171+ {
172+ ref T ri0 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 0 ) ) ;
173+ var vi0 = Unsafe . As < T , Vector < T > > ( ref ri0 ) ;
174+ var ve0 = Vector . Equals ( vi0 , vc ) ;
175+
176+ partials -= ve0 ;
177+
178+ ref T ri1 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 1 ) ) ;
179+ var vi1 = Unsafe . As < T , Vector < T > > ( ref ri1 ) ;
180+ var ve1 = Vector . Equals ( vi1 , vc ) ;
181+
182+ partials -= ve1 ;
183+
184+ ref T ri2 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 2 ) ) ;
185+ var vi2 = Unsafe . As < T , Vector < T > > ( ref ri2 ) ;
186+ var ve2 = Vector . Equals ( vi2 , vc ) ;
187+
188+ partials -= ve2 ;
189+
190+ ref T ri3 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 3 ) ) ;
191+ var vi3 = Unsafe . As < T , Vector < T > > ( ref ri3 ) ;
192+ var ve3 = Vector . Equals ( vi3 , vc ) ;
193+
194+ partials -= ve3 ;
195+
196+ ref T ri4 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 4 ) ) ;
197+ var vi4 = Unsafe . As < T , Vector < T > > ( ref ri4 ) ;
198+ var ve4 = Vector . Equals ( vi4 , vc ) ;
199+
200+ partials -= ve4 ;
201+
202+ ref T ri5 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 5 ) ) ;
203+ var vi5 = Unsafe . As < T , Vector < T > > ( ref ri5 ) ;
204+ var ve5 = Vector . Equals ( vi5 , vc ) ;
205+
206+ partials -= ve5 ;
207+
208+ ref T ri6 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 6 ) ) ;
209+ var vi6 = Unsafe . As < T , Vector < T > > ( ref ri6 ) ;
210+ var ve6 = Vector . Equals ( vi6 , vc ) ;
211+
212+ partials -= ve6 ;
213+
214+ ref T ri7 = ref Unsafe . Add ( ref r0 , offset + ( Vector < T > . Count * 7 ) ) ;
215+ var vi7 = Unsafe . As < T , Vector < T > > ( ref ri7 ) ;
216+ var ve7 = Vector . Equals ( vi7 , vc ) ;
217+
218+ partials -= ve7 ;
219+
220+ chunkLength -= Vector < T > . Count * 8 ;
221+ offset += Vector < T > . Count * 8 ;
222+ }
223+ }
224+
164225 while ( chunkLength >= Vector < T > . Count )
165226 {
166227 ref T ri = ref Unsafe . Add ( ref r0 , offset ) ;
0 commit comments