4
4
using System . Runtime . CompilerServices ;
5
5
using System . Runtime . InteropServices ;
6
6
using System . Runtime . Intrinsics ;
7
+ using System . Runtime . Intrinsics . Arm ;
7
8
using System . Runtime . Intrinsics . X86 ;
8
9
9
10
#pragma warning disable IDE0007 // Use implicit type
@@ -95,7 +96,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
95
96
Vector128 < sbyte > tap1 = Sse2 . LoadVector128 ( ( sbyte * ) tapPtr ) ;
96
97
Vector128 < sbyte > tap2 = Sse2 . LoadVector128 ( ( sbyte * ) ( tapPtr + 0x10 ) ) ;
97
98
Vector128 < byte > zero = Vector128 < byte > . Zero ;
98
- var ones = Vector128 . Create ( ( short ) 1 ) ;
99
+ Vector128 < short > ones = Vector128 . Create ( ( short ) 1 ) ;
99
100
100
101
while ( blocks > 0 )
101
102
{
@@ -179,13 +180,13 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
179
180
byte * localBufferPtr = bufferPtr ;
180
181
181
182
Vector256 < byte > zero = Vector256 < byte > . Zero ;
182
- var dot3v = Vector256 . Create ( ( short ) 1 ) ;
183
- var dot2v = Vector256 . Create ( 32 , 31 , 30 , 29 , 28 , 27 , 26 , 25 , 24 , 23 , 22 , 21 , 20 , 19 , 18 , 17 , 16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 ) ;
183
+ Vector256 < short > dot3v = Vector256 . Create ( ( short ) 1 ) ;
184
+ Vector256 < sbyte > dot2v = Vector256 . Create ( 32 , 31 , 30 , 29 , 28 , 27 , 26 , 25 , 24 , 23 , 22 , 21 , 20 , 19 , 18 , 17 , 16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 ) ;
184
185
185
186
// Process n blocks of data. At most NMAX data bytes can be
186
187
// processed before s2 must be reduced modulo BASE.
187
- var vs1 = Vector256 . CreateScalar ( s1 ) ;
188
- var vs2 = Vector256 . CreateScalar ( s2 ) ;
188
+ Vector256 < uint > vs1 = Vector256 . CreateScalar ( s1 ) ;
189
+ Vector256 < uint > vs2 = Vector256 . CreateScalar ( s2 ) ;
189
190
190
191
while ( length >= 32 )
191
192
{
@@ -243,6 +244,100 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
243
244
}
244
245
}
245
246
247
+ // Based on: https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
248
+ [ MethodImpl ( InliningOptions . HotPath | InliningOptions . ShortMethod ) ]
249
+ private static unsafe uint CalculateArm ( uint adler , ReadOnlySpan < byte > buffer )
250
+ {
251
+ // Split Adler-32 into component sums.
252
+ uint s1 = adler & 0xFFFF ;
253
+ uint s2 = ( adler >> 16 ) & 0xFFFF ;
254
+ uint length = ( uint ) buffer . Length ;
255
+
256
+ // Process the data in blocks.
257
+ long blocks = length / BlockSize ;
258
+ length -= ( uint ) ( blocks * BlockSize ) ;
259
+ fixed ( byte * bufferPtr = & MemoryMarshal . GetReference ( buffer ) )
260
+ {
261
+ byte * localBufferPtr = bufferPtr ;
262
+
263
+ while ( blocks != 0 )
264
+ {
265
+ uint n = NMAX / BlockSize ;
266
+ if ( n > blocks )
267
+ {
268
+ n = ( uint ) blocks ;
269
+ }
270
+
271
+ blocks -= n ;
272
+
273
+ // Process n blocks of data. At most nMax data bytes can be
274
+ // processed before s2 must be reduced modulo Base.
275
+ Vector128 < uint > vs1 = Vector128 < uint > . Zero ;
276
+ Vector128 < uint > vs2 = vs1 . WithElement ( 3 , s1 * n ) ;
277
+ Vector128 < ushort > vColumnSum1 = Vector128 < ushort > . Zero ;
278
+ Vector128 < ushort > vColumnSum2 = Vector128 < ushort > . Zero ;
279
+ Vector128 < ushort > vColumnSum3 = Vector128 < ushort > . Zero ;
280
+ Vector128 < ushort > vColumnSum4 = Vector128 < ushort > . Zero ;
281
+
282
+ do
283
+ {
284
+ // Load 32 input bytes.
285
+ Vector128 < ushort > bytes1 = AdvSimd . LoadVector128 ( localBufferPtr ) . AsUInt16 ( ) ;
286
+ Vector128 < ushort > bytes2 = AdvSimd . LoadVector128 ( localBufferPtr + 0x10 ) . AsUInt16 ( ) ;
287
+
288
+ // Add previous block byte sum to v_s2.
289
+ vs2 = AdvSimd . Add ( vs2 , vs1 ) ;
290
+
291
+ // Horizontally add the bytes for s1.
292
+ vs1 = AdvSimd . AddPairwiseWideningAndAdd (
293
+ vs1 . AsUInt32 ( ) ,
294
+ AdvSimd . AddPairwiseWideningAndAdd ( AdvSimd . AddPairwiseWidening ( bytes1 . AsByte ( ) ) . AsUInt16 ( ) , bytes2 . AsByte ( ) ) ) ;
295
+
296
+ // Vertically add the bytes for s2.
297
+ vColumnSum1 = AdvSimd . AddWideningLower ( vColumnSum1 , bytes1 . GetLower ( ) . AsByte ( ) ) ;
298
+ vColumnSum2 = AdvSimd . AddWideningLower ( vColumnSum2 , bytes1 . GetUpper ( ) . AsByte ( ) ) ;
299
+ vColumnSum3 = AdvSimd . AddWideningLower ( vColumnSum3 , bytes2 . GetLower ( ) . AsByte ( ) ) ;
300
+ vColumnSum4 = AdvSimd . AddWideningLower ( vColumnSum4 , bytes2 . GetUpper ( ) . AsByte ( ) ) ;
301
+
302
+ localBufferPtr += BlockSize ;
303
+ }
304
+ while ( -- n > 0 ) ;
305
+
306
+ vs2 = AdvSimd . ShiftLeftLogical ( vs2 , 5 ) ;
307
+
308
+ // Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
309
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum1 . GetLower ( ) , Vector64 . Create ( ( ushort ) 32 , 31 , 30 , 29 ) ) ;
310
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum1 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 28 , 27 , 26 , 25 ) ) ;
311
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum2 . GetLower ( ) , Vector64 . Create ( ( ushort ) 24 , 23 , 22 , 21 ) ) ;
312
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum2 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 20 , 19 , 18 , 17 ) ) ;
313
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum3 . GetLower ( ) , Vector64 . Create ( ( ushort ) 16 , 15 , 14 , 13 ) ) ;
314
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum3 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 12 , 11 , 10 , 9 ) ) ;
315
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum4 . GetLower ( ) , Vector64 . Create ( ( ushort ) 8 , 7 , 6 , 5 ) ) ;
316
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum4 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 4 , 3 , 2 , 1 ) ) ;
317
+
318
+ // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
319
+ Vector64 < uint > sum1 = AdvSimd . AddPairwise ( vs1 . GetLower ( ) , vs1 . GetUpper ( ) ) ;
320
+ Vector64 < uint > sum2 = AdvSimd . AddPairwise ( vs2 . GetLower ( ) , vs2 . GetUpper ( ) ) ;
321
+ Vector64 < uint > s1s2 = AdvSimd . AddPairwise ( sum1 , sum2 ) ;
322
+
323
+ // Store the results.
324
+ s1 += AdvSimd . Extract ( s1s2 , 0 ) ;
325
+ s2 += AdvSimd . Extract ( s1s2 , 1 ) ;
326
+
327
+ // Reduce.
328
+ s1 %= BASE ;
329
+ s2 %= BASE ;
330
+ }
331
+
332
+ if ( length > 0 )
333
+ {
334
+ HandleLeftOver ( localBufferPtr , length , ref s1 , ref s2 ) ;
335
+ }
336
+
337
+ return s1 | ( s2 << 16 ) ;
338
+ }
339
+ }
340
+
246
341
private static unsafe void HandleLeftOver ( byte * localBufferPtr , uint length , ref uint s1 , ref uint s2 )
247
342
{
248
343
if ( length >= 16 )
@@ -286,7 +381,6 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
286
381
{
287
382
uint s1 = adler & 0xFFFF ;
288
383
uint s2 = ( adler >> 16 ) & 0xFFFF ;
289
- uint k ;
290
384
291
385
fixed ( byte * bufferPtr = buffer )
292
386
{
@@ -295,7 +389,7 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
295
389
296
390
while ( length > 0 )
297
391
{
298
- k = length < NMAX ? length : NMAX ;
392
+ uint k = length < NMAX ? length : NMAX ;
299
393
length -= k ;
300
394
301
395
while ( k >= 16 )
0 commit comments