4
4
using System . Runtime . CompilerServices ;
5
5
using System . Runtime . InteropServices ;
6
6
using System . Runtime . Intrinsics ;
7
+ using System . Runtime . Intrinsics . Arm ;
7
8
using System . Runtime . Intrinsics . X86 ;
8
9
9
10
#pragma warning disable IDE0007 // Use implicit type
@@ -70,6 +71,11 @@ public static uint Calculate(uint adler, ReadOnlySpan<byte> buffer)
70
71
return CalculateSse ( adler , buffer ) ;
71
72
}
72
73
74
+ if ( AdvSimd . IsSupported )
75
+ {
76
+ return CalculateArm ( adler , buffer ) ;
77
+ }
78
+
73
79
return CalculateScalar ( adler , buffer ) ;
74
80
}
75
81
@@ -95,7 +101,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
95
101
Vector128 < sbyte > tap1 = Sse2 . LoadVector128 ( ( sbyte * ) tapPtr ) ;
96
102
Vector128 < sbyte > tap2 = Sse2 . LoadVector128 ( ( sbyte * ) ( tapPtr + 0x10 ) ) ;
97
103
Vector128 < byte > zero = Vector128 < byte > . Zero ;
98
- var ones = Vector128 . Create ( ( short ) 1 ) ;
104
+ Vector128 < short > ones = Vector128 . Create ( ( short ) 1 ) ;
99
105
100
106
while ( blocks > 0 )
101
107
{
@@ -179,13 +185,13 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
179
185
byte * localBufferPtr = bufferPtr ;
180
186
181
187
Vector256 < byte > zero = Vector256 < byte > . Zero ;
182
- var dot3v = Vector256 . Create ( ( short ) 1 ) ;
183
- var dot2v = Vector256 . Create ( 32 , 31 , 30 , 29 , 28 , 27 , 26 , 25 , 24 , 23 , 22 , 21 , 20 , 19 , 18 , 17 , 16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 ) ;
188
+ Vector256 < short > dot3v = Vector256 . Create ( ( short ) 1 ) ;
189
+ Vector256 < sbyte > dot2v = Vector256 . Create ( 32 , 31 , 30 , 29 , 28 , 27 , 26 , 25 , 24 , 23 , 22 , 21 , 20 , 19 , 18 , 17 , 16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 ) ;
184
190
185
191
// Process n blocks of data. At most NMAX data bytes can be
186
192
// processed before s2 must be reduced modulo BASE.
187
- var vs1 = Vector256 . CreateScalar ( s1 ) ;
188
- var vs2 = Vector256 . CreateScalar ( s2 ) ;
193
+ Vector256 < uint > vs1 = Vector256 . CreateScalar ( s1 ) ;
194
+ Vector256 < uint > vs2 = Vector256 . CreateScalar ( s2 ) ;
189
195
190
196
while ( length >= 32 )
191
197
{
@@ -243,6 +249,100 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
243
249
}
244
250
}
245
251
252
+ // Based on: https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
253
+ [ MethodImpl ( InliningOptions . HotPath | InliningOptions . ShortMethod ) ]
254
+ private static unsafe uint CalculateArm ( uint adler , ReadOnlySpan < byte > buffer )
255
+ {
256
+ // Split Adler-32 into component sums.
257
+ uint s1 = adler & 0xFFFF ;
258
+ uint s2 = ( adler >> 16 ) & 0xFFFF ;
259
+ uint length = ( uint ) buffer . Length ;
260
+
261
+ // Process the data in blocks.
262
+ long blocks = length / BlockSize ;
263
+ length -= ( uint ) ( blocks * BlockSize ) ;
264
+ fixed ( byte * bufferPtr = & MemoryMarshal . GetReference ( buffer ) )
265
+ {
266
+ byte * localBufferPtr = bufferPtr ;
267
+
268
+ while ( blocks != 0 )
269
+ {
270
+ uint n = NMAX / BlockSize ;
271
+ if ( n > blocks )
272
+ {
273
+ n = ( uint ) blocks ;
274
+ }
275
+
276
+ blocks -= n ;
277
+
278
+ // Process n blocks of data. At most nMax data bytes can be
279
+ // processed before s2 must be reduced modulo Base.
280
+ Vector128 < uint > vs1 = Vector128 < uint > . Zero ;
281
+ Vector128 < uint > vs2 = vs1 . WithElement ( 3 , s1 * n ) ;
282
+ Vector128 < ushort > vColumnSum1 = Vector128 < ushort > . Zero ;
283
+ Vector128 < ushort > vColumnSum2 = Vector128 < ushort > . Zero ;
284
+ Vector128 < ushort > vColumnSum3 = Vector128 < ushort > . Zero ;
285
+ Vector128 < ushort > vColumnSum4 = Vector128 < ushort > . Zero ;
286
+
287
+ do
288
+ {
289
+ // Load 32 input bytes.
290
+ Vector128 < ushort > bytes1 = AdvSimd . LoadVector128 ( localBufferPtr ) . AsUInt16 ( ) ;
291
+ Vector128 < ushort > bytes2 = AdvSimd . LoadVector128 ( localBufferPtr + 0x10 ) . AsUInt16 ( ) ;
292
+
293
+ // Add previous block byte sum to v_s2.
294
+ vs2 = AdvSimd . Add ( vs2 , vs1 ) ;
295
+
296
+ // Horizontally add the bytes for s1.
297
+ vs1 = AdvSimd . AddPairwiseWideningAndAdd (
298
+ vs1 . AsUInt32 ( ) ,
299
+ AdvSimd . AddPairwiseWideningAndAdd ( AdvSimd . AddPairwiseWidening ( bytes1 . AsByte ( ) ) . AsUInt16 ( ) , bytes2 . AsByte ( ) ) ) ;
300
+
301
+ // Vertically add the bytes for s2.
302
+ vColumnSum1 = AdvSimd . AddWideningLower ( vColumnSum1 , bytes1 . GetLower ( ) . AsByte ( ) ) ;
303
+ vColumnSum2 = AdvSimd . AddWideningLower ( vColumnSum2 , bytes1 . GetUpper ( ) . AsByte ( ) ) ;
304
+ vColumnSum3 = AdvSimd . AddWideningLower ( vColumnSum3 , bytes2 . GetLower ( ) . AsByte ( ) ) ;
305
+ vColumnSum4 = AdvSimd . AddWideningLower ( vColumnSum4 , bytes2 . GetUpper ( ) . AsByte ( ) ) ;
306
+
307
+ localBufferPtr += BlockSize ;
308
+ }
309
+ while ( -- n > 0 ) ;
310
+
311
+ vs2 = AdvSimd . ShiftLeftLogical ( vs2 , 5 ) ;
312
+
313
+ // Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
314
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum1 . GetLower ( ) , Vector64 . Create ( ( ushort ) 32 , 31 , 30 , 29 ) ) ;
315
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum1 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 28 , 27 , 26 , 25 ) ) ;
316
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum2 . GetLower ( ) , Vector64 . Create ( ( ushort ) 24 , 23 , 22 , 21 ) ) ;
317
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum2 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 20 , 19 , 18 , 17 ) ) ;
318
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum3 . GetLower ( ) , Vector64 . Create ( ( ushort ) 16 , 15 , 14 , 13 ) ) ;
319
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum3 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 12 , 11 , 10 , 9 ) ) ;
320
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum4 . GetLower ( ) , Vector64 . Create ( ( ushort ) 8 , 7 , 6 , 5 ) ) ;
321
+ vs2 = AdvSimd . MultiplyWideningLowerAndAdd ( vs2 , vColumnSum4 . GetUpper ( ) , Vector64 . Create ( ( ushort ) 4 , 3 , 2 , 1 ) ) ;
322
+
323
+ // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
324
+ Vector64 < uint > sum1 = AdvSimd . AddPairwise ( vs1 . GetLower ( ) , vs1 . GetUpper ( ) ) ;
325
+ Vector64 < uint > sum2 = AdvSimd . AddPairwise ( vs2 . GetLower ( ) , vs2 . GetUpper ( ) ) ;
326
+ Vector64 < uint > s1s2 = AdvSimd . AddPairwise ( sum1 , sum2 ) ;
327
+
328
+ // Store the results.
329
+ s1 += AdvSimd . Extract ( s1s2 , 0 ) ;
330
+ s2 += AdvSimd . Extract ( s1s2 , 1 ) ;
331
+
332
+ // Reduce.
333
+ s1 %= BASE ;
334
+ s2 %= BASE ;
335
+ }
336
+
337
+ if ( length > 0 )
338
+ {
339
+ HandleLeftOver ( localBufferPtr , length , ref s1 , ref s2 ) ;
340
+ }
341
+
342
+ return s1 | ( s2 << 16 ) ;
343
+ }
344
+ }
345
+
246
346
private static unsafe void HandleLeftOver ( byte * localBufferPtr , uint length , ref uint s1 , ref uint s2 )
247
347
{
248
348
if ( length >= 16 )
@@ -286,7 +386,6 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
286
386
{
287
387
uint s1 = adler & 0xFFFF ;
288
388
uint s2 = ( adler >> 16 ) & 0xFFFF ;
289
- uint k ;
290
389
291
390
fixed ( byte * bufferPtr = buffer )
292
391
{
@@ -295,7 +394,7 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
295
394
296
395
while ( length > 0 )
297
396
{
298
- k = length < NMAX ? length : NMAX ;
397
+ uint k = length < NMAX ? length : NMAX ;
299
398
length -= k ;
300
399
301
400
while ( k >= 16 )
0 commit comments