SixLabors
diff --git a/‎src/ImageSharp/Compression/Zlib/Adler32.cs
Lines changed: 101 additions & 7 deletions b/‎src/ImageSharp/Compression/Zlib/Adler32.cs
Lines changed: 101 additions & 7 deletions
diff --git a/‎src/ImageSharp/Compression/Zlib/Crc32.cs
Lines changed: 95 additions & 0 deletions b/‎src/ImageSharp/Compression/Zlib/Crc32.cs
Lines changed: 95 additions & 0 deletions
diff --git a/‎src/ImageSharp/Formats/Png/Filters/AverageFilter.cs
Lines changed: 33 additions & 1 deletion b/‎src/ImageSharp/Formats/Png/Filters/AverageFilter.cs
Lines changed: 33 additions & 1 deletion
@@ -4,6 +4,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 
 #pragma warning disable IDE0007 // Use implicit type
@@ -95,7 +96,7 @@ private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
                 Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
                 Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
                 Vector128<byte> zero = Vector128<byte>.Zero;
-                var ones = Vector128.Create((short)1);
+                Vector128<short> ones = Vector128.Create((short)1);
 
                 while (blocks > 0)
                 {
@@ -179,13 +180,13 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
             byte* localBufferPtr = bufferPtr;
 
             Vector256<byte> zero = Vector256<byte>.Zero;
-            var dot3v = Vector256.Create((short)1);
-            var dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+            Vector256<short> dot3v = Vector256.Create((short)1);
+            Vector256<sbyte> dot2v = Vector256.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 
             // Process n blocks of data. At most NMAX data bytes can be
             // processed before s2 must be reduced modulo BASE.
-            var vs1 = Vector256.CreateScalar(s1);
-            var vs2 = Vector256.CreateScalar(s2);
+            Vector256<uint> vs1 = Vector256.CreateScalar(s1);
+            Vector256<uint> vs2 = Vector256.CreateScalar(s2);
 
             while (length >= 32)
             {
@@ -243,6 +244,100 @@ public static unsafe uint CalculateAvx2(uint adler, ReadOnlySpan<byte> buffer)
         }
     }
 
+    // Based on: https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
+    [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
+    private static unsafe uint CalculateArm(uint adler, ReadOnlySpan<byte> buffer)
+    {
+        // Split Adler-32 into component sums.
+        uint s1 = adler & 0xFFFF;
+        uint s2 = (adler >> 16) & 0xFFFF;
+        uint length = (uint)buffer.Length;
+
+        // Process the data in blocks.
+        long blocks = length / BlockSize;
+        length -= (uint)(blocks * BlockSize);
+        fixed (byte* bufferPtr = &MemoryMarshal.GetReference(buffer))
+        {
+            byte* localBufferPtr = bufferPtr;
+
+            while (blocks != 0)
+            {
+                uint n = NMAX / BlockSize;
+                if (n > blocks)
+                {
+                    n = (uint)blocks;
+                }
+
+                blocks -= n;
+
+                // Process n blocks of data. At most nMax data bytes can be
+                // processed before s2 must be reduced modulo Base.
+                Vector128<uint> vs1 = Vector128<uint>.Zero;
+                Vector128<uint> vs2 = vs1.WithElement(3, s1 * n);
+                Vector128<ushort> vColumnSum1 = Vector128<ushort>.Zero;
+                Vector128<ushort> vColumnSum2 = Vector128<ushort>.Zero;
+                Vector128<ushort> vColumnSum3 = Vector128<ushort>.Zero;
+                Vector128<ushort> vColumnSum4 = Vector128<ushort>.Zero;
+
+                do
+                {
+                    // Load 32 input bytes.
+                    Vector128<ushort> bytes1 = AdvSimd.LoadVector128(localBufferPtr).AsUInt16();
+                    Vector128<ushort> bytes2 = AdvSimd.LoadVector128(localBufferPtr + 0x10).AsUInt16();
+
+                    // Add previous block byte sum to v_s2.
+                    vs2 = AdvSimd.Add(vs2, vs1);
+
+                    // Horizontally add the bytes for s1.
+                    vs1 = AdvSimd.AddPairwiseWideningAndAdd(
+                        vs1.AsUInt32(),
+                        AdvSimd.AddPairwiseWideningAndAdd(AdvSimd.AddPairwiseWidening(bytes1.AsByte()).AsUInt16(), bytes2.AsByte()));
+
+                    // Vertically add the bytes for s2.
+                    vColumnSum1 = AdvSimd.AddWideningLower(vColumnSum1, bytes1.GetLower().AsByte());
+                    vColumnSum2 = AdvSimd.AddWideningLower(vColumnSum2, bytes1.GetUpper().AsByte());
+                    vColumnSum3 = AdvSimd.AddWideningLower(vColumnSum3, bytes2.GetLower().AsByte());
+                    vColumnSum4 = AdvSimd.AddWideningLower(vColumnSum4, bytes2.GetUpper().AsByte());
+
+                    localBufferPtr += BlockSize;
+                }
+                while (--n > 0);
+
+                vs2 = AdvSimd.ShiftLeftLogical(vs2, 5);
+
+                // Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetLower(), Vector64.Create((ushort)32, 31, 30, 29));
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum1.GetUpper(), Vector64.Create((ushort)28, 27, 26, 25));
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetLower(), Vector64.Create((ushort)24, 23, 22, 21));
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum2.GetUpper(), Vector64.Create((ushort)20, 19, 18, 17));
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetLower(), Vector64.Create((ushort)16, 15, 14, 13));
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum3.GetUpper(), Vector64.Create((ushort)12, 11, 10, 9));
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetLower(), Vector64.Create((ushort)8, 7, 6, 5));
+                vs2 = AdvSimd.MultiplyWideningLowerAndAdd(vs2, vColumnSum4.GetUpper(), Vector64.Create((ushort)4, 3, 2, 1));
+
+                // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
+                Vector64<uint> sum1 = AdvSimd.AddPairwise(vs1.GetLower(), vs1.GetUpper());
+                Vector64<uint> sum2 = AdvSimd.AddPairwise(vs2.GetLower(), vs2.GetUpper());
+                Vector64<uint> s1s2 = AdvSimd.AddPairwise(sum1, sum2);
+
+                // Store the results.
+                s1 += AdvSimd.Extract(s1s2, 0);
+                s2 += AdvSimd.Extract(s1s2, 1);
+
+                // Reduce.
+                s1 %= BASE;
+                s2 %= BASE;
+            }
+
+            if (length > 0)
+            {
+                HandleLeftOver(localBufferPtr, length, ref s1, ref s2);
+            }
+
+            return s1 | (s2 << 16);
+        }
+    }
+
     private static unsafe void HandleLeftOver(byte* localBufferPtr, uint length, ref uint s1, ref uint s2)
     {
         if (length >= 16)
@@ -286,7 +381,6 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
     {
         uint s1 = adler & 0xFFFF;
         uint s2 = (adler >> 16) & 0xFFFF;
-        uint k;
 
         fixed (byte* bufferPtr = buffer)
         {
@@ -295,7 +389,7 @@ private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer
 
             while (length > 0)
             {
-                k = length < NMAX ? length : NMAX;
+                uint k = length < NMAX ? length : NMAX;
                 length -= k;
 
                 while (k >= 16)
 
@@ -5,6 +5,7 @@
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using ArmCrc32 = System.Runtime.Intrinsics.Arm.Crc32;
 
 namespace SixLabors.ImageSharp.Compression.Zlib;
 
@@ -187,6 +188,100 @@ private static unsafe uint CalculateSse(uint crc, ReadOnlySpan<byte> buffer)
         }
     }
 
+    [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
+    private static unsafe uint CalculateArm(uint crc, ReadOnlySpan<byte> buffer)
+    {
+        fixed (byte* bufferPtr = buffer)
+        {
+            byte* localBufferPtr = bufferPtr;
+            int len = buffer.Length;
+
+            while (len > 0 && ((ulong)localBufferPtr & 3) != 0)
+            {
+                crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
+                len--;
+            }
+
+            uint* intBufferPtr = (uint*)localBufferPtr;
+
+            while (len >= 8 * sizeof(uint))
+            {
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                len -= 8 * sizeof(uint);
+            }
+
+            while (len >= sizeof(uint))
+            {
+                crc = ArmCrc32.ComputeCrc32(crc, *intBufferPtr++);
+                len -= sizeof(uint);
+            }
+
+            localBufferPtr = (byte*)intBufferPtr;
+
+            while (len > 0)
+            {
+                crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
+                len--;
+            }
+
+            return crc;
+        }
+    }
+
+    [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
+    private static unsafe uint CalculateArm64(uint crc, ReadOnlySpan<byte> buffer)
+    {
+        fixed (byte* bufferPtr = buffer)
+        {
+            byte* localBufferPtr = bufferPtr;
+            int len = buffer.Length;
+
+            while (len > 0 && ((ulong)localBufferPtr & 7) != 0)
+            {
+                crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
+                len--;
+            }
+
+            ulong* longBufferPtr = (ulong*)localBufferPtr;
+
+            while (len >= 8 * sizeof(ulong))
+            {
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                len -= 8 * sizeof(ulong);
+            }
+
+            while (len >= sizeof(ulong))
+            {
+                crc = ArmCrc32.Arm64.ComputeCrc32(crc, *longBufferPtr++);
+                len -= sizeof(ulong);
+            }
+
+            localBufferPtr = (byte*)longBufferPtr;
+
+            while (len > 0)
+            {
+                crc = ArmCrc32.ComputeCrc32(crc, *localBufferPtr++);
+                len--;
+            }
+
+            return crc;
+        }
+    }
+
     [MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
     private static uint CalculateScalar(uint crc, ReadOnlySpan<byte> buffer)
     {
 
@@ -4,6 +4,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp.Formats.Png.Filters;
@@ -35,6 +36,10 @@ public static void Decode(Span<byte> scanline, Span<byte> previousScanline, int
         {
             DecodeSse2(scanline, previousScanline);
         }
+        else if (AdvSimd.IsSupported && bytesPerPixel is 4)
+        {
+            DecodeArm(scanline, previousScanline);
+        }
         else
         {
             DecodeScalar(scanline, previousScanline, bytesPerPixel);
@@ -48,7 +53,7 @@ private static void DecodeSse2(Span<byte> scanline, Span<byte> previousScanline)
         ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline);
 
         Vector128<byte> d = Vector128<byte>.Zero;
-        var ones = Vector128.Create((byte)1);
+        Vector128<byte> ones = Vector128.Create((byte)1);
 
         int rb = scanline.Length;
         nint offset = 1;
@@ -75,6 +80,33 @@ private static void DecodeSse2(Span<byte> scanline, Span<byte> previousScanline)
         }
     }
 
+    public static void DecodeArm(Span<byte> scanline, Span<byte> previousScanline)
+    {
+        ref byte scanBaseRef = ref MemoryMarshal.GetReference(scanline);
+        ref byte prevBaseRef = ref MemoryMarshal.GetReference(previousScanline);
+
+        Vector64<byte> d = Vector64<byte>.Zero;
+
+        int rb = scanline.Length;
+        int offset = 1;
+        const int bytesPerBatch = 4;
+        while (rb >= bytesPerBatch)
+        {
+            ref byte scanRef = ref Unsafe.Add(ref scanBaseRef, offset);
+            Vector64<byte> a = d;
+            Vector64<byte> b = Vector64.CreateScalar(Unsafe.As<byte, int>(ref Unsafe.Add(ref prevBaseRef, offset))).AsByte();
+            d = Vector64.CreateScalar(Unsafe.As<byte, int>(ref scanRef)).AsByte();
+
+            Vector64<byte> avg = AdvSimd.FusedAddHalving(a, b);
+            d = AdvSimd.Add(d, avg);
+
+            Unsafe.As<byte, int>(ref scanRef) = d.AsInt32().ToScalar();
+
+            rb -= bytesPerBatch;
+            offset += bytesPerBatch;
+        }
+    }
+
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     private static void DecodeScalar(Span<byte> scanline, Span<byte> previousScanline, int bytesPerPixel)
     {