SixLabors
diff --git a/‎.github/workflows/build-and-test.yml
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-and-test.yml
Lines changed: 0 additions & 2 deletions
diff --git a/‎ci-pack.ps1
Lines changed: 1 addition & 1 deletion b/‎ci-pack.ps1
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ImageSharp/Common/Constants.cs
Lines changed: 1 addition & 1 deletion b/‎src/ImageSharp/Common/Constants.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ImageSharp/Common/Helpers/Numerics.cs
Lines changed: 20 additions & 0 deletions b/‎src/ImageSharp/Common/Helpers/Numerics.cs
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Lines changed: 30 additions & 10 deletions b/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Lines changed: 30 additions & 10 deletions
diff --git a/‎src/ImageSharp/Formats/DecoderOptions.cs
Lines changed: 13 additions & 1 deletion b/‎src/ImageSharp/Formats/DecoderOptions.cs
Lines changed: 13 additions & 1 deletion
diff --git a/‎src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
Lines changed: 1 addition & 1 deletion b/‎src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Lines changed: 116 additions & 0 deletions b/‎src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Lines changed: 116 additions & 0 deletions
@@ -114,15 +114,13 @@ jobs:
         if: ${{ matrix.options.sdk-preview != true }}
         uses: actions/setup-dotnet@v3
         with:
-          include-prerelease: true
           dotnet-version: |
             6.0.x
 
       - name: DotNet Setup Preview
         if: ${{ matrix.options.sdk-preview == true }}
         uses: actions/setup-dotnet@v3
         with:
-          include-prerelease: true
           dotnet-version: |
             7.0.x
 
 
@@ -3,4 +3,4 @@ dotnet clean -c Release
 $repositoryUrl = "https://github.com/$env:GITHUB_REPOSITORY"
 
 # Building for packing and publishing.
-dotnet pack -c Release --output "$PSScriptRoot/artifacts" /p:RepositoryUrl=$repositoryUrl
+dotnet pack -c Release -p:PackageOutputPath="$PSScriptRoot/artifacts" -p:RepositoryUrl=$repositoryUrl
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.
 
 namespace SixLabors.ImageSharp;
 
@@ -5,6 +5,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp;
@@ -808,6 +809,25 @@ public static int ReduceSum(Vector256<int> accumulator)
         return Sse2.ConvertToInt32(vsum);
     }
 
+    /// <summary>
+    /// Reduces elements of the vector into one sum.
+    /// </summary>
+    /// <param name="accumulator">The accumulator to reduce.</param>
+    /// <returns>The sum of all elements.</returns>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    public static int ReduceSumArm(Vector128<uint> accumulator)
+    {
+        if (AdvSimd.Arm64.IsSupported)
+        {
+            Vector64<uint> sum = AdvSimd.Arm64.AddAcross(accumulator);
+            return (int)AdvSimd.Extract(sum, 0);
+        }
+
+        Vector128<ulong> sum2 = AdvSimd.AddPairwiseWidening(accumulator);
+        Vector64<uint> sum3 = AdvSimd.Add(sum2.GetLower().AsUInt32(), sum2.GetUpper().AsUInt32());
+        return (int)AdvSimd.Extract(sum3, 0);
+    }
+
     /// <summary>
     /// Reduces even elements of the vector into one sum.
     /// </summary>
 
@@ -532,7 +532,8 @@ private static void Shuffle4Slice3(
         }
 
         /// <summary>
-        /// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
+        /// Performs a multiplication and an addition of the <see cref="Vector256{Single}"/>.
+        /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
         /// </summary>
         /// <remarks>ret = (vm0 * vm1) + va</remarks>
         /// <param name="va">The vector to add to the intermediate result.</param>
@@ -549,22 +550,21 @@ public static Vector256<float> MultiplyAdd(
             {
                 return Fma.MultiplyAdd(vm1, vm0, va);
             }
-            else
-            {
-                return Avx.Add(Avx.Multiply(vm0, vm1), va);
-            }
+
+            return Avx.Add(Avx.Multiply(vm0, vm1), va);
         }
 
         /// <summary>
-        /// Performs a multiplication and a substraction of the <see cref="Vector256{T}"/>.
+        /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
+        /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
         /// </summary>
         /// <remarks>ret = (vm0 * vm1) - vs</remarks>
-        /// <param name="vs">The vector to substract from the intermediate result.</param>
+        /// <param name="vs">The vector to subtract from the intermediate result.</param>
         /// <param name="vm0">The first vector to multiply.</param>
         /// <param name="vm1">The second vector to multiply.</param>
         /// <returns>The <see cref="Vector256{T}"/>.</returns>
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static Vector256<float> MultiplySubstract(
+        public static Vector256<float> MultiplySubtract(
             in Vector256<float> vs,
             in Vector256<float> vm0,
             in Vector256<float> vm1)
@@ -573,10 +573,30 @@ public static Vector256<float> MultiplySubstract(
             {
                 return Fma.MultiplySubtract(vm1, vm0, vs);
             }
-            else
+
+            return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+        }
+
+        /// <summary>
+        /// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
+        /// </summary>
+        /// <remarks>ret = c - (a * b)</remarks>
+        /// <param name="a">The first vector to multiply.</param>
+        /// <param name="b">The second vector to multiply.</param>
+        /// <param name="c">The vector to add negated to the intermediate result.</param>
+        /// <returns>The <see cref="Vector256{T}"/>.</returns>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        public static Vector256<float> MultiplyAddNegated(
+            in Vector256<float> a,
+            in Vector256<float> b,
+            in Vector256<float> c)
+        {
+            if (Fma.IsSupported)
             {
-                return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+                return Fma.MultiplyAddNegated(a, b, c);
             }
+
+            return Avx.Subtract(c, Avx.Multiply(a, b));
         }
 
         /// <summary>
 
@@ -15,15 +15,25 @@ public sealed class DecoderOptions
 
     private uint maxFrames = int.MaxValue;
 
+    // Used by the FileProvider in the unit tests to set the configuration on the fly.
+#pragma warning disable IDE0032 // Use auto property
+    private Configuration configuration = Configuration.Default;
+#pragma warning restore IDE0032 // Use auto property
+
     /// <summary>
     /// Gets the shared default general decoder options instance.
+    /// Used internally to reduce allocations for default decoding operations.
     /// </summary>
     internal static DecoderOptions Default { get; } = LazyOptions.Value;
 
     /// <summary>
     /// Gets a custom configuration instance to be used by the image processing pipeline.
     /// </summary>
-    public Configuration Configuration { get; internal set; } = Configuration.Default;
+#pragma warning disable IDE0032 // Use auto property
+#pragma warning disable RCS1085 // Use auto-implemented property.
+    public Configuration Configuration { get => this.configuration; init => this.configuration = value; }
+#pragma warning restore RCS1085 // Use auto-implemented property.
+#pragma warning restore IDE0032 // Use auto property
 
     /// <summary>
     /// Gets the target size to decode the image into. Scaling should use an operation equivalent to <see cref="ResizeMode.Max"/>.
@@ -44,4 +54,6 @@ public sealed class DecoderOptions
     /// Gets the maximum number of image frames to decode, inclusive.
     /// </summary>
     public uint MaxFrames { get => this.maxFrames; init => this.maxFrames = Math.Clamp(value, 1, int.MaxValue); }
+
+    internal void SetConfiguration(Configuration configuration) => this.configuration = configuration;
 }
@@ -99,7 +99,7 @@ static void IDCT8x8_1D_Avx(ref Block8x8F block)
 
             var mm256_F_1_4142 = Vector256.Create(1.414213562f);
             Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
-            Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
+            Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
 
             tmp0 = Avx.Add(tmp10, tmp13);
             tmp3 = Avx.Subtract(tmp10, tmp13);
 
@@ -5,6 +5,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 
 // ReSharper disable InconsistentNaming
@@ -26,6 +27,11 @@ public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)
             return Vp8_Sse16xN_Sse2(a, b, 8);
         }
 
+        if (AdvSimd.IsSupported)
+        {
+            return Vp8_Sse16x16_Neon(a, b);
+        }
+
         return Vp8_SseNxN(a, b, 16, 16);
     }
 
@@ -43,6 +49,11 @@ public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)
             return Vp8_Sse16xN_Sse2(a, b, 4);
         }
 
+        if (AdvSimd.IsSupported)
+        {
+            return Vp8_Sse16x8_Neon(a, b);
+        }
+
         return Vp8_SseNxN(a, b, 16, 8);
     }
 
@@ -119,6 +130,11 @@ public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)
             return Numerics.ReduceSum(sum);
         }
 
+        if (AdvSimd.IsSupported)
+        {
+            return Vp8_Sse4x4_Neon(a, b);
+        }
+
         return Vp8_SseNxN(a, b, 4, 4);
     }
 
@@ -199,6 +215,106 @@ private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
         return Numerics.ReduceSum(sum);
     }
 
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static unsafe int Vp8_Sse16x16_Neon(Span<byte> a, Span<byte> b)
+    {
+        Vector128<uint> sum = Vector128<uint>.Zero;
+        fixed (byte* aRef = &MemoryMarshal.GetReference(a))
+        {
+            fixed (byte* bRef = &MemoryMarshal.GetReference(b))
+            {
+                for (int y = 0; y < 16; y++)
+                {
+                    sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
+                }
+            }
+        }
+
+#if NET7_0_OR_GREATER
+        return (int)Vector128.Sum(sum);
+#else
+        return Numerics.ReduceSumArm(sum);
+#endif
+    }
+
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static unsafe int Vp8_Sse16x8_Neon(Span<byte> a, Span<byte> b)
+    {
+        Vector128<uint> sum = Vector128<uint>.Zero;
+        fixed (byte* aRef = &MemoryMarshal.GetReference(a))
+        {
+            fixed (byte* bRef = &MemoryMarshal.GetReference(b))
+            {
+                for (int y = 0; y < 8; y++)
+                {
+                    sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
+                }
+            }
+        }
+
+#if NET7_0_OR_GREATER
+        return (int)Vector128.Sum(sum);
+#else
+        return Numerics.ReduceSumArm(sum);
+#endif
+    }
+
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static int Vp8_Sse4x4_Neon(Span<byte> a, Span<byte> b)
+    {
+        Vector128<byte> a0 = Load4x4Neon(a).AsByte();
+        Vector128<byte> b0 = Load4x4Neon(b).AsByte();
+        Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
+        Vector64<byte> absDiffLower = absDiff.GetLower().AsByte();
+        Vector64<byte> absDiffUpper = absDiff.GetUpper().AsByte();
+        Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
+        Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
+
+        // pair-wise adds and widen.
+        Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
+        Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
+
+        Vector128<uint> sum = AdvSimd.Add(sum1, sum2);
+#if NET7_0_OR_GREATER
+        return (int)Vector128.Sum(sum);
+#else
+        return Numerics.ReduceSumArm(sum);
+#endif
+    }
+
+    // Load all 4x4 pixels into a single Vector128<uint>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static unsafe Vector128<uint> Load4x4Neon(Span<byte> src)
+    {
+        fixed (byte* srcRef = &MemoryMarshal.GetReference(src))
+        {
+            Vector128<uint> output = Vector128<uint>.Zero;
+            output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef);
+            output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps));
+            output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2)));
+            output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3)));
+            return output;
+        }
+    }
+
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static unsafe Vector128<uint> AccumulateSSE16Neon(byte* a, byte* b, Vector128<uint> sum)
+    {
+        Vector128<byte> a0 = AdvSimd.LoadVector128(a);
+        Vector128<byte> b0 = AdvSimd.LoadVector128(b);
+
+        Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
+        Vector64<byte> absDiffLower = absDiff.GetLower();
+        Vector64<byte> absDiffUpper = absDiff.GetUpper();
+        Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
+        Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
+
+        // pair-wise adds and widen.
+        Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
+        Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
+        return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2));
+    }
+
     [MethodImpl(InliningOptions.ShortMethod)]
     private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b)
     {
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// Copyright (c) Six Labors.`
	`1`	`+// Copyright (c) Six Labors.`
`2`	`2`	`// Licensed under the Six Labors Split License.`
`3`	`3`
`4`	`4`	`namespace SixLabors.ImageSharp;`