SixLabors
diff --git a/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Lines changed: 114 additions & 122 deletions b/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Lines changed: 114 additions & 122 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Vector128Utilities.cs
Lines changed: 77 additions & 27 deletions b/‎src/ImageSharp/Common/Helpers/Vector128Utilities.cs
Lines changed: 77 additions & 27 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Vector256Utilities.cs
Lines changed: 78 additions & 20 deletions b/‎src/ImageSharp/Common/Helpers/Vector256Utilities.cs
Lines changed: 78 additions & 20 deletions
@@ -4,8 +4,10 @@
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp.Common.Helpers;
@@ -18,30 +20,36 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 /// </list>
 /// Should only be used if the intrinsics are available.
 /// </summary>
-internal static class Vector128Utilities
+#pragma warning disable SA1649 // File name should match first type name
+internal static class Vector128_
+#pragma warning restore SA1649 // File name should match first type name
 {
     /// <summary>
     /// Gets a value indicating whether shuffle operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Sse.IsSupported;
-    }
+        get
+        {
+            if (Vector128.IsHardwareAccelerated)
+            {
+                if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64)
+                {
+                    return Ssse3.IsSupported;
+                }
 
-    /// <summary>
-    /// Gets a value indicating whether shuffle operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleByte
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported;
+                return true;
+            }
+
+            return false;
+        }
     }
 
     /// <summary>
     /// Gets a value indicating whether right align operations are supported.
     /// </summary>
-    public static bool SupportsRightAlign
+    public static bool SupportsAlignRight
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Ssse3.IsSupported || AdvSimd.IsSupported;
@@ -63,15 +71,21 @@ public static bool SupportsShiftByte
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector128{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpected] byte control)
+    public static Vector128<float> ShuffleNative(Vector128<float> vector, [ConstantExpected] byte control)
     {
         if (Sse.IsSupported)
         {
             return Sse.Shuffle(vector, vector, control);
         }
 
-        ThrowUnreachableException();
-        return default;
+        // Don't use InverseMMShuffle here as we want to avoid the cast.
+        Vector128<int> indices = Vector128.Create(
+            control & 0x3,
+            (control >> 2) & 0x3,
+            (control >> 4) & 0x3,
+            (control >> 6) & 0x3);
+
+        return Vector128.Shuffle(vector, indices);
     }
 
     /// <summary>
@@ -86,20 +100,18 @@ public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpecte
     /// A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.
     /// </returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
+    public static Vector128<byte> ShuffleNative(Vector128<byte> vector, Vector128<byte> indices)
     {
+        // For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
         if (Ssse3.IsSupported)
         {
             return Ssse3.Shuffle(vector, indices);
         }
 
-        if (AdvSimd.Arm64.IsSupported)
-        {
-            return AdvSimd.Arm64.VectorTableLookup(vector, indices);
-        }
-
-        ThrowUnreachableException();
-        return default;
+        // For ARM and WASM, codegen will be optimal.
+        // We don't throw for x86/x64 so we should never use this method without
+        // checking for support.
+        return Vector128.Shuffle(vector, indices);
     }
 
     /// <summary>
@@ -193,6 +205,11 @@ public static Vector128<int> ConvertToInt32RoundToEven(Vector128<float> vector)
             return AdvSimd.ConvertToInt32RoundToEven(vector);
         }
 
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertToInt32Saturate(PackedSimd.RoundToNearest(vector));
+        }
+
         Vector128<float> sign = vector & Vector128.Create(-0F);
         Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608F);
 
@@ -218,6 +235,11 @@ public static Vector128<float> RoundToNearestInteger(Vector128<float> vector)
             return AdvSimd.RoundToNearest(vector);
         }
 
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.RoundToNearest(vector);
+        }
+
         Vector128<float> sign = vector & Vector128.Create(-0F);
         Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608F);
 
@@ -270,8 +292,16 @@ public static Vector128<byte> PackUnsignedSaturate(Vector128<short> left, Vector
             return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
         }
 
-        ThrowUnreachableException();
-        return default;
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
+        }
+
+        Vector128<short> min = Vector128.Create((short)byte.MinValue);
+        Vector128<short> max = Vector128.Create((short)byte.MaxValue);
+        Vector128<ushort> lefClamped = Clamp(left, min, max).AsUInt16();
+        Vector128<ushort> rightClamped = Clamp(right, min, max).AsUInt16();
+        return Vector128.Narrow(lefClamped, rightClamped);
     }
 
     /// <summary>
@@ -293,10 +323,30 @@ public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128
             return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
         }
 
-        ThrowUnreachableException();
-        return default;
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertNarrowingSaturateSigned(left, right);
+        }
+
+        Vector128<int> min = Vector128.Create((int)short.MinValue);
+        Vector128<int> max = Vector128.Create((int)short.MaxValue);
+        Vector128<int> lefClamped = Clamp(left, min, max);
+        Vector128<int> rightClamped = Clamp(right, min, max);
+        return Vector128.Narrow(lefClamped, rightClamped);
     }
 
+    /// <summary>
+    /// Restricts a vector between a minimum and a maximum value.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+    /// <param name="value">The vector to restrict.</param>
+    /// <param name="min">The minimum value.</param>
+    /// <param name="max">The maximum value.</param>
+    /// <returns>The restricted <see cref="Vector128{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<T> Clamp<T>(Vector128<T> value, Vector128<T> min, Vector128<T> max)
+        => Vector128.Min(Vector128.Max(value, min), max);
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
@@ -17,21 +17,23 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 /// </list>
 /// Should only be used if the intrinsics are available.
 /// </summary>
-internal static class Vector256Utilities
+#pragma warning disable SA1649 // File name should match first type name
+internal static class Vector256_
+#pragma warning restore SA1649 // File name should match first type name
 {
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
     /// </summary>
-    public static bool SupportsShuffleFloat
+    public static bool SupportsShuffleNativeFloat
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx.IsSupported || Sse.IsSupported;
+        get => Avx.IsSupported;
     }
 
     /// <summary>
     /// Gets a value indicating whether shuffle byte operations are supported.
     /// </summary>
-    public static bool SupportsShuffleByte
+    public static bool SupportsShuffleNativeByte
     {
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         get => Avx2.IsSupported;
@@ -44,20 +46,13 @@ public static bool SupportsShuffleByte
     /// <param name="control">The shuffle control byte.</param>
     /// <returns>The <see cref="Vector256{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpected] byte control)
+    public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
     {
         if (Avx.IsSupported)
         {
             return Avx.Shuffle(vector, vector, control);
         }
 
-        if (Sse.IsSupported)
-        {
-            Vector128<float> lower = vector.GetLower();
-            Vector128<float> upper = vector.GetUpper();
-            return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control));
-        }
-
         ThrowUnreachableException();
         return default;
     }
@@ -71,7 +66,7 @@ public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpecte
     /// </param>
     /// <returns>The <see cref="Vector256{Single}"/>.</returns>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<byte> Shuffle(Vector256<byte> vector, Vector256<byte> indices)
+    public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
     {
         if (Avx2.IsSupported)
         {
@@ -96,13 +91,6 @@ public static Vector256<int> ConvertToInt32RoundToEven(Vector256<float> vector)
             return Avx.ConvertToVector256Int32(vector);
         }
 
-        if (Sse2.IsSupported)
-        {
-            Vector128<int> lower = Sse2.ConvertToVector128Int32(vector.GetLower());
-            Vector128<int> upper = Sse2.ConvertToVector128Int32(vector.GetUpper());
-            return Vector256.Create(lower, upper);
-        }
-
         Vector256<float> sign = vector & Vector256.Create(-0F);
         Vector256<float> val_2p23_f32 = sign | Vector256.Create(8388608F);
 
@@ -152,6 +140,76 @@ public static Vector256<float> MultiplyAdd(
         return va + (vm0 * vm1);
     }
 
+    /// <summary>
+    /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
+    /// </summary>
+    /// <remarks>ret = (vm0 * vm1) - vs</remarks>
+    /// <param name="vs">The vector to subtract from the intermediate result.</param>
+    /// <param name="vm0">The first vector to multiply.</param>
+    /// <param name="vm1">The second vector to multiply.</param>
+    /// <returns>The <see cref="Vector256{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplySubtract(
+        Vector256<float> vs,
+        Vector256<float> vm0,
+        Vector256<float> vm1)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplySubtract(vm1, vm0, vs);
+        }
+
+        return (vm0 * vm1) - vs;
+    }
+
+    /// <summary>
+    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector256{Int16}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> PackSignedSaturate(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.PackSignedSaturate(left, right);
+        }
+
+        Vector256<int> min = Vector256.Create((int)short.MinValue);
+        Vector256<int> max = Vector256.Create((int)short.MaxValue);
+        Vector256<int> lefClamped = Clamp(left, min, max);
+        Vector256<int> rightClamped = Clamp(right, min, max);
+        return Vector256.Narrow(lefClamped, rightClamped);
+    }
+
+    /// <summary>
+    /// Restricts a vector between a minimum and a maximum value.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+    /// <param name="value">The vector to restrict.</param>
+    /// <param name="min">The minimum value.</param>
+    /// <param name="max">The maximum value.</param>
+    /// <returns>The restricted <see cref="Vector256{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<T> Clamp<T>(Vector256<T> value, Vector256<T> min, Vector256<T> max)
+        => Vector256.Min(Vector256.Max(value, min), max);
+
+    /// <summary>
+    /// Widens a <see cref="Vector128{Int16}"/> to a <see cref="Vector256{Int32}"/>.
+    /// </summary>
+    /// <param name="value">The vector to widen.</param>
+    /// <returns>The widened <see cref="Vector256{Int32}"/>.</returns>
+    public static Vector256<int> Widen(Vector128<short> value)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.ConvertToVector256Int32(value);
+        }
+
+        return Vector256.WidenLower(value.ToVector256());
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }