SixLabors
diff --git a/‎.gitattributes
Lines changed: 3 additions & 2 deletions b/‎.gitattributes
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Numerics.cs
Lines changed: 87 additions & 22 deletions b/‎src/ImageSharp/Common/Helpers/Numerics.cs
Lines changed: 87 additions & 22 deletions
@@ -64,18 +64,19 @@
 # Set explicit file behavior to:
 #   treat as text
 #   normalize to Unix-style line endings and
-#   use a union merge when resoling conflicts
+#   use a union merge when resolving conflicts
 ###############################################################################
 *.csproj            text        eol=lf          merge=union
 *.dbproj            text        eol=lf          merge=union
 *.fsproj            text        eol=lf          merge=union
 *.ncrunchproject    text        eol=lf          merge=union
 *.vbproj            text        eol=lf          merge=union
+*.shproj            text        eol=lf          merge=union
 ###############################################################################
 # Set explicit file behavior to:
 #   treat as text
 #   normalize to Windows-style line endings and
-#   use a union merge when resoling conflicts
+#   use a union merge when resolving conflicts
 ###############################################################################
 *.sln               text        eol=crlf        merge=union
 ###############################################################################
 
@@ -474,21 +474,8 @@ private static void ClampImpl<T>(Span<T> span, T min, T max)
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static void Premultiply(ref Vector4 source)
     {
-        float w = source.W;
-        source *= w;
-        source.W = w;
-    }
-
-    /// <summary>
-    /// Reverses the result of premultiplying a vector via <see cref="Premultiply(ref Vector4)"/>.
-    /// </summary>
-    /// <param name="source">The <see cref="Vector4"/> to premultiply</param>
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static void UnPremultiply(ref Vector4 source)
-    {
-        float w = source.W;
-        source /= w;
-        source.W = w;
+        Vector4 alpha = PermuteW(source);
+        source = WithW(source * alpha, alpha);
     }
 
     /// <summary>
@@ -498,7 +485,7 @@ public static void UnPremultiply(ref Vector4 source)
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static void Premultiply(Span<Vector4> vectors)
     {
-        if (Avx2.IsSupported && vectors.Length >= 2)
+        if (Avx.IsSupported && vectors.Length >= 2)
         {
             // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
             ref Vector256<float> vectorsBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
@@ -507,7 +494,7 @@ public static void Premultiply(Span<Vector4> vectors)
             while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
             {
                 Vector256<float> source = vectorsBase;
-                Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
+                Vector256<float> multiply = Avx.Permute(source, ShuffleAlphaControl);
                 vectorsBase = Avx.Blend(Avx.Multiply(source, multiply), source, BlendAlphaControl);
                 vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
             }
@@ -532,24 +519,47 @@ public static void Premultiply(Span<Vector4> vectors)
         }
     }
 
+    /// <summary>
+    /// Reverses the result of premultiplying a vector via <see cref="Premultiply(ref Vector4)"/>.
+    /// </summary>
+    /// <param name="source">The <see cref="Vector4"/> to premultiply</param>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void UnPremultiply(ref Vector4 source)
+    {
+        Vector4 alpha = PermuteW(source);
+        UnPremultiply(ref source, alpha);
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static void UnPremultiply(ref Vector4 source, Vector4 alpha)
+    {
+        if (alpha == Vector4.Zero)
+        {
+            return;
+        }
+
+        source = WithW(source / alpha, alpha);
+    }
+
     /// <summary>
     /// Bulk variant of <see cref="UnPremultiply(ref Vector4)"/>
     /// </summary>
     /// <param name="vectors">The span of vectors</param>
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static void UnPremultiply(Span<Vector4> vectors)
     {
-        if (Avx2.IsSupported && vectors.Length >= 2)
+        if (Avx.IsSupported && vectors.Length >= 2)
         {
             // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
             ref Vector256<float> vectorsBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
             ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));
+            Vector256<float> epsilon = Vector256.Create(Constants.Epsilon);
 
             while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
             {
                 Vector256<float> source = vectorsBase;
-                Vector256<float> multiply = Avx.Shuffle(source, source, ShuffleAlphaControl);
-                vectorsBase = Avx.Blend(Avx.Divide(source, multiply), source, BlendAlphaControl);
+                Vector256<float> alpha = Avx.Permute(source, ShuffleAlphaControl);
+                vectorsBase = UnPremultiply(source, alpha);
                 vectorsBase = ref Unsafe.Add(ref vectorsBase, 1);
             }
 
@@ -573,6 +583,61 @@ public static void UnPremultiply(Span<Vector4> vectors)
         }
     }
 
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> UnPremultiply(Vector256<float> source, Vector256<float> alpha)
+    {
+        // Check if alpha is zero to avoid division by zero
+        Vector256<float> zeroMask = Avx.CompareEqual(alpha, Vector256<float>.Zero);
+
+        // Divide source by alpha if alpha is nonzero, otherwise set all components to match the source value
+        Vector256<float> result = Avx.BlendVariable(Avx.Divide(source, alpha), source, zeroMask);
+
+        // Blend the result with the alpha vector to ensure that the alpha component is unchanged
+        return Avx.Blend(result, alpha, BlendAlphaControl);
+    }
+
+    /// <summary>
+    /// Permutes the given vector return a new instance with all the values set to <see cref="Vector4.W"/>.
+    /// </summary>
+    /// <param name="value">The vector.</param>
+    /// <returns>The <see cref="Vector4"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector4 PermuteW(Vector4 value)
+    {
+        if (Sse.IsSupported)
+        {
+            return Sse.Shuffle(value.AsVector128(), value.AsVector128(), 0b11111111).AsVector4();
+        }
+
+        return new(value.W);
+    }
+
+    /// <summary>
+    /// Sets the W component of the given vector <paramref name="value"/> to the given value from <paramref name="w"/>.
+    /// </summary>
+    /// <param name="value">The vector to set.</param>
+    /// <param name="w">The vector containing the W value.</param>
+    /// <returns>The <see cref="Vector4"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector4 WithW(Vector4 value, Vector4 w)
+    {
+        if (Sse41.IsSupported)
+        {
+            return Sse41.Insert(value.AsVector128(), w.AsVector128(), 0b11_11_0000).AsVector4();
+        }
+
+        if (Sse.IsSupported)
+        {
+            // Create tmp as <w[3], w[0], value[2], value[0]>
+            // Then return <value[0], value[1], tmp[2], tmp[0]> (which is <value[0], value[1], value[2], w[3]>)
+            Vector128<float> tmp = Sse.Shuffle(w.AsVector128(), value.AsVector128(), 0b00_10_00_11);
+            return Sse.Shuffle(value.AsVector128(), tmp, 0b00_10_01_00).AsVector4();
+        }
+
+        value.W = w.W;
+        return value;
+    }
+
     /// <summary>
     /// Calculates the cube pow of all the XYZ channels of the input vectors.
     /// </summary>
@@ -586,7 +651,7 @@ public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
         while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
         {
             Vector4 v = baseRef;
-            float a = v.W;
+            Vector4 a = PermuteW(v);
 
             // Fast path for the default gamma exposure, which is 3. In this case we can skip
             // calling Math.Pow 3 times (one per component), as the method is an internal call and
@@ -595,7 +660,7 @@ public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
             // back to the target index in the temporary span. The whole iteration will get completely
             // inlined and traslated into vectorized instructions, with much better performance.
             v = v * v * v;
-            v.W = a;
+            v = WithW(v, a);
 
             baseRef = v;
             baseRef = ref Unsafe.Add(ref baseRef, 1);