SixLabors
diff --git a/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Lines changed: 0 additions & 23 deletions b/‎src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
Lines changed: 0 additions & 23 deletions
diff --git a/‎src/ImageSharp/Common/Helpers/Vector256Utilities.cs
Lines changed: 22 additions & 0 deletions b/‎src/ImageSharp/Common/Helpers/Vector256Utilities.cs
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
Lines changed: 2 additions & 2 deletions b/‎src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
Lines changed: 0 additions & 142 deletions b/‎src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
Lines changed: 0 additions & 142 deletions
diff --git a/‎src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
Lines changed: 142 additions & 0 deletions b/‎src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
Lines changed: 142 additions & 0 deletions
@@ -619,29 +619,6 @@ public static Vector256<float> MultiplyAdd(
             return va + (vm0 * vm1);
         }
 
-        /// <summary>
-        /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
-        /// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
-        /// </summary>
-        /// <remarks>ret = (vm0 * vm1) - vs</remarks>
-        /// <param name="vs">The vector to subtract from the intermediate result.</param>
-        /// <param name="vm0">The first vector to multiply.</param>
-        /// <param name="vm1">The second vector to multiply.</param>
-        /// <returns>The <see cref="Vector256{T}"/>.</returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static Vector256<float> MultiplySubtract(
-            Vector256<float> vs,
-            Vector256<float> vm0,
-            Vector256<float> vm1)
-        {
-            if (Fma.IsSupported)
-            {
-                return Fma.MultiplySubtract(vm1, vm0, vs);
-            }
-
-            return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
-        }
-
         /// <summary>
         /// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
         /// </summary>
 
@@ -140,6 +140,28 @@ public static Vector256<float> MultiplyAdd(
         return va + (vm0 * vm1);
     }
 
+    /// <summary>
+    /// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
+    /// </summary>
+    /// <remarks>ret = (vm0 * vm1) - vs</remarks>
+    /// <param name="vs">The vector to subtract from the intermediate result.</param>
+    /// <param name="vm0">The first vector to multiply.</param>
+    /// <param name="vm1">The second vector to multiply.</param>
+    /// <returns>The <see cref="Vector256{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<float> MultiplySubtract(
+        Vector256<float> vs,
+        Vector256<float> vm0,
+        Vector256<float> vm1)
+    {
+        if (Fma.IsSupported)
+        {
+            return Fma.MultiplySubtract(vm1, vm0, vs);
+        }
+
+        return (vm0 * vm1) - vs;
+    }
+
     /// <summary>
     /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
     /// </summary>
 
@@ -211,10 +211,10 @@ public nint GetLastNonZeroIndex()
     }
 
     /// <summary>
-    /// Transpose the block inplace.
+    /// Transpose the block in place.
     /// </summary>
     [MethodImpl(InliningOptions.ShortMethod)]
-    public void TransposeInplace()
+    public void TransposeInPlace()
     {
         ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
 
 
@@ -0,0 +1,142 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Common.Helpers;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal static partial class FloatingPointDCT
+{
+    /// <summary>
+    /// Apply floating point FDCT in place using simd operations.
+    /// </summary>
+    /// <param name="block">Input block.</param>
+    private static void FDCT8x8_Vector256(ref Block8x8F block)
+    {
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+        // First pass - process columns
+        FDCT8x8_1D_Vector256(ref block);
+
+        // Second pass - process rows
+        block.TransposeInPlace();
+        FDCT8x8_1D_Vector256(ref block);
+
+        // Applies 1D floating point FDCT in place
+        static void FDCT8x8_1D_Vector256(ref Block8x8F block)
+        {
+            Vector256<float> tmp0 = block.V256_0 + block.V256_7;
+            Vector256<float> tmp7 = block.V256_0 - block.V256_7;
+            Vector256<float> tmp1 = block.V256_1 + block.V256_6;
+            Vector256<float> tmp6 = block.V256_1 - block.V256_6;
+            Vector256<float> tmp2 = block.V256_2 + block.V256_5;
+            Vector256<float> tmp5 = block.V256_2 - block.V256_5;
+            Vector256<float> tmp3 = block.V256_3 + block.V256_4;
+            Vector256<float> tmp4 = block.V256_3 - block.V256_4;
+
+            // Even part
+            Vector256<float> tmp10 = tmp0 + tmp3;
+            Vector256<float> tmp13 = tmp0 - tmp3;
+            Vector256<float> tmp11 = tmp1 + tmp2;
+            Vector256<float> tmp12 = tmp1 - tmp2;
+
+            block.V256_0 = tmp10 + tmp11;
+            block.V256_4 = tmp10 - tmp11;
+
+            Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
+            Vector256<float> z1 = (tmp12 + tmp13) * mm256_F_0_7071;
+            block.V256_2 = tmp13 + z1;
+            block.V256_6 = tmp13 - z1;
+
+            // Odd part
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            Vector256<float> z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f);    // mm256_F_0_3826
+            Vector256<float> z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10);    // mm256_F_0_5411
+            Vector256<float> z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12);    // mm256_F_1_3065
+            Vector256<float> z3 = tmp11 * mm256_F_0_7071;
+
+            Vector256<float> z11 = tmp7 + z3;
+            Vector256<float> z13 = tmp7 - z3;
+
+            block.V256_5 = z13 + z2;
+            block.V256_3 = z13 - z2;
+            block.V256_1 = z11 + z4;
+            block.V256_7 = z11 - z4;
+        }
+    }
+
+    /// <summary>
+    /// Apply floating point IDCT in place using simd operations.
+    /// </summary>
+    /// <param name="transposedBlock">Transposed input block.</param>
+    private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock)
+    {
+        DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
+
+        // First pass - process columns
+        IDCT8x8_1D_Vector256(ref transposedBlock);
+
+        // Second pass - process rows
+        transposedBlock.TransposeInPlace();
+        IDCT8x8_1D_Vector256(ref transposedBlock);
+
+        // Applies 1D floating point FDCT in place
+        static void IDCT8x8_1D_Vector256(ref Block8x8F block)
+        {
+            // Even part
+            Vector256<float> tmp0 = block.V256_0;
+            Vector256<float> tmp1 = block.V256_2;
+            Vector256<float> tmp2 = block.V256_4;
+            Vector256<float> tmp3 = block.V256_6;
+
+            Vector256<float> z5 = tmp0;
+            Vector256<float> tmp10 = z5 + tmp2;
+            Vector256<float> tmp11 = z5 - tmp2;
+
+            Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
+            Vector256<float> tmp13 = tmp1 + tmp3;
+            Vector256<float> tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142);
+
+            tmp0 = tmp10 + tmp13;
+            tmp3 = tmp10 - tmp13;
+            tmp1 = tmp11 + tmp12;
+            tmp2 = tmp11 - tmp12;
+
+            // Odd part
+            Vector256<float> tmp4 = block.V256_1;
+            Vector256<float> tmp5 = block.V256_3;
+            Vector256<float> tmp6 = block.V256_5;
+            Vector256<float> tmp7 = block.V256_7;
+
+            Vector256<float> z13 = tmp6 + tmp5;
+            Vector256<float> z10 = tmp6 - tmp5;
+            Vector256<float> z11 = tmp4 + tmp7;
+            Vector256<float> z12 = tmp4 - tmp7;
+
+            tmp7 = z11 + z13;
+            tmp11 = (z11 - z13) * mm256_F_1_4142;
+
+            z5 = (z10 + z12) * Vector256.Create(1.847759065f);   // mm256_F_1_8477
+
+            tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f));   // mm256_F_n1_0823
+            tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f));   // mm256_F_n2_6131
+
+            tmp6 = tmp12 - tmp7;
+            tmp5 = tmp11 - tmp6;
+            tmp4 = tmp10 - tmp5;
+
+            block.V256_0 = tmp0 + tmp7;
+            block.V256_7 = tmp0 - tmp7;
+            block.V256_1 = tmp1 + tmp6;
+            block.V256_6 = tmp1 - tmp6;
+            block.V256_2 = tmp2 + tmp5;
+            block.V256_5 = tmp2 - tmp5;
+            block.V256_3 = tmp3 + tmp4;
+            block.V256_4 = tmp3 - tmp4;
+        }
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -211,10 +211,10 @@ public nint GetLastNonZeroIndex()`
`211`	`211`	`}`
`212`	`212`
`213`	`213`	`/// <summary>`
`214`		`- /// Transpose the block inplace.`
	`214`	`+ /// Transpose the block in place.`
`215`	`215`	`/// </summary>`
`216`	`216`	`[MethodImpl(InliningOptions.ShortMethod)]`
`217`		`- public void TransposeInplace()`
	`217`	`+ public void TransposeInPlace()`
`218`	`218`	`{`
`219`	`219`	`ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);`
`220`	`220`