Revert to FMA, codegen improvements

Sergio0694 · Sergio0694 · commit 941e173b8d49 · 2021-01-19T22:58:43.000+01:00
diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
@@ -71,7 +71,7 @@ public Vector4 Convolve(Span<Vector4> rowSpan)
         public Vector4 ConvolveCore(ref Vector4 rowStartRef)
         {
 #if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
+            if (Fma.IsSupported)
             {
                 float* bufferStart = this.bufferPtr;
                 float* bufferEnd = bufferStart + (this.Length & ~1);
@@ -80,11 +80,20 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
 
                 while (bufferStart < bufferEnd)
                 {
-                    Vector256<float> rowItem256 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
-                    Vector256<float> bufferItem256 = Avx2.PermuteVar8x32(Vector256.Create(*(double*)bufferStart).AsSingle(), mask);
-                    Vector256<float> multiply256 = Avx.Multiply(rowItem256, bufferItem256);
-
-                    result256 = Avx.Add(multiply256, result256);
+                    // It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
+                    // for the FMA operation, and execute it directly on the target register and reading directly from
+                    // memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
+                    // The code below should compile in the following assembly on .NET 5 x64:
+                    //
+                    // vmovsd xmm2, [rax]               ; load *(double*)bufferStart into xmm2 as [ab, _]
+                    // vpermps ymm2, ymm1, ymm2         ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
+                    // vfmadd231ps ymm0, ymm2, [r8]     ; result256 = FMA(pixels, factors) + result256
+                    //
+                    // For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
+                    result256 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
+                        Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
+                        result256);
 
                     bufferStart += 2;
                     rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
@@ -94,11 +103,10 @@ public Vector4 ConvolveCore(ref Vector4 rowStartRef)
 
                 if ((this.Length & 1) != 0)
                 {
-                    Vector128<float> rowItem128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
-                    var bufferItem128 = Vector128.Create(*bufferStart);
-                    Vector128<float> multiply128 = Sse.Multiply(rowItem128, bufferItem128);
-
-                    result128 = Sse.Add(multiply128, result128);
+                    result128 = Fma.MultiplyAdd(
+                        Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
+                        Vector128.Create(*bufferStart),
+                        result128);
                 }
 
                 return *(Vector4*)&result128;