Clean up and prep for Vector512 multiply

JimBobSquarePants · JimBobSquarePants · commit 29a56350ce6b · 2025-05-07T10:20:48.000+10:00
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -1012,9 +1012,9 @@ internal static void NormalizedFloatToByteSaturate(
                     Unsafe.Add(ref destinationBase, i) = b;
                 }
             }
-            else if (Sse2.IsSupported || AdvSimd.IsSupported)
+            else if (Vector128.IsHardwareAccelerated)
             {
-                // Sse, AdvSimd
+                // Sse, AdvSimd, etc.
                 DebugVerifySpanInput(source, destination, Vector128<byte>.Count);
 
                 nuint n = destination.Vector128Count<byte>();
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -6,6 +6,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.Wasm;
 using System.Runtime.Intrinsics.X86;
 
 namespace SixLabors.ImageSharp.Common.Helpers;
@@ -270,8 +271,16 @@ public static Vector128<byte> PackUnsignedSaturate(Vector128<short> left, Vector
             return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
         }
 
-        ThrowUnreachableException();
-        return default;
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
+        }
+
+        Vector128<short> min = Vector128.Create((short)byte.MinValue);
+        Vector128<short> max = Vector128.Create((short)byte.MaxValue);
+        Vector128<ushort> lefClamped = Clamp(left, min, max).AsUInt16();
+        Vector128<ushort> rightClamped = Clamp(right, min, max).AsUInt16();
+        return Vector128.Narrow(lefClamped, rightClamped);
     }
 
     /// <summary>
@@ -293,10 +302,30 @@ public static Vector128<short> PackSignedSaturate(Vector128<int> left, Vector128
             return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
         }
 
-        ThrowUnreachableException();
-        return default;
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ConvertNarrowingSaturateSigned(left, right);
+        }
+
+        Vector128<int> min = Vector128.Create((int)short.MinValue);
+        Vector128<int> max = Vector128.Create((int)short.MaxValue);
+        Vector128<int> lefClamped = Clamp(left, min, max);
+        Vector128<int> rightClamped = Clamp(right, min, max);
+        return Vector128.Narrow(lefClamped, rightClamped);
     }
 
+    /// <summary
+    /// >Restricts a vector between a minimum and a maximum value.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the vector.</typeparam>
+    /// <param name="value">The vector to restrict.</param>
+    /// <param name="min">The minimum value.</param>
+    /// <param name="max">The maximum value.</param>
+    /// <returns>The restricted <see cref="Vector128{T}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<T> Clamp<T>(Vector128<T> value, Vector128<T> min, Vector128<T> max)
+        => Vector128.Min(Vector128.Max(value, min), max);
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -64,6 +64,7 @@ private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref
 
         ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
 
+        // TODO: We can use the v128 utilities for this.
         for (nuint i = 0; i < 16; i += 2)
         {
             Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
@@ -5,36 +5,36 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 
-// <auto-generated />
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
 
 internal partial struct Block8x8F
 {
     /// <summary>
     /// Level shift by +maximum/2, clip to [0, maximum]
     /// </summary>
+    /// <param name="maximum">The maximum value to normalize to.</param>
     public void NormalizeColorsInPlace(float maximum)
     {
-        var CMin4 = new Vector4(0F);
-        var CMax4 = new Vector4(maximum);
-        var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F));
-
-        this.V0L = Numerics.Clamp(this.V0L + COff4, CMin4, CMax4);
-        this.V0R = Numerics.Clamp(this.V0R + COff4, CMin4, CMax4);
-        this.V1L = Numerics.Clamp(this.V1L + COff4, CMin4, CMax4);
-        this.V1R = Numerics.Clamp(this.V1R + COff4, CMin4, CMax4);
-        this.V2L = Numerics.Clamp(this.V2L + COff4, CMin4, CMax4);
-        this.V2R = Numerics.Clamp(this.V2R + COff4, CMin4, CMax4);
-        this.V3L = Numerics.Clamp(this.V3L + COff4, CMin4, CMax4);
-        this.V3R = Numerics.Clamp(this.V3R + COff4, CMin4, CMax4);
-        this.V4L = Numerics.Clamp(this.V4L + COff4, CMin4, CMax4);
-        this.V4R = Numerics.Clamp(this.V4R + COff4, CMin4, CMax4);
-        this.V5L = Numerics.Clamp(this.V5L + COff4, CMin4, CMax4);
-        this.V5R = Numerics.Clamp(this.V5R + COff4, CMin4, CMax4);
-        this.V6L = Numerics.Clamp(this.V6L + COff4, CMin4, CMax4);
-        this.V6R = Numerics.Clamp(this.V6R + COff4, CMin4, CMax4);
-        this.V7L = Numerics.Clamp(this.V7L + COff4, CMin4, CMax4);
-        this.V7R = Numerics.Clamp(this.V7R + COff4, CMin4, CMax4);
+        Vector4 min = Vector4.Zero;
+        Vector4 max = new(maximum);
+        Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
+
+        this.V0L = Vector4.Clamp(this.V0L + off, min, max);
+        this.V0R = Vector4.Clamp(this.V0R + off, min, max);
+        this.V1L = Vector4.Clamp(this.V1L + off, min, max);
+        this.V1R = Vector4.Clamp(this.V1R + off, min, max);
+        this.V2L = Vector4.Clamp(this.V2L + off, min, max);
+        this.V2R = Vector4.Clamp(this.V2R + off, min, max);
+        this.V3L = Vector4.Clamp(this.V3L + off, min, max);
+        this.V3R = Vector4.Clamp(this.V3R + off, min, max);
+        this.V4L = Vector4.Clamp(this.V4L + off, min, max);
+        this.V4R = Vector4.Clamp(this.V4R + off, min, max);
+        this.V5L = Vector4.Clamp(this.V5L + off, min, max);
+        this.V5R = Vector4.Clamp(this.V5R + off, min, max);
+        this.V6L = Vector4.Clamp(this.V6L + off, min, max);
+        this.V6R = Vector4.Clamp(this.V6R + off, min, max);
+        this.V7L = Vector4.Clamp(this.V7L + off, min, max);
+        this.V7R = Vector4.Clamp(this.V7R + off, min, max);
     }
 
     /// <summary>
@@ -44,7 +44,7 @@ public void NormalizeColorsInPlace(float maximum)
     [MethodImpl(InliningOptions.ShortMethod)]
     public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
     {
-        Vector256<float> off =  Vector256.Create(MathF.Ceiling(maximum * 0.5F));
+        Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
         Vector256<float> max = Vector256.Create(maximum);
 
         ref Vector256<float> row0 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V0L);
@@ -103,6 +103,7 @@ public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
     /// <summary>
     /// Fill the block from 'source' doing short -> float conversion.
     /// </summary>
+    /// <param name="source">The source block</param>
     public void LoadFromInt16Scalar(ref Block8x8 source)
     {
         ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -159,17 +159,18 @@ public float[] ToArray()
     [MethodImpl(InliningOptions.ShortMethod)]
     public void MultiplyInPlace(float value)
     {
-        if (Avx.IsSupported)
+        // TODO: Vector512
+        if (Vector256.IsHardwareAccelerated)
         {
             Vector256<float> valueVec = Vector256.Create(value);
-            this.V0 = Avx.Multiply(this.V0, valueVec);
-            this.V1 = Avx.Multiply(this.V1, valueVec);
-            this.V2 = Avx.Multiply(this.V2, valueVec);
-            this.V3 = Avx.Multiply(this.V3, valueVec);
-            this.V4 = Avx.Multiply(this.V4, valueVec);
-            this.V5 = Avx.Multiply(this.V5, valueVec);
-            this.V6 = Avx.Multiply(this.V6, valueVec);
-            this.V7 = Avx.Multiply(this.V7, valueVec);
+            this.V0 *= valueVec;
+            this.V1 *= valueVec;
+            this.V2 *= valueVec;
+            this.V3 *= valueVec;
+            this.V4 *= valueVec;
+            this.V5 *= valueVec;
+            this.V6 *= valueVec;
+            this.V7 *= valueVec;
         }
         else
         {
@@ -200,16 +201,17 @@ public void MultiplyInPlace(float value)
     [MethodImpl(InliningOptions.ShortMethod)]
     public unsafe void MultiplyInPlace(ref Block8x8F other)
     {
-        if (Avx.IsSupported)
+        // TODO: Vector512
+        if (Vector256.IsHardwareAccelerated)
         {
-            this.V0 = Avx.Multiply(this.V0, other.V0);
-            this.V1 = Avx.Multiply(this.V1, other.V1);
-            this.V2 = Avx.Multiply(this.V2, other.V2);
-            this.V3 = Avx.Multiply(this.V3, other.V3);
-            this.V4 = Avx.Multiply(this.V4, other.V4);
-            this.V5 = Avx.Multiply(this.V5, other.V5);
-            this.V6 = Avx.Multiply(this.V6, other.V6);
-            this.V7 = Avx.Multiply(this.V7, other.V7);
+            this.V0 *= other.V0;
+            this.V1 *= other.V1;
+            this.V2 *= other.V2;
+            this.V3 *= other.V3;
+            this.V4 *= other.V4;
+            this.V5 *= other.V5;
+            this.V6 *= other.V6;
+            this.V7 *= other.V7;
         }
         else
         {
@@ -239,17 +241,18 @@ public unsafe void MultiplyInPlace(ref Block8x8F other)
     [MethodImpl(InliningOptions.ShortMethod)]
     public void AddInPlace(float value)
     {
-        if (Avx.IsSupported)
+        // TODO: Vector512
+        if (Vector256.IsHardwareAccelerated)
         {
             Vector256<float> valueVec = Vector256.Create(value);
-            this.V0 = Avx.Add(this.V0, valueVec);
-            this.V1 = Avx.Add(this.V1, valueVec);
-            this.V2 = Avx.Add(this.V2, valueVec);
-            this.V3 = Avx.Add(this.V3, valueVec);
-            this.V4 = Avx.Add(this.V4, valueVec);
-            this.V5 = Avx.Add(this.V5, valueVec);
-            this.V6 = Avx.Add(this.V6, valueVec);
-            this.V7 = Avx.Add(this.V7, valueVec);
+            this.V0 += valueVec;
+            this.V1 += valueVec;
+            this.V2 += valueVec;
+            this.V3 += valueVec;
+            this.V4 += valueVec;
+            this.V5 += valueVec;
+            this.V6 += valueVec;
+            this.V7 += valueVec;
         }
         else
         {
@@ -509,26 +512,26 @@ public override string ToString()
     }
 
     /// <summary>
-    /// Transpose the block inplace.
+    /// Transpose the block in-place.
     /// </summary>
     [MethodImpl(InliningOptions.ShortMethod)]
-    public void TransposeInplace()
+    public void TransposeInPlace()
     {
         if (Avx.IsSupported)
         {
             this.TransposeInplace_Avx();
         }
         else
         {
-            this.TransposeInplace_Scalar();
+            this.TransposeInPlace_Scalar();
         }
     }
 
     /// <summary>
-    /// Scalar inplace transpose implementation for <see cref="TransposeInplace"/>
+    /// Scalar in-place transpose implementation for <see cref="TransposeInPlace"/>
     /// </summary>
     [MethodImpl(InliningOptions.ShortMethod)]
-    private void TransposeInplace_Scalar()
+    private void TransposeInPlace_Scalar()
     {
         ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
@@ -20,7 +20,7 @@ private static void FDCT8x8_Avx(ref Block8x8F block)
         FDCT8x8_1D_Avx(ref block);
 
         // Second pass - process rows
-        block.TransposeInplace();
+        block.TransposeInPlace();
         FDCT8x8_1D_Avx(ref block);
 
         // Applies 1D floating point FDCT inplace
@@ -81,7 +81,7 @@ private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
         IDCT8x8_1D_Avx(ref transposedBlock);
 
         // Second pass - process rows
-        transposedBlock.TransposeInplace();
+        transposedBlock.TransposeInPlace();
         IDCT8x8_1D_Avx(ref transposedBlock);
 
         // Applies 1D floating point FDCT inplace
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
@@ -77,7 +77,7 @@ public static void AdjustToIDCT(ref Block8x8F quantTable)
 
         // Spectral macroblocks are transposed before quantization
         // so we must transpose quantization table
-        quantTable.TransposeInplace();
+        quantTable.TransposeInPlace();
     }
 
     /// <summary>
@@ -97,7 +97,7 @@ public static void AdjustToFDCT(ref Block8x8F quantTable)
         // Spectral macroblocks are not transposed before quantization
         // Transpose is done after quantization at zig-zag stage
         // so we must transpose quantization table
-        quantTable.TransposeInplace();
+        quantTable.TransposeInPlace();
     }
 
     /// <summary>
@@ -155,7 +155,7 @@ private static void IDCT_Vector4(ref Block8x8F transposedBlock)
         IDCT8x4_Vector4(ref transposedBlock.V0R);
 
         // Second pass - process rows
-        transposedBlock.TransposeInplace();
+        transposedBlock.TransposeInPlace();
         IDCT8x4_Vector4(ref transposedBlock.V0L);
         IDCT8x4_Vector4(ref transposedBlock.V0R);
 
@@ -225,7 +225,7 @@ private static void FDCT_Vector4(ref Block8x8F block)
         FDCT8x4_Vector4(ref block.V0R);
 
         // Second pass - process rows
-        block.TransposeInplace();
+        block.TransposeInPlace();
         FDCT8x4_Vector4(ref block.V0L);
         FDCT8x4_Vector4(ref block.V0R);
 
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
@@ -48,7 +48,7 @@ public static void AdjustToIDCT(ref Block8x8F quantTable)
 
         // Spectral macroblocks are transposed before quantization
         // so we must transpose quantization table
-        quantTable.TransposeInplace();
+        quantTable.TransposeInPlace();
     }
 
     /// <summary>
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
@@ -14,7 +14,7 @@ public class Block8x8F_Transpose
     [Benchmark]
     public float TransposeInplace()
     {
-        this.source.TransposeInplace();
+        this.source.TransposeInPlace();
         return this.source[0];
     }
 
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -130,7 +130,7 @@ static void RunTest()
 
             Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData());
 
-            block8x8.TransposeInplace();
+            block8x8.TransposeInPlace();
 
             float[] actual = new float[64];
             block8x8.ScaledCopyTo(actual);
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

Original file line number	Diff line number	Diff line change
`@@ -1012,9 +1012,9 @@ internal static void NormalizedFloatToByteSaturate(`
`1012`	`1012`	`Unsafe.Add(ref destinationBase, i) = b;`
`1013`	`1013`	`}`
`1014`	`1014`	`}`
`1015`		`- else if (Sse2.IsSupported \|\| AdvSimd.IsSupported)`
	`1015`	`+ else if (Vector128.IsHardwareAccelerated)`
`1016`	`1016`	`{`
`1017`		`- // Sse, AdvSimd`
	`1017`	`+ // Sse, AdvSimd, etc.`
`1018`	`1018`	`DebugVerifySpanInput(source, destination, Vector128<byte>.Count);`
`1019`	`1019`
`1020`	`1020`	`nuint n = destination.Vector128Count<byte>();`
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref`
`64`	`64`
`65`	`65`	`ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);`
`66`	`66`
	`67`	`+ // TODO: We can use the v128 utilities for this.`
`67`	`68`	`for (nuint i = 0; i < 16; i += 2)`
`68`	`69`	`{`
`69`	`70`	`Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ public static void AdjustToIDCT(ref Block8x8F quantTable)`
`48`	`48`
`49`	`49`	`// Spectral macroblocks are transposed before quantization`
`50`	`50`	`// so we must transpose quantization table`
`51`		`- quantTable.TransposeInplace();`
	`51`	`+ quantTable.TransposeInPlace();`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`/// <summary>`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ public class Block8x8F_Transpose`
`14`	`14`	`[Benchmark]`
`15`	`15`	`public float TransposeInplace()`
`16`	`16`	`{`
`17`		`- this.source.TransposeInplace();`
	`17`	`+ this.source.TransposeInPlace();`
`18`	`18`	`return this.source[0];`
`19`	`19`	`}`
`20`	`20`