Skip to content

Commit 6238f00

Browse files
Modernize additional V256 code from review
1 parent 8a23d42 commit 6238f00

File tree

8 files changed

+175
-176
lines changed

8 files changed

+175
-176
lines changed

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -619,29 +619,6 @@ public static Vector256<float> MultiplyAdd(
619619
return va + (vm0 * vm1);
620620
}
621621

622-
/// <summary>
623-
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
624-
/// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
625-
/// </summary>
626-
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
627-
/// <param name="vs">The vector to subtract from the intermediate result.</param>
628-
/// <param name="vm0">The first vector to multiply.</param>
629-
/// <param name="vm1">The second vector to multiply.</param>
630-
/// <returns>The <see cref="Vector256{T}"/>.</returns>
631-
[MethodImpl(InliningOptions.ShortMethod)]
632-
public static Vector256<float> MultiplySubtract(
633-
Vector256<float> vs,
634-
Vector256<float> vm0,
635-
Vector256<float> vm1)
636-
{
637-
if (Fma.IsSupported)
638-
{
639-
return Fma.MultiplySubtract(vm1, vm0, vs);
640-
}
641-
642-
return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
643-
}
644-
645622
/// <summary>
646623
/// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
647624
/// </summary>

src/ImageSharp/Common/Helpers/Vector256Utilities.cs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,28 @@ public static Vector256<float> MultiplyAdd(
140140
return va + (vm0 * vm1);
141141
}
142142

143+
/// <summary>
144+
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
145+
/// </summary>
146+
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
147+
/// <param name="vs">The vector to subtract from the intermediate result.</param>
148+
/// <param name="vm0">The first vector to multiply.</param>
149+
/// <param name="vm1">The second vector to multiply.</param>
150+
/// <returns>The <see cref="Vector256{T}"/>.</returns>
151+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
152+
public static Vector256<float> MultiplySubtract(
153+
Vector256<float> vs,
154+
Vector256<float> vm0,
155+
Vector256<float> vm1)
156+
{
157+
if (Fma.IsSupported)
158+
{
159+
return Fma.MultiplySubtract(vm1, vm0, vs);
160+
}
161+
162+
return (vm0 * vm1) - vs;
163+
}
164+
143165
/// <summary>
144166
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
145167
/// </summary>

src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,10 +211,10 @@ public nint GetLastNonZeroIndex()
211211
}
212212

213213
/// <summary>
214-
/// Transpose the block inplace.
214+
/// Transpose the block in place.
215215
/// </summary>
216216
[MethodImpl(InliningOptions.ShortMethod)]
217-
public void TransposeInplace()
217+
public void TransposeInPlace()
218218
{
219219
ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);
220220

src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs

Lines changed: 0 additions & 142 deletions
This file was deleted.
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Six Labors Split License.
3+
4+
using System.Runtime.Intrinsics;
5+
using SixLabors.ImageSharp.Common.Helpers;
6+
7+
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
8+
9+
internal static partial class FloatingPointDCT
10+
{
11+
/// <summary>
12+
/// Apply floating point FDCT in place using simd operations.
13+
/// </summary>
14+
/// <param name="block">Input block.</param>
15+
private static void FDCT8x8_Vector256(ref Block8x8F block)
16+
{
17+
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
18+
19+
// First pass - process columns
20+
FDCT8x8_1D_Vector256(ref block);
21+
22+
// Second pass - process rows
23+
block.TransposeInPlace();
24+
FDCT8x8_1D_Vector256(ref block);
25+
26+
// Applies 1D floating point FDCT in place
27+
static void FDCT8x8_1D_Vector256(ref Block8x8F block)
28+
{
29+
Vector256<float> tmp0 = block.V256_0 + block.V256_7;
30+
Vector256<float> tmp7 = block.V256_0 - block.V256_7;
31+
Vector256<float> tmp1 = block.V256_1 + block.V256_6;
32+
Vector256<float> tmp6 = block.V256_1 - block.V256_6;
33+
Vector256<float> tmp2 = block.V256_2 + block.V256_5;
34+
Vector256<float> tmp5 = block.V256_2 - block.V256_5;
35+
Vector256<float> tmp3 = block.V256_3 + block.V256_4;
36+
Vector256<float> tmp4 = block.V256_3 - block.V256_4;
37+
38+
// Even part
39+
Vector256<float> tmp10 = tmp0 + tmp3;
40+
Vector256<float> tmp13 = tmp0 - tmp3;
41+
Vector256<float> tmp11 = tmp1 + tmp2;
42+
Vector256<float> tmp12 = tmp1 - tmp2;
43+
44+
block.V256_0 = tmp10 + tmp11;
45+
block.V256_4 = tmp10 - tmp11;
46+
47+
Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
48+
Vector256<float> z1 = (tmp12 + tmp13) * mm256_F_0_7071;
49+
block.V256_2 = tmp13 + z1;
50+
block.V256_6 = tmp13 - z1;
51+
52+
// Odd part
53+
tmp10 = tmp4 + tmp5;
54+
tmp11 = tmp5 + tmp6;
55+
tmp12 = tmp6 + tmp7;
56+
57+
Vector256<float> z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f); // mm256_F_0_3826
58+
Vector256<float> z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
59+
Vector256<float> z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
60+
Vector256<float> z3 = tmp11 * mm256_F_0_7071;
61+
62+
Vector256<float> z11 = tmp7 + z3;
63+
Vector256<float> z13 = tmp7 - z3;
64+
65+
block.V256_5 = z13 + z2;
66+
block.V256_3 = z13 - z2;
67+
block.V256_1 = z11 + z4;
68+
block.V256_7 = z11 - z4;
69+
}
70+
}
71+
72+
/// <summary>
73+
/// Apply floating point IDCT in place using simd operations.
74+
/// </summary>
75+
/// <param name="transposedBlock">Transposed input block.</param>
76+
private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock)
77+
{
78+
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
79+
80+
// First pass - process columns
81+
IDCT8x8_1D_Vector256(ref transposedBlock);
82+
83+
// Second pass - process rows
84+
transposedBlock.TransposeInPlace();
85+
IDCT8x8_1D_Vector256(ref transposedBlock);
86+
87+
// Applies 1D floating point FDCT in place
88+
static void IDCT8x8_1D_Vector256(ref Block8x8F block)
89+
{
90+
// Even part
91+
Vector256<float> tmp0 = block.V256_0;
92+
Vector256<float> tmp1 = block.V256_2;
93+
Vector256<float> tmp2 = block.V256_4;
94+
Vector256<float> tmp3 = block.V256_6;
95+
96+
Vector256<float> z5 = tmp0;
97+
Vector256<float> tmp10 = z5 + tmp2;
98+
Vector256<float> tmp11 = z5 - tmp2;
99+
100+
Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
101+
Vector256<float> tmp13 = tmp1 + tmp3;
102+
Vector256<float> tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142);
103+
104+
tmp0 = tmp10 + tmp13;
105+
tmp3 = tmp10 - tmp13;
106+
tmp1 = tmp11 + tmp12;
107+
tmp2 = tmp11 - tmp12;
108+
109+
// Odd part
110+
Vector256<float> tmp4 = block.V256_1;
111+
Vector256<float> tmp5 = block.V256_3;
112+
Vector256<float> tmp6 = block.V256_5;
113+
Vector256<float> tmp7 = block.V256_7;
114+
115+
Vector256<float> z13 = tmp6 + tmp5;
116+
Vector256<float> z10 = tmp6 - tmp5;
117+
Vector256<float> z11 = tmp4 + tmp7;
118+
Vector256<float> z12 = tmp4 - tmp7;
119+
120+
tmp7 = z11 + z13;
121+
tmp11 = (z11 - z13) * mm256_F_1_4142;
122+
123+
z5 = (z10 + z12) * Vector256.Create(1.847759065f); // mm256_F_1_8477
124+
125+
tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
126+
tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
127+
128+
tmp6 = tmp12 - tmp7;
129+
tmp5 = tmp11 - tmp6;
130+
tmp4 = tmp10 - tmp5;
131+
132+
block.V256_0 = tmp0 + tmp7;
133+
block.V256_7 = tmp0 - tmp7;
134+
block.V256_1 = tmp1 + tmp6;
135+
block.V256_6 = tmp1 - tmp6;
136+
block.V256_2 = tmp2 + tmp5;
137+
block.V256_5 = tmp2 - tmp5;
138+
block.V256_3 = tmp3 + tmp4;
139+
block.V256_4 = tmp3 - tmp4;
140+
}
141+
}
142+
}

0 commit comments

Comments
 (0)