Skip to content

Commit ae0a51c

Browse files
Merge pull request #2359 from SixLabors/js/avx2-porter-duff
Enable Avx2 optimizations on Porter-Duff operations.
2 parents 3dd1d9d + 9752566 commit ae0a51c

File tree

14 files changed

+10042
-698
lines changed

14 files changed

+10042
-698
lines changed

src/ImageSharp/Common/Constants.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) Six Labors.
1+
// Copyright (c) Six Labors.
22
// Licensed under the Six Labors Split License.
33

44
namespace SixLabors.ImageSharp;

src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,8 @@ private static void Shuffle4Slice3(
532532
}
533533

534534
/// <summary>
535-
/// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
535+
/// Performs a multiplication and an addition of the <see cref="Vector256{Single}"/>.
536+
/// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
536537
/// </summary>
537538
/// <remarks>ret = (vm0 * vm1) + va</remarks>
538539
/// <param name="va">The vector to add to the intermediate result.</param>
@@ -549,22 +550,21 @@ public static Vector256<float> MultiplyAdd(
549550
{
550551
return Fma.MultiplyAdd(vm1, vm0, va);
551552
}
552-
else
553-
{
554-
return Avx.Add(Avx.Multiply(vm0, vm1), va);
555-
}
553+
554+
return Avx.Add(Avx.Multiply(vm0, vm1), va);
556555
}
557556

558557
/// <summary>
559-
/// Performs a multiplication and a substraction of the <see cref="Vector256{T}"/>.
558+
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
559+
/// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
560560
/// </summary>
561561
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
562-
/// <param name="vs">The vector to substract from the intermediate result.</param>
562+
/// <param name="vs">The vector to subtract from the intermediate result.</param>
563563
/// <param name="vm0">The first vector to multiply.</param>
564564
/// <param name="vm1">The second vector to multiply.</param>
565565
/// <returns>The <see cref="Vector256{T}"/>.</returns>
566566
[MethodImpl(InliningOptions.ShortMethod)]
567-
public static Vector256<float> MultiplySubstract(
567+
public static Vector256<float> MultiplySubtract(
568568
in Vector256<float> vs,
569569
in Vector256<float> vm0,
570570
in Vector256<float> vm1)
@@ -573,10 +573,30 @@ public static Vector256<float> MultiplySubstract(
573573
{
574574
return Fma.MultiplySubtract(vm1, vm0, vs);
575575
}
576-
else
576+
577+
return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
578+
}
579+
580+
/// <summary>
581+
/// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
582+
/// </summary>
583+
/// <remarks>ret = c - (a * b)</remarks>
584+
/// <param name="a">The first vector to multiply.</param>
585+
/// <param name="b">The second vector to multiply.</param>
586+
/// <param name="c">The vector to add negated to the intermediate result.</param>
587+
/// <returns>The <see cref="Vector256{T}"/>.</returns>
588+
[MethodImpl(InliningOptions.ShortMethod)]
589+
public static Vector256<float> MultiplyAddNegated(
590+
in Vector256<float> a,
591+
in Vector256<float> b,
592+
in Vector256<float> c)
593+
{
594+
if (Fma.IsSupported)
577595
{
578-
return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
596+
return Fma.MultiplyAddNegated(a, b, c);
579597
}
598+
599+
return Avx.Subtract(c, Avx.Multiply(a, b));
580600
}
581601

582602
/// <summary>

src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ static void IDCT8x8_1D_Avx(ref Block8x8F block)
9999

100100
var mm256_F_1_4142 = Vector256.Create(1.414213562f);
101101
Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
102-
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
102+
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
103103

104104
tmp0 = Avx.Add(tmp10, tmp13);
105105
tmp3 = Avx.Subtract(tmp10, tmp13);

src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.cs

Lines changed: 7676 additions & 436 deletions
Large diffs are not rendered by default.

src/ImageSharp/PixelFormats/PixelBlenders/DefaultPixelBlenders.Generated.tt

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313

1414
// <auto-generated />
1515
using System.Numerics;
16+
using System.Runtime.CompilerServices;
17+
using System.Runtime.InteropServices;
18+
using System.Runtime.Intrinsics;
19+
using System.Runtime.Intrinsics.X86;
1620

1721
namespace SixLabors.ImageSharp.PixelFormats.PixelBlenders;
1822

@@ -86,18 +90,85 @@ var blenders = new []{
8690
protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, ReadOnlySpan<Vector4> source, float amount)
8791
{
8892
amount = Numerics.Clamp(amount, 0, 1);
89-
for (int i = 0; i < destination.Length; i++)
93+
94+
if (Avx2.IsSupported && destination.Length >= 2)
9095
{
91-
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
96+
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
97+
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
98+
ref Vector256<float> destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
99+
100+
ref Vector256<float> backgroundBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(background));
101+
ref Vector256<float> sourceBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(source));
102+
Vector256<float> opacity = Vector256.Create(amount);
103+
104+
while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
105+
{
106+
destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
107+
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
108+
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
109+
sourceBase = ref Unsafe.Add(ref sourceBase, 1);
110+
}
111+
112+
if (Numerics.Modulo2(destination.Length) != 0)
113+
{
114+
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
115+
int i = destination.Length - 1;
116+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
117+
}
118+
}
119+
else
120+
{
121+
for (int i = 0; i < destination.Length; i++)
122+
{
123+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], amount);
124+
}
92125
}
93126
}
94127

95128
/// <inheritdoc />
96129
protected override void BlendFunction(Span<Vector4> destination, ReadOnlySpan<Vector4> background, ReadOnlySpan<Vector4> source, ReadOnlySpan<float> amount)
97130
{
98-
for (int i = 0; i < destination.Length; i++)
131+
if (Avx2.IsSupported && destination.Length >= 2)
132+
{
133+
// Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
134+
ref Vector256<float> destinationBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
135+
ref Vector256<float> destinationLast = ref Unsafe.Add(ref destinationBase, (IntPtr)((uint)destination.Length / 2u));
136+
137+
ref Vector256<float> backgroundBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(background));
138+
ref Vector256<float> sourceBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(source));
139+
ref float amountBase = ref MemoryMarshal.GetReference(amount);
140+
141+
Vector256<float> vOne = Vector256.Create(1F);
142+
143+
while (Unsafe.IsAddressLessThan(ref destinationBase, ref destinationLast))
144+
{
145+
// We need to create a Vector256<float> containing the current and next amount values
146+
// taking up each half of the Vector256<float> and then clamp them.
147+
Vector256<float> opacity = Vector256.Create(
148+
Vector128.Create(amountBase),
149+
Vector128.Create(Unsafe.Add(ref amountBase, 1)));
150+
opacity = Avx.Min(Avx.Max(Vector256<float>.Zero, opacity), vOne);
151+
152+
destinationBase = PorterDuffFunctions.<#=blender_composer#>(backgroundBase, sourceBase, opacity);
153+
destinationBase = ref Unsafe.Add(ref destinationBase, 1);
154+
backgroundBase = ref Unsafe.Add(ref backgroundBase, 1);
155+
sourceBase = ref Unsafe.Add(ref sourceBase, 1);
156+
amountBase = ref Unsafe.Add(ref amountBase, 2);
157+
}
158+
159+
if (Numerics.Modulo2(destination.Length) != 0)
160+
{
161+
// Vector4 fits neatly in pairs. Any overlap has to be equal to 1.
162+
int i = destination.Length - 1;
163+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
164+
}
165+
}
166+
else
99167
{
100-
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1));
168+
for (int i = 0; i < destination.Length; i++)
169+
{
170+
destination[i] = PorterDuffFunctions.<#=blender_composer#>(background[i], source[i], Numerics.Clamp(amount[i], 0, 1F));
171+
}
101172
}
102173
}
103174
}

0 commit comments

Comments
 (0)