Skip to content

Commit 30bdc29

Browse files
Migrate from Sse to general Vector128 for ZigZag
1 parent 5125a04 commit 30bdc29

File tree

4 files changed

+90
-72
lines changed

4 files changed

+90
-72
lines changed

src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
using System.Runtime.CompilerServices;
55
using System.Runtime.Intrinsics;
6-
using System.Runtime.Intrinsics.X86;
76
using SixLabors.ImageSharp.Common.Helpers;
87

98
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@@ -45,22 +44,20 @@ public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
4544
private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max)
4645
=> Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max));
4746

48-
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
47+
private static void MultiplyIntoInt16Vector128(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
4948
{
50-
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
49+
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
5150

5251
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
5352
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
54-
5553
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
5654

57-
// TODO: We can use the v128 utilities for this.
5855
for (nuint i = 0; i < 16; i += 2)
5956
{
60-
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
61-
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
57+
Vector128<int> left = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
58+
Vector128<int> right = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
6259

63-
Unsafe.Add(ref destBase, i / 2) = Sse2.PackSignedSaturate(left, right);
60+
Unsafe.Add(ref destBase, i / 2) = Vector128_.PackSignedSaturate(left, right);
6461
}
6562
}
6663
}

src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F
121121
}
122122
}
123123

124-
private void TransposeInplace_Avx()
124+
private void TransposeInPlace_Avx()
125125
{
126126
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
127127
Vector256<float> r0 = Avx.InsertVector128(

src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,10 @@ public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8
282282
MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
283283
ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
284284
}
285-
else if (Ssse3.IsSupported)
285+
else if (Vector128.IsHardwareAccelerated)
286286
{
287-
MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
288-
ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
287+
MultiplyIntoInt16Vector128(ref block, ref qt, ref dest);
288+
ZigZag.ApplyTransposingZigZagOrderingVector128(ref dest);
289289
}
290290
else
291291
{
@@ -387,7 +387,7 @@ public void RoundInPlace()
387387
[MethodImpl(InliningOptions.ShortMethod)]
388388
public void LoadFrom(ref Block8x8 source)
389389
{
390-
if (SimdUtils.HasVector8)
390+
if (Avx2.IsSupported)
391391
{
392392
this.LoadFromInt16ExtendedAvx2(ref source);
393393
return;
@@ -483,6 +483,7 @@ public void LoadFromInt16Scalar(ref Block8x8 source)
483483
/// <param name="value">Value to compare to.</param>
484484
public bool EqualsToScalar(int value)
485485
{
486+
// TODO: Can we provide a Vector128 implementation for this?
486487
if (Avx2.IsSupported)
487488
{
488489
const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
@@ -585,10 +586,11 @@ public void TransposeInPlace()
585586
{
586587
if (Avx.IsSupported)
587588
{
588-
this.TransposeInplace_Avx();
589+
this.TransposeInPlace_Avx();
589590
}
590591
else
591592
{
593+
// TODO: Can we provide a Vector128 implementation for this?
592594
this.TransposeInPlace_Scalar();
593595
}
594596
}

src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

Lines changed: 77 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Six Labors Split License.
33

4+
using System.Diagnostics;
5+
using System.Diagnostics.CodeAnalysis;
6+
using System.Runtime.CompilerServices;
47
using System.Runtime.InteropServices;
58
using System.Runtime.Intrinsics;
69
using System.Runtime.Intrinsics.X86;
@@ -17,11 +20,11 @@ internal static partial class ZigZag
1720
#pragma warning restore SA1309
1821

1922
/// <summary>
20-
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3"/>
23+
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingVector128"/>
2124
/// zig zag implementation.
2225
/// </summary>
23-
private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
24-
{
26+
private static ReadOnlySpan<byte> SseShuffleMasks =>
27+
[
2528
#pragma warning disable SA1515
2629
/* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
2730
// A
@@ -83,14 +86,14 @@ internal static partial class ZigZag
8386
// H
8487
_, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, 14, 15,
8588
#pragma warning restore SA1515
86-
};
89+
];
8790

8891
/// <summary>
8992
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
9093
/// zig zag implementation.
9194
/// </summary>
92-
private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
93-
{
95+
private static ReadOnlySpan<byte> AvxShuffleMasks =>
96+
[
9497
#pragma warning disable SA1515
9598
/* 01 */
9699
// [cr] crln_01_AB_CD
@@ -138,15 +141,15 @@ internal static partial class ZigZag
138141
// (in) GH
139142
_, _, _, _, _, _, _, _, 0, 1, 10, 11, 12, 13, 2, 3, _, _, _, _, _, _, 0, 1, 6, 7, 8, 9, 2, 3, 10, 11,
140143
#pragma warning restore SA1515
141-
};
144+
];
142145

143146
/// <summary>
144-
/// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
147+
/// Applies zig zag ordering for given 8x8 matrix using <see cref="Vector128{T}"/> cpu intrinsics.
145148
/// </summary>
146149
/// <param name="block">Input matrix.</param>
147-
public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
150+
public static unsafe void ApplyTransposingZigZagOrderingVector128(ref Block8x8 block)
148151
{
149-
DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
152+
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
150153

151154
fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
152155
{
@@ -160,68 +163,68 @@ public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block
160163
Vector128<byte> rowH = block.V7.AsByte();
161164

162165
// row0 - A0 B0 A1 A2 B1 C0 D0 C1
163-
Vector128<short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
164-
Vector128<short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
165-
Vector128<short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
166-
Vector128<short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
167-
row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
166+
Vector128<short> row0_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 0))).AsInt16();
167+
Vector128<short> row0_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 1))).AsInt16();
168+
Vector128<short> row0_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 2))).AsInt16();
169+
Vector128<short> row0 = row0_A | row0_B | row0_C;
170+
row0 = row0.AsUInt16().WithElement(6, rowD.AsUInt16().GetElement(0)).AsInt16();
168171

169172
// row1 - B2 A3 A4 B3 C2 D1 E0 F0
170-
Vector128<short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
171-
Vector128<short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
172-
Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
173-
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
174-
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
175-
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
176-
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
173+
Vector128<short> row1_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 3))).AsInt16();
174+
Vector128<short> row1_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 4))).AsInt16();
175+
Vector128<short> row1 = row1_A | row1_B;
176+
row1 = row1.AsUInt16().WithElement(4, rowC.AsUInt16().GetElement(2)).AsInt16();
177+
row1 = row1.AsUInt16().WithElement(5, rowD.AsUInt16().GetElement(1)).AsInt16();
178+
row1 = row1.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(0)).AsInt16();
179+
row1 = row1.AsUInt16().WithElement(7, rowF.AsUInt16().GetElement(0)).AsInt16();
177180

178181
// row2 - E1 D2 C3 B4 A5 A6 B5 C4
179-
Vector128<short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
180-
Vector128<short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
181-
Vector128<short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
182-
Vector128<short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
183-
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
184-
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
182+
Vector128<short> row2_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 5))).AsInt16();
183+
Vector128<short> row2_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 6))).AsInt16();
184+
Vector128<short> row2_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 7))).AsInt16();
185+
Vector128<short> row2 = row2_A | row2_B | row2_C;
186+
row2 = row2.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(2)).AsInt16();
187+
row2 = row2.AsUInt16().WithElement(0, rowE.AsUInt16().GetElement(1)).AsInt16();
185188

186189
// row3 - D3 E2 F1 G0 H0 G1 F2 E3
187-
Vector128<short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
188-
Vector128<short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
189-
Vector128<short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
190-
Vector128<short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
191-
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
192-
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
190+
Vector128<short> row3_E = ZShuffle(rowE, Vector128.Load(shuffleVectorsPtr + (16 * 8))).AsInt16();
191+
Vector128<short> row3_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 9))).AsInt16();
192+
Vector128<short> row3_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 10))).AsInt16();
193+
Vector128<short> row3 = row3_E | row3_F | row3_G;
194+
row3 = row3.AsUInt16().WithElement(0, rowD.AsUInt16().GetElement(3)).AsInt16();
195+
row3 = row3.AsUInt16().WithElement(4, rowH.AsUInt16().GetElement(0)).AsInt16();
193196

194197
// row4 - D4 C5 B6 A7 B7 C6 D5 E4
195-
Vector128<short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
196-
Vector128<short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
197-
Vector128<short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
198-
Vector128<short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
199-
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
200-
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
198+
Vector128<short> row4_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 11))).AsInt16();
199+
Vector128<short> row4_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 12))).AsInt16();
200+
Vector128<short> row4_D = ZShuffle(rowD, Vector128.Load(shuffleVectorsPtr + (16 * 13))).AsInt16();
201+
Vector128<short> row4 = row4_B | row4_C | row4_D;
202+
row4 = row4.AsUInt16().WithElement(3, rowA.AsUInt16().GetElement(7)).AsInt16();
203+
row4 = row4.AsUInt16().WithElement(7, rowE.AsUInt16().GetElement(4)).AsInt16();
201204

202205
// row5 - F3 G2 H1 H2 G3 F4 E5 D6
203-
Vector128<short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
204-
Vector128<short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
205-
Vector128<short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
206-
Vector128<short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
207-
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
208-
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
206+
Vector128<short> row5_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 14))).AsInt16();
207+
Vector128<short> row5_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 15))).AsInt16();
208+
Vector128<short> row5_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 16))).AsInt16();
209+
Vector128<short> row5 = row5_F | row5_G | row5_H;
210+
row5 = row5.AsUInt16().WithElement(7, rowD.AsUInt16().GetElement(6)).AsInt16();
211+
row5 = row5.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(5)).AsInt16();
209212

210213
// row6 - C7 D7 E6 F5 G4 H3 H4 G5
211-
Vector128<short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
212-
Vector128<short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
213-
Vector128<short> row6 = Sse2.Or(row6_G, row6_H);
214-
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
215-
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
216-
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
217-
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
214+
Vector128<short> row6_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 17))).AsInt16();
215+
Vector128<short> row6_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 18))).AsInt16();
216+
Vector128<short> row6 = row6_G | row6_H;
217+
row6 = row6.AsUInt16().WithElement(0, rowC.AsUInt16().GetElement(7)).AsInt16();
218+
row6 = row6.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(7)).AsInt16();
219+
row6 = row6.AsUInt16().WithElement(2, rowE.AsUInt16().GetElement(6)).AsInt16();
220+
row6 = row6.AsUInt16().WithElement(3, rowF.AsUInt16().GetElement(5)).AsInt16();
218221

219222
// row7 - F6 E7 F7 G6 H5 H6 G7 H7
220-
Vector128<short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
221-
Vector128<short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
222-
Vector128<short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
223-
Vector128<short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
224-
row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();
223+
Vector128<short> row7_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 19))).AsInt16();
224+
Vector128<short> row7_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 20))).AsInt16();
225+
Vector128<short> row7_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 21))).AsInt16();
226+
Vector128<short> row7 = row7_F | row7_G | row7_H;
227+
row7 = row7.AsUInt16().WithElement(1, rowE.AsUInt16().GetElement(7)).AsInt16();
225228

226229
block.V0 = row0;
227230
block.V1 = row1;
@@ -300,4 +303,20 @@ public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block)
300303
block.V67 = row67.AsInt16();
301304
}
302305
}
306+
307+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
308+
private static Vector128<byte> ZShuffle(Vector128<byte> source, Vector128<byte> mask)
309+
{
310+
// For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
311+
if (Ssse3.IsSupported)
312+
{
313+
return Ssse3.Shuffle(source, mask);
314+
}
315+
316+
// For ARM and WASM, codegen will be optimal.
317+
return Vector128.Shuffle(source, mask);
318+
}
319+
320+
[DoesNotReturn]
321+
private static void ThrowUnreachableException() => throw new UnreachableException();
303322
}

0 commit comments

Comments
 (0)