1
1
// Copyright (c) Six Labors.
2
2
// Licensed under the Six Labors Split License.
3
3
4
+ using System . Diagnostics ;
5
+ using System . Diagnostics . CodeAnalysis ;
6
+ using System . Runtime . CompilerServices ;
4
7
using System . Runtime . InteropServices ;
5
8
using System . Runtime . Intrinsics ;
6
9
using System . Runtime . Intrinsics . X86 ;
@@ -17,11 +20,11 @@ internal static partial class ZigZag
17
20
#pragma warning restore SA1309
18
21
19
22
/// <summary>
20
- /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3 "/>
23
+ /// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingVector128 "/>
21
24
/// zig zag implementation.
22
25
/// </summary>
23
- private static ReadOnlySpan < byte > SseShuffleMasks => new byte [ ]
24
- {
26
+ private static ReadOnlySpan < byte > SseShuffleMasks =>
27
+ [
25
28
#pragma warning disable SA1515
26
29
/* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
27
30
// A
@@ -83,14 +86,14 @@ internal static partial class ZigZag
83
86
// H
84
87
_ , _ , _ , _ , _ , _ , _ , _ , 10 , 11 , 12 , 13 , _ , _ , 14 , 15 ,
85
88
#pragma warning restore SA1515
86
- } ;
89
+ ] ;
87
90
88
91
/// <summary>
89
92
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
90
93
/// zig zag implementation.
91
94
/// </summary>
92
- private static ReadOnlySpan < byte > AvxShuffleMasks => new byte [ ]
93
- {
95
+ private static ReadOnlySpan < byte > AvxShuffleMasks =>
96
+ [
94
97
#pragma warning disable SA1515
95
98
/* 01 */
96
99
// [cr] crln_01_AB_CD
@@ -138,15 +141,15 @@ internal static partial class ZigZag
138
141
// (in) GH
139
142
_ , _ , _ , _ , _ , _ , _ , _ , 0 , 1 , 10 , 11 , 12 , 13 , 2 , 3 , _ , _ , _ , _ , _ , _ , 0 , 1 , 6 , 7 , 8 , 9 , 2 , 3 , 10 , 11 ,
140
143
#pragma warning restore SA1515
141
- } ;
144
+ ] ;
142
145
143
146
/// <summary>
144
- /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
147
+ /// Applies zig zag ordering for given 8x8 matrix using <see cref="Vector128{T}"/> cpu intrinsics.
145
148
/// </summary>
146
149
/// <param name="block">Input matrix.</param>
147
- public static unsafe void ApplyTransposingZigZagOrderingSsse3 ( ref Block8x8 block )
150
+ public static unsafe void ApplyTransposingZigZagOrderingVector128 ( ref Block8x8 block )
148
151
{
149
- DebugGuard . IsTrue ( Ssse3 . IsSupported , "Ssse3 support is required to run this operation!" ) ;
152
+ DebugGuard . IsTrue ( Vector128 . IsHardwareAccelerated , "Vector128 support is required to run this operation!" ) ;
150
153
151
154
fixed ( byte * shuffleVectorsPtr = & MemoryMarshal . GetReference ( SseShuffleMasks ) )
152
155
{
@@ -160,68 +163,68 @@ public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block
160
163
Vector128 < byte > rowH = block . V7 . AsByte ( ) ;
161
164
162
165
// row0 - A0 B0 A1 A2 B1 C0 D0 C1
163
- Vector128 < short > row0_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 0 ) ) ) . AsInt16 ( ) ;
164
- Vector128 < short > row0_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 1 ) ) ) . AsInt16 ( ) ;
165
- Vector128 < short > row0_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 2 ) ) ) . AsInt16 ( ) ;
166
- Vector128 < short > row0 = Sse2 . Or ( Sse2 . Or ( row0_A , row0_B ) , row0_C ) ;
167
- row0 = Sse2 . Insert ( row0 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 0 ) , 6 ) . AsInt16 ( ) ;
166
+ Vector128 < short > row0_A = ZShuffle ( rowA , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 0 ) ) ) . AsInt16 ( ) ;
167
+ Vector128 < short > row0_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 1 ) ) ) . AsInt16 ( ) ;
168
+ Vector128 < short > row0_C = ZShuffle ( rowC , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 2 ) ) ) . AsInt16 ( ) ;
169
+ Vector128 < short > row0 = row0_A | row0_B | row0_C ;
170
+ row0 = row0 . AsUInt16 ( ) . WithElement ( 6 , rowD . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
168
171
169
172
// row1 - B2 A3 A4 B3 C2 D1 E0 F0
170
- Vector128 < short > row1_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 3 ) ) ) . AsInt16 ( ) ;
171
- Vector128 < short > row1_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 4 ) ) ) . AsInt16 ( ) ;
172
- Vector128 < short > row1 = Sse2 . Or ( row1_A , row1_B ) ;
173
- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowC . AsUInt16 ( ) , 2 ) , 4 ) . AsInt16 ( ) ;
174
- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 1 ) , 5 ) . AsInt16 ( ) ;
175
- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 0 ) , 6 ) . AsInt16 ( ) ;
176
- row1 = Sse2 . Insert ( row1 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 0 ) , 7 ) . AsInt16 ( ) ;
173
+ Vector128 < short > row1_A = ZShuffle ( rowA , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 3 ) ) ) . AsInt16 ( ) ;
174
+ Vector128 < short > row1_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 4 ) ) ) . AsInt16 ( ) ;
175
+ Vector128 < short > row1 = row1_A | row1_B ;
176
+ row1 = row1 . AsUInt16 ( ) . WithElement ( 4 , rowC . AsUInt16 ( ) . GetElement ( 2 ) ) . AsInt16 ( ) ;
177
+ row1 = row1 . AsUInt16 ( ) . WithElement ( 5 , rowD . AsUInt16 ( ) . GetElement ( 1 ) ) . AsInt16 ( ) ;
178
+ row1 = row1 . AsUInt16 ( ) . WithElement ( 6 , rowE . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
179
+ row1 = row1 . AsUInt16 ( ) . WithElement ( 7 , rowF . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
177
180
178
181
// row2 - E1 D2 C3 B4 A5 A6 B5 C4
179
- Vector128 < short > row2_A = Ssse3 . Shuffle ( rowA , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 5 ) ) ) . AsInt16 ( ) ;
180
- Vector128 < short > row2_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 6 ) ) ) . AsInt16 ( ) ;
181
- Vector128 < short > row2_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 7 ) ) ) . AsInt16 ( ) ;
182
- Vector128 < short > row2 = Sse2 . Or ( Sse2 . Or ( row2_A , row2_B ) , row2_C ) ;
183
- row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 2 ) , 1 ) . AsInt16 ( ) ;
184
- row2 = Sse2 . Insert ( row2 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 1 ) , 0 ) . AsInt16 ( ) ;
182
+ Vector128 < short > row2_A = ZShuffle ( rowA , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 5 ) ) ) . AsInt16 ( ) ;
183
+ Vector128 < short > row2_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 6 ) ) ) . AsInt16 ( ) ;
184
+ Vector128 < short > row2_C = ZShuffle ( rowC , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 7 ) ) ) . AsInt16 ( ) ;
185
+ Vector128 < short > row2 = row2_A | row2_B | row2_C ;
186
+ row2 = row2 . AsUInt16 ( ) . WithElement ( 1 , rowD . AsUInt16 ( ) . GetElement ( 2 ) ) . AsInt16 ( ) ;
187
+ row2 = row2 . AsUInt16 ( ) . WithElement ( 0 , rowE . AsUInt16 ( ) . GetElement ( 1 ) ) . AsInt16 ( ) ;
185
188
186
189
// row3 - D3 E2 F1 G0 H0 G1 F2 E3
187
- Vector128 < short > row3_E = Ssse3 . Shuffle ( rowE , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 8 ) ) ) . AsInt16 ( ) ;
188
- Vector128 < short > row3_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 9 ) ) ) . AsInt16 ( ) ;
189
- Vector128 < short > row3_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 10 ) ) ) . AsInt16 ( ) ;
190
- Vector128 < short > row3 = Sse2 . Or ( Sse2 . Or ( row3_E , row3_F ) , row3_G ) ;
191
- row3 = Sse2 . Insert ( row3 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 3 ) , 0 ) . AsInt16 ( ) ;
192
- row3 = Sse2 . Insert ( row3 . AsUInt16 ( ) , Sse2 . Extract ( rowH . AsUInt16 ( ) , 0 ) , 4 ) . AsInt16 ( ) ;
190
+ Vector128 < short > row3_E = ZShuffle ( rowE , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 8 ) ) ) . AsInt16 ( ) ;
191
+ Vector128 < short > row3_F = ZShuffle ( rowF , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 9 ) ) ) . AsInt16 ( ) ;
192
+ Vector128 < short > row3_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 10 ) ) ) . AsInt16 ( ) ;
193
+ Vector128 < short > row3 = row3_E | row3_F | row3_G ;
194
+ row3 = row3 . AsUInt16 ( ) . WithElement ( 0 , rowD . AsUInt16 ( ) . GetElement ( 3 ) ) . AsInt16 ( ) ;
195
+ row3 = row3 . AsUInt16 ( ) . WithElement ( 4 , rowH . AsUInt16 ( ) . GetElement ( 0 ) ) . AsInt16 ( ) ;
193
196
194
197
// row4 - D4 C5 B6 A7 B7 C6 D5 E4
195
- Vector128 < short > row4_B = Ssse3 . Shuffle ( rowB , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 11 ) ) ) . AsInt16 ( ) ;
196
- Vector128 < short > row4_C = Ssse3 . Shuffle ( rowC , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 12 ) ) ) . AsInt16 ( ) ;
197
- Vector128 < short > row4_D = Ssse3 . Shuffle ( rowD , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 13 ) ) ) . AsInt16 ( ) ;
198
- Vector128 < short > row4 = Sse2 . Or ( Sse2 . Or ( row4_B , row4_C ) , row4_D ) ;
199
- row4 = Sse2 . Insert ( row4 . AsUInt16 ( ) , Sse2 . Extract ( rowA . AsUInt16 ( ) , 7 ) , 3 ) . AsInt16 ( ) ;
200
- row4 = Sse2 . Insert ( row4 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 4 ) , 7 ) . AsInt16 ( ) ;
198
+ Vector128 < short > row4_B = ZShuffle ( rowB , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 11 ) ) ) . AsInt16 ( ) ;
199
+ Vector128 < short > row4_C = ZShuffle ( rowC , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 12 ) ) ) . AsInt16 ( ) ;
200
+ Vector128 < short > row4_D = ZShuffle ( rowD , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 13 ) ) ) . AsInt16 ( ) ;
201
+ Vector128 < short > row4 = row4_B | row4_C | row4_D ;
202
+ row4 = row4 . AsUInt16 ( ) . WithElement ( 3 , rowA . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
203
+ row4 = row4 . AsUInt16 ( ) . WithElement ( 7 , rowE . AsUInt16 ( ) . GetElement ( 4 ) ) . AsInt16 ( ) ;
201
204
202
205
// row5 - F3 G2 H1 H2 G3 F4 E5 D6
203
- Vector128 < short > row5_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 14 ) ) ) . AsInt16 ( ) ;
204
- Vector128 < short > row5_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 15 ) ) ) . AsInt16 ( ) ;
205
- Vector128 < short > row5_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 16 ) ) ) . AsInt16 ( ) ;
206
- Vector128 < short > row5 = Sse2 . Or ( Sse2 . Or ( row5_F , row5_G ) , row5_H ) ;
207
- row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 6 ) , 7 ) . AsInt16 ( ) ;
208
- row5 = Sse2 . Insert ( row5 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 5 ) , 6 ) . AsInt16 ( ) ;
206
+ Vector128 < short > row5_F = ZShuffle ( rowF , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 14 ) ) ) . AsInt16 ( ) ;
207
+ Vector128 < short > row5_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 15 ) ) ) . AsInt16 ( ) ;
208
+ Vector128 < short > row5_H = ZShuffle ( rowH , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 16 ) ) ) . AsInt16 ( ) ;
209
+ Vector128 < short > row5 = row5_F | row5_G | row5_H ;
210
+ row5 = row5 . AsUInt16 ( ) . WithElement ( 7 , rowD . AsUInt16 ( ) . GetElement ( 6 ) ) . AsInt16 ( ) ;
211
+ row5 = row5 . AsUInt16 ( ) . WithElement ( 6 , rowE . AsUInt16 ( ) . GetElement ( 5 ) ) . AsInt16 ( ) ;
209
212
210
213
// row6 - C7 D7 E6 F5 G4 H3 H4 G5
211
- Vector128 < short > row6_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 17 ) ) ) . AsInt16 ( ) ;
212
- Vector128 < short > row6_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 18 ) ) ) . AsInt16 ( ) ;
213
- Vector128 < short > row6 = Sse2 . Or ( row6_G , row6_H ) ;
214
- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowC . AsUInt16 ( ) , 7 ) , 0 ) . AsInt16 ( ) ;
215
- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowD . AsUInt16 ( ) , 7 ) , 1 ) . AsInt16 ( ) ;
216
- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 6 ) , 2 ) . AsInt16 ( ) ;
217
- row6 = Sse2 . Insert ( row6 . AsUInt16 ( ) , Sse2 . Extract ( rowF . AsUInt16 ( ) , 5 ) , 3 ) . AsInt16 ( ) ;
214
+ Vector128 < short > row6_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 17 ) ) ) . AsInt16 ( ) ;
215
+ Vector128 < short > row6_H = ZShuffle ( rowH , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 18 ) ) ) . AsInt16 ( ) ;
216
+ Vector128 < short > row6 = row6_G | row6_H ;
217
+ row6 = row6 . AsUInt16 ( ) . WithElement ( 0 , rowC . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
218
+ row6 = row6 . AsUInt16 ( ) . WithElement ( 1 , rowD . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
219
+ row6 = row6 . AsUInt16 ( ) . WithElement ( 2 , rowE . AsUInt16 ( ) . GetElement ( 6 ) ) . AsInt16 ( ) ;
220
+ row6 = row6 . AsUInt16 ( ) . WithElement ( 3 , rowF . AsUInt16 ( ) . GetElement ( 5 ) ) . AsInt16 ( ) ;
218
221
219
222
// row7 - F6 E7 F7 G6 H5 H6 G7 H7
220
- Vector128 < short > row7_F = Ssse3 . Shuffle ( rowF , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 19 ) ) ) . AsInt16 ( ) ;
221
- Vector128 < short > row7_G = Ssse3 . Shuffle ( rowG , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 20 ) ) ) . AsInt16 ( ) ;
222
- Vector128 < short > row7_H = Ssse3 . Shuffle ( rowH , Sse2 . LoadVector128 ( shuffleVectorsPtr + ( 16 * 21 ) ) ) . AsInt16 ( ) ;
223
- Vector128 < short > row7 = Sse2 . Or ( Sse2 . Or ( row7_F , row7_G ) , row7_H ) ;
224
- row7 = Sse2 . Insert ( row7 . AsUInt16 ( ) , Sse2 . Extract ( rowE . AsUInt16 ( ) , 7 ) , 1 ) . AsInt16 ( ) ;
223
+ Vector128 < short > row7_F = ZShuffle ( rowF , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 19 ) ) ) . AsInt16 ( ) ;
224
+ Vector128 < short > row7_G = ZShuffle ( rowG , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 20 ) ) ) . AsInt16 ( ) ;
225
+ Vector128 < short > row7_H = ZShuffle ( rowH , Vector128 . Load ( shuffleVectorsPtr + ( 16 * 21 ) ) ) . AsInt16 ( ) ;
226
+ Vector128 < short > row7 = row7_F | row7_G | row7_H ;
227
+ row7 = row7 . AsUInt16 ( ) . WithElement ( 1 , rowE . AsUInt16 ( ) . GetElement ( 7 ) ) . AsInt16 ( ) ;
225
228
226
229
block . V0 = row0 ;
227
230
block . V1 = row1 ;
@@ -300,4 +303,20 @@ public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block)
300
303
block . V67 = row67 . AsInt16 ( ) ;
301
304
}
302
305
}
306
+
307
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
308
+ private static Vector128 < byte > ZShuffle ( Vector128 < byte > source , Vector128 < byte > mask )
309
+ {
310
+ // For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
311
+ if ( Ssse3 . IsSupported )
312
+ {
313
+ return Ssse3 . Shuffle ( source , mask ) ;
314
+ }
315
+
316
+ // For ARM and WASM, codegen will be optimal.
317
+ return Vector128 . Shuffle ( source , mask ) ;
318
+ }
319
+
320
+ [ DoesNotReturn ]
321
+ private static void ThrowUnreachableException ( ) => throw new UnreachableException ( ) ;
303
322
}
0 commit comments