@@ -13,33 +13,38 @@ internal static partial class SimdUtils
13
13
{
14
14
public static class HwIntrinsics
15
15
{
16
- public static ReadOnlySpan < byte > PermuteMaskDeinterleave8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
16
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ] // too much IL for JIT to inline, so give a hint
17
+ public static Vector256 < int > PermuteMaskDeinterleave8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsInt32 ( ) ;
17
18
18
- public static ReadOnlySpan < byte > PermuteMaskEvenOdd8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
19
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
20
+ public static Vector256 < uint > PermuteMaskEvenOdd8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
19
21
20
- public static ReadOnlySpan < byte > PermuteMaskSwitchInnerDWords8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
22
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
23
+ public static Vector256 < uint > PermuteMaskSwitchInnerDWords8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
21
24
22
- private static ReadOnlySpan < byte > MoveFirst24BytesToSeparateLanes => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
25
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
26
+ private static Vector256 < uint > MoveFirst24BytesToSeparateLanes ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
23
27
24
- internal static ReadOnlySpan < byte > ExtractRgb => new byte [ ] { 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF } ;
28
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
29
+ internal static Vector256 < byte > ExtractRgb ( ) => Vector256 . Create ( 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF ) ;
25
30
26
- private static ReadOnlySpan < byte > ShuffleMaskPad4Nx16 => new byte [ ] { 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 } ;
31
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
32
+ private static Vector128 < byte > ShuffleMaskPad4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 ) ;
27
33
28
- private static ReadOnlySpan < byte > ShuffleMaskSlice4Nx16 => new byte [ ] { 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 } ;
34
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
35
+ private static Vector128 < byte > ShuffleMaskSlice4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 ) ;
29
36
30
- private static ReadOnlySpan < byte > ShuffleMaskShiftAlpha =>
31
- new byte [ ]
32
- {
33
- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
34
- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15
35
- } ;
37
+ #pragma warning disable SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
38
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
39
+ private static Vector256 < byte > ShuffleMaskShiftAlpha ( ) => Vector256 . Create ( ( byte )
40
+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
41
+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ) ;
36
42
37
- public static ReadOnlySpan < byte > PermuteMaskShiftAlpha8x32 =>
38
- new byte [ ]
39
- {
40
- 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
41
- 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0
42
- } ;
43
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
44
+ public static Vector256 < uint > PermuteMaskShiftAlpha8x32 ( ) => Vector256 . Create (
45
+ 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
46
+ 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
47
+ #pragma warning restore SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
43
48
44
49
/// <summary>
45
50
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
@@ -189,7 +194,7 @@ public static void Shuffle4Slice3Reduce(
189
194
{
190
195
if ( Ssse3 . IsSupported )
191
196
{
192
- int remainder = source . Length % ( Vector128 < byte > . Count * 4 ) ;
197
+ int remainder = source . Length & ( Vector128 < byte > . Count * 4 - 1 ) ; // bit-hack for modulo
193
198
194
199
int sourceCount = source . Length - remainder ;
195
200
int destCount = ( int ) ( ( uint ) sourceCount * 3 / 4 ) ;
@@ -221,7 +226,7 @@ private static void Shuffle4(
221
226
ref Vector256 < float > destBase =
222
227
ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
223
228
224
- nint n = ( nint ) ( uint ) ( dest . Length / Vector256 < float > . Count ) ;
229
+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector256 < float > . Count ) ;
225
230
nint m = Numerics . Modulo4 ( n ) ;
226
231
nint u = n - m ;
227
232
@@ -253,7 +258,7 @@ private static void Shuffle4(
253
258
ref Vector128 < float > destBase =
254
259
ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
255
260
256
- nint n = ( nint ) ( uint ) dest . Length / Vector128 < float > . Count ;
261
+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < float > . Count ) ;
257
262
nint m = Numerics . Modulo4 ( n ) ;
258
263
nint u = n - m ;
259
264
@@ -306,7 +311,7 @@ private static void Shuffle4(
306
311
ref Vector256 < byte > destBase =
307
312
ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
308
313
309
- nint n = ( nint ) ( uint ) dest . Length / Vector256 < byte > . Count ;
314
+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector256 < byte > . Count ) ;
310
315
nint m = Numerics . Modulo4 ( n ) ;
311
316
nint u = n - m ;
312
317
@@ -342,7 +347,7 @@ private static void Shuffle4(
342
347
ref Vector128 < byte > destBase =
343
348
ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
344
349
345
- nint n = ( nint ) ( uint ) dest . Length / Vector128 < byte > . Count ;
350
+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < byte > . Count ) ;
346
351
nint m = Numerics . Modulo4 ( n ) ;
347
352
nint u = n - m ;
348
353
@@ -375,10 +380,8 @@ private static void Shuffle3(
375
380
{
376
381
if ( Ssse3 . IsSupported )
377
382
{
378
- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
379
- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
380
- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
381
- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
383
+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
384
+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
382
385
Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
383
386
384
387
Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -440,8 +443,7 @@ private static void Pad3Shuffle4(
440
443
{
441
444
if ( Ssse3 . IsSupported )
442
445
{
443
- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
444
- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
446
+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
445
447
Vector128 < byte > vfill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
446
448
447
449
Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -484,8 +486,7 @@ private static void Shuffle4Slice3(
484
486
{
485
487
if ( Ssse3 . IsSupported )
486
488
{
487
- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
488
- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
489
+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
489
490
Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
490
491
491
492
Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -542,9 +543,9 @@ private static void Shuffle4Slice3(
542
543
/// <returns>The <see cref="Vector256{T}"/>.</returns>
543
544
[ MethodImpl ( InliningOptions . AlwaysInline ) ]
544
545
public static Vector256 < float > MultiplyAdd (
545
- in Vector256 < float > va ,
546
- in Vector256 < float > vm0 ,
547
- in Vector256 < float > vm1 )
546
+ Vector256 < float > va ,
547
+ Vector256 < float > vm0 ,
548
+ Vector256 < float > vm1 )
548
549
{
549
550
if ( Fma . IsSupported )
550
551
{
@@ -565,9 +566,9 @@ public static Vector256<float> MultiplyAdd(
565
566
/// <returns>The <see cref="Vector256{T}"/>.</returns>
566
567
[ MethodImpl ( InliningOptions . ShortMethod ) ]
567
568
public static Vector256 < float > MultiplySubtract (
568
- in Vector256 < float > vs ,
569
- in Vector256 < float > vm0 ,
570
- in Vector256 < float > vm1 )
569
+ Vector256 < float > vs ,
570
+ Vector256 < float > vm0 ,
571
+ Vector256 < float > vm1 )
571
572
{
572
573
if ( Fma . IsSupported )
573
574
{
@@ -587,9 +588,9 @@ public static Vector256<float> MultiplySubtract(
587
588
/// <returns>The <see cref="Vector256{T}"/>.</returns>
588
589
[ MethodImpl ( InliningOptions . ShortMethod ) ]
589
590
public static Vector256 < float > MultiplyAddNegated (
590
- in Vector256 < float > a ,
591
- in Vector256 < float > b ,
592
- in Vector256 < float > c )
591
+ Vector256 < float > a ,
592
+ Vector256 < float > b ,
593
+ Vector256 < float > c )
593
594
{
594
595
if ( Fma . IsSupported )
595
596
{
@@ -655,7 +656,7 @@ internal static unsafe void ByteToNormalizedFloat(
655
656
ref Vector256 < float > destBase =
656
657
ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
657
658
658
- var scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
659
+ Vector256 < float > scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
659
660
660
661
for ( nuint i = 0 ; i < n ; i ++ )
661
662
{
@@ -688,7 +689,7 @@ internal static unsafe void ByteToNormalizedFloat(
688
689
ref Vector128 < float > destBase =
689
690
ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
690
691
691
- var scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
692
+ Vector128 < float > scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
692
693
Vector128 < byte > zero = Vector128 < byte > . Zero ;
693
694
694
695
for ( nuint i = 0 ; i < n ; i ++ )
@@ -790,9 +791,8 @@ internal static void NormalizedFloatToByteSaturate(
790
791
ref Vector256 < byte > destBase =
791
792
ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
792
793
793
- var scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
794
- ref byte maskBase = ref MemoryMarshal . GetReference ( PermuteMaskDeinterleave8x32 ) ;
795
- Vector256 < int > mask = Unsafe . As < byte , Vector256 < int > > ( ref maskBase ) ;
794
+ Vector256 < float > scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
795
+ Vector256 < int > mask = PermuteMaskDeinterleave8x32 ( ) ;
796
796
797
797
for ( nuint i = 0 ; i < n ; i ++ )
798
798
{
@@ -829,7 +829,7 @@ internal static void NormalizedFloatToByteSaturate(
829
829
ref Vector128 < byte > destBase =
830
830
ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
831
831
832
- var scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
832
+ Vector128 < float > scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
833
833
834
834
for ( nuint i = 0 ; i < n ; i ++ )
835
835
{
@@ -866,14 +866,12 @@ internal static void PackFromRgbPlanesAvx2Reduce(
866
866
867
867
nuint count = ( uint ) redChannel . Length / ( uint ) Vector256 < byte > . Count ;
868
868
869
- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
870
- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
869
+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
871
870
872
- ref byte control2Bytes = ref MemoryMarshal . GetReference ( PermuteMaskShiftAlpha8x32 ) ;
873
- Vector256 < uint > control2 = Unsafe . As < byte , Vector256 < uint > > ( ref control2Bytes ) ;
874
- var a = Vector256 . Create ( ( byte ) 255 ) ;
871
+ Vector256 < uint > control2 = PermuteMaskShiftAlpha8x32 ( ) ;
872
+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
875
873
876
- Vector256 < byte > shuffleAlpha = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ShuffleMaskShiftAlpha ) ) ;
874
+ Vector256 < byte > shuffleAlpha = ShuffleMaskShiftAlpha ( ) ;
877
875
878
876
for ( nuint i = 0 ; i < count ; i ++ )
879
877
{
@@ -937,9 +935,8 @@ internal static void PackFromRgbPlanesAvx2Reduce(
937
935
ref Vector256 < byte > dBase = ref Unsafe . As < Rgba32 , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
938
936
939
937
nuint count = ( uint ) redChannel . Length / ( uint ) Vector256 < byte > . Count ;
940
- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
941
- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
942
- var a = Vector256 . Create ( ( byte ) 255 ) ;
938
+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
939
+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
943
940
944
941
for ( nuint i = 0 ; i < count ; i ++ )
945
942
{
@@ -988,8 +985,8 @@ internal static void UnpackToRgbPlanesAvx2Reduce(
988
985
ref Vector256 < float > destGRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( greenChannel ) ) ;
989
986
ref Vector256 < float > destBRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( blueChannel ) ) ;
990
987
991
- Vector256 < uint > extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveFirst24BytesToSeparateLanes ) ) ;
992
- Vector256 < byte > extractRgbMask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractRgb ) ) ;
988
+ Vector256 < uint > extractToLanesMask = MoveFirst24BytesToSeparateLanes ( ) ;
989
+ Vector256 < byte > extractRgbMask = ExtractRgb ( ) ;
993
990
Vector256 < byte > rgb , rg , bx ;
994
991
Vector256 < float > r , g , b ;
995
992
0 commit comments