@@ -14,33 +14,38 @@ internal static partial class SimdUtils
14
14
{
15
15
public static class HwIntrinsics
16
16
{
17
- public static ReadOnlySpan < byte > PermuteMaskDeinterleave8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
17
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ] // too much IL for JIT to inline, so give a hint
18
+ public static Vector256 < int > PermuteMaskDeinterleave8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsInt32 ( ) ;
18
19
19
- public static ReadOnlySpan < byte > PermuteMaskEvenOdd8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
20
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
21
+ public static Vector256 < uint > PermuteMaskEvenOdd8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
20
22
21
- public static ReadOnlySpan < byte > PermuteMaskSwitchInnerDWords8x32 => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
23
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
24
+ public static Vector256 < uint > PermuteMaskSwitchInnerDWords8x32 ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
22
25
23
- private static ReadOnlySpan < byte > MoveFirst24BytesToSeparateLanes => new byte [ ] { 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 } ;
26
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
27
+ private static Vector256 < uint > MoveFirst24BytesToSeparateLanes ( ) => Vector256 . Create ( 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
24
28
25
- internal static ReadOnlySpan < byte > ExtractRgb => new byte [ ] { 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF } ;
29
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
30
+ internal static Vector256 < byte > ExtractRgb ( ) => Vector256 . Create ( 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF , 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF ) ;
26
31
27
- private static ReadOnlySpan < byte > ShuffleMaskPad4Nx16 => new byte [ ] { 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 } ;
32
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
33
+ private static Vector128 < byte > ShuffleMaskPad4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 0x80 , 3 , 4 , 5 , 0x80 , 6 , 7 , 8 , 0x80 , 9 , 10 , 11 , 0x80 ) ;
28
34
29
- private static ReadOnlySpan < byte > ShuffleMaskSlice4Nx16 => new byte [ ] { 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 } ;
35
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
36
+ private static Vector128 < byte > ShuffleMaskSlice4Nx16 ( ) => Vector128 . Create ( 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 0x80 , 0x80 , 0x80 , 0x80 ) ;
30
37
31
- private static ReadOnlySpan < byte > ShuffleMaskShiftAlpha =>
32
- new byte [ ]
33
- {
34
- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
35
- 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15
36
- } ;
38
+ #pragma warning disable SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
39
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
40
+ private static Vector256 < byte > ShuffleMaskShiftAlpha ( ) => Vector256 . Create ( ( byte )
41
+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ,
42
+ 0 , 1 , 2 , 4 , 5 , 6 , 8 , 9 , 10 , 12 , 13 , 14 , 3 , 7 , 11 , 15 ) ;
37
43
38
- public static ReadOnlySpan < byte > PermuteMaskShiftAlpha8x32 =>
39
- new byte [ ]
40
- {
41
- 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
42
- 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0
43
- } ;
44
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
45
+ public static Vector256 < uint > PermuteMaskShiftAlpha8x32 ( ) => Vector256 . Create (
46
+ 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 4 , 0 , 0 , 0 ,
47
+ 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 7 , 0 , 0 , 0 ) . AsUInt32 ( ) ;
48
+ #pragma warning restore SA1003 , SA1116 , SA1117 // Parameters should be on same line or separate lines
44
49
45
50
/// <summary>
46
51
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
@@ -190,7 +195,7 @@ public static void Shuffle4Slice3Reduce(
190
195
{
191
196
if ( Ssse3 . IsSupported )
192
197
{
193
- int remainder = source . Length % ( Vector128 < byte > . Count * 4 ) ;
198
+ int remainder = source . Length & ( ( Vector128 < byte > . Count * 4 ) - 1 ) ; // bit-hack for modulo
194
199
195
200
int sourceCount = source . Length - remainder ;
196
201
int destCount = ( int ) ( ( uint ) sourceCount * 3 / 4 ) ;
@@ -254,7 +259,7 @@ private static void Shuffle4(
254
259
ref Vector128 < float > destBase =
255
260
ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
256
261
257
- nint n = ( nint ) ( uint ) dest . Length / Vector128 < float > . Count ;
262
+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < float > . Count ) ;
258
263
nint m = Numerics . Modulo4 ( n ) ;
259
264
nint u = n - m ;
260
265
@@ -307,7 +312,7 @@ private static void Shuffle4(
307
312
ref Vector256 < byte > destBase =
308
313
ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
309
314
310
- nint n = ( nint ) ( uint ) dest . Length / Vector256 < byte > . Count ;
315
+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector256 < byte > . Count ) ;
311
316
nint m = Numerics . Modulo4 ( n ) ;
312
317
nint u = n - m ;
313
318
@@ -343,7 +348,7 @@ private static void Shuffle4(
343
348
ref Vector128 < byte > destBase =
344
349
ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
345
350
346
- nint n = ( nint ) ( uint ) dest . Length / Vector128 < byte > . Count ;
351
+ nint n = ( nint ) ( ( uint ) dest . Length / ( uint ) Vector128 < byte > . Count ) ;
347
352
nint m = Numerics . Modulo4 ( n ) ;
348
353
nint u = n - m ;
349
354
@@ -376,10 +381,8 @@ private static void Shuffle3(
376
381
{
377
382
if ( Ssse3 . IsSupported )
378
383
{
379
- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
380
- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
381
- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
382
- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
384
+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
385
+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
383
386
Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
384
387
385
388
Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -441,8 +444,7 @@ private static void Pad3Shuffle4(
441
444
{
442
445
if ( Ssse3 . IsSupported )
443
446
{
444
- ref byte vmaskBase = ref MemoryMarshal . GetReference ( ShuffleMaskPad4Nx16 ) ;
445
- Vector128 < byte > vmask = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskBase ) ;
447
+ Vector128 < byte > vmask = ShuffleMaskPad4Nx16 ( ) ;
446
448
Vector128 < byte > vfill = Vector128 . Create ( 0xff000000ff000000ul ) . AsByte ( ) ;
447
449
448
450
Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -485,8 +487,7 @@ private static void Shuffle4Slice3(
485
487
{
486
488
if ( Ssse3 . IsSupported )
487
489
{
488
- ref byte vmaskoBase = ref MemoryMarshal . GetReference ( ShuffleMaskSlice4Nx16 ) ;
489
- Vector128 < byte > vmasko = Unsafe . As < byte , Vector128 < byte > > ( ref vmaskoBase ) ;
490
+ Vector128 < byte > vmasko = ShuffleMaskSlice4Nx16 ( ) ;
490
491
Vector128 < byte > vmaske = Ssse3 . AlignRight ( vmasko , vmasko , 12 ) ;
491
492
492
493
Span < byte > bytes = stackalloc byte [ Vector128 < byte > . Count ] ;
@@ -543,9 +544,9 @@ private static void Shuffle4Slice3(
543
544
/// <returns>The <see cref="Vector256{T}"/>.</returns>
544
545
[ MethodImpl ( InliningOptions . AlwaysInline ) ]
545
546
public static Vector256 < float > MultiplyAdd (
546
- in Vector256 < float > va ,
547
- in Vector256 < float > vm0 ,
548
- in Vector256 < float > vm1 )
547
+ Vector256 < float > va ,
548
+ Vector256 < float > vm0 ,
549
+ Vector256 < float > vm1 )
549
550
{
550
551
if ( Fma . IsSupported )
551
552
{
@@ -594,9 +595,9 @@ public static Vector128<float> MultiplyAdd(
594
595
/// <returns>The <see cref="Vector256{T}"/>.</returns>
595
596
[ MethodImpl ( InliningOptions . ShortMethod ) ]
596
597
public static Vector256 < float > MultiplySubtract (
597
- in Vector256 < float > vs ,
598
- in Vector256 < float > vm0 ,
599
- in Vector256 < float > vm1 )
598
+ Vector256 < float > vs ,
599
+ Vector256 < float > vm0 ,
600
+ Vector256 < float > vm1 )
600
601
{
601
602
if ( Fma . IsSupported )
602
603
{
@@ -616,9 +617,9 @@ public static Vector256<float> MultiplySubtract(
616
617
/// <returns>The <see cref="Vector256{T}"/>.</returns>
617
618
[ MethodImpl ( InliningOptions . ShortMethod ) ]
618
619
public static Vector256 < float > MultiplyAddNegated (
619
- in Vector256 < float > a ,
620
- in Vector256 < float > b ,
621
- in Vector256 < float > c )
620
+ Vector256 < float > a ,
621
+ Vector256 < float > b ,
622
+ Vector256 < float > c )
622
623
{
623
624
if ( Fma . IsSupported )
624
625
{
@@ -684,7 +685,7 @@ internal static unsafe void ByteToNormalizedFloat(
684
685
ref Vector256 < float > destBase =
685
686
ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
686
687
687
- var scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
688
+ Vector256 < float > scale = Vector256 . Create ( 1 / ( float ) byte . MaxValue ) ;
688
689
689
690
for ( nuint i = 0 ; i < n ; i ++ )
690
691
{
@@ -717,7 +718,7 @@ internal static unsafe void ByteToNormalizedFloat(
717
718
ref Vector128 < float > destBase =
718
719
ref Unsafe . As < float , Vector128 < float > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
719
720
720
- var scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
721
+ Vector128 < float > scale = Vector128 . Create ( 1 / ( float ) byte . MaxValue ) ;
721
722
Vector128 < byte > zero = Vector128 < byte > . Zero ;
722
723
723
724
for ( nuint i = 0 ; i < n ; i ++ )
@@ -819,9 +820,8 @@ internal static void NormalizedFloatToByteSaturate(
819
820
ref Vector256 < byte > destBase =
820
821
ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
821
822
822
- var scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
823
- ref byte maskBase = ref MemoryMarshal . GetReference ( PermuteMaskDeinterleave8x32 ) ;
824
- Vector256 < int > mask = Unsafe . As < byte , Vector256 < int > > ( ref maskBase ) ;
823
+ Vector256 < float > scale = Vector256 . Create ( ( float ) byte . MaxValue ) ;
824
+ Vector256 < int > mask = PermuteMaskDeinterleave8x32 ( ) ;
825
825
826
826
for ( nuint i = 0 ; i < n ; i ++ )
827
827
{
@@ -858,7 +858,7 @@ internal static void NormalizedFloatToByteSaturate(
858
858
ref Vector128 < byte > destBase =
859
859
ref Unsafe . As < byte , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( dest ) ) ;
860
860
861
- var scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
861
+ Vector128 < float > scale = Vector128 . Create ( ( float ) byte . MaxValue ) ;
862
862
863
863
for ( nuint i = 0 ; i < n ; i ++ )
864
864
{
@@ -895,14 +895,12 @@ internal static void PackFromRgbPlanesAvx2Reduce(
895
895
896
896
nuint count = redChannel . Vector256Count < byte > ( ) ;
897
897
898
- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
899
- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
898
+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
900
899
901
- ref byte control2Bytes = ref MemoryMarshal . GetReference ( PermuteMaskShiftAlpha8x32 ) ;
902
- Vector256 < uint > control2 = Unsafe . As < byte , Vector256 < uint > > ( ref control2Bytes ) ;
903
- var a = Vector256 . Create ( ( byte ) 255 ) ;
900
+ Vector256 < uint > control2 = PermuteMaskShiftAlpha8x32 ( ) ;
901
+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
904
902
905
- Vector256 < byte > shuffleAlpha = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ShuffleMaskShiftAlpha ) ) ;
903
+ Vector256 < byte > shuffleAlpha = ShuffleMaskShiftAlpha ( ) ;
906
904
907
905
for ( nuint i = 0 ; i < count ; i ++ )
908
906
{
@@ -966,9 +964,8 @@ internal static void PackFromRgbPlanesAvx2Reduce(
966
964
ref Vector256 < byte > dBase = ref Unsafe . As < Rgba32 , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( destination ) ) ;
967
965
968
966
nuint count = redChannel . Vector256Count < byte > ( ) ;
969
- ref byte control1Bytes = ref MemoryMarshal . GetReference ( PermuteMaskEvenOdd8x32 ) ;
970
- Vector256 < uint > control1 = Unsafe . As < byte , Vector256 < uint > > ( ref control1Bytes ) ;
971
- var a = Vector256 . Create ( ( byte ) 255 ) ;
967
+ Vector256 < uint > control1 = PermuteMaskEvenOdd8x32 ( ) ;
968
+ Vector256 < byte > a = Vector256 . Create ( ( byte ) 255 ) ;
972
969
973
970
for ( nuint i = 0 ; i < count ; i ++ )
974
971
{
@@ -1017,8 +1014,8 @@ internal static void UnpackToRgbPlanesAvx2Reduce(
1017
1014
ref Vector256 < float > destGRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( greenChannel ) ) ;
1018
1015
ref Vector256 < float > destBRef = ref Unsafe . As < float , Vector256 < float > > ( ref MemoryMarshal . GetReference ( blueChannel ) ) ;
1019
1016
1020
- Vector256 < uint > extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveFirst24BytesToSeparateLanes ) ) ;
1021
- Vector256 < byte > extractRgbMask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractRgb ) ) ;
1017
+ Vector256 < uint > extractToLanesMask = MoveFirst24BytesToSeparateLanes ( ) ;
1018
+ Vector256 < byte > extractRgbMask = ExtractRgb ( ) ;
1022
1019
Vector256 < byte > rgb , rg , bx ;
1023
1020
Vector256 < float > r , g , b ;
1024
1021
0 commit comments