@@ -15,97 +15,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
15
15
{
16
16
internal static class RgbToYCbCrConverterVectorized
17
17
{
18
- private static ReadOnlySpan < byte > ExtractionMasks => new byte [ ]
19
- {
20
- 0x0 , 0xFF , 0xFF , 0xFF , 0x1 , 0xFF , 0xFF , 0xFF , 0x2 , 0xFF , 0xFF , 0xFF , 0x3 , 0xFF , 0xFF , 0xFF , 0x10 , 0xFF , 0xFF , 0xFF , 0x11 , 0xFF , 0xFF , 0xFF , 0x12 , 0xFF , 0xFF , 0xFF , 0x13 , 0xFF , 0xFF , 0xFF ,
21
- 0x4 , 0xFF , 0xFF , 0xFF , 0x5 , 0xFF , 0xFF , 0xFF , 0x6 , 0xFF , 0xFF , 0xFF , 0x7 , 0xFF , 0xFF , 0xFF , 0x14 , 0xFF , 0xFF , 0xFF , 0x15 , 0xFF , 0xFF , 0xFF , 0x16 , 0xFF , 0xFF , 0xFF , 0x17 , 0xFF , 0xFF , 0xFF ,
22
- 0x8 , 0xFF , 0xFF , 0xFF , 0x9 , 0xFF , 0xFF , 0xFF , 0xA , 0xFF , 0xFF , 0xFF , 0xB , 0xFF , 0xFF , 0xFF , 0x18 , 0xFF , 0xFF , 0xFF , 0x19 , 0xFF , 0xFF , 0xFF , 0x1A , 0xFF , 0xFF , 0xFF , 0x1B , 0xFF , 0xFF , 0xFF ,
23
- 0xC , 0xFF , 0xFF , 0xFF , 0xD , 0xFF , 0xFF , 0xFF , 0xE , 0xFF , 0xFF , 0xFF , 0xF , 0xFF , 0xFF , 0xFF , 0x1C , 0xFF , 0xFF , 0xFF , 0x1D , 0xFF , 0xFF , 0xFF , 0x1E , 0xFF , 0xFF , 0xFF , 0x1F , 0xFF , 0xFF , 0xFF ,
24
- } ;
25
-
26
18
public static bool IsSupported
27
19
{
28
20
get
29
21
{
30
22
#if SUPPORTS_RUNTIME_INTRINSICS
31
- return Avx2 . IsSupported && Fma . IsSupported ;
23
+ return Avx2 . IsSupported ;
32
24
#else
33
25
return false ;
34
26
#endif
35
27
}
36
28
}
37
29
38
- public static void Convert ( ReadOnlySpan < Rgb24 > rgbSpan , ref Block8x8F yBlock , ref Block8x8F cbBlock , ref Block8x8F crBlock )
39
- {
40
- Debug . Assert ( IsSupported , "AVX2 and FMA are required to run this converter" ) ;
41
-
42
30
#if SUPPORTS_RUNTIME_INTRINSICS
43
- SeparateRgb ( rgbSpan ) ;
44
- ConvertInternal ( rgbSpan , ref yBlock , ref cbBlock , ref crBlock ) ;
45
- #endif
46
- }
47
-
48
- #if SUPPORTS_RUNTIME_INTRINSICS
49
- /// <summary>
50
- /// Rearranges the provided <paramref name="rgbSpan"/> in-place
51
- /// from { r00, g00, b00, ..., r63, g63, b63 }
52
- /// to { r00, ... r31, g00, ..., g31, b00, ..., b31,
53
- /// r32, ... r63, g32, ..., g63, b31, ..., b63 }
54
- /// </summary>
55
- /// <remarks>
56
- /// SSE is used for this operation as it is significantly faster than AVX in this specific case.
57
- /// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers.
58
- /// </remarks>
59
- [ MethodImpl ( InliningOptions . ShortMethod ) ]
60
- private static void SeparateRgb ( ReadOnlySpan < Rgb24 > rgbSpan )
31
+ private static ReadOnlySpan < byte > MoveFirst24BytesToSeparateLanes => new byte [ ]
61
32
{
62
- var selectRed0 = Vector128 . Create ( 0x00 , 0x03 , 0x06 , 0x09 , 0x0C , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
63
- var selectRed1 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x02 , 0x05 , 0x08 , 0x0B , 0x0E , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
64
- var selectRed2 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x01 , 0x04 , 0x07 , 0x0A , 0x0D ) ;
65
-
66
- var selectGreen0 = Vector128 . Create ( 0x01 , 0x04 , 0x07 , 0x0A , 0x0D , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
67
- var selectGreen1 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 , 0x03 , 0x06 , 0x09 , 0x0C , 0x0F , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
68
- var selectGreen2 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x02 , 0x05 , 0x08 , 0x0B , 0x0E ) ;
33
+ 0 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 2 , 0 , 0 , 0 , 6 , 0 , 0 , 0 ,
34
+ 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 5 , 0 , 0 , 0 , 7 , 0 , 0 , 0
35
+ } ;
69
36
70
- var selectBlue0 = Vector128 . Create ( 0x02 , 0x05 , 0x08 , 0x0B , 0x0E , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
71
- var selectBlue1 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x01 , 0x04 , 0x07 , 0x0A , 0x0D , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ) ;
72
- var selectBlue2 = Vector128 . Create ( 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 , 0x03 , 0x06 , 0x09 , 0x0C , 0x0F ) ;
37
+ private static ReadOnlySpan < byte > MoveLast24BytesToSeparateLanes => new byte [ ]
38
+ {
39
+ 2 , 0 , 0 , 0 , 3 , 0 , 0 , 0 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
40
+ 5 , 0 , 0 , 0 , 6 , 0 , 0 , 0 , 7 , 0 , 0 , 0 , 1 , 0 , 0 , 0
41
+ } ;
73
42
74
- for ( int i = 0 ; i < 2 ; i ++ )
75
- {
76
- ref Vector128 < byte > inRef = ref Unsafe . Add ( ref Unsafe . As < Rgb24 , Vector128 < byte > > ( ref MemoryMarshal . GetReference ( rgbSpan ) ) , i * 6 ) ;
77
-
78
- Vector128 < byte > in0 = inRef ;
79
- Vector128 < byte > in1 = Unsafe . Add ( ref inRef , 1 ) ;
80
- Vector128 < byte > in2 = Unsafe . Add ( ref inRef , 2 ) ;
81
-
82
- Vector128 < byte > r0 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectRed0 ) , Ssse3 . Shuffle ( in1 , selectRed1 ) ) , Ssse3 . Shuffle ( in2 , selectRed2 ) ) ;
83
- Vector128 < byte > g0 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectGreen0 ) , Ssse3 . Shuffle ( in1 , selectGreen1 ) ) , Ssse3 . Shuffle ( in2 , selectGreen2 ) ) ;
84
- Vector128 < byte > b0 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectBlue0 ) , Ssse3 . Shuffle ( in1 , selectBlue1 ) ) , Ssse3 . Shuffle ( in2 , selectBlue2 ) ) ;
85
-
86
- in0 = Unsafe . Add ( ref inRef , 3 ) ;
87
- in1 = Unsafe . Add ( ref inRef , 4 ) ;
88
- in2 = Unsafe . Add ( ref inRef , 5 ) ;
89
-
90
- Vector128 < byte > r1 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectRed0 ) , Ssse3 . Shuffle ( in1 , selectRed1 ) ) , Ssse3 . Shuffle ( in2 , selectRed2 ) ) ;
91
- Vector128 < byte > g1 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectGreen0 ) , Ssse3 . Shuffle ( in1 , selectGreen1 ) ) , Ssse3 . Shuffle ( in2 , selectGreen2 ) ) ;
92
- Vector128 < byte > b1 = Sse2 . Or ( Sse2 . Or ( Ssse3 . Shuffle ( in0 , selectBlue0 ) , Ssse3 . Shuffle ( in1 , selectBlue1 ) ) , Ssse3 . Shuffle ( in2 , selectBlue2 ) ) ;
93
-
94
- inRef = r0 ;
95
- Unsafe . Add ( ref inRef , 1 ) = r1;
96
- Unsafe . Add ( ref inRef , 2 ) = g0;
97
- Unsafe . Add ( ref inRef , 3 ) = g1;
98
- Unsafe . Add ( ref inRef , 4 ) = b0;
99
- Unsafe . Add ( ref inRef , 5 ) = b1;
100
- }
101
- }
43
+ private static ReadOnlySpan < byte > ExtractRgb => new byte [ ]
44
+ {
45
+ 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF ,
46
+ 0 , 3 , 6 , 9 , 1 , 4 , 7 , 10 , 2 , 5 , 8 , 11 , 0xFF , 0xFF , 0xFF , 0xFF
47
+ } ;
48
+ #endif
102
49
103
- /// <summary>
104
- /// Converts the previously separated (see <see cref="SeparateRgb"/>) RGB values to YCbCr using AVX2 and FMA.
105
- /// </summary>
106
- [ MethodImpl ( InliningOptions . ShortMethod ) ]
107
- private static void ConvertInternal ( ReadOnlySpan < Rgb24 > rgbSpan , ref Block8x8F yBlock , ref Block8x8F cbBlock , ref Block8x8F crBlock )
50
+ public static void Convert ( ReadOnlySpan < Rgb24 > rgbSpan , ref Block8x8F yBlock , ref Block8x8F cbBlock , ref Block8x8F crBlock )
108
51
{
52
+ Debug . Assert ( IsSupported , "AVX2 is required to run this converter" ) ;
53
+
54
+ #if SUPPORTS_RUNTIME_INTRINSICS
109
55
var f0299 = Vector256 . Create ( 0.299f ) ;
110
56
var f0587 = Vector256 . Create ( 0.587f ) ;
111
57
var f0114 = Vector256 . Create ( 0.114f ) ;
@@ -115,68 +61,60 @@ private static void ConvertInternal(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F y
115
61
var fn0418688 = Vector256 . Create ( - 0.418688f ) ;
116
62
var fn0081312F = Vector256 . Create ( - 0.081312F ) ;
117
63
var f05 = Vector256 . Create ( 0.5f ) ;
64
+ var zero = Vector256 . Create ( 0 ) . AsByte ( ) ;
118
65
119
66
ref Vector256 < byte > inRef = ref Unsafe . As < Rgb24 , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( rgbSpan ) ) ;
120
-
121
- for ( int i = 0 ; i < 2 ; i ++ )
67
+ ref Vector256 < float > destYRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref yBlock ) ;
68
+ ref Vector256 < float > destCbRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref cbBlock ) ;
69
+ ref Vector256 < float > destCrRef = ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref crBlock ) ;
70
+
71
+ var extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveFirst24BytesToSeparateLanes ) ) ;
72
+ var extractRgbMask = Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractRgb ) ) ;
73
+ Vector256 < byte > rgb , rg , bx ;
74
+ Vector256 < float > r , g , b ;
75
+ for ( int i = 0 ; i < 7 ; i ++ )
122
76
{
123
- ref Vector256 < float > destYRef = ref Unsafe . Add ( ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref yBlock ) , i * 4 ) ;
124
- ref Vector256 < float > destCbRef = ref Unsafe . Add ( ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref cbBlock ) , i * 4 ) ;
125
- ref Vector256 < float > destCrRef = ref Unsafe . Add ( ref Unsafe . As < Block8x8F , Vector256 < float > > ( ref crBlock ) , i * 4 ) ;
126
-
127
- Vector256 < byte > red = Unsafe . Add ( ref inRef , i * 3 ) ;
128
- Vector256 < byte > green = Unsafe . Add ( ref inRef , ( i * 3 ) + 1 ) ;
129
- Vector256 < byte > blue = Unsafe . Add ( ref inRef , ( i * 3 ) + 2 ) ;
77
+ rgb = Avx2 . PermuteVar8x32 ( Unsafe . AddByteOffset ( ref inRef , ( IntPtr ) ( 24 * i ) ) . AsUInt32 ( ) , extractToLanesMask ) . AsByte ( ) ;
130
78
131
- for ( int j = 0 ; j < 2 ; j ++ )
132
- {
133
- // 1st part of unrolled loop
134
- Vector256 < byte > mask = Unsafe . Add ( ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractionMasks ) ) , j * 2 ) ;
79
+ rgb = Avx2 . Shuffle ( rgb , extractRgbMask ) ;
135
80
136
- Vector256 < float > r = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( red , mask ) . AsInt32 ( ) ) ;
137
- Vector256 < float > g = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( green , mask ) . AsInt32 ( ) ) ;
138
- Vector256 < float > b = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( blue , mask ) . AsInt32 ( ) ) ;
81
+ rg = Avx2 . UnpackLow ( rgb , zero ) ;
82
+ bx = Avx2 . UnpackHigh ( rgb , zero ) ;
139
83
140
- // (0.299F * r) + (0.587F * g) + (0.114F * b);
141
- Vector256 < float > yy0 = Fma . MultiplyAdd ( f0299 , r , Fma . MultiplyAdd ( f0587 , g , Avx . Multiply ( f0114 , b ) ) ) ;
84
+ r = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( rg , zero ) . AsInt32 ( ) ) ;
85
+ g = Avx . ConvertToVector256Single ( Avx2 . UnpackHigh ( rg , zero ) . AsInt32 ( ) ) ;
86
+ b = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( bx , zero ) . AsInt32 ( ) ) ;
142
87
143
- // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
144
- Vector256 < float > cb0 = Avx . Add ( f128 , Fma . MultiplyAdd ( fn0168736 , r , Fma . MultiplyAdd ( fn0331264 , g , Avx . Multiply ( f05 , b ) ) ) ) ;
88
+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
89
+ Unsafe . Add ( ref destYRef , i ) = SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f0114 , b ) , f0587 , g ) , f0299 , r ) ;
145
90
146
- // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
147
- Vector256 < float > cr0 = Avx . Add ( f128 , Fma . MultiplyAdd ( f05 , r , Fma . MultiplyAdd ( fn0418688 , g , Avx . Multiply ( fn0081312F , b ) ) ) ) ;
91
+ // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
92
+ Unsafe . Add ( ref destCbRef , i ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f05 , b ) , fn0331264 , g ) , fn0168736 , r ) ) ;
148
93
149
- // 2nd part of unrolled loop
150
- mask = Unsafe . Add ( ref Unsafe . As < byte , Vector256 < byte > > ( ref MemoryMarshal . GetReference ( ExtractionMasks ) ) , ( j * 2 ) + 1 ) ;
151
-
152
- r = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( red , mask ) . AsInt32 ( ) ) ;
153
- g = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( green , mask ) . AsInt32 ( ) ) ;
154
- b = Avx . ConvertToVector256Single ( Avx2 . Shuffle ( blue , mask ) . AsInt32 ( ) ) ;
94
+ // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
95
+ Unsafe . Add ( ref destCrRef , i ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( fn0081312F , b ) , fn0418688 , g ) , f05 , r ) ) ;
96
+ }
155
97
156
- // (0.299F * r) + (0.587F * g) + (0.114F * b);
157
- Vector256 < float > yy1 = Fma . MultiplyAdd ( f0299 , r , Fma . MultiplyAdd ( f0587 , g , Avx . Multiply ( f0114 , b ) ) ) ;
98
+ extractToLanesMask = Unsafe . As < byte , Vector256 < uint > > ( ref MemoryMarshal . GetReference ( MoveLast24BytesToSeparateLanes ) ) ;
99
+ rgb = Avx2 . PermuteVar8x32 ( Unsafe . AddByteOffset ( ref inRef , ( IntPtr ) 160 ) . AsUInt32 ( ) , extractToLanesMask ) . AsByte ( ) ;
100
+ rgb = Avx2 . Shuffle ( rgb , extractRgbMask ) ;
158
101
159
- // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
160
- Vector256 < float > cb1 = Avx . Add ( f128 , Fma . MultiplyAdd ( fn0168736 , r , Fma . MultiplyAdd ( fn0331264 , g , Avx . Multiply ( f05 , b ) ) ) ) ;
102
+ rg = Avx2 . UnpackLow ( rgb , zero ) ;
103
+ bx = Avx2 . UnpackHigh ( rgb , zero ) ;
161
104
162
- // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
163
- Vector256 < float > cr1 = Avx . Add ( f128 , Fma . MultiplyAdd ( f05 , r , Fma . MultiplyAdd ( fn0418688 , g , Avx . Multiply ( fn0081312F , b ) ) ) ) ;
105
+ r = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( rg , zero ) . AsInt32 ( ) ) ;
106
+ g = Avx . ConvertToVector256Single ( Avx2 . UnpackHigh ( rg , zero ) . AsInt32 ( ) ) ;
107
+ b = Avx . ConvertToVector256Single ( Avx2 . UnpackLow ( bx , zero ) . AsInt32 ( ) ) ;
164
108
165
- // store results from 1st and 2nd part
166
- Vector256 < float > tmpY = Avx . Permute2x128 ( yy0 , yy1 , 0b0010_0001 ) ;
167
- Unsafe . Add ( ref destYRef , j ) = Avx. Blend ( yy0 , tmpY , 0b1111_0000 ) ;
168
- Unsafe . Add ( ref destYRef , j + 2 ) = Avx. Blend ( yy1 , tmpY , 0b0000_1111 ) ;
109
+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
110
+ Unsafe . Add ( ref destYRef , 7 ) = SimdUtils. HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f0114 , b ) , f0587 , g ) , f0299 , r ) ;
169
111
170
- Vector256 < float > tmpCb = Avx . Permute2x128 ( cb0 , cb1 , 0b0010_0001 ) ;
171
- Unsafe . Add ( ref destCbRef , j ) = Avx. Blend ( cb0 , tmpCb , 0b1111_0000 ) ;
172
- Unsafe . Add ( ref destCbRef , j + 2 ) = Avx. Blend ( cb1 , tmpCb , 0b0000_1111 ) ;
112
+ // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
113
+ Unsafe . Add ( ref destCbRef , 7 ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( f05 , b ) , fn0331264 , g ) , fn0168736 , r ) ) ;
173
114
174
- Vector256 < float > tmpCr = Avx . Permute2x128 ( cr0 , cr1 , 0b0010_0001 ) ;
175
- Unsafe . Add ( ref destCrRef , j ) = Avx. Blend ( cr0 , tmpCr , 0b1111_0000 ) ;
176
- Unsafe . Add ( ref destCrRef , j + 2 ) = Avx. Blend ( cr1 , tmpCr , 0b0000_1111 ) ;
177
- }
178
- }
179
- }
115
+ // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
116
+ Unsafe . Add ( ref destCrRef , 7 ) = Avx. Add ( f128 , SimdUtils . HwIntrinsics . MultiplyAdd ( SimdUtils . HwIntrinsics . MultiplyAdd ( Avx . Multiply ( fn0081312F , b ) , fn0418688 , g ) , f05 , r ) ) ;
180
117
#endif
118
+ }
181
119
}
182
120
}
0 commit comments