Skip to content

Commit 91b18b1

Browse files
committed
Improve algorithm
1 parent 47b0d9f commit 91b18b1

File tree

2 files changed

+61
-127
lines changed

2 files changed

+61
-127
lines changed

src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs

Lines changed: 59 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -15,97 +15,43 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
1515
{
1616
internal static class RgbToYCbCrConverterVectorized
1717
{
18-
private static ReadOnlySpan<byte> ExtractionMasks => new byte[]
19-
{
20-
0x0, 0xFF, 0xFF, 0xFF, 0x1, 0xFF, 0xFF, 0xFF, 0x2, 0xFF, 0xFF, 0xFF, 0x3, 0xFF, 0xFF, 0xFF, 0x10, 0xFF, 0xFF, 0xFF, 0x11, 0xFF, 0xFF, 0xFF, 0x12, 0xFF, 0xFF, 0xFF, 0x13, 0xFF, 0xFF, 0xFF,
21-
0x4, 0xFF, 0xFF, 0xFF, 0x5, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF, 0x14, 0xFF, 0xFF, 0xFF, 0x15, 0xFF, 0xFF, 0xFF, 0x16, 0xFF, 0xFF, 0xFF, 0x17, 0xFF, 0xFF, 0xFF,
22-
0x8, 0xFF, 0xFF, 0xFF, 0x9, 0xFF, 0xFF, 0xFF, 0xA, 0xFF, 0xFF, 0xFF, 0xB, 0xFF, 0xFF, 0xFF, 0x18, 0xFF, 0xFF, 0xFF, 0x19, 0xFF, 0xFF, 0xFF, 0x1A, 0xFF, 0xFF, 0xFF, 0x1B, 0xFF, 0xFF, 0xFF,
23-
0xC, 0xFF, 0xFF, 0xFF, 0xD, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF, 0x1C, 0xFF, 0xFF, 0xFF, 0x1D, 0xFF, 0xFF, 0xFF, 0x1E, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF,
24-
};
25-
2618
public static bool IsSupported
2719
{
2820
get
2921
{
3022
#if SUPPORTS_RUNTIME_INTRINSICS
31-
return Avx2.IsSupported && Fma.IsSupported;
23+
return Avx2.IsSupported;
3224
#else
3325
return false;
3426
#endif
3527
}
3628
}
3729

38-
public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
39-
{
40-
Debug.Assert(IsSupported, "AVX2 and FMA are required to run this converter");
41-
4230
#if SUPPORTS_RUNTIME_INTRINSICS
43-
SeparateRgb(rgbSpan);
44-
ConvertInternal(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
45-
#endif
46-
}
47-
48-
#if SUPPORTS_RUNTIME_INTRINSICS
49-
/// <summary>
50-
/// Rearranges the provided <paramref name="rgbSpan"/> in-place
51-
/// from { r00, g00, b00, ..., r63, g63, b63 }
52-
/// to { r00, ... r31, g00, ..., g31, b00, ..., b31,
53-
/// r32, ... r63, g32, ..., g63, b31, ..., b63 }
54-
/// </summary>
55-
/// <remarks>
56-
/// SSE is used for this operation as it is significantly faster than AVX in this specific case.
57-
/// Solving this problem with AVX requires too many instructions that cross the 128-bit lanes of YMM registers.
58-
/// </remarks>
59-
[MethodImpl(InliningOptions.ShortMethod)]
60-
private static void SeparateRgb(ReadOnlySpan<Rgb24> rgbSpan)
31+
private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
6132
{
62-
var selectRed0 = Vector128.Create(0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
63-
var selectRed1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
64-
var selectRed2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D);
65-
66-
var selectGreen0 = Vector128.Create(0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
67-
var selectGreen1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
68-
var selectGreen2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x05, 0x08, 0x0B, 0x0E);
33+
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
34+
3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
35+
};
6936

70-
var selectBlue0 = Vector128.Create(0x02, 0x05, 0x08, 0x0B, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
71-
var selectBlue1 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
72-
var selectBlue2 = Vector128.Create(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F);
37+
private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
38+
{
39+
2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
40+
5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
41+
};
7342

74-
for (int i = 0; i < 2; i++)
75-
{
76-
ref Vector128<byte> inRef = ref Unsafe.Add(ref Unsafe.As<Rgb24, Vector128<byte>>(ref MemoryMarshal.GetReference(rgbSpan)), i * 6);
77-
78-
Vector128<byte> in0 = inRef;
79-
Vector128<byte> in1 = Unsafe.Add(ref inRef, 1);
80-
Vector128<byte> in2 = Unsafe.Add(ref inRef, 2);
81-
82-
Vector128<byte> r0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
83-
Vector128<byte> g0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
84-
Vector128<byte> b0 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
85-
86-
in0 = Unsafe.Add(ref inRef, 3);
87-
in1 = Unsafe.Add(ref inRef, 4);
88-
in2 = Unsafe.Add(ref inRef, 5);
89-
90-
Vector128<byte> r1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectRed0), Ssse3.Shuffle(in1, selectRed1)), Ssse3.Shuffle(in2, selectRed2));
91-
Vector128<byte> g1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectGreen0), Ssse3.Shuffle(in1, selectGreen1)), Ssse3.Shuffle(in2, selectGreen2));
92-
Vector128<byte> b1 = Sse2.Or(Sse2.Or(Ssse3.Shuffle(in0, selectBlue0), Ssse3.Shuffle(in1, selectBlue1)), Ssse3.Shuffle(in2, selectBlue2));
93-
94-
inRef = r0;
95-
Unsafe.Add(ref inRef, 1) = r1;
96-
Unsafe.Add(ref inRef, 2) = g0;
97-
Unsafe.Add(ref inRef, 3) = g1;
98-
Unsafe.Add(ref inRef, 4) = b0;
99-
Unsafe.Add(ref inRef, 5) = b1;
100-
}
101-
}
43+
private static ReadOnlySpan<byte> ExtractRgb => new byte[]
44+
{
45+
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
46+
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF
47+
};
48+
#endif
10249

103-
/// <summary>
104-
/// Converts the previously separated (see <see cref="SeparateRgb"/>) RGB values to YCbCr using AVX2 and FMA.
105-
/// </summary>
106-
[MethodImpl(InliningOptions.ShortMethod)]
107-
private static void ConvertInternal(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
50+
public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
10851
{
52+
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
53+
54+
#if SUPPORTS_RUNTIME_INTRINSICS
10955
var f0299 = Vector256.Create(0.299f);
11056
var f0587 = Vector256.Create(0.587f);
11157
var f0114 = Vector256.Create(0.114f);
@@ -115,68 +61,60 @@ private static void ConvertInternal(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F y
11561
var fn0418688 = Vector256.Create(-0.418688f);
11662
var fn0081312F = Vector256.Create(-0.081312F);
11763
var f05 = Vector256.Create(0.5f);
64+
var zero = Vector256.Create(0).AsByte();
11865

11966
ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
120-
121-
for (int i = 0; i < 2; i++)
67+
ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock);
68+
ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock);
69+
ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock);
70+
71+
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
72+
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
73+
Vector256<byte> rgb, rg, bx;
74+
Vector256<float> r, g, b;
75+
for (int i = 0; i < 7; i++)
12276
{
123-
ref Vector256<float> destYRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock), i * 4);
124-
ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), i * 4);
125-
ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), i * 4);
126-
127-
Vector256<byte> red = Unsafe.Add(ref inRef, i * 3);
128-
Vector256<byte> green = Unsafe.Add(ref inRef, (i * 3) + 1);
129-
Vector256<byte> blue = Unsafe.Add(ref inRef, (i * 3) + 2);
77+
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
13078

131-
for (int j = 0; j < 2; j++)
132-
{
133-
// 1st part of unrolled loop
134-
Vector256<byte> mask = Unsafe.Add(ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractionMasks)), j * 2);
79+
rgb = Avx2.Shuffle(rgb, extractRgbMask);
13580

136-
Vector256<float> r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
137-
Vector256<float> g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
138-
Vector256<float> b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
81+
rg = Avx2.UnpackLow(rgb, zero);
82+
bx = Avx2.UnpackHigh(rgb, zero);
13983

140-
// (0.299F * r) + (0.587F * g) + (0.114F * b);
141-
Vector256<float> yy0 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
84+
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
85+
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
86+
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
14287

143-
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
144-
Vector256<float> cb0 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
88+
// (0.299F * r) + (0.587F * g) + (0.114F * b);
89+
Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
14590

146-
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
147-
Vector256<float> cr0 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
91+
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
92+
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
14893

149-
// 2nd part of unrolled loop
150-
mask = Unsafe.Add(ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractionMasks)), (j * 2) + 1);
151-
152-
r = Avx.ConvertToVector256Single(Avx2.Shuffle(red, mask).AsInt32());
153-
g = Avx.ConvertToVector256Single(Avx2.Shuffle(green, mask).AsInt32());
154-
b = Avx.ConvertToVector256Single(Avx2.Shuffle(blue, mask).AsInt32());
94+
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
95+
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
96+
}
15597

156-
// (0.299F * r) + (0.587F * g) + (0.114F * b);
157-
Vector256<float> yy1 = Fma.MultiplyAdd(f0299, r, Fma.MultiplyAdd(f0587, g, Avx.Multiply(f0114, b)));
98+
extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
99+
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
100+
rgb = Avx2.Shuffle(rgb, extractRgbMask);
158101

159-
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
160-
Vector256<float> cb1 = Avx.Add(f128, Fma.MultiplyAdd(fn0168736, r, Fma.MultiplyAdd(fn0331264, g, Avx.Multiply(f05, b))));
102+
rg = Avx2.UnpackLow(rgb, zero);
103+
bx = Avx2.UnpackHigh(rgb, zero);
161104

162-
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
163-
Vector256<float> cr1 = Avx.Add(f128, Fma.MultiplyAdd(f05, r, Fma.MultiplyAdd(fn0418688, g, Avx.Multiply(fn0081312F, b))));
105+
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
106+
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
107+
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
164108

165-
// store results from 1st and 2nd part
166-
Vector256<float> tmpY = Avx.Permute2x128(yy0, yy1, 0b0010_0001);
167-
Unsafe.Add(ref destYRef, j) = Avx.Blend(yy0, tmpY, 0b1111_0000);
168-
Unsafe.Add(ref destYRef, j + 2) = Avx.Blend(yy1, tmpY, 0b0000_1111);
109+
// (0.299F * r) + (0.587F * g) + (0.114F * b);
110+
Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
169111

170-
Vector256<float> tmpCb = Avx.Permute2x128(cb0, cb1, 0b0010_0001);
171-
Unsafe.Add(ref destCbRef, j) = Avx.Blend(cb0, tmpCb, 0b1111_0000);
172-
Unsafe.Add(ref destCbRef, j + 2) = Avx.Blend(cb1, tmpCb, 0b0000_1111);
112+
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
113+
Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
173114

174-
Vector256<float> tmpCr = Avx.Permute2x128(cr0, cr1, 0b0010_0001);
175-
Unsafe.Add(ref destCrRef, j) = Avx.Blend(cr0, tmpCr, 0b1111_0000);
176-
Unsafe.Add(ref destCrRef, j + 2) = Avx.Blend(cr1, tmpCr, 0b0000_1111);
177-
}
178-
}
179-
}
115+
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
116+
Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
180117
#endif
118+
}
181119
}
182120
}

tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,13 @@ public void TestVectorizedConverter()
4848

4949
Rgb24[] data = CreateTestData();
5050

51-
// RgbToYCbCrConverterVectorized uses `data` as working memory so we need a copy for verification below
52-
Rgb24[] dataCopy = new Rgb24[data.Length];
53-
data.CopyTo(dataCopy, 0);
54-
5551
Block8x8F y = default;
5652
Block8x8F cb = default;
5753
Block8x8F cr = default;
5854

5955
RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr);
6056

61-
Verify(dataCopy, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
57+
Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
6258
}
6359

6460
private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer)
@@ -73,7 +69,7 @@ private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref
7369
float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
7470
float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
7571

76-
Assert.Equal(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i]), comparer);
72+
Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}");
7773
}
7874
}
7975

0 commit comments

Comments
 (0)