Skip to content

Commit eab04e4

Browse files
Merge pull request #1508 from tkp1n/feature/vectorize-rgb2ycbcr-conversion
Vectorize Jpeg Encoder Color Conversion
2 parents e2961dc + 91b18b1 commit eab04e4

File tree

5 files changed

+307
-23
lines changed

5 files changed

+307
-23
lines changed

src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrTables.cs renamed to src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
// Copyright (c) Six Labors.
22
// Licensed under the Apache License, Version 2.0.
33

4+
using System;
45
using System.Runtime.CompilerServices;
6+
using SixLabors.ImageSharp.PixelFormats;
57

68
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
79
{
810
/// <summary>
911
/// Provides 8-bit lookup tables for converting from Rgb to YCbCr colorspace.
1012
/// Methods to build the tables are based on libjpeg implementation.
11-
/// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
1213
/// </summary>
13-
internal unsafe struct RgbToYCbCrTables
14+
internal unsafe struct RgbToYCbCrConverterLut
1415
{
1516
/// <summary>
1617
/// The red luminance table
@@ -63,10 +64,10 @@ internal unsafe struct RgbToYCbCrTables
6364
/// <summary>
6465
/// Initializes the YCbCr tables
6566
/// </summary>
66-
/// <returns>The initialized <see cref="RgbToYCbCrTables"/></returns>
67-
public static RgbToYCbCrTables Create()
67+
/// <returns>The initialized <see cref="RgbToYCbCrConverterLut"/></returns>
68+
public static RgbToYCbCrConverterLut Create()
6869
{
69-
RgbToYCbCrTables tables = default;
70+
RgbToYCbCrConverterLut tables = default;
7071

7172
for (int i = 0; i <= 255; i++)
7273
{
@@ -92,11 +93,10 @@ public static RgbToYCbCrTables Create()
9293
}
9394

9495
/// <summary>
95-
/// TODO: Replace this logic with SIMD conversion (similar to the one in the decoder)!
9696
/// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
9797
/// </summary>
9898
[MethodImpl(MethodImplOptions.AggressiveInlining)]
99-
public void ConvertPixelInto(
99+
private void ConvertPixelInto(
100100
int r,
101101
int g,
102102
int b,
@@ -111,10 +111,29 @@ public void ConvertPixelInto(
111111
// float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
112112
cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
113113

114-
// float cr = MathF.Round(y + (1.772F * cb), MidpointRounding.AwayFromZero);
114+
// float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
115115
crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
116116
}
117117

118+
public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
119+
{
120+
ref Rgb24 rgbStart = ref rgbSpan[0];
121+
122+
for (int i = 0; i < 64; i++)
123+
{
124+
ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
125+
126+
this.ConvertPixelInto(
127+
c.R,
128+
c.G,
129+
c.B,
130+
ref yBlock,
131+
ref cbBlock,
132+
ref crBlock,
133+
i);
134+
}
135+
}
136+
118137
[MethodImpl(MethodImplOptions.AggressiveInlining)]
119138
private static int Fix(float x)
120139
=> (int)((x * (1L << ScaleBits)) + 0.5F);
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
using System;
5+
using System.Diagnostics;
6+
#if SUPPORTS_RUNTIME_INTRINSICS
7+
using System.Runtime.CompilerServices;
8+
using System.Runtime.InteropServices;
9+
using System.Runtime.Intrinsics;
10+
using System.Runtime.Intrinsics.X86;
11+
#endif
12+
using SixLabors.ImageSharp.PixelFormats;
13+
14+
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
15+
{
16+
internal static class RgbToYCbCrConverterVectorized
17+
{
18+
public static bool IsSupported
19+
{
20+
get
21+
{
22+
#if SUPPORTS_RUNTIME_INTRINSICS
23+
return Avx2.IsSupported;
24+
#else
25+
return false;
26+
#endif
27+
}
28+
}
29+
30+
#if SUPPORTS_RUNTIME_INTRINSICS
31+
private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
32+
{
33+
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
34+
3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
35+
};
36+
37+
private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
38+
{
39+
2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
40+
5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
41+
};
42+
43+
private static ReadOnlySpan<byte> ExtractRgb => new byte[]
44+
{
45+
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
46+
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF
47+
};
48+
#endif
49+
50+
public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
51+
{
52+
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
53+
54+
#if SUPPORTS_RUNTIME_INTRINSICS
55+
var f0299 = Vector256.Create(0.299f);
56+
var f0587 = Vector256.Create(0.587f);
57+
var f0114 = Vector256.Create(0.114f);
58+
var fn0168736 = Vector256.Create(-0.168736f);
59+
var fn0331264 = Vector256.Create(-0.331264f);
60+
var f128 = Vector256.Create(128f);
61+
var fn0418688 = Vector256.Create(-0.418688f);
62+
var fn0081312F = Vector256.Create(-0.081312F);
63+
var f05 = Vector256.Create(0.5f);
64+
var zero = Vector256.Create(0).AsByte();
65+
66+
ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
67+
ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock);
68+
ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock);
69+
ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock);
70+
71+
var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
72+
var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
73+
Vector256<byte> rgb, rg, bx;
74+
Vector256<float> r, g, b;
75+
for (int i = 0; i < 7; i++)
76+
{
77+
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
78+
79+
rgb = Avx2.Shuffle(rgb, extractRgbMask);
80+
81+
rg = Avx2.UnpackLow(rgb, zero);
82+
bx = Avx2.UnpackHigh(rgb, zero);
83+
84+
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
85+
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
86+
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
87+
88+
// (0.299F * r) + (0.587F * g) + (0.114F * b);
89+
Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
90+
91+
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
92+
Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
93+
94+
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
95+
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
96+
}
97+
98+
extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
99+
rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
100+
rgb = Avx2.Shuffle(rgb, extractRgbMask);
101+
102+
rg = Avx2.UnpackLow(rgb, zero);
103+
bx = Avx2.UnpackHigh(rgb, zero);
104+
105+
r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
106+
g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
107+
b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
108+
109+
// (0.299F * r) + (0.587F * g) + (0.114F * b);
110+
Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
111+
112+
// 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
113+
Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
114+
115+
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
116+
Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
117+
#endif
118+
}
119+
}
120+
}

src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// Licensed under the Apache License, Version 2.0.
33

44
using System;
5-
using System.Runtime.CompilerServices;
65
using SixLabors.ImageSharp.Advanced;
76
using SixLabors.ImageSharp.PixelFormats;
87

@@ -33,7 +32,7 @@ internal ref struct YCbCrForwardConverter<TPixel>
3332
/// <summary>
3433
/// The color conversion tables
3534
/// </summary>
36-
private RgbToYCbCrTables colorTables;
35+
private RgbToYCbCrConverterLut colorTables;
3736

3837
/// <summary>
3938
/// Temporal 8x8 block to hold TPixel data
@@ -48,7 +47,12 @@ internal ref struct YCbCrForwardConverter<TPixel>
4847
public static YCbCrForwardConverter<TPixel> Create()
4948
{
5049
var result = default(YCbCrForwardConverter<TPixel>);
51-
result.colorTables = RgbToYCbCrTables.Create();
50+
if (!RgbToYCbCrConverterVectorized.IsSupported)
51+
{
52+
// Avoid creating lookup tables, when vectorized converter is supported
53+
result.colorTables = RgbToYCbCrConverterLut.Create();
54+
}
55+
5256
return result;
5357
}
5458

@@ -65,20 +69,14 @@ public void Convert(ImageFrame<TPixel> frame, int x, int y, in RowOctet<TPixel>
6569
ref Block8x8F yBlock = ref this.Y;
6670
ref Block8x8F cbBlock = ref this.Cb;
6771
ref Block8x8F crBlock = ref this.Cr;
68-
ref Rgb24 rgbStart = ref rgbSpan[0];
6972

70-
for (int i = 0; i < 64; i++)
73+
if (RgbToYCbCrConverterVectorized.IsSupported)
7174
{
72-
ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
73-
74-
this.colorTables.ConvertPixelInto(
75-
c.R,
76-
c.G,
77-
c.B,
78-
ref yBlock,
79-
ref cbBlock,
80-
ref crBlock,
81-
i);
75+
RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
76+
}
77+
else
78+
{
79+
this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
8280
}
8381
}
8482
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Copyright (c) Six Labors.
2+
// Licensed under the Apache License, Version 2.0.
3+
4+
using System;
5+
using BenchmarkDotNet.Attributes;
6+
using SixLabors.ImageSharp.Formats.Jpeg.Components;
7+
using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
8+
using SixLabors.ImageSharp.PixelFormats;
9+
10+
namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components.Encoder
11+
{
12+
public class YCbCrForwardConverterBenchmark
13+
{
14+
private RgbToYCbCrConverterLut converter;
15+
private Rgb24[] data;
16+
17+
[GlobalSetup]
18+
public void Setup()
19+
{
20+
this.converter = RgbToYCbCrConverterLut.Create();
21+
22+
var r = new Random(42);
23+
this.data = new Rgb24[64];
24+
25+
var d = new byte[3];
26+
for (int i = 0; i < this.data.Length; i++)
27+
{
28+
r.NextBytes(d);
29+
this.data[i] = new Rgb24(d[0], d[1], d[2]);
30+
}
31+
}
32+
33+
[Benchmark(Baseline = true)]
34+
public void ConvertLut()
35+
{
36+
Block8x8F y = default;
37+
Block8x8F cb = default;
38+
Block8x8F cr = default;
39+
40+
this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
41+
}
42+
43+
[Benchmark]
44+
public void ConvertVectorized()
45+
{
46+
Block8x8F y = default;
47+
Block8x8F cb = default;
48+
Block8x8F cr = default;
49+
50+
if (RgbToYCbCrConverterVectorized.IsSupported)
51+
{
52+
RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
53+
}
54+
}
55+
}
56+
}

0 commit comments

Comments
 (0)