|
| 1 | +// Copyright (c) Six Labors. |
| 2 | +// Licensed under the Apache License, Version 2.0. |
| 3 | + |
| 4 | +using System; |
| 5 | +using System.Diagnostics; |
| 6 | +#if SUPPORTS_RUNTIME_INTRINSICS |
| 7 | +using System.Runtime.CompilerServices; |
| 8 | +using System.Runtime.InteropServices; |
| 9 | +using System.Runtime.Intrinsics; |
| 10 | +using System.Runtime.Intrinsics.X86; |
| 11 | +#endif |
| 12 | +using SixLabors.ImageSharp.PixelFormats; |
| 13 | + |
| 14 | +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder |
| 15 | +{ |
| 16 | + internal static class RgbToYCbCrConverterVectorized |
| 17 | + { |
| 18 | + public static bool IsSupported |
| 19 | + { |
| 20 | + get |
| 21 | + { |
| 22 | +#if SUPPORTS_RUNTIME_INTRINSICS |
| 23 | + return Avx2.IsSupported; |
| 24 | +#else |
| 25 | + return false; |
| 26 | +#endif |
| 27 | + } |
| 28 | + } |
| 29 | + |
| 30 | +#if SUPPORTS_RUNTIME_INTRINSICS |
| 31 | + private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[] |
| 32 | + { |
| 33 | + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, |
| 34 | + 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 |
| 35 | + }; |
| 36 | + |
| 37 | + private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[] |
| 38 | + { |
| 39 | + 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, |
| 40 | + 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0 |
| 41 | + }; |
| 42 | + |
| 43 | + private static ReadOnlySpan<byte> ExtractRgb => new byte[] |
| 44 | + { |
| 45 | + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, |
| 46 | + 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF |
| 47 | + }; |
| 48 | +#endif |
| 49 | + |
| 50 | + public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) |
| 51 | + { |
| 52 | + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); |
| 53 | + |
| 54 | +#if SUPPORTS_RUNTIME_INTRINSICS |
| 55 | + var f0299 = Vector256.Create(0.299f); |
| 56 | + var f0587 = Vector256.Create(0.587f); |
| 57 | + var f0114 = Vector256.Create(0.114f); |
| 58 | + var fn0168736 = Vector256.Create(-0.168736f); |
| 59 | + var fn0331264 = Vector256.Create(-0.331264f); |
| 60 | + var f128 = Vector256.Create(128f); |
| 61 | + var fn0418688 = Vector256.Create(-0.418688f); |
| 62 | + var fn0081312F = Vector256.Create(-0.081312F); |
| 63 | + var f05 = Vector256.Create(0.5f); |
| 64 | + var zero = Vector256.Create(0).AsByte(); |
| 65 | + |
| 66 | + ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan)); |
| 67 | + ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock); |
| 68 | + ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock); |
| 69 | + ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock); |
| 70 | + |
| 71 | + var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); |
| 72 | + var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb)); |
| 73 | + Vector256<byte> rgb, rg, bx; |
| 74 | + Vector256<float> r, g, b; |
| 75 | + for (int i = 0; i < 7; i++) |
| 76 | + { |
| 77 | + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte(); |
| 78 | + |
| 79 | + rgb = Avx2.Shuffle(rgb, extractRgbMask); |
| 80 | + |
| 81 | + rg = Avx2.UnpackLow(rgb, zero); |
| 82 | + bx = Avx2.UnpackHigh(rgb, zero); |
| 83 | + |
| 84 | + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); |
| 85 | + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); |
| 86 | + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); |
| 87 | + |
| 88 | + // (0.299F * r) + (0.587F * g) + (0.114F * b); |
| 89 | + Unsafe.Add(ref destYRef, i) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); |
| 90 | + |
| 91 | + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) |
| 92 | + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); |
| 93 | + |
| 94 | + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) |
| 95 | + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); |
| 96 | + } |
| 97 | + |
| 98 | + extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes)); |
| 99 | + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte(); |
| 100 | + rgb = Avx2.Shuffle(rgb, extractRgbMask); |
| 101 | + |
| 102 | + rg = Avx2.UnpackLow(rgb, zero); |
| 103 | + bx = Avx2.UnpackHigh(rgb, zero); |
| 104 | + |
| 105 | + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); |
| 106 | + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); |
| 107 | + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); |
| 108 | + |
| 109 | + // (0.299F * r) + (0.587F * g) + (0.114F * b); |
| 110 | + Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); |
| 111 | + |
| 112 | + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) |
| 113 | + Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); |
| 114 | + |
| 115 | + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) |
| 116 | + Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); |
| 117 | +#endif |
| 118 | + } |
| 119 | + } |
| 120 | +} |
0 commit comments