Skip to content

Commit b3a1229

Browse files
committed
Optimize the DYUV LUT algorithm and move it to the regular file decoder
1 parent ea8355f commit b3a1229

File tree

5 files changed

+95
-121
lines changed

5 files changed

+95
-121
lines changed

src/CDI/Video/VideoDecoders.cpp

Lines changed: 90 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -282,21 +282,21 @@ static constexpr void matrixRGB(Pixel* pixel, const int Y, const uint8_t U, cons
282282
/** \brief Decode a DYUV line to ARGB.
283283
* \tparam WIDTH The number of source pixels to decode.
284284
* \param dst Where the ARGB data will be written to.
285-
* \param data The source DYUV data.
285+
* \param dyuv The source DYUV data.
286286
* \param initialDYUV The initial value to be used by the DYUV decoder.
287-
* \return The number of raw bytes read from \p data.
287+
* \return The number of raw bytes read from \p dyuv.
288288
*/
289289
template<uint16_t WIDTH>
290-
uint16_t decodeDYUVLine(Pixel* dst, const uint8_t* data, uint32_t initialDYUV) noexcept
290+
uint16_t decodeDYUVLine(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept
291291
{
292292
uint8_t py = bits<16, 23>(initialDYUV);
293293
uint8_t pu = bits<8, 15>(initialDYUV);
294294
uint8_t pv = initialDYUV;
295295

296296
for(uint16_t index = 0; index < WIDTH; index += 2)
297297
{
298-
const uint8_t high = data[index];
299-
const uint8_t low = data[index + 1];
298+
const uint8_t high = dyuv[index];
299+
const uint8_t low = dyuv[index + 1];
300300

301301
// Green book V.4.4.2
302302
uint8_t u2 = bits<4, 7>(high);
@@ -338,6 +338,91 @@ template uint16_t decodeDYUVLine<384>(Pixel* dst, const uint8_t* data, uint32_t
338338
template uint16_t decodeDYUVLine<720>(Pixel* dst, const uint8_t* data, uint32_t initialDYUV) noexcept;
339339
template uint16_t decodeDYUVLine<768>(Pixel* dst, const uint8_t* data, uint32_t initialDYUV) noexcept;
340340

341+
#define DYUV_PIXEL_INDEX(Y, U, V) (as<uint32_t>(Y) << 16 | as<uint32_t>(U) << 8 | V)
342+
343+
static std::vector<Pixel> generateDYUVPixelLUT() noexcept
344+
{
345+
std::vector<Pixel> array{};
346+
array.resize(256 * 256 * 256);
347+
348+
for(int y = 0; y < 256; ++y)
349+
for(int u = 0; u < 256; ++u)
350+
for(int v = 0; v < 256; ++v)
351+
{
352+
Pixel pixel{0};
353+
pixel.r = limu8(y + matrixVToR[v]);
354+
pixel.g = limu8(y - (matrixUToG[u] + matrixVToG[v]));
355+
pixel.b = limu8(y + matrixUToB[u]);
356+
array[DYUV_PIXEL_INDEX(y, u, v)] = pixel;
357+
}
358+
return array;
359+
}
360+
361+
/** \brief LUT to convert the YUV values to RGB. Use #DYUV_PIXEL_INDEX macro to index this array. */
362+
static const std::vector<Pixel> dyuvPixelLUT = generateDYUVPixelLUT();
363+
364+
/** \brief Decode a DYUV line to ARGB using a LUT.
365+
* \tparam WIDTH The number of source pixels to decode.
366+
* \param dst Where the ARGB data will be written to.
367+
* \param dyuv The source DYUV data.
368+
* \param initialDYUV The initial value to be used by the DYUV decoder.
369+
* \return The number of raw bytes read from \p dyuv.
370+
*
371+
* This is not a SIMD decoder because each pixel depends on the decoded value of the previous one.
372+
* However this is another approach that uses a pixel LUT to remove as much calculations as possible.
373+
*/
374+
template<uint16_t WIDTH>
375+
uint16_t decodeDYUVLineLUT(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept
376+
{
377+
uint8_t py = bits<16, 23>(initialDYUV);
378+
uint8_t pu = bits<8, 15>(initialDYUV);
379+
uint8_t pv = initialDYUV;
380+
381+
for(uint16_t index = 0; index < WIDTH; index += 2)
382+
{
383+
const uint8_t high = dyuv[index];
384+
const uint8_t low = dyuv[index + 1];
385+
386+
// Green book V.4.4.2
387+
uint8_t u2 = bits<4, 7>(high);
388+
uint8_t y1 = bits<0, 3>(high);
389+
uint8_t v2 = bits<4, 7>(low);
390+
uint8_t y2 = bits<0, 3>(low);
391+
392+
y1 = py + dequantizer[y1];
393+
u2 = pu + dequantizer[u2];
394+
v2 = pv + dequantizer[v2];
395+
y2 = y1 + dequantizer[y2];
396+
const uint8_t u1 = (as<uint16_t>(pu) + as<uint16_t>(u2)) >> 1;
397+
const uint8_t v1 = (as<uint16_t>(pv) + as<uint16_t>(v2)) >> 1;
398+
399+
// Store previous.
400+
py = y2;
401+
pu = u2;
402+
pv = v2;
403+
404+
Pixel* pixel1 = dst++;
405+
*pixel1 = dyuvPixelLUT[DYUV_PIXEL_INDEX(y1, u1, v1)]; // Matrix RGB.
406+
if constexpr(WIDTH == 360 || WIDTH == 384)
407+
{
408+
memcpy(dst++, pixel1, sizeof(Pixel));
409+
}
410+
411+
Pixel* pixel2 = dst++;
412+
*pixel2 = dyuvPixelLUT[DYUV_PIXEL_INDEX(y2, u2, v2)]; // Matrix RGB.
413+
if constexpr(WIDTH == 360 || WIDTH == 384)
414+
{
415+
memcpy(dst++, pixel2, sizeof(Pixel));
416+
}
417+
}
418+
419+
return WIDTH;
420+
}
421+
template uint16_t decodeDYUVLineLUT<360>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
422+
template uint16_t decodeDYUVLineLUT<384>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
423+
template uint16_t decodeDYUVLineLUT<720>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
424+
template uint16_t decodeDYUVLineLUT<768>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
425+
341426
/** \brief Decode a CLUT line to ARGB.
342427
* \tparam WIDTH The number of source pixels to decode.
343428
* \param dst Where the ARGB data will be written to.

src/CDI/Video/VideoDecoders.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ uint16_t decodeRunLengthLine(Pixel* dst, const uint8_t* data, uint16_t width, co
2020
template<uint16_t WIDTH>
2121
uint16_t decodeRGB555Line(Pixel* dst, const uint8_t* dataA, const uint8_t* dataB) noexcept;
2222
template<uint16_t WIDTH>
23-
uint16_t decodeDYUVLine(Pixel* dst, const uint8_t* data, uint32_t initialDYUV) noexcept;
23+
uint16_t decodeDYUVLine(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
24+
template<uint16_t WIDTH>
25+
uint16_t decodeDYUVLineLUT(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
2426
template<uint16_t WIDTH>
2527
uint16_t decodeCLUTLine(Pixel* dst, const uint8_t* data, const uint32_t* CLUTTable, ImageCodingMethod icm) noexcept;
2628

src/CDI/Video/VideoDecodersSIMD.cpp

Lines changed: 0 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -9,44 +9,6 @@
99
#include <execution>
1010
#include <iterator>
1111

12-
#define MAKE_RED_INDEX(Y, V) (as<uint16_t>(Y) << 8 | V)
13-
#define MAKE_GREEN_INDEX(Y, U, V) (as<uint32_t>(Y) << 16 | as<uint32_t>(U) << 8 | V)
14-
#define MAKE_BLUE_INDEX(Y, U) (as<uint16_t>(Y) << 8 | U)
15-
16-
static constexpr std::array<uint8_t, 0x1'0000> generateRedLUT() noexcept
17-
{
18-
std::array<uint8_t, 0x1'0000> array{};
19-
for(int y = 0; y < 256; ++y)
20-
for(int v = 0; v < 256; ++v)
21-
array[MAKE_RED_INDEX(y, v)] = limu8(y + Video::matrixVToR[v]);
22-
return array;
23-
}
24-
25-
static std::vector<uint8_t> generateGreenLUT() noexcept
26-
{
27-
std::vector<uint8_t> array{};
28-
array.resize(256 * 256 * 256);
29-
30-
for(int y = 0; y < 256; ++y)
31-
for(int u = 0; u < 256; ++u)
32-
for(int v = 0; v < 256; ++v)
33-
array[MAKE_GREEN_INDEX(y, u, v)] = limu8(y - (Video::matrixUToG[u] + Video::matrixVToG[v]));
34-
return array;
35-
}
36-
37-
static constexpr std::array<uint8_t, 0x1'0000> generateBlueLUT() noexcept
38-
{
39-
std::array<uint8_t, 0x1'0000> array{};
40-
for(int y = 0; y < 256; ++y)
41-
for(int u = 0; u < 256; ++u)
42-
array[MAKE_BLUE_INDEX(y, u)] = limu8(y + Video::matrixUToB[u]);
43-
return array;
44-
}
45-
46-
static constexpr std::array<uint8_t, 0x1'0000> redLUT = generateRedLUT();
47-
static const std::vector<uint8_t> greenLUT = generateGreenLUT();
48-
static constexpr std::array<uint8_t, 0x1'0000> blueLUT = generateBlueLUT();
49-
5012
namespace Video
5113
{
5214

@@ -251,76 +213,4 @@ template uint16_t decodeDYUVLineSIMD<384>(Pixel* dst, const uint8_t* dyuv, uint3
251213
template uint16_t decodeDYUVLineSIMD<720>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
252214
template uint16_t decodeDYUVLineSIMD<768>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
253215

254-
/** \brief Matrixes the YUV values to RGB.
255-
* The LUT for green is not constexpr because it is a 16MB array which we can't reasonably generate at compile-time.
256-
*/
257-
static constexpr void matrixRGB(Pixel* pixel, const int Y, const uint8_t U, const uint8_t V) noexcept
258-
{
259-
pixel->r = redLUT[MAKE_RED_INDEX(Y, V)];
260-
pixel->g = greenLUT[MAKE_GREEN_INDEX(Y, U, V)];
261-
pixel->b = blueLUT[MAKE_BLUE_INDEX(Y, U)];
262-
}
263-
264-
/** \brief Decode a DYUV line to ARGB using a LUT.
265-
* \tparam WIDTH The number of source pixels to decode.
266-
* \param dst Where the ARGB data will be written to.
267-
* \param dyuv The source DYUV data.
268-
* \param initialDYUV The initial value to be used by the DYUV decoder.
269-
* \return The number of raw bytes read from \p dyuv.
270-
*
271-
* This is not a SIMD decoder as it is impossible because each pixel depends on the decoded value of the previous one.
272-
* However this is another approach that heavily uses LUTs to remove as much calculations as possible.
273-
*/
274-
template<uint16_t WIDTH>
275-
uint16_t decodeDYUVLineLUT(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept
276-
{
277-
uint8_t py = bits<16, 23>(initialDYUV);
278-
uint8_t pu = bits<8, 15>(initialDYUV);
279-
uint8_t pv = initialDYUV;
280-
281-
for(uint16_t index = 0; index < WIDTH; index += 2)
282-
{
283-
const uint8_t high = dyuv[index];
284-
const uint8_t low = dyuv[index + 1];
285-
286-
// Green book V.4.4.2
287-
uint8_t u2 = bits<4, 7>(high);
288-
uint8_t y1 = bits<0, 3>(high);
289-
uint8_t v2 = bits<4, 7>(low);
290-
uint8_t y2 = bits<0, 3>(low);
291-
292-
y1 = py + dequantizer[y1];
293-
u2 = pu + dequantizer[u2];
294-
v2 = pv + dequantizer[v2];
295-
y2 = y1 + dequantizer[y2];
296-
const uint8_t u1 = (as<uint16_t>(pu) + as<uint16_t>(u2)) >> 1;
297-
const uint8_t v1 = (as<uint16_t>(pv) + as<uint16_t>(v2)) >> 1;
298-
299-
// Store previous.
300-
py = y2;
301-
pu = u2;
302-
pv = v2;
303-
304-
Pixel* pixel1 = dst++;
305-
matrixRGB(pixel1, y1, u1, v1);
306-
if constexpr(WIDTH == 360 || WIDTH == 384)
307-
{
308-
memcpy(dst++, pixel1, sizeof(Pixel));
309-
}
310-
311-
Pixel* pixel2 = dst++;
312-
matrixRGB(pixel2, y2, u2, v2);
313-
if constexpr(WIDTH == 360 || WIDTH == 384)
314-
{
315-
memcpy(dst++, pixel2, sizeof(Pixel));
316-
}
317-
}
318-
319-
return WIDTH;
320-
}
321-
template uint16_t decodeDYUVLineLUT<360>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
322-
template uint16_t decodeDYUVLineLUT<384>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
323-
template uint16_t decodeDYUVLineLUT<720>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
324-
template uint16_t decodeDYUVLineLUT<768>(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
325-
326216
} // namespace Video

src/CDI/Video/VideoDecodersSIMD.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ uint16_t decodeRGB555LineSIMD(Pixel* dst, const uint8_t* dataA, const uint8_t* d
2020
template<uint16_t WIDTH>
2121
uint16_t decodeDYUVLineSIMD(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
2222

23-
template<uint16_t WIDTH>
24-
uint16_t decodeDYUVLineLUT(Pixel* dst, const uint8_t* dyuv, uint32_t initialDYUV) noexcept;
25-
2623
} // namespace Video
2724

2825
#endif // CDI_COMMON_VIDEOSIMD_HPP

tests/testVideoDecoders.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -463,10 +463,10 @@ TEST_CASE("DYUV", "[Video]")
463463
Video::decodeDYUVLine<384>(DST.data(), SRC.data(), 0x0010'8080);
464464
REQUIRE(std::equal(DST.cbegin(), DST.cend(), EXPECTED.cbegin()));
465465

466-
#if LIBCEDIMU_ENABLE_RENDERERSIMD
467466
Video::decodeDYUVLineLUT<384>(DST.data(), SRC.data(), 0x0010'8080);
468467
REQUIRE(std::equal(DST.cbegin(), DST.cend(), EXPECTED.cbegin()));
469468

469+
#if LIBCEDIMU_ENABLE_RENDERERSIMD
470470
Video::decodeDYUVLineSIMD<384>(DST.data(), SRC.data(), 0x0010'8080);
471471
REQUIRE(std::equal(DST.cbegin(), DST.cend(), EXPECTED.cbegin()));
472472
#endif
@@ -500,10 +500,10 @@ TEST_CASE("DYUV", "[Video]")
500500
// Video::decodeDYUVLine<768>(DST.data(), DYUV_SRC.data(), 0x0010'8080);
501501
// REQUIRE(std::equal(DST.cbegin(), DST.cend(), DYUV_EXPECTED.cbegin()));
502502

503-
// #if LIBCEDIMU_ENABLE_RENDERERSIMD
504503
// Video::decodeDYUVLineLUT<768>(DST.data(), DYUV_SRC.data(), 0x0010'8080);
505504
// REQUIRE(std::equal(DST.cbegin(), DST.cend(), DYUV_EXPECTED.cbegin()));
506505

506+
// #if LIBCEDIMU_ENABLE_RENDERERSIMD
507507
// Video::decodeDYUVLineSIMD<768>(DST.data(), DYUV_SRC.data(), 0x0010'8080);
508508
// REQUIRE(std::equal(DST.cbegin(), DST.cend(), DYUV_EXPECTED.cbegin()));
509509
// #endif

0 commit comments

Comments
 (0)