Skip to content

Commit c3e5d7b

Browse files
authored
Merge pull request #244 from Libvisual/alpha-blend-fixes
Core (LV::Video) Fix alpha blending of 32-bit videos (#230)
2 parents 3096732 + 0e95069 commit c3e5d7b

File tree

8 files changed

+187
-49
lines changed

8 files changed

+187
-49
lines changed

libvisual/cmake/LVBuildTest.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ FUNCTION(LV_BUILD_TEST TEST_NAME)
3737

3838
TARGET_LINK_LIBRARIES(${TEST_NAME}
3939
PRIVATE
40+
test_common
4041
Libvisual::Libvisual
4142
Threads::Threads
4243
${PARSED_ARGS_LINK_LIBS}

libvisual/libvisual/private/lv_video_blit.cpp

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,36 @@ namespace LV {
6161
}
6262
}
6363

64-
void VideoBlit::blit_overlay_alphasrc (Video* dest, Video* src)
64+
void VideoBlit::blit_overlay_alphasrc (Video* dst, Video* src)
6565
{
66-
auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
67-
auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ());
68-
6966
if (visual_cpu_has_mmx ()) {
70-
blit_overlay_alphasrc_mmx (dest, src);
67+
blit_overlay_alphasrc_mmx (dst, src);
7168
return;
7269
}
7370

71+
auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
72+
auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());
73+
7474
for (int y = 0; y < src->m_impl->height; y++) {
75+
auto dst_pixel = dst_pixel_row_ptr;
76+
auto src_pixel = src_pixel_row_ptr;
77+
7578
for (int x = 0; x < src->m_impl->width; x++) {
76-
uint8_t alpha = srcbuf[3];
79+
uint8_t const src_alpha = src_pixel[3];
7780

78-
destbuf[0] = (alpha * (srcbuf[0] - destbuf[0]) >> 8) + destbuf[0];
79-
destbuf[1] = (alpha * (srcbuf[1] - destbuf[1]) >> 8) + destbuf[1];
80-
destbuf[2] = (alpha * (srcbuf[2] - destbuf[2]) >> 8) + destbuf[2];
81+
// NOTE: This is effectively
82+
// "(src_alpha / 255) * src_pixel[i] + (1 - src_alpha / 255) * dst_pixel[i]"
83+
// but with only a single multiplication, a single division by 256 rather than 255, for speed.
84+
dst_pixel[0] = (src_alpha * (src_pixel[0] - dst_pixel[0]) >> 8) + dst_pixel[0];
85+
dst_pixel[1] = (src_alpha * (src_pixel[1] - dst_pixel[1]) >> 8) + dst_pixel[1];
86+
dst_pixel[2] = (src_alpha * (src_pixel[2] - dst_pixel[2]) >> 8) + dst_pixel[2];
8187

82-
destbuf += dest->m_impl->bpp;
83-
srcbuf += src->m_impl->bpp;
88+
src_pixel += 4;
89+
dst_pixel += 4;
8490
}
8591

86-
destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
87-
srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp);
92+
dst_pixel_row_ptr += dst->m_impl->pitch;
93+
src_pixel_row_ptr += src->m_impl->pitch;
8894
}
8995
}
9096

libvisual/libvisual/private/lv_video_blit_simd.cpp

Lines changed: 46 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,49 +26,60 @@
2626
#include "lv_video_blit.hpp"
2727
#include "lv_video_private.hpp"
2828
#include "lv_common.h"
29+
#include <x86intrin.h>
2930

3031
namespace LV {
3132

32-
void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src)
33+
void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src)
3334
{
3435
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
35-
auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
36-
auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ());
37-
38-
for (int i = 0; i < src->m_impl->height; i++) {
39-
for (int j = 0; j < src->m_impl->width; j++) {
40-
__asm __volatile
41-
("\n\t movd %[spix], %%mm0"
42-
"\n\t movd %[dpix], %%mm1"
43-
"\n\t movq %%mm0, %%mm2"
44-
"\n\t movq %%mm0, %%mm3"
45-
"\n\t psrlq $24, %%mm2" /* The alpha */
46-
"\n\t movq %%mm0, %%mm4"
47-
"\n\t psrld $24, %%mm3"
48-
"\n\t psrld $24, %%mm4"
49-
"\n\t psllq $32, %%mm2"
50-
"\n\t psllq $16, %%mm3"
51-
"\n\t por %%mm4, %%mm2"
52-
"\n\t punpcklbw %%mm6, %%mm0" /* interleaving dest */
53-
"\n\t por %%mm3, %%mm2"
54-
"\n\t punpcklbw %%mm6, %%mm1" /* interleaving source */
55-
"\n\t psubsw %%mm1, %%mm0" /* (src - dest) part */
56-
"\n\t pmullw %%mm2, %%mm0" /* alpha * (src - dest) */
57-
"\n\t psrlw $8, %%mm0" /* / 256 */
58-
"\n\t paddb %%mm1, %%mm0" /* + dest */
59-
"\n\t packuswb %%mm0, %%mm0"
60-
"\n\t movd %%mm0, %[dest]"
61-
: [dest] "=m" (*destbuf)
62-
: [dpix] "m" (*destbuf)
63-
, [spix] "m" (*srcbuf));
64-
65-
destbuf += 4;
66-
srcbuf += 4;
36+
auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
37+
auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());
38+
39+
for (int y = 0; y < src->m_impl->height; y++) {
40+
auto dst_pixel = reinterpret_cast<uint32_t*> (dst_pixel_row_ptr);
41+
auto src_pixel = reinterpret_cast<uint32_t const*> (src_pixel_row_ptr);
42+
43+
for (int x = 0; x < src->m_impl->width; x++) {
44+
// We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers.
45+
// See the pure C implementation in blit_overlay_alphsrc() for the calculation involved.
46+
47+
// Load source alpha as a 16-bit int.
48+
uint16_t const src_alpha = reinterpret_cast<uint8_t const*> (src_pixel)[3];
49+
50+
// Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits.
51+
auto src = _mm_cvtsi32_si64 (*src_pixel);
52+
auto dst = _mm_cvtsi32_si64 (*dst_pixel);
53+
src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ());
54+
dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ());
55+
56+
// Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2.
57+
auto a1 = _mm_set1_pi16 (src_alpha);
58+
auto a2 = _mm_set1_pi16 (static_cast<uint16_t> (255) - src_alpha);
59+
60+
// Interpolate between source and target.
61+
auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2));
62+
result = _mm_srli_pi16 (result, 8);
63+
64+
// Unpack result but keep the target pixel alpha.
65+
// Is there a nicer way to do this?
66+
uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result));
67+
int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00);
68+
69+
*dst_pixel = int_result;
70+
71+
dst_pixel++;
72+
src_pixel++;
6773
}
6874

69-
destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
70-
srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp);
75+
dst_pixel_row_ptr += dst->m_impl->pitch;
76+
src_pixel_row_ptr += src->m_impl->pitch;
7177
}
78+
79+
// FIXME: Some sources said this is not needed for x64 as MMX registers are no longer
80+
// overlayed on FP ones.
81+
_mm_empty ();
82+
7283
#endif /* !VISUAL_ARCH_X86 */
7384
}
7485

libvisual/tests/CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
INCLUDE(LVBuildTest)
22

3-
INCLUDE_DIRECTORIES(
3+
ADD_LIBRARY(test_common STATIC
4+
random.cpp
5+
)
6+
7+
TARGET_INCLUDE_DIRECTORIES(test_common PUBLIC
48
${CMAKE_CURRENT_SOURCE_DIR}
59
)
610

11+
TARGET_LINK_LIBRARIES(test_common PUBLIC libvisual)
12+
713
ADD_SUBDIRECTORY(audio_test)
814
ADD_SUBDIRECTORY(mem_test)
915
ADD_SUBDIRECTORY(video_test)

libvisual/tests/random.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#include "random.hpp"
2+
#include <random>
3+
4+
namespace LV::Tests
5+
{
6+
LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth)
7+
{
8+
std::random_device device {};
9+
std::uniform_int_distribution<uint8_t> distrib {0, 255};
10+
11+
auto video {LV::Video::create (width, height, depth)};
12+
13+
auto bytes_per_pixel = video->get_bpp ();
14+
auto pitch = video->get_pitch ();
15+
16+
auto content_bytes_per_row = bytes_per_pixel * video->get_width ();
17+
18+
auto pixel_row_ptr = static_cast<uint8_t *>(video->get_pixels ());
19+
20+
for (int y = 0; y < video->get_height (); y++) {
21+
auto pixel = pixel_row_ptr;
22+
for (int c = 0; c < content_bytes_per_row; c++) {
23+
*pixel = distrib (device);
24+
pixel++;
25+
}
26+
27+
pixel_row_ptr += pitch;
28+
}
29+
30+
return video;
31+
}
32+
} // LV::Tests namespace

libvisual/tests/random.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef _LV_TESTS_VIDEO_RANDOM_HPP
2+
#define _LV_TESTS_VIDEO_RANDOM_HPP
3+
4+
#include <libvisual/libvisual.h>
5+
6+
namespace LV::Tests
7+
{
8+
LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth);
9+
10+
} // LV::Tests namespace
11+
12+
#endif // defined(_LV_TESTS_VIDEO_COMMON_HPP)

libvisual/tests/video_test/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ LV_BUILD_TEST(video_check_test
22
SOURCES video_check_test.cpp
33
)
44

5+
LV_BUILD_TEST(video_blit_test
6+
SOURCES video_blit_test.cpp
7+
)
8+
59
IF(HAVE_SDL)
610
LV_BUILD_TEST(video_scale_test
711
SOURCES video_scale_test.cpp
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include "test.h"
2+
#include "random.hpp"
3+
#include <libvisual/libvisual.h>
4+
#include <cassert>
5+
6+
namespace
7+
{
8+
LV::VideoPtr clone_video (LV::VideoPtr const& source)
9+
{
10+
auto clone {LV::Video::create (source->get_width (), source->get_height (), source->get_depth ())};
11+
12+
assert (clone->get_pitch () == source->get_pitch ());
13+
std::size_t buffer_size = static_cast<std::size_t> (clone->get_pitch () * clone->get_width ());
14+
15+
visual_mem_copy (clone->get_pixels (), source->get_pixels (), buffer_size);
16+
17+
return clone;
18+
}
19+
20+
void test_blit_overlay_alphasrc ()
21+
{
22+
// Check that blit_overlay_alphasrc results are within +/- 1 of exact computation for each colour channel. The
23+
// errors largely arise from the use of 256 instead of 255 as divisor for performance reasons.
24+
25+
int const test_width = 31;
26+
int const test_height = 31;
27+
28+
auto source = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT);
29+
source->set_compose_type (VISUAL_VIDEO_COMPOSE_TYPE_SRC);
30+
31+
auto target = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT);
32+
33+
auto actual {clone_video (target)};
34+
actual->blit (source, 0, 0, true);
35+
36+
for (int y = 0; y < test_height; y++) {
37+
auto source_pixel = static_cast<uint8_t const*> (source->get_pixel_ptr (0, y));
38+
auto target_pixel = static_cast<uint8_t const*> (target->get_pixel_ptr (0, y));
39+
auto actual_pixel = static_cast<uint8_t const*> (actual->get_pixel_ptr (0, y));
40+
41+
for (int x = 0; x < test_width; x++) {
42+
LV_TEST_ASSERT (actual_pixel[3] == target_pixel[3]);
43+
44+
float source_alpha = static_cast<float> (source_pixel[3]) / 255.0f;
45+
uint8_t b = source_alpha * source_pixel[0] + (1.0f - source_alpha) * target_pixel[0];
46+
uint8_t g = source_alpha * source_pixel[1] + (1.0f - source_alpha) * target_pixel[1];
47+
uint8_t r = source_alpha * source_pixel[2] + (1.0f - source_alpha) * target_pixel[2];
48+
49+
LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[0]) - static_cast<int16_t> (b)) <= 1);
50+
LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[1]) - static_cast<int16_t> (g)) <= 1);
51+
LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[2]) - static_cast<int16_t> (r)) <= 1);
52+
53+
source_pixel += 4;
54+
target_pixel += 4;
55+
actual_pixel += 4;
56+
}
57+
}
58+
}
59+
} // anonymous namespace
60+
61+
int main(int argc, char *argv[])
62+
{
63+
LV::System::init (argc, argv);
64+
test_blit_overlay_alphasrc ();
65+
LV::System::destroy ();
66+
}

0 commit comments

Comments
 (0)