Skip to content

Commit 3a7d770

Browse files
committed
Core (LV::Video): Rewrite MMX alpha blending of 32-bit videos using intrinsics (#230).
1 parent e5f954a commit 3a7d770

File tree

1 file changed

+46
-35
lines changed

1 file changed

+46
-35
lines changed

libvisual/libvisual/private/lv_video_blit_simd.cpp

Lines changed: 46 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,49 +26,60 @@
2626
#include "lv_video_blit.hpp"
2727
#include "lv_video_private.hpp"
2828
#include "lv_common.h"
29+
#include <x86intrin.h>
2930

3031
namespace LV {
3132

32-
void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src)
33+
void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src)
3334
{
3435
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
35-
auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
36-
auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ());
37-
38-
for (int i = 0; i < src->m_impl->height; i++) {
39-
for (int j = 0; j < src->m_impl->width; j++) {
40-
__asm __volatile
41-
("\n\t movd %[spix], %%mm0"
42-
"\n\t movd %[dpix], %%mm1"
43-
"\n\t movq %%mm0, %%mm2"
44-
"\n\t movq %%mm0, %%mm3"
45-
"\n\t psrlq $24, %%mm2" /* The alpha */
46-
"\n\t movq %%mm0, %%mm4"
47-
"\n\t psrld $24, %%mm3"
48-
"\n\t psrld $24, %%mm4"
49-
"\n\t psllq $32, %%mm2"
50-
"\n\t psllq $16, %%mm3"
51-
"\n\t por %%mm4, %%mm2"
52-
"\n\t punpcklbw %%mm6, %%mm0" /* interleaving dest */
53-
"\n\t por %%mm3, %%mm2"
54-
"\n\t punpcklbw %%mm6, %%mm1" /* interleaving source */
55-
"\n\t psubsw %%mm1, %%mm0" /* (src - dest) part */
56-
"\n\t pmullw %%mm2, %%mm0" /* alpha * (src - dest) */
57-
"\n\t psrlw $8, %%mm0" /* / 256 */
58-
"\n\t paddb %%mm1, %%mm0" /* + dest */
59-
"\n\t packuswb %%mm0, %%mm0"
60-
"\n\t movd %%mm0, %[dest]"
61-
: [dest] "=m" (*destbuf)
62-
: [dpix] "m" (*destbuf)
63-
, [spix] "m" (*srcbuf));
64-
65-
destbuf += 4;
66-
srcbuf += 4;
36+
auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
37+
auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());
38+
39+
for (int y = 0; y < src->m_impl->height; y++) {
40+
auto dst_pixel = reinterpret_cast<uint32_t*> (dst_pixel_row_ptr);
41+
auto src_pixel = reinterpret_cast<uint32_t const*> (src_pixel_row_ptr);
42+
43+
for (int x = 0; x < src->m_impl->width; x++) {
44+
// We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers.
45+
// See the pure C implementation in blit_overlay_alphsrc() for the calculation involved.
46+
47+
// Load source alpha as a 16-bit int.
48+
uint16_t const src_alpha = reinterpret_cast<uint8_t const*> (src_pixel)[3];
49+
50+
// Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits.
51+
auto src = _mm_cvtsi32_si64 (*src_pixel);
52+
auto dst = _mm_cvtsi32_si64 (*dst_pixel);
53+
src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ());
54+
dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ());
55+
56+
// Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2.
57+
auto a1 = _mm_set1_pi16 (src_alpha);
58+
auto a2 = _mm_set1_pi16 (static_cast<uint16_t> (255) - src_alpha);
59+
60+
// Interpolate between source and target.
61+
auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2));
62+
result = _mm_srli_pi16 (result, 8);
63+
64+
// Unpack result but keep the target pixel alpha.
65+
// Is there a nicer way to do this?
66+
uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result));
67+
int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00);
68+
69+
*dst_pixel = int_result;
70+
71+
dst_pixel++;
72+
src_pixel++;
6773
}
6874

69-
destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
70-
srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp);
75+
dst_pixel_row_ptr += dst->m_impl->pitch;
76+
src_pixel_row_ptr += src->m_impl->pitch;
7177
}
78+
79+
// FIXME: Some sources said this is not needed for x64 as MMX registers are no longer
80+
// overlayed on FP ones.
81+
_mm_empty ();
82+
7283
#endif /* !VISUAL_ARCH_X86 */
7384
}
7485

0 commit comments

Comments
 (0)