|
26 | 26 | #include "lv_video_blit.hpp" |
27 | 27 | #include "lv_video_private.hpp" |
28 | 28 | #include "lv_common.h" |
| 29 | +#include <x86intrin.h> |
29 | 30 |
|
30 | 31 | namespace LV { |
31 | 32 |
|
32 | | - void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src) |
| 33 | + void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src) |
33 | 34 | { |
34 | 35 | #if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64) |
35 | | - auto destbuf = static_cast<uint8_t*> (dest->get_pixels ()); |
36 | | - auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ()); |
37 | | - |
38 | | - for (int i = 0; i < src->m_impl->height; i++) { |
39 | | - for (int j = 0; j < src->m_impl->width; j++) { |
40 | | - __asm __volatile |
41 | | - ("\n\t movd %[spix], %%mm0" |
42 | | - "\n\t movd %[dpix], %%mm1" |
43 | | - "\n\t movq %%mm0, %%mm2" |
44 | | - "\n\t movq %%mm0, %%mm3" |
45 | | - "\n\t psrlq $24, %%mm2" /* The alpha */ |
46 | | - "\n\t movq %%mm0, %%mm4" |
47 | | - "\n\t psrld $24, %%mm3" |
48 | | - "\n\t psrld $24, %%mm4" |
49 | | - "\n\t psllq $32, %%mm2" |
50 | | - "\n\t psllq $16, %%mm3" |
51 | | - "\n\t por %%mm4, %%mm2" |
52 | | - "\n\t punpcklbw %%mm6, %%mm0" /* interleaving dest */ |
53 | | - "\n\t por %%mm3, %%mm2" |
54 | | - "\n\t punpcklbw %%mm6, %%mm1" /* interleaving source */ |
55 | | - "\n\t psubsw %%mm1, %%mm0" /* (src - dest) part */ |
56 | | - "\n\t pmullw %%mm2, %%mm0" /* alpha * (src - dest) */ |
57 | | - "\n\t psrlw $8, %%mm0" /* / 256 */ |
58 | | - "\n\t paddb %%mm1, %%mm0" /* + dest */ |
59 | | - "\n\t packuswb %%mm0, %%mm0" |
60 | | - "\n\t movd %%mm0, %[dest]" |
61 | | - : [dest] "=m" (*destbuf) |
62 | | - : [dpix] "m" (*destbuf) |
63 | | - , [spix] "m" (*srcbuf)); |
64 | | - |
65 | | - destbuf += 4; |
66 | | - srcbuf += 4; |
| 36 | + auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ()); |
| 37 | + auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ()); |
| 38 | + |
| 39 | + for (int y = 0; y < src->m_impl->height; y++) { |
| 40 | + auto dst_pixel = reinterpret_cast<uint32_t*> (dst_pixel_row_ptr); |
| 41 | + auto src_pixel = reinterpret_cast<uint32_t const*> (src_pixel_row_ptr); |
| 42 | + |
| 43 | + for (int x = 0; x < src->m_impl->width; x++) { |
| 44 | + // We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers. |
| 45 | + // See the pure C implementation in blit_overlay_alphsrc() for the calculation involved. |
| 46 | + |
| 47 | + // Load source alpha as a 16-bit int. |
| 48 | + uint16_t const src_alpha = reinterpret_cast<uint8_t const*> (src_pixel)[3]; |
| 49 | + |
| 50 | + // Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits. |
| 51 | + auto src = _mm_cvtsi32_si64 (*src_pixel); |
| 52 | + auto dst = _mm_cvtsi32_si64 (*dst_pixel); |
| 53 | + src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ()); |
| 54 | + dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ()); |
| 55 | + |
| 56 | + // Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2. |
| 57 | + auto a1 = _mm_set1_pi16 (src_alpha); |
| 58 | + auto a2 = _mm_set1_pi16 (static_cast<uint16_t> (255) - src_alpha); |
| 59 | + |
| 60 | + // Interpolate between source and target. |
| 61 | + auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2)); |
| 62 | + result = _mm_srli_pi16 (result, 8); |
| 63 | + |
| 64 | + // Unpack result but keep the target pixel alpha. |
| 65 | + // Is there a nicer way to do this? |
| 66 | + uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result)); |
| 67 | + int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00); |
| 68 | + |
| 69 | + *dst_pixel = int_result; |
| 70 | + |
| 71 | + dst_pixel++; |
| 72 | + src_pixel++; |
67 | 73 | } |
68 | 74 |
|
69 | | - destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp); |
70 | | - srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp); |
| 75 | + dst_pixel_row_ptr += dst->m_impl->pitch; |
| 76 | + src_pixel_row_ptr += src->m_impl->pitch; |
71 | 77 | } |
| 78 | + |
| 79 | + // FIXME: Some sources said this is not needed for x64 as MMX registers are no longer |
| 80 | + // overlayed on FP ones. |
| 81 | + _mm_empty (); |
| 82 | + |
72 | 83 | #endif /* !VISUAL_ARCH_X86 */ |
73 | 84 | } |
74 | 85 |
|
|
0 commit comments