Core (LV::Video): Rewrite MMX alpha blending of 32-bit videos using intrinsics (#230).

kaixiong · kaixiong · commit 3a7d77026acc · 2024-12-26T05:20:55.000+08:00
diff --git a/libvisual/libvisual/private/lv_video_blit_simd.cpp b/libvisual/libvisual/private/lv_video_blit_simd.cpp
@@ -26,49 +26,60 @@
 #include "lv_video_blit.hpp"
 #include "lv_video_private.hpp"
 #include "lv_common.h"
+#include <x86intrin.h>
 
 namespace LV {
 
-  void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src)
+  void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src)
   {
 #if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
-      auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
-      auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ());
-
-      for (int i = 0; i < src->m_impl->height; i++) {
-          for (int j = 0; j < src->m_impl->width; j++) {
-              __asm __volatile
-                  ("\n\t movd %[spix], %%mm0"
-                   "\n\t movd %[dpix], %%mm1"
-                   "\n\t movq %%mm0, %%mm2"
-                   "\n\t movq %%mm0, %%mm3"
-                   "\n\t psrlq $24, %%mm2"  /* The alpha */
-                   "\n\t movq %%mm0, %%mm4"
-                   "\n\t psrld $24, %%mm3"
-                   "\n\t psrld $24, %%mm4"
-                   "\n\t psllq $32, %%mm2"
-                   "\n\t psllq $16, %%mm3"
-                   "\n\t por %%mm4, %%mm2"
-                   "\n\t punpcklbw %%mm6, %%mm0"    /* interleaving dest */
-                   "\n\t por %%mm3, %%mm2"
-                   "\n\t punpcklbw %%mm6, %%mm1"    /* interleaving source */
-                   "\n\t psubsw %%mm1, %%mm0"   /* (src - dest) part */
-                   "\n\t pmullw %%mm2, %%mm0"   /* alpha * (src - dest) */
-                   "\n\t psrlw $8, %%mm0"       /* / 256 */
-                   "\n\t paddb %%mm1, %%mm0"    /* + dest */
-                   "\n\t packuswb %%mm0, %%mm0"
-                   "\n\t movd %%mm0, %[dest]"
-                   : [dest] "=m" (*destbuf)
-                   : [dpix] "m" (*destbuf)
-                   , [spix] "m" (*srcbuf));
-
-              destbuf += 4;
-              srcbuf += 4;
+      auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
+      auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());
+
+      for (int y = 0; y < src->m_impl->height; y++) {
+          auto dst_pixel = reinterpret_cast<uint32_t*> (dst_pixel_row_ptr);
+          auto src_pixel = reinterpret_cast<uint32_t const*> (src_pixel_row_ptr);
+
+          for (int x = 0; x < src->m_impl->width; x++) {
+              // We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers.
+              // See the pure C implementation in blit_overlay_alphsrc() for the calculation involved.
+
+              // Load source alpha as a 16-bit int.
+              uint16_t const src_alpha = reinterpret_cast<uint8_t const*> (src_pixel)[3];
+
+              // Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits.
+              auto src = _mm_cvtsi32_si64 (*src_pixel);
+              auto dst = _mm_cvtsi32_si64 (*dst_pixel);
+              src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ());
+              dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ());
+
+              // Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2.
+              auto a1 = _mm_set1_pi16 (src_alpha);
+              auto a2 = _mm_set1_pi16 (static_cast<uint16_t> (255) - src_alpha);
+
+              // Interpolate between source and target.
+              auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2));
+              result = _mm_srli_pi16 (result, 8);
+
+              // Unpack result but keep the target pixel alpha.
+              // Is there a nicer way to do this?
+              uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result));
+              int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00);
+
+              *dst_pixel = int_result;
+
+              dst_pixel++;
+              src_pixel++;
           }
 
-          destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
-          srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp);
+          dst_pixel_row_ptr += dst->m_impl->pitch;
+          src_pixel_row_ptr += src->m_impl->pitch;
       }
+
+      // FIXME: Some sources said this is not needed for x64 as MMX registers are no longer
+      // overlayed on FP ones.
+      _mm_empty ();
+
 #endif /* !VISUAL_ARCH_X86 */
   }