AVX2 premul_alpha() (#2615)

itzpr3d4t0r · web-flow · commit d732a4444ace · 2024-03-17T11:30:11.000+01:00
* AVX2 premul_alpha() implementation

* minor changes

* finishing touches and comments

* format

* add bias back in

* fix for alpha shuffle masks

* optimization
diff --git a/src_c/alphablit.c b/src_c/alphablit.c
@@ -2839,6 +2839,10 @@ premul_surf_color_by_alpha(SDL_Surface *src, SDL_Surface *dst)
         // since we know dst is a copy of src we can simplify the normal checks
 #if !defined(__EMSCRIPTEN__)
 #if SDL_BYTEORDER == SDL_LIL_ENDIAN
+    if ((src->format->BytesPerPixel == 4) && pg_has_avx2()) {
+        premul_surf_color_by_alpha_avx2(src, dst);
+        return 0;
+    }
 #if defined(__SSE2__)
     if ((src->format->BytesPerPixel == 4) && SDL_HasSSE2()) {
         premul_surf_color_by_alpha_sse2(src, dst);
diff --git a/src_c/simd_blitters.h b/src_c/simd_blitters.h
@@ -81,3 +81,5 @@ void
 blit_blend_rgb_min_avx2(SDL_BlitInfo *info);
 void
 blit_blend_premultiplied_avx2(SDL_BlitInfo *info);
+void
+premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst);
diff --git a/src_c/simd_blitters_avx2.c b/src_c/simd_blitters_avx2.c
@@ -1529,3 +1529,117 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
 }
 #endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
           !defined(SDL_DISABLE_IMMINTRIN_H) */
+
+#define PREMUL_ALPHA_CODE                                   \
+    /* extract the alpha */                                 \
+    mm_alpha_in = _mm256_and_si256(mm_src, mm256_amask);    \
+                                                            \
+    /*redistribute the alphas across the R, G, B channels*/ \
+    alphaA = _mm256_shuffle_epi8(mm_src, shuffle_maskA);    \
+    alphaB = _mm256_shuffle_epi8(mm_src, shuffle_maskB);    \
+                                                            \
+    /*prep the pixels for 16-bit math*/                     \
+    mm_srcA = _mm256_unpacklo_epi8(mm_src, mm_zero);        \
+    mm_srcB = _mm256_unpackhi_epi8(mm_src, mm_zero);        \
+                                                            \
+    mm_srcA = _mm256_add_epi16(mm_srcA, mm256_ones);        \
+    mm_srcB = _mm256_add_epi16(mm_srcB, mm256_ones);        \
+                                                            \
+    /*multiply the pixels by the alphas*/                   \
+    mm_srcA = _mm256_mullo_epi16(mm_srcA, alphaA);          \
+    mm_srcB = _mm256_mullo_epi16(mm_srcB, alphaB);          \
+                                                            \
+    /*shift the pixels back down to 8-bit*/                 \
+    mm_srcA = _mm256_srli_epi16(mm_srcA, 8);                \
+    mm_srcB = _mm256_srli_epi16(mm_srcB, 8);                \
+                                                            \
+    /*pack the pixels back together*/                       \
+    mm_dst = _mm256_packus_epi16(mm_srcA, mm_srcB);         \
+    /*add the original alpha back in*/                      \
+    mm_dst = _mm256_or_si256(mm_dst, mm_alpha_in);
+
+#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+    !defined(SDL_DISABLE_IMMINTRIN_H)
+void
+premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst)
+{
+    int i, height = src->h;
+    const int width = src->w;
+    const int n_iters_8 = width / 8;
+    const int pxl_excess = width % 8;
+    const int src_skip = src->pitch / 4 - width;
+    const int dst_skip = dst->pitch / 4 - width;
+    const int src_exc_skip = pxl_excess + src_skip;
+    const int dst_exc_skip = pxl_excess + dst_skip;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)dst->pixels;
+
+    __m256i mm_src, mm_dst, alphaA, alphaB, mm_alpha_in;
+    __m256i mm_srcA, mm_srcB;
+
+    const __m256i mm256_amask = _mm256_set1_epi32(src->format->Amask);
+    const __m256i mm_zero = _mm256_setzero_si256();
+    const __m256i partial_mask =
+        _mm256_set_epi32(0, pxl_excess > 6 ? -1 : 0, pxl_excess > 5 ? -1 : 0,
+                         pxl_excess > 4 ? -1 : 0, pxl_excess > 3 ? -1 : 0,
+                         pxl_excess > 2 ? -1 : 0, pxl_excess > 1 ? -1 : 0,
+                         pxl_excess > 0 ? -1 : 0);
+    const __m256i mm256_ones = _mm256_set1_epi16(0x0001);
+
+    char _a_off = ((src->format->Amask >> 8) == 0)    ? 0
+                  : ((src->format->Amask >> 16) == 0) ? 1
+                  : ((src->format->Amask >> 24) == 0) ? 2
+                                                      : 3;
+
+    /* masks for shuffling the alpha to the RGB channels for multiplication */
+    const __m256i shuffle_maskA = _mm256_set_epi8(
+        -1, -1, -1, 20 + _a_off, -1, 20 + _a_off, -1, 20 + _a_off, -1, -1, -1,
+        16 + _a_off, -1, 16 + _a_off, -1, 16 + _a_off, -1, -1, -1, 4 + _a_off,
+        -1, 4 + _a_off, -1, 4 + _a_off, -1, -1, -1, _a_off, -1, _a_off, -1,
+        _a_off);
+
+    const __m256i shuffle_maskB = _mm256_set_epi8(
+        -1, -1, -1, 28 + _a_off, -1, 28 + _a_off, -1, 28 + _a_off, -1, -1, -1,
+        24 + _a_off, -1, 24 + _a_off, -1, 24 + _a_off, -1, -1, -1, 12 + _a_off,
+        -1, 12 + _a_off, -1, 12 + _a_off, -1, -1, -1, 8 + _a_off, -1,
+        8 + _a_off, -1, 8 + _a_off);
+
+    while (height--) {
+        /* 8 pixels at a time */
+        for (i = 0; i < n_iters_8; i++) {
+            mm_src = _mm256_loadu_si256((__m256i *)srcp);
+
+            PREMUL_ALPHA_CODE;
+
+            _mm256_storeu_si256((__m256i *)dstp, mm_dst);
+
+            dstp += 8;
+            srcp += 8;
+        }
+
+        /* up to 7 pixels at a time */
+        if (pxl_excess) {
+            mm_src = _mm256_maskload_epi32((int *)srcp, partial_mask);
+
+            PREMUL_ALPHA_CODE
+
+            _mm256_maskstore_epi32((int *)dstp, partial_mask, mm_dst);
+
+            srcp += src_exc_skip;
+            dstp += dst_exc_skip;
+            continue;
+        }
+
+        srcp += src_skip;
+        dstp += dst_skip;
+    }
+}
+#else
+void
+premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst)
+{
+    BAD_AVX2_FUNCTION_CALL;
+}
+#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+!defined(SDL_DISABLE_IMMINTRIN_H) */