Add varying pixel format support

MyreMylar · MyreMylar · commit ad0a14d2c37a · 2023-02-11T11:32:14.000Z
This was added before but got lost somehow in the switchover midweek.
diff --git a/src_c/alphablit.c b/src_c/alphablit.c
@@ -581,6 +581,7 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst,
                         src->format->Rmask == dst->format->Rmask &&
                         src->format->Gmask == dst->format->Gmask &&
                         src->format->Bmask == dst->format->Bmask &&
+                        src->format->Amask == 0xFF000000 &&
                         info.src_blend != SDL_BLENDMODE_NONE &&
                         pg_HasSSE_NEON() && (src != dst)) {
                         blit_blend_premultiplied_sse2(&info);
diff --git a/src_c/simd_blitters_avx2.c b/src_c/simd_blitters_avx2.c
@@ -928,10 +928,18 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
     int dstpxskip = info->d_pxskip >> 2;
 
     int pre_8_width = width % 8;
-    int post_8_width = (width - pre_8_width) / 8;
+    int post_8_width = width / 8;
 
     /* if either surface has a non-zero alpha mask use that as our mask */
     Uint32 amask = info->src->Amask | info->dst->Amask;
+    /* find the index 0, 1, 2 or 3 of the alpha channel within the pixel
+     * this can vary depending on the channel order in the pixel format.
+     * e.g. ARGB vs RGBA or BGRA
+     */
+    char a_index = ((amask >> 8) == 0)    ? 0
+                   : ((amask >> 16) == 0) ? 1
+                   : ((amask >> 24) == 0) ? 2
+                                          : 3;
 
     __m256i *srcp256 = (__m256i *)info->s_pixels;
     __m256i *dstp256 = (__m256i *)info->d_pixels;
@@ -942,27 +950,34 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
         mm256_shuff_alpha_mask_A, mm256_shuff_alpha_mask_B;
 
     mm_zero = _mm_setzero_si128();
-    mm_ones = _mm_set_epi64x(0x0000000000000000, 0x0001000100010001);
+    mm_ones = _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                           0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01);
 
     mm256_shuff_mask_A =
         _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80,
                         18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5,
                         0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
-
-    mm256_shuff_alpha_mask_A =
-        _mm256_set_epi8(0x80, 23, 0x80, 23, 0x80, 23, 0x80, 23, 0x80, 19, 0x80,
-                        19, 0x80, 19, 0x80, 19, 0x80, 7, 0x80, 7, 0x80, 7,
-                        0x80, 7, 0x80, 3, 0x80, 3, 0x80, 3, 0x80, 3);
+    /* use the alpha index to eventually grab the alpha channel of each pixel
+     */
+    mm256_shuff_alpha_mask_A = _mm256_set_epi8(
+        0x80, 20 + a_index, 0x80, 20 + a_index, 0x80, 20 + a_index, 0x80,
+        20 + a_index, 0x80, 16 + a_index, 0x80, 16 + a_index, 0x80,
+        16 + a_index, 0x80, 16 + a_index, 0x80, 4 + a_index, 0x80, 4 + a_index,
+        0x80, 4 + a_index, 0x80, 4 + a_index, 0x80, a_index, 0x80, a_index,
+        0x80, a_index, 0x80, a_index);
 
     mm256_shuff_mask_B =
         _mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80,
                         26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13,
                         0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8);
-
-    mm256_shuff_alpha_mask_B =
-        _mm256_set_epi8(0x80, 31, 0x80, 31, 0x80, 31, 0x80, 31, 0x80, 27, 0x80,
-                        27, 0x80, 27, 0x80, 27, 0x80, 15, 0x80, 15, 0x80, 15,
-                        0x80, 15, 0x80, 11, 0x80, 11, 0x80, 11, 0x80, 11);
+    /* use the alpha index to eventually grab the alpha channel of each pixel
+     */
+    mm256_shuff_alpha_mask_B = _mm256_set_epi8(
+        0x80, 28 + a_index, 0x80, 28 + a_index, 0x80, 28 + a_index, 0x80,
+        28 + a_index, 0x80, 24 + a_index, 0x80, 24 + a_index, 0x80,
+        24 + a_index, 0x80, 24 + a_index, 0x80, 12 + a_index, 0x80,
+        12 + a_index, 0x80, 12 + a_index, 0x80, 12 + a_index, 0x80,
+        8 + a_index, 0x80, 8 + a_index, 0x80, 8 + a_index, 0x80, 8 + a_index);
 
     mm256_ones = _mm256_set_epi8(
         0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
@@ -1026,24 +1041,99 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
                     mm256_src = _mm256_loadu_si256(srcp256);
                     mm256_dst = _mm256_loadu_si256(dstp256);
 
-                    /* insert 8 pixel at a time blend here */
-
-                    /* do everything A set first */
+                    /* At the start we shuffle our initial 8 bit source &
+                     * destination pixel channels into a 16 bit configuration,
+                     * spaced out by 00s.
+                     * We need the room of 16bits to do a potential max size
+                     * 8bit x 8bit multiplication (e.g. 255 x 255 = 65,025).
+                     *
+                     * At the same time we are working backwards from our
+                     * final packing instruction that lets us squish 2 256bit
+                     * registers divided into 16 16bit values into one 256bit
+                     * register containing 32 8bit values (or 8 32bit pixels
+                     * worth of data).
+                     *
+                     * This is why we end up with the strange seeming initial
+                     * shuffling around of pixel channels into two 256 bit
+                     * registers with 00 gaps. The goal is to be able to
+                     * perform the necessary blend calculation on each channel
+                     * of all 8 pixels in as few operations as possible and
+                     * then leave them arranged ready to be packed back up
+                     * into a single 8 x 32bit pixel chunk of data again at
+                     * the end.
+                     */
+
+                    /* Do shuffle then blend to the A half first .
+                     */
+
+                    /* these shuffles prepare our source, destination and
+                     * alpha only channels for 16 bit operation and puts them
+                     * in the correct order for the final pack with the B half
+                     */
                     mm256_dstA =
                         _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_A);
+                    /* mm256_dstA = (dst pixel 6:[00AA][00RR][00GG][00BB],
+                     *               dst pixel 5:[00AA][00RR][00GG][00BB],
+                     *               dst pixel 2:[00AA][00RR][00GG][00BB],
+                     *               dst pixel 1:[00AA][00RR][00GG][00BB])
+                     */
                     mm256_src_shuff =
                         _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
+                    /* mm256_src_shuff = (src pixel 6:[00AA][00RR][00GG][00BB],
+                     *                    src pixel 5:[00AA][00RR][00GG][00BB],
+                     *                    src pixel 2:[00AA][00RR][00GG][00BB],
+                     *                    src pixel 1:[00AA][00RR][00GG][00BB])
+                     */
                     mm256_alpha = _mm256_shuffle_epi8(
                         mm256_src, mm256_shuff_alpha_mask_A);
+                    /* mm256_alpha = (src alpha 6:[00AA][00AA][00AA][00AA],
+                     *                src alpha 5:[00AA][00AA][00AA][00AA],
+                     *                src alpha 2:[00AA][00AA][00AA][00AA],
+                     *                src alpha 1:[00AA][00AA][00AA][00AA])
+                     */
+
+                    /* blend on A half, at 16bit size, starts here.
+                     * overall target blend (with colours and alpha represented
+                     * as values between 0 and 1) is:
+                     *
+                     * result = source.RGB + (dest.RGB * (1 - source.A))
+                     *
+                     * Optimised and rearranged for values between 0 and 255
+                     * the blend formula for a single colour channel is:
+                     *
+                     * (sC + dC - ((dC + 1) * sA >> 8))
+                     */
                     mm256_src_shuff =
                         _mm256_add_epi16(mm256_src_shuff, mm256_dstA);
+                    /* That was Source channels + Destination channels */
                     mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_ones);
+                    /* That was Destination channels plus one */
                     mm256_dstA = _mm256_mullo_epi16(mm256_alpha, mm256_dstA);
+                    /* That was each of the Destination channels plus one
+                     * multiplied by the Source Alpha channels.
+                     */
                     mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
+                    /* That was the right shift by 8bits on the result
+                     * of the last two operations. It is, combined with the
+                     * addition of the ones above, effectively
+                     * a division operation by 255 for each channel putting
+                     * us back in the 8bit, 0 - 255 range (but still with 00)
+                     * padding taking us up to 16bit for now)
+                     */
 
                     mm256_dstA = _mm256_sub_epi16(mm256_src_shuff, mm256_dstA);
-
-                    /* now do B set */
+                    /* this is the final subtraction completing the original
+                     * colour channel blend formula. We now have blended
+                     * channel values sitting in the same 16 bit, 00 padded
+                     * arrangement of pixels as we did prior to the blend.
+                     */
+
+                    /* end of A half blend */
+
+                    /* now do B half shuffle then blend.
+                     * The register shapes are the same as for the A half but
+                     * with pixels 8,7,4 & 3 instead
+                     */
                     mm256_dstB =
                         _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_B);
                     mm256_src_shuff =
@@ -1060,6 +1150,9 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
 
                     /* now pack A & B together */
                     mm256_dst = _mm256_packus_epi16(mm256_dstA, mm256_dstB);
+                    /* After this pack operation our pixels are pack in the
+                     * right order - 8,7,6,5,4,3,2,1 and the right 8bit size
+                     */
                     _mm256_storeu_si256(dstp256, mm256_dst);
 
                     srcp256++;
@@ -1075,8 +1168,7 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
 void
 blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
 {
-    RAISE_AVX2_RUNTIME_SSE2_COMPILED_WARNING();
-    blit_blend_premultiplied_sse2(info);
+    BAD_AVX2_FUNCTION_CALL;
 }
 #endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
           !defined(SDL_DISABLE_IMMINTRIN_H) */
diff --git a/test/surface_test.py b/test/surface_test.py
@@ -3651,6 +3651,47 @@ def test_premul_surf(
             )
         )
 
+    def test_blit_blend_premultiplied_pixelformats(self):
+        # first create two BGRA pixel format surfaces using
+        # tobytes & frombytes
+        argb_surf_a = pygame.Surface((64, 64), flags=pygame.SRCALPHA)
+        byte_surf_a = pygame.image.tobytes(argb_surf_a, "ARGB")
+        bgra_surf_a = pygame.image.frombytes(byte_surf_a, (64, 64), "ARGB")
+
+        argb_surf_b = pygame.Surface((64, 64), flags=pygame.SRCALPHA)
+        byte_surf_b = pygame.image.tobytes(argb_surf_b, "ARGB")
+        bgra_surf_b = pygame.image.frombytes(byte_surf_b, (64, 64), "ARGB")
+
+        argb_surf_a.fill((64, 0, 0, 128))
+        self.assertEqual(argb_surf_a.get_at((0, 0)),
+                         pygame.Color(64, 0, 0, 128))
+
+        # 128 green, 128 blue at 50% alpha, premultiplied
+        argb_surf_b.fill((0, 64, 64, 128))
+        self.assertEqual(argb_surf_b.get_at((0, 0)),
+                         pygame.Color(0, 64, 64, 128))
+
+        argb_surf_a.blit(argb_surf_b, (0, 0),
+                         special_flags=pygame.BLEND_PREMULTIPLIED)
+
+        self.assertEqual(argb_surf_a.get_at((0, 0)),
+                         pygame.Color(32, 64, 64, 192))
+
+        bgra_surf_a.fill((64, 0, 0, 128))  # 128 red at 50% alpha
+        self.assertEqual(bgra_surf_a.get_at((0, 0)),
+                         pygame.Color(64, 0, 0, 128))
+
+        # 128 green, 128 blue at 50% alpha, premultiplied
+        bgra_surf_b.fill((0, 64, 64, 128))
+        self.assertEqual(bgra_surf_b.get_at((0, 0)),
+                         pygame.Color(0, 64, 64, 128))
+
+        bgra_surf_a.blit(bgra_surf_b, (0, 0),
+                         special_flags=pygame.BLEND_PREMULTIPLIED)
+
+        self.assertEqual(bgra_surf_a.get_at((0, 0)),
+                         pygame.Color(32, 64, 64, 192))
+
     def test_blit_blend_big_rect(self):
         """test that an oversized rect works ok."""
         color = (1, 2, 3, 255)