Skip to content

Commit ad0a14d

Browse files
committed
Add varying pixel format support
This was added before but got lost somehow in the switchover midweek.
1 parent 9f3e97c commit ad0a14d

File tree

3 files changed

+153
-19
lines changed

3 files changed

+153
-19
lines changed

src_c/alphablit.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,7 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst,
581581
src->format->Rmask == dst->format->Rmask &&
582582
src->format->Gmask == dst->format->Gmask &&
583583
src->format->Bmask == dst->format->Bmask &&
584+
src->format->Amask == 0xFF000000 &&
584585
info.src_blend != SDL_BLENDMODE_NONE &&
585586
pg_HasSSE_NEON() && (src != dst)) {
586587
blit_blend_premultiplied_sse2(&info);

src_c/simd_blitters_avx2.c

Lines changed: 111 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -928,10 +928,18 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
928928
int dstpxskip = info->d_pxskip >> 2;
929929

930930
int pre_8_width = width % 8;
931-
int post_8_width = (width - pre_8_width) / 8;
931+
int post_8_width = width / 8;
932932

933933
/* if either surface has a non-zero alpha mask use that as our mask */
934934
Uint32 amask = info->src->Amask | info->dst->Amask;
935+
/* find the index 0, 1, 2 or 3 of the alpha channel within the pixel
936+
* this can vary depending on the channel order in the pixel format.
937+
* e.g. ARGB vs RGBA or BGRA
938+
*/
939+
char a_index = ((amask >> 8) == 0) ? 0
940+
: ((amask >> 16) == 0) ? 1
941+
: ((amask >> 24) == 0) ? 2
942+
: 3;
935943

936944
__m256i *srcp256 = (__m256i *)info->s_pixels;
937945
__m256i *dstp256 = (__m256i *)info->d_pixels;
@@ -942,27 +950,34 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
942950
mm256_shuff_alpha_mask_A, mm256_shuff_alpha_mask_B;
943951

944952
mm_zero = _mm_setzero_si128();
945-
mm_ones = _mm_set_epi64x(0x0000000000000000, 0x0001000100010001);
953+
mm_ones = _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
954+
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01);
946955

947956
mm256_shuff_mask_A =
948957
_mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80,
949958
18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5,
950959
0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
951-
952-
mm256_shuff_alpha_mask_A =
953-
_mm256_set_epi8(0x80, 23, 0x80, 23, 0x80, 23, 0x80, 23, 0x80, 19, 0x80,
954-
19, 0x80, 19, 0x80, 19, 0x80, 7, 0x80, 7, 0x80, 7,
955-
0x80, 7, 0x80, 3, 0x80, 3, 0x80, 3, 0x80, 3);
960+
/* use the alpha index to eventually grab the alpha channel of each pixel
961+
*/
962+
mm256_shuff_alpha_mask_A = _mm256_set_epi8(
963+
0x80, 20 + a_index, 0x80, 20 + a_index, 0x80, 20 + a_index, 0x80,
964+
20 + a_index, 0x80, 16 + a_index, 0x80, 16 + a_index, 0x80,
965+
16 + a_index, 0x80, 16 + a_index, 0x80, 4 + a_index, 0x80, 4 + a_index,
966+
0x80, 4 + a_index, 0x80, 4 + a_index, 0x80, a_index, 0x80, a_index,
967+
0x80, a_index, 0x80, a_index);
956968

957969
mm256_shuff_mask_B =
958970
_mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80,
959971
26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13,
960972
0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8);
961-
962-
mm256_shuff_alpha_mask_B =
963-
_mm256_set_epi8(0x80, 31, 0x80, 31, 0x80, 31, 0x80, 31, 0x80, 27, 0x80,
964-
27, 0x80, 27, 0x80, 27, 0x80, 15, 0x80, 15, 0x80, 15,
965-
0x80, 15, 0x80, 11, 0x80, 11, 0x80, 11, 0x80, 11);
973+
/* use the alpha index to eventually grab the alpha channel of each pixel
974+
*/
975+
mm256_shuff_alpha_mask_B = _mm256_set_epi8(
976+
0x80, 28 + a_index, 0x80, 28 + a_index, 0x80, 28 + a_index, 0x80,
977+
28 + a_index, 0x80, 24 + a_index, 0x80, 24 + a_index, 0x80,
978+
24 + a_index, 0x80, 24 + a_index, 0x80, 12 + a_index, 0x80,
979+
12 + a_index, 0x80, 12 + a_index, 0x80, 12 + a_index, 0x80,
980+
8 + a_index, 0x80, 8 + a_index, 0x80, 8 + a_index, 0x80, 8 + a_index);
966981

967982
mm256_ones = _mm256_set_epi8(
968983
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
@@ -1026,24 +1041,99 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
10261041
mm256_src = _mm256_loadu_si256(srcp256);
10271042
mm256_dst = _mm256_loadu_si256(dstp256);
10281043

1029-
/* insert 8 pixel at a time blend here */
1030-
1031-
/* do everything A set first */
1044+
/* At the start we shuffle our initial 8 bit source &
1045+
* destination pixel channels into a 16 bit configuration,
1046+
* spaced out by 00s.
1047+
* We need the room of 16bits to do a potential max size
1048+
* 8bit x 8bit multiplication (e.g. 255 x 255 = 65,025).
1049+
*
1050+
* At the same time we are working backwards from our
1051+
* final packing instruction that lets us squish 2 256bit
1052+
* registers divided into 16 16bit values into one 256bit
1053+
* register containing 32 8bit values (or 8 32bit pixels
1054+
* worth of data).
1055+
*
1056+
* This is why we end up with the strange seeming initial
1057+
* shuffling around of pixel channels into two 256 bit
1058+
* registers with 00 gaps. The goal is to be able to
1059+
* perform the necessary blend calculation on each channel
1060+
* of all 8 pixels in as few operations as possible and
1061+
* then leave them arranged ready to be packed back up
1062+
* into a single 8 x 32bit pixel chunk of data again at
1063+
* the end.
1064+
*/
1065+
1066+
/* Do shuffle then blend to the A half first .
1067+
*/
1068+
1069+
/* these shuffles prepare our source, destination and
1070+
* alpha only channels for 16 bit operation and puts them
1071+
* in the correct order for the final pack with the B half
1072+
*/
10321073
mm256_dstA =
10331074
_mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_A);
1075+
/* mm256_dstA = (dst pixel 6:[00AA][00RR][00GG][00BB],
1076+
* dst pixel 5:[00AA][00RR][00GG][00BB],
1077+
* dst pixel 2:[00AA][00RR][00GG][00BB],
1078+
* dst pixel 1:[00AA][00RR][00GG][00BB])
1079+
*/
10341080
mm256_src_shuff =
10351081
_mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
1082+
/* mm256_src_shuff = (src pixel 6:[00AA][00RR][00GG][00BB],
1083+
* src pixel 5:[00AA][00RR][00GG][00BB],
1084+
* src pixel 2:[00AA][00RR][00GG][00BB],
1085+
* src pixel 1:[00AA][00RR][00GG][00BB])
1086+
*/
10361087
mm256_alpha = _mm256_shuffle_epi8(
10371088
mm256_src, mm256_shuff_alpha_mask_A);
1089+
/* mm256_alpha = (src alpha 6:[00AA][00AA][00AA][00AA],
1090+
* src alpha 5:[00AA][00AA][00AA][00AA],
1091+
* src alpha 2:[00AA][00AA][00AA][00AA],
1092+
* src alpha 1:[00AA][00AA][00AA][00AA])
1093+
*/
1094+
1095+
/* blend on A half, at 16bit size, starts here.
1096+
* overall target blend (with colours and alpha represented
1097+
* as values between 0 and 1) is:
1098+
*
1099+
* result = source.RGB + (dest.RGB * (1 - source.A))
1100+
*
1101+
* Optimised and rearranged for values between 0 and 255
1102+
* the blend formula for a single colour channel is:
1103+
*
1104+
* (sC + dC - ((dC + 1) * sA >> 8))
1105+
*/
10381106
mm256_src_shuff =
10391107
_mm256_add_epi16(mm256_src_shuff, mm256_dstA);
1108+
/* That was Source channels + Destination channels */
10401109
mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_ones);
1110+
/* That was Destination channels plus one */
10411111
mm256_dstA = _mm256_mullo_epi16(mm256_alpha, mm256_dstA);
1112+
/* That was each of the Destination channels plus one
1113+
* multiplied by the Source Alpha channels.
1114+
*/
10421115
mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
1116+
/* That was the right shift by 8bits on the result
1117+
* of the last two operations. It is, combined with the
1118+
* addition of the ones above, effectively
1119+
* a division operation by 255 for each channel putting
1120+
* us back in the 8bit, 0 - 255 range (but still with 00)
1121+
* padding taking us up to 16bit for now)
1122+
*/
10431123

10441124
mm256_dstA = _mm256_sub_epi16(mm256_src_shuff, mm256_dstA);
1045-
1046-
/* now do B set */
1125+
/* this is the final subtraction completing the original
1126+
* colour channel blend formula. We now have blended
1127+
* channel values sitting in the same 16 bit, 00 padded
1128+
* arrangement of pixels as we did prior to the blend.
1129+
*/
1130+
1131+
/* end of A half blend */
1132+
1133+
/* now do B half shuffle then blend.
1134+
* The register shapes are the same as for the A half but
1135+
* with pixels 8,7,4 & 3 instead
1136+
*/
10471137
mm256_dstB =
10481138
_mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_B);
10491139
mm256_src_shuff =
@@ -1060,6 +1150,9 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
10601150

10611151
/* now pack A & B together */
10621152
mm256_dst = _mm256_packus_epi16(mm256_dstA, mm256_dstB);
1153+
/* After this pack operation our pixels are pack in the
1154+
* right order - 8,7,6,5,4,3,2,1 and the right 8bit size
1155+
*/
10631156
_mm256_storeu_si256(dstp256, mm256_dst);
10641157

10651158
srcp256++;
@@ -1075,8 +1168,7 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
10751168
void
10761169
blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
10771170
{
1078-
RAISE_AVX2_RUNTIME_SSE2_COMPILED_WARNING();
1079-
blit_blend_premultiplied_sse2(info);
1171+
BAD_AVX2_FUNCTION_CALL;
10801172
}
10811173
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
10821174
!defined(SDL_DISABLE_IMMINTRIN_H) */

test/surface_test.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3651,6 +3651,47 @@ def test_premul_surf(
36513651
)
36523652
)
36533653

3654+
def test_blit_blend_premultiplied_pixelformats(self):
3655+
# first create two BGRA pixel format surfaces using
3656+
# tobytes & frombytes
3657+
argb_surf_a = pygame.Surface((64, 64), flags=pygame.SRCALPHA)
3658+
byte_surf_a = pygame.image.tobytes(argb_surf_a, "ARGB")
3659+
bgra_surf_a = pygame.image.frombytes(byte_surf_a, (64, 64), "ARGB")
3660+
3661+
argb_surf_b = pygame.Surface((64, 64), flags=pygame.SRCALPHA)
3662+
byte_surf_b = pygame.image.tobytes(argb_surf_b, "ARGB")
3663+
bgra_surf_b = pygame.image.frombytes(byte_surf_b, (64, 64), "ARGB")
3664+
3665+
argb_surf_a.fill((64, 0, 0, 128))
3666+
self.assertEqual(argb_surf_a.get_at((0, 0)),
3667+
pygame.Color(64, 0, 0, 128))
3668+
3669+
# 128 green, 128 blue at 50% alpha, premultiplied
3670+
argb_surf_b.fill((0, 64, 64, 128))
3671+
self.assertEqual(argb_surf_b.get_at((0, 0)),
3672+
pygame.Color(0, 64, 64, 128))
3673+
3674+
argb_surf_a.blit(argb_surf_b, (0, 0),
3675+
special_flags=pygame.BLEND_PREMULTIPLIED)
3676+
3677+
self.assertEqual(argb_surf_a.get_at((0, 0)),
3678+
pygame.Color(32, 64, 64, 192))
3679+
3680+
bgra_surf_a.fill((64, 0, 0, 128)) # 128 red at 50% alpha
3681+
self.assertEqual(bgra_surf_a.get_at((0, 0)),
3682+
pygame.Color(64, 0, 0, 128))
3683+
3684+
# 128 green, 128 blue at 50% alpha, premultiplied
3685+
bgra_surf_b.fill((0, 64, 64, 128))
3686+
self.assertEqual(bgra_surf_b.get_at((0, 0)),
3687+
pygame.Color(0, 64, 64, 128))
3688+
3689+
bgra_surf_a.blit(bgra_surf_b, (0, 0),
3690+
special_flags=pygame.BLEND_PREMULTIPLIED)
3691+
3692+
self.assertEqual(bgra_surf_a.get_at((0, 0)),
3693+
pygame.Color(32, 64, 64, 192))
3694+
36543695
def test_blit_blend_big_rect(self):
36553696
"""test that an oversized rect works ok."""
36563697
color = (1, 2, 3, 255)

0 commit comments

Comments
 (0)