Skip to content

Commit 9f3e97c

Browse files
committed
Add an AVX2 version of the premultiplied alpha blend mode
1 parent a4fb81e commit 9f3e97c

File tree

3 files changed

+192
-95
lines changed

3 files changed

+192
-95
lines changed

src_c/alphablit.c

Lines changed: 19 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,6 @@ blit_blend_rgba_max(SDL_BlitInfo *info);
109109

110110
static void
111111
blit_blend_premultiplied(SDL_BlitInfo *info);
112-
#ifdef __MMX__
113-
static void
114-
blit_blend_premultiplied_mmx(SDL_BlitInfo *info);
115-
#endif /* __MMX__ */
116112

117113
static int
118114
SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst,
@@ -567,27 +563,32 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst,
567563
break;
568564
}
569565
case PYGAME_BLEND_PREMULTIPLIED: {
566+
#if !defined(__EMSCRIPTEN__)
567+
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
570568
if (src->format->BytesPerPixel == 4 &&
571569
dst->format->BytesPerPixel == 4 &&
572570
src->format->Rmask == dst->format->Rmask &&
573571
src->format->Gmask == dst->format->Gmask &&
574572
src->format->Bmask == dst->format->Bmask &&
575-
info.src_blend != SDL_BLENDMODE_NONE) {
576-
#if defined(__MMX__) || defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
573+
info.src_blend != SDL_BLENDMODE_NONE &&
574+
pg_has_avx2() && (src != dst)) {
575+
blit_blend_premultiplied_avx2(&info);
576+
break;
577+
}
577578
#if PG_ENABLE_SSE_NEON
578-
if (pg_HasSSE_NEON()) {
579-
blit_blend_premultiplied_sse2(&info);
580-
break;
581-
}
582-
#endif /* PG_ENABLE_SSE_NEON */
583-
#ifdef __MMX__
584-
if (SDL_HasMMX() == SDL_TRUE) {
585-
blit_blend_premultiplied_mmx(&info);
586-
break;
587-
}
588-
#endif /*__MMX__*/
589-
#endif /*__MMX__ || __SSE2__ || PG_ENABLE_ARM_NEON*/
579+
if (src->format->BytesPerPixel == 4 &&
580+
dst->format->BytesPerPixel == 4 &&
581+
src->format->Rmask == dst->format->Rmask &&
582+
src->format->Gmask == dst->format->Gmask &&
583+
src->format->Bmask == dst->format->Bmask &&
584+
info.src_blend != SDL_BLENDMODE_NONE &&
585+
pg_HasSSE_NEON() && (src != dst)) {
586+
blit_blend_premultiplied_sse2(&info);
587+
break;
590588
}
589+
#endif /* PG_ENABLE_SSE_NEON */
590+
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
591+
#endif /* __EMSCRIPTEN__ */
591592

592593
blit_blend_premultiplied(&info);
593594
break;
@@ -1262,83 +1263,6 @@ blit_blend_rgba_max(SDL_BlitInfo *info)
12621263
}
12631264
}
12641265

1265-
#ifdef __MMX__
1266-
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
1267-
static void
1268-
blit_blend_premultiplied_mmx(SDL_BlitInfo *info)
1269-
{
1270-
int n;
1271-
int width = info->width;
1272-
int height = info->height;
1273-
Uint32 *srcp = (Uint32 *)info->s_pixels;
1274-
int srcskip = info->s_skip >> 2;
1275-
Uint32 *dstp = (Uint32 *)info->d_pixels;
1276-
int dstskip = info->d_skip >> 2;
1277-
SDL_PixelFormat *srcfmt = info->src;
1278-
Uint32 amask = srcfmt->Amask;
1279-
Uint32 ashift = srcfmt->Ashift;
1280-
Uint64 multmask2;
1281-
1282-
__m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
1283-
1284-
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1285-
multmask2 = 0x00FF00FF00FF00FFULL;
1286-
1287-
while (height--) {
1288-
/* *INDENT-OFF* */
1289-
LOOP_UNROLLED4(
1290-
{
1291-
Uint32 alpha = *srcp & amask;
1292-
if (alpha == 0) {
1293-
/* do nothing */
1294-
}
1295-
else if (alpha == amask) {
1296-
*dstp = *srcp;
1297-
}
1298-
else {
1299-
src1 = _mm_cvtsi32_si64(
1300-
*srcp); /* src(ARGB) -> src1 (0000ARGB) */
1301-
src1 =
1302-
_mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1303-
1304-
dst1 = _mm_cvtsi32_si64(
1305-
*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
1306-
dst1 =
1307-
_mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1308-
1309-
mm_alpha = _mm_cvtsi32_si64(
1310-
alpha); /* alpha -> mm_alpha (0000000A) */
1311-
mm_alpha = _mm_srli_si64(
1312-
mm_alpha,
1313-
ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1314-
mm_alpha = _mm_unpacklo_pi16(
1315-
mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1316-
mm_alpha2 = _mm_unpacklo_pi32(
1317-
mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
1318-
mm_alpha2 = _mm_xor_si64(
1319-
mm_alpha2,
1320-
*(__m64 *)&multmask2); /* 255 - mm_alpha -> mm_alpha */
1321-
1322-
/* pre-multiplied alpha blend */
1323-
dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
1324-
dst1 = _mm_srli_pi16(dst1, 8);
1325-
dst1 = _mm_add_pi16(src1, dst1);
1326-
dst1 = _mm_packs_pu16(dst1, mm_zero);
1327-
1328-
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1329-
}
1330-
++srcp;
1331-
++dstp;
1332-
},
1333-
n, width);
1334-
/* *INDENT-ON* */
1335-
srcp += srcskip;
1336-
dstp += dstskip;
1337-
}
1338-
_mm_empty();
1339-
}
1340-
#endif /*__MMX__*/
1341-
13421266
static void
13431267
blit_blend_premultiplied(SDL_BlitInfo *info)
13441268
{

src_c/simd_blitters.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,5 @@ void
8282
blit_blend_rgba_min_avx2(SDL_BlitInfo *info);
8383
void
8484
blit_blend_rgb_min_avx2(SDL_BlitInfo *info);
85+
void
86+
blit_blend_premultiplied_avx2(SDL_BlitInfo *info);

src_c/simd_blitters_avx2.c

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -909,3 +909,174 @@ blit_blend_rgb_min_avx2(SDL_BlitInfo *info)
909909
}
910910
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
911911
!defined(SDL_DISABLE_IMMINTRIN_H) */
912+
913+
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
914+
!defined(SDL_DISABLE_IMMINTRIN_H)
915+
void
916+
blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
917+
{
918+
int n;
919+
int width = info->width;
920+
int height = info->height;
921+
922+
Uint32 *srcp = (Uint32 *)info->s_pixels;
923+
int srcskip = info->s_skip >> 2;
924+
int srcpxskip = info->s_pxskip >> 2;
925+
926+
Uint32 *dstp = (Uint32 *)info->d_pixels;
927+
int dstskip = info->d_skip >> 2;
928+
int dstpxskip = info->d_pxskip >> 2;
929+
930+
int pre_8_width = width % 8;
931+
int post_8_width = (width - pre_8_width) / 8;
932+
933+
/* if either surface has a non-zero alpha mask use that as our mask */
934+
Uint32 amask = info->src->Amask | info->dst->Amask;
935+
936+
__m256i *srcp256 = (__m256i *)info->s_pixels;
937+
__m256i *dstp256 = (__m256i *)info->d_pixels;
938+
939+
__m128i mm_src, mm_dst, mm_zero, mm_alpha, mm_sub_dst, mm_ones;
940+
__m256i mm256_src, mm256_dst, mm256_shuff_mask_A, mm256_shuff_mask_B,
941+
mm256_src_shuff, mm256_dstA, mm256_dstB, mm256_ones, mm256_alpha,
942+
mm256_shuff_alpha_mask_A, mm256_shuff_alpha_mask_B;
943+
944+
mm_zero = _mm_setzero_si128();
945+
mm_ones = _mm_set_epi64x(0x0000000000000000, 0x0001000100010001);
946+
947+
mm256_shuff_mask_A =
948+
_mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80,
949+
18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5,
950+
0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
951+
952+
mm256_shuff_alpha_mask_A =
953+
_mm256_set_epi8(0x80, 23, 0x80, 23, 0x80, 23, 0x80, 23, 0x80, 19, 0x80,
954+
19, 0x80, 19, 0x80, 19, 0x80, 7, 0x80, 7, 0x80, 7,
955+
0x80, 7, 0x80, 3, 0x80, 3, 0x80, 3, 0x80, 3);
956+
957+
mm256_shuff_mask_B =
958+
_mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80,
959+
26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13,
960+
0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8);
961+
962+
mm256_shuff_alpha_mask_B =
963+
_mm256_set_epi8(0x80, 31, 0x80, 31, 0x80, 31, 0x80, 31, 0x80, 27, 0x80,
964+
27, 0x80, 27, 0x80, 27, 0x80, 15, 0x80, 15, 0x80, 15,
965+
0x80, 15, 0x80, 11, 0x80, 11, 0x80, 11, 0x80, 11);
966+
967+
mm256_ones = _mm256_set_epi8(
968+
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
969+
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
970+
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01);
971+
972+
while (height--) {
973+
if (pre_8_width > 0) {
974+
/* one pixel at a time - same as current sse2 version */
975+
LOOP_UNROLLED4(
976+
{
977+
Uint32 alpha = *srcp & amask;
978+
if (alpha == 0) {
979+
/* do nothing */
980+
}
981+
else if (alpha == amask) {
982+
*dstp = *srcp;
983+
}
984+
else {
985+
mm_src = _mm_cvtsi32_si128(*srcp);
986+
/*mm_src = 0x000000000000000000000000AARRGGBB*/
987+
mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
988+
/*mm_src = 0x000000000000000000AA00RR00GG00BB*/
989+
mm_dst = _mm_cvtsi32_si128(*dstp);
990+
/*mm_dst = 0x000000000000000000000000AARRGGBB*/
991+
mm_dst = _mm_unpacklo_epi8(mm_dst, mm_zero);
992+
/*mm_dst = 0x000000000000000000AA00RR00GG00BB*/
993+
994+
mm_alpha = _mm_cvtsi32_si128(alpha);
995+
/* alpha -> mm_alpha (000000000000A000) */
996+
mm_alpha = _mm_srli_si128(mm_alpha, 3);
997+
/* mm_alpha >> ashift -> mm_alpha(000000000000000A) */
998+
mm_alpha = _mm_unpacklo_epi16(mm_alpha, mm_alpha);
999+
/* 0000000000000A0A -> mm_alpha */
1000+
mm_alpha = _mm_unpacklo_epi32(mm_alpha, mm_alpha);
1001+
/* 000000000A0A0A0A -> mm_alpha2 */
1002+
1003+
/* pre-multiplied alpha blend */
1004+
mm_sub_dst = _mm_add_epi16(mm_dst, mm_ones);
1005+
mm_sub_dst = _mm_mullo_epi16(mm_sub_dst, mm_alpha);
1006+
mm_sub_dst = _mm_srli_epi16(mm_sub_dst, 8);
1007+
mm_dst = _mm_add_epi16(mm_src, mm_dst);
1008+
mm_dst = _mm_sub_epi16(mm_dst, mm_sub_dst);
1009+
mm_dst = _mm_packus_epi16(mm_dst, mm_zero);
1010+
1011+
*dstp = _mm_cvtsi128_si32(mm_dst);
1012+
}
1013+
1014+
srcp += srcpxskip;
1015+
dstp += dstpxskip;
1016+
},
1017+
n, pre_8_width);
1018+
}
1019+
srcp256 = (__m256i *)srcp;
1020+
dstp256 = (__m256i *)dstp;
1021+
if (post_8_width > 0) {
1022+
/*8 pixels at a time, need to use shuffle to get everything
1023+
lined up - see mul for an example*/
1024+
LOOP_UNROLLED4(
1025+
{
1026+
mm256_src = _mm256_loadu_si256(srcp256);
1027+
mm256_dst = _mm256_loadu_si256(dstp256);
1028+
1029+
/* insert 8 pixel at a time blend here */
1030+
1031+
/* do everything A set first */
1032+
mm256_dstA =
1033+
_mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_A);
1034+
mm256_src_shuff =
1035+
_mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
1036+
mm256_alpha = _mm256_shuffle_epi8(
1037+
mm256_src, mm256_shuff_alpha_mask_A);
1038+
mm256_src_shuff =
1039+
_mm256_add_epi16(mm256_src_shuff, mm256_dstA);
1040+
mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_ones);
1041+
mm256_dstA = _mm256_mullo_epi16(mm256_alpha, mm256_dstA);
1042+
mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
1043+
1044+
mm256_dstA = _mm256_sub_epi16(mm256_src_shuff, mm256_dstA);
1045+
1046+
/* now do B set */
1047+
mm256_dstB =
1048+
_mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_B);
1049+
mm256_src_shuff =
1050+
_mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
1051+
mm256_alpha = _mm256_shuffle_epi8(
1052+
mm256_src, mm256_shuff_alpha_mask_B);
1053+
mm256_src_shuff =
1054+
_mm256_add_epi16(mm256_src_shuff, mm256_dstB);
1055+
mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_ones);
1056+
mm256_dstB = _mm256_mullo_epi16(mm256_alpha, mm256_dstB);
1057+
mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
1058+
1059+
mm256_dstB = _mm256_sub_epi16(mm256_src_shuff, mm256_dstB);
1060+
1061+
/* now pack A & B together */
1062+
mm256_dst = _mm256_packus_epi16(mm256_dstA, mm256_dstB);
1063+
_mm256_storeu_si256(dstp256, mm256_dst);
1064+
1065+
srcp256++;
1066+
dstp256++;
1067+
},
1068+
n, post_8_width);
1069+
}
1070+
srcp = (Uint32 *)srcp256 + srcskip;
1071+
dstp = (Uint32 *)dstp256 + dstskip;
1072+
}
1073+
}
1074+
#else
1075+
void
1076+
blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
1077+
{
1078+
RAISE_AVX2_RUNTIME_SSE2_COMPILED_WARNING();
1079+
blit_blend_premultiplied_sse2(info);
1080+
}
1081+
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
1082+
!defined(SDL_DISABLE_IMMINTRIN_H) */

0 commit comments

Comments
 (0)