@@ -909,3 +909,174 @@ blit_blend_rgb_min_avx2(SDL_BlitInfo *info)
909909}
910910#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
911911 !defined(SDL_DISABLE_IMMINTRIN_H) */
912+
913+ #if defined(__AVX2__ ) && defined(HAVE_IMMINTRIN_H ) && \
914+ !defined(SDL_DISABLE_IMMINTRIN_H )
915+ void
916+ blit_blend_premultiplied_avx2 (SDL_BlitInfo * info )
917+ {
918+ int n ;
919+ int width = info -> width ;
920+ int height = info -> height ;
921+
922+ Uint32 * srcp = (Uint32 * )info -> s_pixels ;
923+ int srcskip = info -> s_skip >> 2 ;
924+ int srcpxskip = info -> s_pxskip >> 2 ;
925+
926+ Uint32 * dstp = (Uint32 * )info -> d_pixels ;
927+ int dstskip = info -> d_skip >> 2 ;
928+ int dstpxskip = info -> d_pxskip >> 2 ;
929+
930+ int pre_8_width = width % 8 ;
931+ int post_8_width = (width - pre_8_width ) / 8 ;
932+
933+ /* if either surface has a non-zero alpha mask use that as our mask */
934+ Uint32 amask = info -> src -> Amask | info -> dst -> Amask ;
935+
936+ __m256i * srcp256 = (__m256i * )info -> s_pixels ;
937+ __m256i * dstp256 = (__m256i * )info -> d_pixels ;
938+
939+ __m128i mm_src , mm_dst , mm_zero , mm_alpha , mm_sub_dst , mm_ones ;
940+ __m256i mm256_src , mm256_dst , mm256_shuff_mask_A , mm256_shuff_mask_B ,
941+ mm256_src_shuff , mm256_dstA , mm256_dstB , mm256_ones , mm256_alpha ,
942+ mm256_shuff_alpha_mask_A , mm256_shuff_alpha_mask_B ;
943+
944+ mm_zero = _mm_setzero_si128 ();
945+ mm_ones = _mm_set_epi64x (0x0000000000000000 , 0x0001000100010001 );
946+
947+ mm256_shuff_mask_A =
948+ _mm256_set_epi8 (0x80 , 23 , 0x80 , 22 , 0x80 , 21 , 0x80 , 20 , 0x80 , 19 , 0x80 ,
949+ 18 , 0x80 , 17 , 0x80 , 16 , 0x80 , 7 , 0x80 , 6 , 0x80 , 5 ,
950+ 0x80 , 4 , 0x80 , 3 , 0x80 , 2 , 0x80 , 1 , 0x80 , 0 );
951+
952+ mm256_shuff_alpha_mask_A =
953+ _mm256_set_epi8 (0x80 , 23 , 0x80 , 23 , 0x80 , 23 , 0x80 , 23 , 0x80 , 19 , 0x80 ,
954+ 19 , 0x80 , 19 , 0x80 , 19 , 0x80 , 7 , 0x80 , 7 , 0x80 , 7 ,
955+ 0x80 , 7 , 0x80 , 3 , 0x80 , 3 , 0x80 , 3 , 0x80 , 3 );
956+
957+ mm256_shuff_mask_B =
958+ _mm256_set_epi8 (0x80 , 31 , 0x80 , 30 , 0x80 , 29 , 0x80 , 28 , 0x80 , 27 , 0x80 ,
959+ 26 , 0x80 , 25 , 0x80 , 24 , 0x80 , 15 , 0x80 , 14 , 0x80 , 13 ,
960+ 0x80 , 12 , 0x80 , 11 , 0x80 , 10 , 0x80 , 9 , 0x80 , 8 );
961+
962+ mm256_shuff_alpha_mask_B =
963+ _mm256_set_epi8 (0x80 , 31 , 0x80 , 31 , 0x80 , 31 , 0x80 , 31 , 0x80 , 27 , 0x80 ,
964+ 27 , 0x80 , 27 , 0x80 , 27 , 0x80 , 15 , 0x80 , 15 , 0x80 , 15 ,
965+ 0x80 , 15 , 0x80 , 11 , 0x80 , 11 , 0x80 , 11 , 0x80 , 11 );
966+
967+ mm256_ones = _mm256_set_epi8 (
968+ 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 ,
969+ 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 ,
970+ 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 );
971+
972+ while (height -- ) {
973+ if (pre_8_width > 0 ) {
974+ /* one pixel at a time - same as current sse2 version */
975+ LOOP_UNROLLED4 (
976+ {
977+ Uint32 alpha = * srcp & amask ;
978+ if (alpha == 0 ) {
979+ /* do nothing */
980+ }
981+ else if (alpha == amask ) {
982+ * dstp = * srcp ;
983+ }
984+ else {
985+ mm_src = _mm_cvtsi32_si128 (* srcp );
986+ /*mm_src = 0x000000000000000000000000AARRGGBB*/
987+ mm_src = _mm_unpacklo_epi8 (mm_src , mm_zero );
988+ /*mm_src = 0x000000000000000000AA00RR00GG00BB*/
989+ mm_dst = _mm_cvtsi32_si128 (* dstp );
990+ /*mm_dst = 0x000000000000000000000000AARRGGBB*/
991+ mm_dst = _mm_unpacklo_epi8 (mm_dst , mm_zero );
992+ /*mm_dst = 0x000000000000000000AA00RR00GG00BB*/
993+
994+ mm_alpha = _mm_cvtsi32_si128 (alpha );
995+ /* alpha -> mm_alpha (000000000000A000) */
996+ mm_alpha = _mm_srli_si128 (mm_alpha , 3 );
997+ /* mm_alpha >> ashift -> mm_alpha(000000000000000A) */
998+ mm_alpha = _mm_unpacklo_epi16 (mm_alpha , mm_alpha );
999+ /* 0000000000000A0A -> mm_alpha */
1000+ mm_alpha = _mm_unpacklo_epi32 (mm_alpha , mm_alpha );
1001+ /* 000000000A0A0A0A -> mm_alpha2 */
1002+
1003+ /* pre-multiplied alpha blend */
1004+ mm_sub_dst = _mm_add_epi16 (mm_dst , mm_ones );
1005+ mm_sub_dst = _mm_mullo_epi16 (mm_sub_dst , mm_alpha );
1006+ mm_sub_dst = _mm_srli_epi16 (mm_sub_dst , 8 );
1007+ mm_dst = _mm_add_epi16 (mm_src , mm_dst );
1008+ mm_dst = _mm_sub_epi16 (mm_dst , mm_sub_dst );
1009+ mm_dst = _mm_packus_epi16 (mm_dst , mm_zero );
1010+
1011+ * dstp = _mm_cvtsi128_si32 (mm_dst );
1012+ }
1013+
1014+ srcp += srcpxskip ;
1015+ dstp += dstpxskip ;
1016+ },
1017+ n , pre_8_width );
1018+ }
1019+ srcp256 = (__m256i * )srcp ;
1020+ dstp256 = (__m256i * )dstp ;
1021+ if (post_8_width > 0 ) {
1022+ /*8 pixels at a time, need to use shuffle to get everything
1023+ lined up - see mul for an example*/
1024+ LOOP_UNROLLED4 (
1025+ {
1026+ mm256_src = _mm256_loadu_si256 (srcp256 );
1027+ mm256_dst = _mm256_loadu_si256 (dstp256 );
1028+
1029+ /* insert 8 pixel at a time blend here */
1030+
1031+ /* do everything A set first */
1032+ mm256_dstA =
1033+ _mm256_shuffle_epi8 (mm256_dst , mm256_shuff_mask_A );
1034+ mm256_src_shuff =
1035+ _mm256_shuffle_epi8 (mm256_src , mm256_shuff_mask_A );
1036+ mm256_alpha = _mm256_shuffle_epi8 (
1037+ mm256_src , mm256_shuff_alpha_mask_A );
1038+ mm256_src_shuff =
1039+ _mm256_add_epi16 (mm256_src_shuff , mm256_dstA );
1040+ mm256_dstA = _mm256_add_epi16 (mm256_dstA , mm256_ones );
1041+ mm256_dstA = _mm256_mullo_epi16 (mm256_alpha , mm256_dstA );
1042+ mm256_dstA = _mm256_srli_epi16 (mm256_dstA , 8 );
1043+
1044+ mm256_dstA = _mm256_sub_epi16 (mm256_src_shuff , mm256_dstA );
1045+
1046+ /* now do B set */
1047+ mm256_dstB =
1048+ _mm256_shuffle_epi8 (mm256_dst , mm256_shuff_mask_B );
1049+ mm256_src_shuff =
1050+ _mm256_shuffle_epi8 (mm256_src , mm256_shuff_mask_B );
1051+ mm256_alpha = _mm256_shuffle_epi8 (
1052+ mm256_src , mm256_shuff_alpha_mask_B );
1053+ mm256_src_shuff =
1054+ _mm256_add_epi16 (mm256_src_shuff , mm256_dstB );
1055+ mm256_dstB = _mm256_add_epi16 (mm256_dstB , mm256_ones );
1056+ mm256_dstB = _mm256_mullo_epi16 (mm256_alpha , mm256_dstB );
1057+ mm256_dstB = _mm256_srli_epi16 (mm256_dstB , 8 );
1058+
1059+ mm256_dstB = _mm256_sub_epi16 (mm256_src_shuff , mm256_dstB );
1060+
1061+ /* now pack A & B together */
1062+ mm256_dst = _mm256_packus_epi16 (mm256_dstA , mm256_dstB );
1063+ _mm256_storeu_si256 (dstp256 , mm256_dst );
1064+
1065+ srcp256 ++ ;
1066+ dstp256 ++ ;
1067+ },
1068+ n , post_8_width );
1069+ }
1070+ srcp = (Uint32 * )srcp256 + srcskip ;
1071+ dstp = (Uint32 * )dstp256 + dstskip ;
1072+ }
1073+ }
1074+ #else
1075+ void
1076+ blit_blend_premultiplied_avx2 (SDL_BlitInfo * info )
1077+ {
1078+ RAISE_AVX2_RUNTIME_SSE2_COMPILED_WARNING ();
1079+ blit_blend_premultiplied_sse2 (info );
1080+ }
1081+ #endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
1082+ !defined(SDL_DISABLE_IMMINTRIN_H) */
0 commit comments