@@ -928,10 +928,18 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
928928 int dstpxskip = info -> d_pxskip >> 2 ;
929929
930930 int pre_8_width = width % 8 ;
931- int post_8_width = ( width - pre_8_width ) / 8 ;
931+ int post_8_width = width / 8 ;
932932
933933 /* if either surface has a non-zero alpha mask use that as our mask */
934934 Uint32 amask = info -> src -> Amask | info -> dst -> Amask ;
935+ /* find the index 0, 1, 2 or 3 of the alpha channel within the pixel
936+ * this can vary depending on the channel order in the pixel format.
937+ * e.g. ARGB vs RGBA or BGRA
938+ */
939+ char a_index = ((amask >> 8 ) == 0 ) ? 0
940+ : ((amask >> 16 ) == 0 ) ? 1
941+ : ((amask >> 24 ) == 0 ) ? 2
942+ : 3 ;
935943
936944 __m256i * srcp256 = (__m256i * )info -> s_pixels ;
937945 __m256i * dstp256 = (__m256i * )info -> d_pixels ;
@@ -942,27 +950,34 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
942950 mm256_shuff_alpha_mask_A , mm256_shuff_alpha_mask_B ;
943951
944952 mm_zero = _mm_setzero_si128 ();
945- mm_ones = _mm_set_epi64x (0x0000000000000000 , 0x0001000100010001 );
953+ mm_ones = _mm_set_epi8 (0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 ,
954+ 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 );
946955
947956 mm256_shuff_mask_A =
948957 _mm256_set_epi8 (0x80 , 23 , 0x80 , 22 , 0x80 , 21 , 0x80 , 20 , 0x80 , 19 , 0x80 ,
949958 18 , 0x80 , 17 , 0x80 , 16 , 0x80 , 7 , 0x80 , 6 , 0x80 , 5 ,
950959 0x80 , 4 , 0x80 , 3 , 0x80 , 2 , 0x80 , 1 , 0x80 , 0 );
951-
952- mm256_shuff_alpha_mask_A =
953- _mm256_set_epi8 (0x80 , 23 , 0x80 , 23 , 0x80 , 23 , 0x80 , 23 , 0x80 , 19 , 0x80 ,
954- 19 , 0x80 , 19 , 0x80 , 19 , 0x80 , 7 , 0x80 , 7 , 0x80 , 7 ,
955- 0x80 , 7 , 0x80 , 3 , 0x80 , 3 , 0x80 , 3 , 0x80 , 3 );
960+ /* use the alpha index to eventually grab the alpha channel of each pixel
961+ */
962+ mm256_shuff_alpha_mask_A = _mm256_set_epi8 (
963+ 0x80 , 20 + a_index , 0x80 , 20 + a_index , 0x80 , 20 + a_index , 0x80 ,
964+ 20 + a_index , 0x80 , 16 + a_index , 0x80 , 16 + a_index , 0x80 ,
965+ 16 + a_index , 0x80 , 16 + a_index , 0x80 , 4 + a_index , 0x80 , 4 + a_index ,
966+ 0x80 , 4 + a_index , 0x80 , 4 + a_index , 0x80 , a_index , 0x80 , a_index ,
967+ 0x80 , a_index , 0x80 , a_index );
956968
957969 mm256_shuff_mask_B =
958970 _mm256_set_epi8 (0x80 , 31 , 0x80 , 30 , 0x80 , 29 , 0x80 , 28 , 0x80 , 27 , 0x80 ,
959971 26 , 0x80 , 25 , 0x80 , 24 , 0x80 , 15 , 0x80 , 14 , 0x80 , 13 ,
960972 0x80 , 12 , 0x80 , 11 , 0x80 , 10 , 0x80 , 9 , 0x80 , 8 );
961-
962- mm256_shuff_alpha_mask_B =
963- _mm256_set_epi8 (0x80 , 31 , 0x80 , 31 , 0x80 , 31 , 0x80 , 31 , 0x80 , 27 , 0x80 ,
964- 27 , 0x80 , 27 , 0x80 , 27 , 0x80 , 15 , 0x80 , 15 , 0x80 , 15 ,
965- 0x80 , 15 , 0x80 , 11 , 0x80 , 11 , 0x80 , 11 , 0x80 , 11 );
973+ /* use the alpha index to eventually grab the alpha channel of each pixel
974+ */
975+ mm256_shuff_alpha_mask_B = _mm256_set_epi8 (
976+ 0x80 , 28 + a_index , 0x80 , 28 + a_index , 0x80 , 28 + a_index , 0x80 ,
977+ 28 + a_index , 0x80 , 24 + a_index , 0x80 , 24 + a_index , 0x80 ,
978+ 24 + a_index , 0x80 , 24 + a_index , 0x80 , 12 + a_index , 0x80 ,
979+ 12 + a_index , 0x80 , 12 + a_index , 0x80 , 12 + a_index , 0x80 ,
980+ 8 + a_index , 0x80 , 8 + a_index , 0x80 , 8 + a_index , 0x80 , 8 + a_index );
966981
967982 mm256_ones = _mm256_set_epi8 (
968983 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 , 0x00 , 0x01 ,
@@ -1026,24 +1041,99 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
10261041 mm256_src = _mm256_loadu_si256 (srcp256 );
10271042 mm256_dst = _mm256_loadu_si256 (dstp256 );
10281043
1029- /* insert 8 pixel at a time blend here */
1030-
1031- /* do everything A set first */
1044+ /* At the start we shuffle our initial 8 bit source &
1045+ * destination pixel channels into a 16 bit configuration,
1046+ * spaced out by 00s.
1047+ * We need the room of 16bits to do a potential max size
1048+ * 8bit x 8bit multiplication (e.g. 255 x 255 = 65,025).
1049+ *
1050+ * At the same time we are working backwards from our
1051+ * final packing instruction that lets us squish 2 256bit
1052+ * registers divided into 16 16bit values into one 256bit
1053+ * register containing 32 8bit values (or 8 32bit pixels
1054+ * worth of data).
1055+ *
1056+ * This is why we end up with the strange seeming initial
1057+ * shuffling around of pixel channels into two 256 bit
1058+ * registers with 00 gaps. The goal is to be able to
1059+ * perform the necessary blend calculation on each channel
1060+ * of all 8 pixels in as few operations as possible and
1061+ * then leave them arranged ready to be packed back up
1062+ * into a single 8 x 32bit pixel chunk of data again at
1063+ * the end.
1064+ */
1065+
1066+ /* Do shuffle then blend to the A half first .
1067+ */
1068+
1069+ /* these shuffles prepare our source, destination and
1070+ * alpha only channels for 16 bit operation and puts them
1071+ * in the correct order for the final pack with the B half
1072+ */
10321073 mm256_dstA =
10331074 _mm256_shuffle_epi8 (mm256_dst , mm256_shuff_mask_A );
1075+ /* mm256_dstA = (dst pixel 6:[00AA][00RR][00GG][00BB],
1076+ * dst pixel 5:[00AA][00RR][00GG][00BB],
1077+ * dst pixel 2:[00AA][00RR][00GG][00BB],
1078+ * dst pixel 1:[00AA][00RR][00GG][00BB])
1079+ */
10341080 mm256_src_shuff =
10351081 _mm256_shuffle_epi8 (mm256_src , mm256_shuff_mask_A );
1082+ /* mm256_src_shuff = (src pixel 6:[00AA][00RR][00GG][00BB],
1083+ * src pixel 5:[00AA][00RR][00GG][00BB],
1084+ * src pixel 2:[00AA][00RR][00GG][00BB],
1085+ * src pixel 1:[00AA][00RR][00GG][00BB])
1086+ */
10361087 mm256_alpha = _mm256_shuffle_epi8 (
10371088 mm256_src , mm256_shuff_alpha_mask_A );
1089+ /* mm256_alpha = (src alpha 6:[00AA][00AA][00AA][00AA],
1090+ * src alpha 5:[00AA][00AA][00AA][00AA],
1091+ * src alpha 2:[00AA][00AA][00AA][00AA],
1092+ * src alpha 1:[00AA][00AA][00AA][00AA])
1093+ */
1094+
1095+ /* blend on A half, at 16bit size, starts here.
1096+ * overall target blend (with colours and alpha represented
1097+ * as values between 0 and 1) is:
1098+ *
1099+ * result = source.RGB + (dest.RGB * (1 - source.A))
1100+ *
1101+ * Optimised and rearranged for values between 0 and 255
1102+ * the blend formula for a single colour channel is:
1103+ *
1104+ * (sC + dC - ((dC + 1) * sA >> 8))
1105+ */
10381106 mm256_src_shuff =
10391107 _mm256_add_epi16 (mm256_src_shuff , mm256_dstA );
1108+ /* That was Source channels + Destination channels */
10401109 mm256_dstA = _mm256_add_epi16 (mm256_dstA , mm256_ones );
1110+ /* That was Destination channels plus one */
10411111 mm256_dstA = _mm256_mullo_epi16 (mm256_alpha , mm256_dstA );
1112+ /* That was each of the Destination channels plus one
1113+ * multiplied by the Source Alpha channels.
1114+ */
10421115 mm256_dstA = _mm256_srli_epi16 (mm256_dstA , 8 );
1116+ /* That was the right shift by 8bits on the result
1117+ * of the last two operations. It is, combined with the
1118+ * addition of the ones above, effectively
1119+ * a division operation by 255 for each channel putting
1120+ * us back in the 8bit, 0 - 255 range (but still with 00)
1121+ * padding taking us up to 16bit for now)
1122+ */
10431123
10441124 mm256_dstA = _mm256_sub_epi16 (mm256_src_shuff , mm256_dstA );
1045-
1046- /* now do B set */
1125+ /* this is the final subtraction completing the original
1126+ * colour channel blend formula. We now have blended
1127+ * channel values sitting in the same 16 bit, 00 padded
1128+ * arrangement of pixels as we did prior to the blend.
1129+ */
1130+
1131+ /* end of A half blend */
1132+
1133+ /* now do B half shuffle then blend.
1134+ * The register shapes are the same as for the A half but
1135+ * with pixels 8,7,4 & 3 instead
1136+ */
10471137 mm256_dstB =
10481138 _mm256_shuffle_epi8 (mm256_dst , mm256_shuff_mask_B );
10491139 mm256_src_shuff =
@@ -1060,6 +1150,9 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
10601150
10611151 /* now pack A & B together */
10621152 mm256_dst = _mm256_packus_epi16 (mm256_dstA , mm256_dstB );
1153+ /* After this pack operation our pixels are pack in the
1154+ * right order - 8,7,6,5,4,3,2,1 and the right 8bit size
1155+ */
10631156 _mm256_storeu_si256 (dstp256 , mm256_dst );
10641157
10651158 srcp256 ++ ;
@@ -1075,8 +1168,7 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
10751168void
10761169blit_blend_premultiplied_avx2 (SDL_BlitInfo * info )
10771170{
1078- RAISE_AVX2_RUNTIME_SSE2_COMPILED_WARNING ();
1079- blit_blend_premultiplied_sse2 (info );
1171+ BAD_AVX2_FUNCTION_CALL ;
10801172}
10811173#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
10821174 !defined(SDL_DISABLE_IMMINTRIN_H) */
0 commit comments