@@ -1529,3 +1529,117 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
15291529}
15301530#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
15311531 !defined(SDL_DISABLE_IMMINTRIN_H) */
1532+
1533+ #define PREMUL_ALPHA_CODE \
1534+ /* extract the alpha */ \
1535+ mm_alpha_in = _mm256_and_si256(mm_src, mm256_amask); \
1536+ \
1537+ /*redistribute the alphas across the R, G, B channels*/ \
1538+ alphaA = _mm256_shuffle_epi8 (mm_src , shuffle_maskA ); \
1539+ alphaB = _mm256_shuffle_epi8 (mm_src , shuffle_maskB ); \
1540+ \
1541+ /*prep the pixels for 16-bit math*/ \
1542+ mm_srcA = _mm256_unpacklo_epi8 (mm_src , mm_zero ); \
1543+ mm_srcB = _mm256_unpackhi_epi8 (mm_src , mm_zero ); \
1544+ \
1545+ mm_srcA = _mm256_add_epi16 (mm_srcA , mm256_ones ); \
1546+ mm_srcB = _mm256_add_epi16 (mm_srcB , mm256_ones ); \
1547+ \
1548+ /*multiply the pixels by the alphas*/ \
1549+ mm_srcA = _mm256_mullo_epi16 (mm_srcA , alphaA ); \
1550+ mm_srcB = _mm256_mullo_epi16 (mm_srcB , alphaB ); \
1551+ \
1552+ /*shift the pixels back down to 8-bit*/ \
1553+ mm_srcA = _mm256_srli_epi16 (mm_srcA , 8 ); \
1554+ mm_srcB = _mm256_srli_epi16 (mm_srcB , 8 ); \
1555+ \
1556+ /*pack the pixels back together*/ \
1557+ mm_dst = _mm256_packus_epi16 (mm_srcA , mm_srcB ); \
1558+ /*add the original alpha back in*/ \
1559+ mm_dst = _mm256_or_si256 (mm_dst , mm_alpha_in );
1560+
1561+ #if defined(__AVX2__ ) && defined(HAVE_IMMINTRIN_H ) && \
1562+ !defined(SDL_DISABLE_IMMINTRIN_H )
1563+ void
1564+ premul_surf_color_by_alpha_avx2 (SDL_Surface * src , SDL_Surface * dst )
1565+ {
1566+ int i , height = src -> h ;
1567+ const int width = src -> w ;
1568+ const int n_iters_8 = width / 8 ;
1569+ const int pxl_excess = width % 8 ;
1570+ const int src_skip = src -> pitch / 4 - width ;
1571+ const int dst_skip = dst -> pitch / 4 - width ;
1572+ const int src_exc_skip = pxl_excess + src_skip ;
1573+ const int dst_exc_skip = pxl_excess + dst_skip ;
1574+
1575+ Uint32 * srcp = (Uint32 * )src -> pixels ;
1576+ Uint32 * dstp = (Uint32 * )dst -> pixels ;
1577+
1578+ __m256i mm_src , mm_dst , alphaA , alphaB , mm_alpha_in ;
1579+ __m256i mm_srcA , mm_srcB ;
1580+
1581+ const __m256i mm256_amask = _mm256_set1_epi32 (src -> format -> Amask );
1582+ const __m256i mm_zero = _mm256_setzero_si256 ();
1583+ const __m256i partial_mask =
1584+ _mm256_set_epi32 (0 , pxl_excess > 6 ? -1 : 0 , pxl_excess > 5 ? -1 : 0 ,
1585+ pxl_excess > 4 ? -1 : 0 , pxl_excess > 3 ? -1 : 0 ,
1586+ pxl_excess > 2 ? -1 : 0 , pxl_excess > 1 ? -1 : 0 ,
1587+ pxl_excess > 0 ? -1 : 0 );
1588+ const __m256i mm256_ones = _mm256_set1_epi16 (0x0001 );
1589+
1590+ char _a_off = ((src -> format -> Amask >> 8 ) == 0 ) ? 0
1591+ : ((src -> format -> Amask >> 16 ) == 0 ) ? 1
1592+ : ((src -> format -> Amask >> 24 ) == 0 ) ? 2
1593+ : 3 ;
1594+
1595+ /* masks for shuffling the alpha to the RGB channels for multiplication */
1596+ const __m256i shuffle_maskA = _mm256_set_epi8 (
1597+ -1 , -1 , -1 , 20 + _a_off , -1 , 20 + _a_off , -1 , 20 + _a_off , -1 , -1 , -1 ,
1598+ 16 + _a_off , -1 , 16 + _a_off , -1 , 16 + _a_off , -1 , -1 , -1 , 4 + _a_off ,
1599+ -1 , 4 + _a_off , -1 , 4 + _a_off , -1 , -1 , -1 , _a_off , -1 , _a_off , -1 ,
1600+ _a_off );
1601+
1602+ const __m256i shuffle_maskB = _mm256_set_epi8 (
1603+ -1 , -1 , -1 , 28 + _a_off , -1 , 28 + _a_off , -1 , 28 + _a_off , -1 , -1 , -1 ,
1604+ 24 + _a_off , -1 , 24 + _a_off , -1 , 24 + _a_off , -1 , -1 , -1 , 12 + _a_off ,
1605+ -1 , 12 + _a_off , -1 , 12 + _a_off , -1 , -1 , -1 , 8 + _a_off , -1 ,
1606+ 8 + _a_off , -1 , 8 + _a_off );
1607+
1608+ while (height -- ) {
1609+ /* 8 pixels at a time */
1610+ for (i = 0 ; i < n_iters_8 ; i ++ ) {
1611+ mm_src = _mm256_loadu_si256 ((__m256i * )srcp );
1612+
1613+ PREMUL_ALPHA_CODE ;
1614+
1615+ _mm256_storeu_si256 ((__m256i * )dstp , mm_dst );
1616+
1617+ dstp += 8 ;
1618+ srcp += 8 ;
1619+ }
1620+
1621+ /* up to 7 pixels at a time */
1622+ if (pxl_excess ) {
1623+ mm_src = _mm256_maskload_epi32 ((int * )srcp , partial_mask );
1624+
1625+ PREMUL_ALPHA_CODE
1626+
1627+ _mm256_maskstore_epi32 ( (int * )dstp , partial_mask , mm_dst );
1628+
1629+ srcp += src_exc_skip ;
1630+ dstp += dst_exc_skip ;
1631+ continue ;
1632+ }
1633+
1634+ srcp += src_skip ;
1635+ dstp += dst_skip ;
1636+ }
1637+ }
1638+ #else
1639+ void
1640+ premul_surf_color_by_alpha_avx2 (SDL_Surface * src , SDL_Surface * dst )
1641+ {
1642+ BAD_AVX2_FUNCTION_CALL ;
1643+ }
1644+ #endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
1645+ !defined(SDL_DISABLE_IMMINTRIN_H) */
0 commit comments