Skip to content

Commit d732a44

Browse files
authored
AVX2 premul_alpha() (#2615)
* AVX2 premul_alpha() implementation * minor changes * finishing touches and comments * format * add bias back in * fix for alpha shuffle masks * optimization
1 parent 9111d6c commit d732a44

File tree

3 files changed

+120
-0
lines changed

3 files changed

+120
-0
lines changed

src_c/alphablit.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2839,6 +2839,10 @@ premul_surf_color_by_alpha(SDL_Surface *src, SDL_Surface *dst)
28392839
// since we know dst is a copy of src we can simplify the normal checks
28402840
#if !defined(__EMSCRIPTEN__)
28412841
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
2842+
if ((src->format->BytesPerPixel == 4) && pg_has_avx2()) {
2843+
premul_surf_color_by_alpha_avx2(src, dst);
2844+
return 0;
2845+
}
28422846
#if defined(__SSE2__)
28432847
if ((src->format->BytesPerPixel == 4) && SDL_HasSSE2()) {
28442848
premul_surf_color_by_alpha_sse2(src, dst);

src_c/simd_blitters.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,5 @@ void
8181
blit_blend_rgb_min_avx2(SDL_BlitInfo *info);
8282
void
8383
blit_blend_premultiplied_avx2(SDL_BlitInfo *info);
84+
void
85+
premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst);

src_c/simd_blitters_avx2.c

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1529,3 +1529,117 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info)
15291529
}
15301530
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
15311531
!defined(SDL_DISABLE_IMMINTRIN_H) */
1532+
1533+
#define PREMUL_ALPHA_CODE \
1534+
/* extract the alpha */ \
1535+
mm_alpha_in = _mm256_and_si256(mm_src, mm256_amask); \
1536+
\
1537+
/*redistribute the alphas across the R, G, B channels*/ \
1538+
alphaA = _mm256_shuffle_epi8(mm_src, shuffle_maskA); \
1539+
alphaB = _mm256_shuffle_epi8(mm_src, shuffle_maskB); \
1540+
\
1541+
/*prep the pixels for 16-bit math*/ \
1542+
mm_srcA = _mm256_unpacklo_epi8(mm_src, mm_zero); \
1543+
mm_srcB = _mm256_unpackhi_epi8(mm_src, mm_zero); \
1544+
\
1545+
mm_srcA = _mm256_add_epi16(mm_srcA, mm256_ones); \
1546+
mm_srcB = _mm256_add_epi16(mm_srcB, mm256_ones); \
1547+
\
1548+
/*multiply the pixels by the alphas*/ \
1549+
mm_srcA = _mm256_mullo_epi16(mm_srcA, alphaA); \
1550+
mm_srcB = _mm256_mullo_epi16(mm_srcB, alphaB); \
1551+
\
1552+
/*shift the pixels back down to 8-bit*/ \
1553+
mm_srcA = _mm256_srli_epi16(mm_srcA, 8); \
1554+
mm_srcB = _mm256_srli_epi16(mm_srcB, 8); \
1555+
\
1556+
/*pack the pixels back together*/ \
1557+
mm_dst = _mm256_packus_epi16(mm_srcA, mm_srcB); \
1558+
/*add the original alpha back in*/ \
1559+
mm_dst = _mm256_or_si256(mm_dst, mm_alpha_in);
1560+
1561+
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
1562+
!defined(SDL_DISABLE_IMMINTRIN_H)
1563+
void
1564+
premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst)
1565+
{
1566+
int i, height = src->h;
1567+
const int width = src->w;
1568+
const int n_iters_8 = width / 8;
1569+
const int pxl_excess = width % 8;
1570+
const int src_skip = src->pitch / 4 - width;
1571+
const int dst_skip = dst->pitch / 4 - width;
1572+
const int src_exc_skip = pxl_excess + src_skip;
1573+
const int dst_exc_skip = pxl_excess + dst_skip;
1574+
1575+
Uint32 *srcp = (Uint32 *)src->pixels;
1576+
Uint32 *dstp = (Uint32 *)dst->pixels;
1577+
1578+
__m256i mm_src, mm_dst, alphaA, alphaB, mm_alpha_in;
1579+
__m256i mm_srcA, mm_srcB;
1580+
1581+
const __m256i mm256_amask = _mm256_set1_epi32(src->format->Amask);
1582+
const __m256i mm_zero = _mm256_setzero_si256();
1583+
const __m256i partial_mask =
1584+
_mm256_set_epi32(0, pxl_excess > 6 ? -1 : 0, pxl_excess > 5 ? -1 : 0,
1585+
pxl_excess > 4 ? -1 : 0, pxl_excess > 3 ? -1 : 0,
1586+
pxl_excess > 2 ? -1 : 0, pxl_excess > 1 ? -1 : 0,
1587+
pxl_excess > 0 ? -1 : 0);
1588+
const __m256i mm256_ones = _mm256_set1_epi16(0x0001);
1589+
1590+
char _a_off = ((src->format->Amask >> 8) == 0) ? 0
1591+
: ((src->format->Amask >> 16) == 0) ? 1
1592+
: ((src->format->Amask >> 24) == 0) ? 2
1593+
: 3;
1594+
1595+
/* masks for shuffling the alpha to the RGB channels for multiplication */
1596+
const __m256i shuffle_maskA = _mm256_set_epi8(
1597+
-1, -1, -1, 20 + _a_off, -1, 20 + _a_off, -1, 20 + _a_off, -1, -1, -1,
1598+
16 + _a_off, -1, 16 + _a_off, -1, 16 + _a_off, -1, -1, -1, 4 + _a_off,
1599+
-1, 4 + _a_off, -1, 4 + _a_off, -1, -1, -1, _a_off, -1, _a_off, -1,
1600+
_a_off);
1601+
1602+
const __m256i shuffle_maskB = _mm256_set_epi8(
1603+
-1, -1, -1, 28 + _a_off, -1, 28 + _a_off, -1, 28 + _a_off, -1, -1, -1,
1604+
24 + _a_off, -1, 24 + _a_off, -1, 24 + _a_off, -1, -1, -1, 12 + _a_off,
1605+
-1, 12 + _a_off, -1, 12 + _a_off, -1, -1, -1, 8 + _a_off, -1,
1606+
8 + _a_off, -1, 8 + _a_off);
1607+
1608+
while (height--) {
1609+
/* 8 pixels at a time */
1610+
for (i = 0; i < n_iters_8; i++) {
1611+
mm_src = _mm256_loadu_si256((__m256i *)srcp);
1612+
1613+
PREMUL_ALPHA_CODE;
1614+
1615+
_mm256_storeu_si256((__m256i *)dstp, mm_dst);
1616+
1617+
dstp += 8;
1618+
srcp += 8;
1619+
}
1620+
1621+
/* up to 7 pixels at a time */
1622+
if (pxl_excess) {
1623+
mm_src = _mm256_maskload_epi32((int *)srcp, partial_mask);
1624+
1625+
PREMUL_ALPHA_CODE
1626+
1627+
_mm256_maskstore_epi32((int *)dstp, partial_mask, mm_dst);
1628+
1629+
srcp += src_exc_skip;
1630+
dstp += dst_exc_skip;
1631+
continue;
1632+
}
1633+
1634+
srcp += src_skip;
1635+
dstp += dst_skip;
1636+
}
1637+
}
1638+
#else
1639+
void
1640+
premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst)
1641+
{
1642+
BAD_AVX2_FUNCTION_CALL;
1643+
}
1644+
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
1645+
!defined(SDL_DISABLE_IMMINTRIN_H) */

0 commit comments

Comments
 (0)