Skip to content

Commit 9a5ff18

Browse files
authored
Merge pull request #2534 from pygame-community/simd-transform-invert
Add SIMD versions of the invert transform
2 parents 721ab32 + 63c6467 commit 9a5ff18

File tree

4 files changed

+210
-13
lines changed

4 files changed

+210
-13
lines changed

src_c/simd_transform.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,13 @@ filter_expand_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
4040
void
4141
filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
4242
int dstpitch, int srcheight, int dstheight);
43+
void
44+
invert_sse2(SDL_Surface *src, SDL_Surface *newsurf);
4345

4446
#endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */
4547

4648
// AVX2 functions
4749
void
4850
grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf);
51+
void
52+
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf);

src_c/simd_transform_avx2.c

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,100 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
214214
srcp256 = (__m256i *)srcp;
215215
}
216216
}
217+
218+
void
219+
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
220+
{
221+
int s_row_skip = (src->pitch - src->w * 4) / 4;
222+
223+
// generate number of batches of pixels we need to loop through
224+
int pixel_batch_length = src->w * src->h;
225+
int num_batches = 1;
226+
if (s_row_skip > 0) {
227+
pixel_batch_length = src->w;
228+
num_batches = src->h;
229+
}
230+
231+
int remaining_pixels = pixel_batch_length % 8;
232+
int perfect_8_pixels = pixel_batch_length / 8;
233+
234+
int perfect_8_pixels_batch_counter = perfect_8_pixels;
235+
int remaining_pixels_batch_counter = remaining_pixels;
236+
237+
Uint32 *srcp = (Uint32 *)src->pixels;
238+
Uint32 *dstp = (Uint32 *)newsurf->pixels;
239+
240+
Uint32 amask = src->format->Amask;
241+
Uint32 rgbmask = ~amask;
242+
243+
__m256i *srcp256 = (__m256i *)src->pixels;
244+
__m256i *dstp256 = (__m256i *)newsurf->pixels;
245+
246+
__m256i mm256_src, mm256_dst, mm256_rgb_invert_mask, mm256_alpha,
247+
mm256_alpha_mask;
248+
249+
mm256_rgb_invert_mask = _mm256_set1_epi32(rgbmask);
250+
mm256_alpha_mask = _mm256_set1_epi32(amask);
251+
252+
__m256i _partial8_mask = _mm256_set_epi32(
253+
0x00, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0,
254+
(remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0,
255+
(remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0,
256+
(remaining_pixels > 0) ? -1 : 0);
257+
258+
while (num_batches--) {
259+
perfect_8_pixels_batch_counter = perfect_8_pixels;
260+
remaining_pixels_batch_counter = remaining_pixels;
261+
while (perfect_8_pixels_batch_counter--) {
262+
mm256_src = _mm256_loadu_si256(srcp256);
263+
264+
/* pull out the alpha */
265+
mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
266+
267+
/* do the invert */
268+
mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);
269+
270+
/* put the alpha back in*/
271+
mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
272+
273+
_mm256_storeu_si256(dstp256, mm256_dst);
274+
275+
srcp256++;
276+
dstp256++;
277+
}
278+
srcp = (Uint32 *)srcp256;
279+
dstp = (Uint32 *)dstp256;
280+
if (remaining_pixels_batch_counter > 0) {
281+
mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
282+
283+
/* pull out the alpha */
284+
mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
285+
286+
/* do the invert */
287+
mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);
288+
289+
/* put the alpha back in*/
290+
mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
291+
292+
_mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
293+
294+
srcp += remaining_pixels_batch_counter;
295+
dstp += remaining_pixels_batch_counter;
296+
}
297+
srcp += s_row_skip;
298+
srcp256 = (__m256i *)srcp;
299+
}
300+
}
217301
#else
218302
void
219303
grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
220304
{
221305
BAD_AVX2_FUNCTION_CALL;
222306
}
307+
void
308+
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
309+
{
310+
BAD_AVX2_FUNCTION_CALL;
311+
}
223312
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
224313
!defined(SDL_DISABLE_IMMINTRIN_H) */

src_c/simd_transform_sse2.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -610,4 +610,80 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
610610
srcp64 = (Uint64 *)srcp;
611611
}
612612
}
613+
614+
void
615+
invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
616+
{
617+
int s_row_skip = (src->pitch - src->w * 4) / 4;
618+
619+
// generate number of batches of pixels we need to loop through
620+
int pixel_batch_length = src->w * src->h;
621+
int num_batches = 1;
622+
if (s_row_skip > 0) {
623+
pixel_batch_length = src->w;
624+
num_batches = src->h;
625+
}
626+
int remaining_pixels = pixel_batch_length % 4;
627+
int perfect_4_pixels = pixel_batch_length / 4;
628+
629+
int perfect_4_pixels_batch_counter = perfect_4_pixels;
630+
int remaining_pixels_batch_counter = remaining_pixels;
631+
632+
Uint32 *srcp = (Uint32 *)src->pixels;
633+
Uint32 *dstp = (Uint32 *)newsurf->pixels;
634+
635+
__m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask;
636+
637+
__m128i *srcp128 = (__m128i *)src->pixels;
638+
__m128i *dstp128 = (__m128i *)newsurf->pixels;
639+
640+
mm_rgb_invert_mask = _mm_set1_epi32(~src->format->Amask);
641+
mm_alpha_mask = _mm_set1_epi32(src->format->Amask);
642+
643+
while (num_batches--) {
644+
perfect_4_pixels_batch_counter = perfect_4_pixels;
645+
remaining_pixels_batch_counter = remaining_pixels;
646+
while (perfect_4_pixels_batch_counter--) {
647+
mm_src = _mm_loadu_si128(srcp128);
648+
/*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
649+
650+
/* pull out the alpha */
651+
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
652+
653+
/* do the invert */
654+
mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
655+
656+
/* put the alpha back in*/
657+
mm_dst = _mm_or_si128(mm_dst, mm_alpha);
658+
659+
_mm_storeu_si128(dstp128, mm_dst);
660+
/*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
661+
srcp128++;
662+
dstp128++;
663+
}
664+
srcp = (Uint32 *)srcp128;
665+
dstp = (Uint32 *)dstp128;
666+
while (remaining_pixels_batch_counter--) {
667+
mm_src = _mm_cvtsi32_si128(*srcp);
668+
/*mm_src = 0x000000000000000000000000AARRGGBB*/
669+
670+
/* pull out the alpha */
671+
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
672+
673+
/* do the invert */
674+
mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
675+
676+
/* put the alpha back in*/
677+
mm_dst = _mm_or_si128(mm_dst, mm_alpha);
678+
679+
*dstp = _mm_cvtsi128_si32(mm_dst);
680+
/*dstp = 0xAARRGGBB*/
681+
srcp++;
682+
dstp++;
683+
}
684+
srcp += s_row_skip;
685+
srcp128 = (__m128i *)srcp;
686+
}
687+
}
688+
613689
#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/

src_c/transform.c

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3373,6 +3373,25 @@ surf_gaussian_blur(PyObject *self, PyObject *args, PyObject *kwargs)
33733373
return (PyObject *)pgSurface_New(new_surf);
33743374
}
33753375

3376+
void
3377+
invert_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
3378+
{
3379+
int x, y;
3380+
for (y = 0; y < src->h; y++) {
3381+
for (x = 0; x < src->w; x++) {
3382+
Uint32 pixel;
3383+
Uint8 *pix;
3384+
SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
3385+
pix);
3386+
unsigned char r, g, b, a;
3387+
SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
3388+
Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
3389+
SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
3390+
newsurf->format, pix);
3391+
}
3392+
}
3393+
}
3394+
33763395
SDL_Surface *
33773396
invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
33783397
{
@@ -3399,21 +3418,30 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
33993418
PyExc_ValueError,
34003419
"Source and destination surfaces need the same format."));
34013420
}
3402-
3403-
int x, y;
3404-
for (y = 0; y < src->h; y++) {
3405-
for (x = 0; x < src->w; x++) {
3406-
Uint32 pixel;
3407-
Uint8 *pix;
3408-
SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
3409-
pix);
3410-
unsigned char r, g, b, a;
3411-
SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
3412-
Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
3413-
SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
3414-
newsurf->format, pix);
3421+
#if defined(__EMSCRIPTEN__)
3422+
invert_non_simd(src, newsurf);
3423+
#else // !defined(__EMSCRIPTEN__)
3424+
if (src->format->BytesPerPixel == 4 &&
3425+
src->format->Rmask == newsurf->format->Rmask &&
3426+
src->format->Gmask == newsurf->format->Gmask &&
3427+
src->format->Bmask == newsurf->format->Bmask &&
3428+
(src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) {
3429+
if (pg_has_avx2()) {
3430+
invert_avx2(src, newsurf);
3431+
}
3432+
#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
3433+
else if (pg_HasSSE_NEON()) {
3434+
invert_sse2(src, newsurf);
3435+
}
3436+
#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
3437+
else {
3438+
invert_non_simd(src, newsurf);
34153439
}
34163440
}
3441+
else {
3442+
invert_non_simd(src, newsurf);
3443+
}
3444+
#endif // !defined(__EMSCRIPTEN__)
34173445

34183446
SDL_UnlockSurface(newsurf);
34193447

0 commit comments

Comments
 (0)