Merge pull request #2534 from pygame-community/simd-transform-invert

MyreMylar · web-flow · commit 9a5ff186b981 · 2024-04-12T09:12:22.000+01:00
Add SIMD versions of the invert transform
diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h
@@ -40,9 +40,13 @@ filter_expand_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
 void
 filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
                      int dstpitch, int srcheight, int dstheight);
+void
+invert_sse2(SDL_Surface *src, SDL_Surface *newsurf);
 
 #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */
 
 // AVX2 functions
 void
 grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf);
+void
+invert_avx2(SDL_Surface *src, SDL_Surface *newsurf);
diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
@@ -214,11 +214,100 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         srcp256 = (__m256i *)srcp;
     }
 }
+
+void
+invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+
+    int remaining_pixels = pixel_batch_length % 8;
+    int perfect_8_pixels = pixel_batch_length / 8;
+
+    int perfect_8_pixels_batch_counter = perfect_8_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint32 amask = src->format->Amask;
+    Uint32 rgbmask = ~amask;
+
+    __m256i *srcp256 = (__m256i *)src->pixels;
+    __m256i *dstp256 = (__m256i *)newsurf->pixels;
+
+    __m256i mm256_src, mm256_dst, mm256_rgb_invert_mask, mm256_alpha,
+        mm256_alpha_mask;
+
+    mm256_rgb_invert_mask = _mm256_set1_epi32(rgbmask);
+    mm256_alpha_mask = _mm256_set1_epi32(amask);
+
+    __m256i _partial8_mask = _mm256_set_epi32(
+        0x00, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0,
+        (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0,
+        (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0,
+        (remaining_pixels > 0) ? -1 : 0);
+
+    while (num_batches--) {
+        perfect_8_pixels_batch_counter = perfect_8_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_8_pixels_batch_counter--) {
+            mm256_src = _mm256_loadu_si256(srcp256);
+
+            /* pull out the alpha */
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
+            /* do the invert */
+            mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
+
+            _mm256_storeu_si256(dstp256, mm256_dst);
+
+            srcp256++;
+            dstp256++;
+        }
+        srcp = (Uint32 *)srcp256;
+        dstp = (Uint32 *)dstp256;
+        if (remaining_pixels_batch_counter > 0) {
+            mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
+
+            /* pull out the alpha */
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
+            /* do the invert */
+            mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
+
+            _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
+
+            srcp += remaining_pixels_batch_counter;
+            dstp += remaining_pixels_batch_counter;
+        }
+        srcp += s_row_skip;
+        srcp256 = (__m256i *)srcp;
+    }
+}
 #else
 void
 grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
 {
     BAD_AVX2_FUNCTION_CALL;
 }
+void
+invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    BAD_AVX2_FUNCTION_CALL;
+}
 #endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
           !defined(SDL_DISABLE_IMMINTRIN_H) */
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
@@ -610,4 +610,80 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         srcp64 = (Uint64 *)srcp;
     }
 }
+
+void
+invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+    int remaining_pixels = pixel_batch_length % 4;
+    int perfect_4_pixels = pixel_batch_length / 4;
+
+    int perfect_4_pixels_batch_counter = perfect_4_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    __m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask;
+
+    __m128i *srcp128 = (__m128i *)src->pixels;
+    __m128i *dstp128 = (__m128i *)newsurf->pixels;
+
+    mm_rgb_invert_mask = _mm_set1_epi32(~src->format->Amask);
+    mm_alpha_mask = _mm_set1_epi32(src->format->Amask);
+
+    while (num_batches--) {
+        perfect_4_pixels_batch_counter = perfect_4_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_4_pixels_batch_counter--) {
+            mm_src = _mm_loadu_si128(srcp128);
+            /*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
+
+            /* pull out the alpha */
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
+
+            /* do the invert */
+            mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
+
+            _mm_storeu_si128(dstp128, mm_dst);
+            /*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
+            srcp128++;
+            dstp128++;
+        }
+        srcp = (Uint32 *)srcp128;
+        dstp = (Uint32 *)dstp128;
+        while (remaining_pixels_batch_counter--) {
+            mm_src = _mm_cvtsi32_si128(*srcp);
+            /*mm_src = 0x000000000000000000000000AARRGGBB*/
+
+            /* pull out the alpha */
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
+
+            /* do the invert */
+            mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
+
+            *dstp = _mm_cvtsi128_si32(mm_dst);
+            /*dstp = 0xAARRGGBB*/
+            srcp++;
+            dstp++;
+        }
+        srcp += s_row_skip;
+        srcp128 = (__m128i *)srcp;
+    }
+}
+
 #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/
diff --git a/src_c/transform.c b/src_c/transform.c
@@ -3373,6 +3373,25 @@ surf_gaussian_blur(PyObject *self, PyObject *args, PyObject *kwargs)
     return (PyObject *)pgSurface_New(new_surf);
 }
 
+void
+invert_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int x, y;
+    for (y = 0; y < src->h; y++) {
+        for (x = 0; x < src->w; x++) {
+            Uint32 pixel;
+            Uint8 *pix;
+            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
+                        pix);
+            unsigned char r, g, b, a;
+            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
+            Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
+            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
+                        newsurf->format, pix);
+        }
+    }
+}
+
 SDL_Surface *
 invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
 {
@@ -3399,21 +3418,30 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
             PyExc_ValueError,
             "Source and destination surfaces need the same format."));
     }
-
-    int x, y;
-    for (y = 0; y < src->h; y++) {
-        for (x = 0; x < src->w; x++) {
-            Uint32 pixel;
-            Uint8 *pix;
-            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
-                        pix);
-            unsigned char r, g, b, a;
-            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
-            Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
-            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
-                        newsurf->format, pix);
+#if defined(__EMSCRIPTEN__)
+    invert_non_simd(src, newsurf);
+#else  // !defined(__EMSCRIPTEN__)
+    if (src->format->BytesPerPixel == 4 &&
+        src->format->Rmask == newsurf->format->Rmask &&
+        src->format->Gmask == newsurf->format->Gmask &&
+        src->format->Bmask == newsurf->format->Bmask &&
+        (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) {
+        if (pg_has_avx2()) {
+            invert_avx2(src, newsurf);
+        }
+#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        else if (pg_HasSSE_NEON()) {
+            invert_sse2(src, newsurf);
+        }
+#endif  // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        else {
+            invert_non_simd(src, newsurf);
         }
     }
+    else {
+        invert_non_simd(src, newsurf);
+    }
+#endif  // !defined(__EMSCRIPTEN__)
 
     SDL_UnlockSurface(newsurf);