Proper fix for SATD_ANY_SIZE_MULTI_GENERIC and the AVX2 counterpart, fixes #433

fador · fador · commit 36f7752d515a · 2025-06-10T09:08:13.000+03:00
diff --git a/src/strategies/avx2/picture-avx2.c b/src/strategies/avx2/picture-avx2.c
@@ -631,48 +631,51 @@ SATD_NXN_DUAL_AVX2(64)
   static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
   static void satd_any_size_ ## suffix ( \
       int width, int height, \
-      const uint8_t **preds, \
+      const kvz_pixel **preds, \
       const int stride, \
-      const uint8_t *orig, \
+      const kvz_pixel *orig, \
       const int orig_stride, \
       unsigned num_modes, \
       unsigned *costs_out, \
       int8_t *valid) \
   { \
     unsigned sums[num_parallel_blocks] = { 0 }; \
-    const uint8_t *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
-    const uint8_t *orig_ptr = orig; \
+    const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
+    const kvz_pixel *orig_ptr = orig; \
     costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
-    if (width % 8 != 0) { \
+    const int width_mod_8 = width % 8; \
+    if (width_mod_8 != 0) { \
+      const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
       /* Process the first column using 4x4 blocks. */ \
       for (int y = 0; y < height; y += 4) { \
-        kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \
-            } \
-      orig_ptr += 4; \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig[y*orig_stride], orig_stride, sums); \
+        for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4*stride; costs_out[blk] += sums[blk]; }\
+      } \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
         pred_ptrs[blk] += 4; \
-            }\
+      }\
       width -= 4; \
-            } \
+    } \
     if (height % 8 != 0) { \
+      const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
       /* Process the first row using 4x4 blocks. */ \
       for (int x = 0; x < width; x += 4 ) { \
-        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
-            } \
-      orig_ptr += 4 * orig_stride; \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig_ptr[x], orig_stride, sums); \
+        for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4;  costs_out[blk] += sums[blk]; }\
+      } \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
         pred_ptrs[blk] += 4 * stride; \
-            }\
+      }\
       height -= 4; \
-        } \
+    } \
     /* The rest can now be processed with 8x8 blocks. */ \
-    for (int y = 0; y < height; y += 8) { \
-      orig_ptr = &orig[y * orig_stride]; \
-      pred_ptrs[0] = &preds[0][y * stride]; \
-      pred_ptrs[1] = &preds[1][y * stride]; \
-      pred_ptrs[2] = &preds[2][y * stride]; \
-      pred_ptrs[3] = &preds[3][y * stride]; \
-      for (int x = 0; x < width; x += 8) { \
+    for (int y = height % 8; y < height; y += 8) { \
+      orig_ptr = &orig[y * orig_stride + width_mod_8]; \
+      pred_ptrs[0] = &preds[0][y * stride + width_mod_8]; \
+      pred_ptrs[1] = &preds[1][y * stride + width_mod_8]; \
+      pred_ptrs[2] = &preds[2][y * stride + width_mod_8]; \
+      pred_ptrs[3] = &preds[3][y * stride + width_mod_8]; \
+      for (int x = width_mod_8; x < width; x += 8) { \
         satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
         orig_ptr += 8; \
         pred_ptrs[0] += 8; \
diff --git a/src/strategies/generic/picture-generic.c b/src/strategies/generic/picture-generic.c
@@ -417,14 +417,14 @@ SATD_DUAL_NXN(64, kvz_pixel)
     const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
     const kvz_pixel *orig_ptr = orig; \
     costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
-    if (width % 8 != 0) { \
+    const int width_mod_8 = width % 8; \
+    if (width_mod_8 != 0) { \
       const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
       /* Process the first column using 4x4 blocks. */ \
       for (int y = 0; y < height; y += 4) { \
         kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig[y*orig_stride], orig_stride, sums); \
-        for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4*stride; }\
+        for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4*stride; costs_out[blk] += sums[blk]; }\
       } \
-      orig_ptr += 4; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
         pred_ptrs[blk] += 4; \
       }\
@@ -435,22 +435,21 @@ SATD_DUAL_NXN(64, kvz_pixel)
       /* Process the first row using 4x4 blocks. */ \
       for (int x = 0; x < width; x += 4 ) { \
         kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig_ptr[x], orig_stride, sums); \
-        for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4; }\
+        for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4;  costs_out[blk] += sums[blk]; }\
       } \
-      orig_ptr += 4 * orig_stride; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
         pred_ptrs[blk] += 4 * stride; \
       }\
       height -= 4; \
     } \
     /* The rest can now be processed with 8x8 blocks. */ \
-    for (int y = 0; y < height; y += 8) { \
-      orig_ptr = &orig[y * orig_stride]; \
-      pred_ptrs[0] = &preds[0][y * stride]; \
-      pred_ptrs[1] = &preds[1][y * stride]; \
-      pred_ptrs[2] = &preds[2][y * stride]; \
-      pred_ptrs[3] = &preds[3][y * stride]; \
-      for (int x = 0; x < width; x += 8) { \
+    for (int y = height % 8; y < height; y += 8) { \
+      orig_ptr = &orig[y * orig_stride + width_mod_8]; \
+      pred_ptrs[0] = &preds[0][y * stride + width_mod_8]; \
+      pred_ptrs[1] = &preds[1][y * stride + width_mod_8]; \
+      pred_ptrs[2] = &preds[2][y * stride + width_mod_8]; \
+      pred_ptrs[3] = &preds[3][y * stride + width_mod_8]; \
+      for (int x = width_mod_8; x < width; x += 8) { \
         satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
         orig_ptr += 8; \
         pred_ptrs[0] += 8; \