@@ -631,48 +631,51 @@ SATD_NXN_DUAL_AVX2(64)
631631 static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
632632 static void satd_any_size_ ## suffix ( \
633633 int width, int height, \
634- const uint8_t **preds, \
634+ const kvz_pixel **preds, \
635635 const int stride, \
636- const uint8_t *orig, \
636+ const kvz_pixel *orig, \
637637 const int orig_stride, \
638638 unsigned num_modes, \
639639 unsigned *costs_out, \
640640 int8_t *valid) \
641641 { \
642642 unsigned sums[num_parallel_blocks] = { 0 }; \
643- const uint8_t *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
644- const uint8_t *orig_ptr = orig; \
643+ const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
644+ const kvz_pixel *orig_ptr = orig; \
645645 costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
646- if (width % 8 != 0) { \
646+ const int width_mod_8 = width % 8; \
647+ if (width_mod_8 != 0) { \
648+ const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
647649 /* Process the first column using 4x4 blocks. */ \
648650 for (int y = 0 ; y < height ; y += 4 ) { \
649- kvz_satd_4x4_subblock_ ## suffix (preds , stride, orig, orig_stride, sums); \
650- } \
651- orig_ptr += 4; \
651+ kvz_satd_4x4_subblock_ ## suffix (pred_ptrs_tmp , stride, & orig[y*orig_stride] , orig_stride, sums); \
652+ for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4*stride; costs_out[blk] += sums[blk]; } \
653+ } \
652654 for(int blk = 0; blk < num_parallel_blocks; ++blk){\
653655 pred_ptrs[blk] += 4; \
654- }\
656+ }\
655657 width -= 4; \
656- } \
658+ } \
657659 if (height % 8 != 0) { \
660+ const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
658661 /* Process the first row using 4x4 blocks. */ \
659662 for (int x = 0 ; x < width ; x += 4 ) { \
660- kvz_satd_4x4_subblock_ ## suffix (pred_ptrs , stride, orig_ptr, orig_stride, sums); \
661- } \
662- orig_ptr += 4 * orig_stride; \
663+ kvz_satd_4x4_subblock_ ## suffix (pred_ptrs_tmp , stride, & orig_ptr[x] , orig_stride, sums); \
664+ for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4; costs_out[blk] += sums[blk]; } \
665+ } \
663666 for(int blk = 0; blk < num_parallel_blocks; ++blk){\
664667 pred_ptrs[blk] += 4 * stride; \
665- }\
668+ }\
666669 height -= 4; \
667- } \
670+ } \
668671 /* The rest can now be processed with 8x8 blocks. */ \
669- for (int y = 0 ; y < height ; y += 8 ) { \
670- orig_ptr = & orig [y * orig_stride ]; \
671- pred_ptrs [0 ] = & preds [0 ][y * stride ]; \
672- pred_ptrs [1 ] = & preds [1 ][y * stride ]; \
673- pred_ptrs [2 ] = & preds [2 ][y * stride ]; \
674- pred_ptrs [3 ] = & preds [3 ][y * stride ]; \
675- for (int x = 0 ; x < width ; x += 8 ) { \
672+ for (int y = height % 8 ; y < height ; y += 8 ) { \
673+ orig_ptr = & orig [y * orig_stride + width_mod_8 ]; \
674+ pred_ptrs [0 ] = & preds [0 ][y * stride + width_mod_8 ]; \
675+ pred_ptrs [1 ] = & preds [1 ][y * stride + width_mod_8 ]; \
676+ pred_ptrs [2 ] = & preds [2 ][y * stride + width_mod_8 ]; \
677+ pred_ptrs [3 ] = & preds [3 ][y * stride + width_mod_8 ]; \
678+ for (int x = width_mod_8 ; x < width ; x += 8 ) { \
676679 satd_8x8_subblock_ ## suffix (pred_ptrs, stride, orig_ptr, orig_stride, sums); \
677680 orig_ptr += 8; \
678681 pred_ptrs[0] += 8; \
0 commit comments