Skip to content

Commit 36f7752

Browse files
committed
Proper fix for SATD_ANY_SIZE_MULTI_GENERIC and the AVX2 counterpart, fixes #433
1 parent 77a6f6b commit 36f7752

File tree

2 files changed

+36
-34
lines changed

2 files changed

+36
-34
lines changed

src/strategies/avx2/picture-avx2.c

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -631,48 +631,51 @@ SATD_NXN_DUAL_AVX2(64)
631631
static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
632632
static void satd_any_size_ ## suffix ( \
633633
int width, int height, \
634-
const uint8_t **preds, \
634+
const kvz_pixel **preds, \
635635
const int stride, \
636-
const uint8_t *orig, \
636+
const kvz_pixel *orig, \
637637
const int orig_stride, \
638638
unsigned num_modes, \
639639
unsigned *costs_out, \
640640
int8_t *valid) \
641641
{ \
642642
unsigned sums[num_parallel_blocks] = { 0 }; \
643-
const uint8_t *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
644-
const uint8_t *orig_ptr = orig; \
643+
const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
644+
const kvz_pixel *orig_ptr = orig; \
645645
costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
646-
if (width % 8 != 0) { \
646+
const int width_mod_8 = width % 8; \
647+
if (width_mod_8 != 0) { \
648+
const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
647649
/* Process the first column using 4x4 blocks. */ \
648650
for (int y = 0; y < height; y += 4) { \
649-
kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \
650-
} \
651-
orig_ptr += 4; \
651+
kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig[y*orig_stride], orig_stride, sums); \
652+
for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4*stride; costs_out[blk] += sums[blk]; }\
653+
} \
652654
for(int blk = 0; blk < num_parallel_blocks; ++blk){\
653655
pred_ptrs[blk] += 4; \
654-
}\
656+
}\
655657
width -= 4; \
656-
} \
658+
} \
657659
if (height % 8 != 0) { \
660+
const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
658661
/* Process the first row using 4x4 blocks. */ \
659662
for (int x = 0; x < width; x += 4 ) { \
660-
kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
661-
} \
662-
orig_ptr += 4 * orig_stride; \
663+
kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig_ptr[x], orig_stride, sums); \
664+
for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4; costs_out[blk] += sums[blk]; }\
665+
} \
663666
for(int blk = 0; blk < num_parallel_blocks; ++blk){\
664667
pred_ptrs[blk] += 4 * stride; \
665-
}\
668+
}\
666669
height -= 4; \
667-
} \
670+
} \
668671
/* The rest can now be processed with 8x8 blocks. */ \
669-
for (int y = 0; y < height; y += 8) { \
670-
orig_ptr = &orig[y * orig_stride]; \
671-
pred_ptrs[0] = &preds[0][y * stride]; \
672-
pred_ptrs[1] = &preds[1][y * stride]; \
673-
pred_ptrs[2] = &preds[2][y * stride]; \
674-
pred_ptrs[3] = &preds[3][y * stride]; \
675-
for (int x = 0; x < width; x += 8) { \
672+
for (int y = height % 8; y < height; y += 8) { \
673+
orig_ptr = &orig[y * orig_stride + width_mod_8]; \
674+
pred_ptrs[0] = &preds[0][y * stride + width_mod_8]; \
675+
pred_ptrs[1] = &preds[1][y * stride + width_mod_8]; \
676+
pred_ptrs[2] = &preds[2][y * stride + width_mod_8]; \
677+
pred_ptrs[3] = &preds[3][y * stride + width_mod_8]; \
678+
for (int x = width_mod_8; x < width; x += 8) { \
676679
satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
677680
orig_ptr += 8; \
678681
pred_ptrs[0] += 8; \

src/strategies/generic/picture-generic.c

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -417,14 +417,14 @@ SATD_DUAL_NXN(64, kvz_pixel)
417417
const kvz_pixel *pred_ptrs[4] = { preds[0], preds[1], preds[2], preds[3] };\
418418
const kvz_pixel *orig_ptr = orig; \
419419
costs_out[0] = 0; costs_out[1] = 0; costs_out[2] = 0; costs_out[3] = 0; \
420-
if (width % 8 != 0) { \
420+
const int width_mod_8 = width % 8; \
421+
if (width_mod_8 != 0) { \
421422
const kvz_pixel *pred_ptrs_tmp[4] = { preds[0], preds[1], preds[2], preds[3] };\
422423
/* Process the first column using 4x4 blocks. */ \
423424
for (int y = 0; y < height; y += 4) { \
424425
kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig[y*orig_stride], orig_stride, sums); \
425-
for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4*stride; }\
426+
for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4*stride; costs_out[blk] += sums[blk]; }\
426427
} \
427-
orig_ptr += 4; \
428428
for(int blk = 0; blk < num_parallel_blocks; ++blk){\
429429
pred_ptrs[blk] += 4; \
430430
}\
@@ -435,22 +435,21 @@ SATD_DUAL_NXN(64, kvz_pixel)
435435
/* Process the first row using 4x4 blocks. */ \
436436
for (int x = 0; x < width; x += 4 ) { \
437437
kvz_satd_4x4_subblock_ ## suffix(pred_ptrs_tmp, stride, &orig_ptr[x], orig_stride, sums); \
438-
for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4; }\
438+
for(int blk = 0; blk < num_parallel_blocks; ++blk){ pred_ptrs_tmp[blk] += 4; costs_out[blk] += sums[blk]; }\
439439
} \
440-
orig_ptr += 4 * orig_stride; \
441440
for(int blk = 0; blk < num_parallel_blocks; ++blk){\
442441
pred_ptrs[blk] += 4 * stride; \
443442
}\
444443
height -= 4; \
445444
} \
446445
/* The rest can now be processed with 8x8 blocks. */ \
447-
for (int y = 0; y < height; y += 8) { \
448-
orig_ptr = &orig[y * orig_stride]; \
449-
pred_ptrs[0] = &preds[0][y * stride]; \
450-
pred_ptrs[1] = &preds[1][y * stride]; \
451-
pred_ptrs[2] = &preds[2][y * stride]; \
452-
pred_ptrs[3] = &preds[3][y * stride]; \
453-
for (int x = 0; x < width; x += 8) { \
446+
for (int y = height % 8; y < height; y += 8) { \
447+
orig_ptr = &orig[y * orig_stride + width_mod_8]; \
448+
pred_ptrs[0] = &preds[0][y * stride + width_mod_8]; \
449+
pred_ptrs[1] = &preds[1][y * stride + width_mod_8]; \
450+
pred_ptrs[2] = &preds[2][y * stride + width_mod_8]; \
451+
pred_ptrs[3] = &preds[3][y * stride + width_mod_8]; \
452+
for (int x = width_mod_8; x < width; x += 8) { \
454453
satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
455454
orig_ptr += 8; \
456455
pred_ptrs[0] += 8; \

0 commit comments

Comments
 (0)