Skip to content

Commit b129d9f

Browse files
committed
mc: Reduce stack use in {put,prep}_scaled_{bilin,8tap}
For the bilin cases, this seems to make things marginally faster (measured on x86_64; 7-25% faster with compiler autovectorization). For 8tap, it doesn't make much of a difference at all. Before: GCC Clang mc_scaled_8tap_regular_w128_8bpc_c: 115155.5 98549.3 mc_scaled_8tap_regular_w128_8bpc_ssse3: 17936.0 18411.1 mc_scaled_bilinear_w128_8bpc_c: 40290.0 51812.9 mc_scaled_bilinear_w128_8bpc_ssse3: 18243.9 18177.0 After: mc_scaled_8tap_regular_w128_8bpc_c: 116304.3 99453.2 mc_scaled_8tap_regular_w128_8bpc_ssse3: 18387.0 18077.3 mc_scaled_bilinear_w128_8bpc_c: 37381.4 41145.0 mc_scaled_bilinear_w128_8bpc_ssse3: 18423.8 18031.6 (Benchmarked with the seed 0; the total runtime for the scaled benchmarks are significantly affected by the random seed.) This reduces the stack usage of these functions from around 65 KB each, to less than 1 KB for bilin, and around 2 KB for 8tap. With this in place, the required stack space for dav1d should be mostly identical across configurations; on x86_64 (both with and without assembly), it can run with 62 KB of stack, and on arm and aarch64, it can run with 58 KB of stack.
1 parent cd5bfa1 commit b129d9f

File tree

1 file changed

+134
-93
lines changed

1 file changed

+134
-93
lines changed

src/mc_tmpl.c

Lines changed: 134 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -84,18 +84,34 @@ prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
8484
F[6] * src[x + +3 * stride] + \
8585
F[7] * src[x + +4 * stride])
8686

87+
#define FILTER_8TAP2(src, x, F) \
88+
(F[0] * src[0][x] + \
89+
F[1] * src[1][x] + \
90+
F[2] * src[2][x] + \
91+
F[3] * src[3][x] + \
92+
F[4] * src[4][x] + \
93+
F[5] * src[5][x] + \
94+
F[6] * src[6][x] + \
95+
F[7] * src[7][x])
96+
8797
#define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
8898
((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
8999

90100
#define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
91101
((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
92102

103+
#define DAV1D_FILTER_8TAP_RND3(src, x, F, sh) \
104+
((FILTER_8TAP2(src, x, F) + ((1 << (sh)) >> 1)) >> (sh))
105+
93106
#define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
94107
iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
95108

96109
#define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
97110
iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
98111

112+
#define DAV1D_FILTER_8TAP_CLIP3(src, x, F, sh) \
113+
iclip_pixel(DAV1D_FILTER_8TAP_RND3(src, x, F, sh))
114+
99115
#define GET_H_FILTER(mx) \
100116
const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
101117
dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
@@ -179,43 +195,50 @@ put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
179195
{
180196
const int intermediate_bits = get_intermediate_bits(bitdepth_max);
181197
const int intermediate_rnd = (1 << intermediate_bits) >> 1;
182-
int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
183-
int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
198+
int16_t mid[128 * 8];
199+
int16_t *mid_ptrs[8];
200+
int in_y = -8;
184201
src_stride = PXSTRIDE(src_stride);
185202

186-
src -= src_stride * 3;
187-
do {
188-
int x;
189-
int imx = mx, ioff = 0;
190-
191-
for (x = 0; x < w; x++) {
192-
GET_H_FILTER(imx >> 6);
193-
mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
194-
6 - intermediate_bits) :
195-
src[ioff] << intermediate_bits;
196-
imx += dx;
197-
ioff += imx >> 10;
198-
imx &= 0x3ff;
199-
}
203+
for (int i = 0; i < 8; i++)
204+
mid_ptrs[i] = &mid[128 * i];
200205

201-
mid_ptr += 128;
202-
src += src_stride;
203-
} while (--tmp_h);
206+
src -= src_stride * 3;
204207

205-
mid_ptr = mid + 128 * 3;
206208
for (int y = 0; y < h; y++) {
207209
int x;
208-
GET_V_FILTER(my >> 6);
210+
int src_y = my >> 10;
211+
GET_V_FILTER((my & 0x3ff) >> 6);
212+
213+
while (in_y < src_y) {
214+
int imx = mx, ioff = 0;
215+
int16_t *mid_ptr = mid_ptrs[0];
216+
217+
for (int i = 0; i < 7; i++)
218+
mid_ptrs[i] = mid_ptrs[i + 1];
219+
mid_ptrs[7] = mid_ptr;
220+
221+
for (x = 0; x < w; x++) {
222+
GET_H_FILTER(imx >> 6);
223+
mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
224+
6 - intermediate_bits) :
225+
src[ioff] << intermediate_bits;
226+
imx += dx;
227+
ioff += imx >> 10;
228+
imx &= 0x3ff;
229+
}
230+
231+
src += src_stride;
232+
in_y++;
233+
}
209234

210235
for (x = 0; x < w; x++)
211-
dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
212-
6 + intermediate_bits) :
213-
iclip_pixel((mid_ptr[x] + intermediate_rnd) >>
236+
dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP3(mid_ptrs, x, fv,
237+
6 + intermediate_bits) :
238+
iclip_pixel((mid_ptrs[3][x] + intermediate_rnd) >>
214239
intermediate_bits);
215240

216241
my += dy;
217-
mid_ptr += (my >> 10) * 128;
218-
my &= 0x3ff;
219242
dst += PXSTRIDE(dst_stride);
220243
}
221244
}
@@ -288,41 +311,48 @@ prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
288311
HIGHBD_DECL_SUFFIX)
289312
{
290313
const int intermediate_bits = get_intermediate_bits(bitdepth_max);
291-
int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
292-
int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
314+
int16_t mid[128 * 8];
315+
int16_t *mid_ptrs[8];
316+
int in_y = -8;
293317
src_stride = PXSTRIDE(src_stride);
294318

295-
src -= src_stride * 3;
296-
do {
297-
int x;
298-
int imx = mx, ioff = 0;
299-
300-
for (x = 0; x < w; x++) {
301-
GET_H_FILTER(imx >> 6);
302-
mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
303-
6 - intermediate_bits) :
304-
src[ioff] << intermediate_bits;
305-
imx += dx;
306-
ioff += imx >> 10;
307-
imx &= 0x3ff;
308-
}
319+
for (int i = 0; i < 8; i++)
320+
mid_ptrs[i] = &mid[128 * i];
309321

310-
mid_ptr += 128;
311-
src += src_stride;
312-
} while (--tmp_h);
322+
src -= src_stride * 3;
313323

314-
mid_ptr = mid + 128 * 3;
315324
for (int y = 0; y < h; y++) {
316325
int x;
317-
GET_V_FILTER(my >> 6);
326+
int src_y = my >> 10;
327+
GET_V_FILTER((my & 0x3ff) >> 6);
328+
329+
while (in_y < src_y) {
330+
int imx = mx, ioff = 0;
331+
int16_t *mid_ptr = mid_ptrs[0];
332+
333+
for (int i = 0; i < 7; i++)
334+
mid_ptrs[i] = mid_ptrs[i + 1];
335+
mid_ptrs[7] = mid_ptr;
336+
337+
for (x = 0; x < w; x++) {
338+
GET_H_FILTER(imx >> 6);
339+
mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
340+
6 - intermediate_bits) :
341+
src[ioff] << intermediate_bits;
342+
imx += dx;
343+
ioff += imx >> 10;
344+
imx &= 0x3ff;
345+
}
346+
347+
src += src_stride;
348+
in_y++;
349+
}
318350

319351
for (x = 0; x < w; x++)
320-
tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6)
321-
: mid_ptr[x]) - PREP_BIAS;
352+
tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND3(mid_ptrs, x, fv, 6)
353+
: mid_ptrs[3][x]) - PREP_BIAS;
322354

323355
my += dy;
324-
mid_ptr += (my >> 10) * 128;
325-
my &= 0x3ff;
326356
tmp += w;
327357
}
328358
}
@@ -392,6 +422,15 @@ filter_fns(sharp_smooth, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH)
392422
#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
393423
iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
394424

425+
#define FILTER_BILIN2(src1, src2, x, mxy) \
426+
(16 * src1[x] + ((mxy) * (src2[x] - src1[x])))
427+
428+
#define FILTER_BILIN_RND2(src1, src2, x, mxy, sh) \
429+
((FILTER_BILIN2(src1, src2, x, mxy) + ((1 << (sh)) >> 1)) >> (sh))
430+
431+
#define FILTER_BILIN_CLIP2(src1, src2, x, mxy, sh) \
432+
iclip_pixel(FILTER_BILIN_RND2(src1, src2, x, mxy, sh))
433+
395434
static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
396435
const pixel *src, ptrdiff_t src_stride,
397436
const int w, int h, const int mx, const int my
@@ -456,36 +495,37 @@ static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
456495
HIGHBD_DECL_SUFFIX)
457496
{
458497
const int intermediate_bits = get_intermediate_bits(bitdepth_max);
459-
int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
460-
int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
498+
int16_t mid[128 * 2];
499+
int in_y = -2;
461500

462501
do {
463502
int x;
464-
int imx = mx, ioff = 0;
465-
466-
for (x = 0; x < w; x++) {
467-
mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
468-
4 - intermediate_bits);
469-
imx += dx;
470-
ioff += imx >> 10;
471-
imx &= 0x3ff;
472-
}
473-
474-
mid_ptr += 128;
475-
src += PXSTRIDE(src_stride);
476-
} while (--tmp_h);
503+
int y = my >> 10;
504+
int16_t *mid1 = &mid[(y & 1) * 128];
505+
int16_t *mid2 = &mid[((y + 1) & 1) * 128];
506+
int dmy = my & 0x3ff;
507+
508+
while (in_y < y) {
509+
int imx = mx, ioff = 0;
510+
int16_t *mid_ptr = &mid[(in_y & 1) * 128];
511+
512+
for (x = 0; x < w; x++) {
513+
mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
514+
4 - intermediate_bits);
515+
imx += dx;
516+
ioff += imx >> 10;
517+
imx &= 0x3ff;
518+
}
477519

478-
mid_ptr = mid;
479-
do {
480-
int x;
520+
src += PXSTRIDE(src_stride);
521+
in_y++;
522+
}
481523

482524
for (x = 0; x < w; x++)
483-
dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128,
525+
dst[x] = FILTER_BILIN_CLIP2(mid1, mid2, x, dmy >> 6,
484526
4 + intermediate_bits);
485527

486528
my += dy;
487-
mid_ptr += (my >> 10) * 128;
488-
my &= 0x3ff;
489529
dst += PXSTRIDE(dst_stride);
490530
} while (--h);
491531
}
@@ -551,35 +591,36 @@ static void prep_bilin_scaled_c(int16_t *tmp,
551591
const int dx, const int dy HIGHBD_DECL_SUFFIX)
552592
{
553593
const int intermediate_bits = get_intermediate_bits(bitdepth_max);
554-
int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
555-
int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
594+
int16_t mid[128 * 2];
595+
int in_y = -2;
556596

557597
do {
558598
int x;
559-
int imx = mx, ioff = 0;
560-
561-
for (x = 0; x < w; x++) {
562-
mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
563-
4 - intermediate_bits);
564-
imx += dx;
565-
ioff += imx >> 10;
566-
imx &= 0x3ff;
567-
}
568-
569-
mid_ptr += 128;
570-
src += PXSTRIDE(src_stride);
571-
} while (--tmp_h);
599+
int y = my >> 10;
600+
int16_t *mid1 = &mid[(y & 1) * 128];
601+
int16_t *mid2 = &mid[((y + 1) & 1) * 128];
602+
int dmy = my & 0x3ff;
603+
604+
while (in_y < y) {
605+
int imx = mx, ioff = 0;
606+
int16_t *mid_ptr = &mid[(in_y & 1) * 128];
607+
608+
for (x = 0; x < w; x++) {
609+
mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
610+
4 - intermediate_bits);
611+
imx += dx;
612+
ioff += imx >> 10;
613+
imx &= 0x3ff;
614+
}
572615

573-
mid_ptr = mid;
574-
do {
575-
int x;
616+
src += PXSTRIDE(src_stride);
617+
in_y++;
618+
}
576619

577620
for (x = 0; x < w; x++)
578-
tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS;
621+
tmp[x] = FILTER_BILIN_RND2(mid1, mid2, x, dmy >> 6, 4) - PREP_BIAS;
579622

580623
my += dy;
581-
mid_ptr += (my >> 10) * 128;
582-
my &= 0x3ff;
583624
tmp += w;
584625
} while (--h);
585626
}

0 commit comments

Comments
 (0)