Skip to content

Commit 8291a66

Browse files
committed
looprestoration: Use only 6 row buffer for wiener, like NEON/x86
This uses a separate function for combined horizontal and vertical filtering, without needing to write the intermediate results back to memory inbetween. This mostly serves as an example for how to adjust the logic for that case; unless we actually merge the horizontal and vertical filtering within the _hv function, we still need space for a 7th row on the stack within that function (which means we use just as much stack as before), but we also need one extra memcpy to write it into the right destination. In a build where the compiler is allowed to vectorize and inline the wiener functions into each other, this change actually reduces the final binary size by 4 KB, if the C version of the wiener filter is retained. This change makes the vectorized C code as fast as it was before with Clang 18; on Xcode Clang 16, it's 2x slower than it was before. Unfortunately, with GCC, this change makes the code a bit slower again.
1 parent a149f5c commit 8291a66

File tree

1 file changed

+79
-40
lines changed

1 file changed

+79
-40
lines changed

src/looprestoration_tmpl.c

Lines changed: 79 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -171,17 +171,77 @@ static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
171171
for (int i = 0; i < w; i++) {
172172
int sum = -round_offset;
173173

174-
for (int k = 0; k < 7; k++)
174+
// Only filter using 6 input rows. The 7th row is assumed to be
175+
// identical to the last one.
176+
//
177+
// This function is assumed to only be called at the end, when doing
178+
// padding at the bottom.
179+
for (int k = 0; k < 6; k++)
175180
sum += ptrs[k][i] * fv[k];
181+
sum += ptrs[5][i] * fv[6];
176182

177183
p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
178184
}
179185

180-
// Rotate the window of pointers
181-
uint16_t *tmp = ptrs[0];
186+
// Shift the pointers, but only update the first 5; the 6th pointer is kept
187+
// as it was before (and the 7th is implicitly identical to the 6th).
188+
for (int i = 0; i < 5; i++)
189+
ptrs[i] = ptrs[i + 1];
190+
}
191+
192+
static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4],
193+
const pixel *src, const int16_t filter[2][8],
194+
const int w, const enum LrEdgeFlags edges
195+
HIGHBD_DECL_SUFFIX)
196+
{
197+
const int bitdepth = bitdepth_from_max(bitdepth_max);
198+
199+
const int round_bits_v = 11 - (bitdepth == 12) * 2;
200+
const int rounding_off_v = 1 << (round_bits_v - 1);
201+
const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
202+
203+
const int16_t *fh = filter[0];
204+
const int16_t *fv = filter[1];
205+
206+
// Do combined horziontal and vertical filtering; doing horizontal
207+
// filtering of one row, combined with vertical filtering of 6
208+
// preexisting rows and the newly filtered row.
209+
210+
// For simplicity in the C implementation, just do a separate call
211+
// of the horizontal filter, into a temporary buffer.
212+
uint16_t tmp[REST_UNIT_STRIDE];
213+
wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
214+
215+
for (int i = 0; i < w; i++) {
216+
int sum = -round_offset;
217+
218+
// Filter using the 6 stored preexisting rows, and the newly
219+
// filtered one in tmp[].
220+
for (int k = 0; k < 6; k++)
221+
sum += ptrs[k][i] * fv[k];
222+
sum += tmp[i] * fv[6];
223+
// At this point, after having read all inputs at point [i], we
224+
// could overwrite [i] with the newly filtered data.
225+
226+
p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
227+
}
228+
229+
// For simplicity in the C implementation, just memcpy the newly
230+
// filtered row into ptrs[6]. Normally, in steady state filtering,
231+
// this output row, ptrs[6], is equal to ptrs[0]. However at startup,
232+
// at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
233+
// so we can't assume we can write into ptrs[0] but we need to keep
234+
// a separate pointer for the next row to write into.
235+
memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE);
236+
237+
// Rotate the window of pointers. Shift the 6 pointers downwards one step.
182238
for (int i = 0; i < 6; i++)
183239
ptrs[i] = ptrs[i + 1];
184-
ptrs[6] = tmp;
240+
// The topmost pointer, ptrs[6], which isn't used as input, is set to
241+
// ptrs[0], which will be used as output for the next _hv call.
242+
// At the start of the filtering, the caller may set ptrs[6] to the
243+
// right next buffer to fill in, instead.
244+
ptrs[6] = ptrs[0];
185245
}
186246

187247
// FIXME Could split into luma and chroma specific functions,
@@ -194,10 +254,11 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
194254
{
195255
// Values stored between horizontal and vertical filtering don't
196256
// fit in a uint8_t.
197-
uint16_t hor[7 * REST_UNIT_STRIDE];
198-
uint16_t *ptrs[7], *rows[7];
199-
for (int i = 0; i < 7; i++)
257+
uint16_t hor[6 * REST_UNIT_STRIDE];
258+
uint16_t *ptrs[7], *rows[6];
259+
for (int i = 0; i < 6; i++)
200260
rows[i] = &hor[i * REST_UNIT_STRIDE];
261+
const int16_t (*const filter)[8] = params->filter;
201262
const int16_t *fh = params->filter[0];
202263
const int16_t *fv = params->filter[1];
203264
const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
@@ -269,8 +330,8 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
269330
goto v3;
270331

271332
ptrs[6] = rows[3];
272-
wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
273-
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
333+
wiener_filter_hv(p, ptrs, left, src, filter, w, edges
334+
HIGHBD_TAIL_SUFFIX);
274335
left++;
275336
src += PXSTRIDE(stride);
276337
p += PXSTRIDE(stride);
@@ -279,8 +340,8 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
279340
goto v3;
280341

281342
ptrs[6] = rows[4];
282-
wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
283-
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
343+
wiener_filter_hv(p, ptrs, left, src, filter, w, edges
344+
HIGHBD_TAIL_SUFFIX);
284345
left++;
285346
src += PXSTRIDE(stride);
286347
p += PXSTRIDE(stride);
@@ -289,29 +350,10 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
289350
goto v3;
290351
}
291352

292-
ptrs[6] = rows[5];
293-
wiener_filter_h(rows[5], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
294-
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
295-
left++;
296-
src += PXSTRIDE(stride);
297-
p += PXSTRIDE(stride);
298-
299-
if (--h <= 0)
300-
goto v3;
301-
302-
ptrs[6] = rows[6];
303-
wiener_filter_h(rows[6], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
304-
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
305-
left++;
306-
src += PXSTRIDE(stride);
307-
p += PXSTRIDE(stride);
308-
309-
if (--h <= 0)
310-
goto v3;
311-
353+
ptrs[6] = ptrs[5] + REST_UNIT_STRIDE;
312354
do {
313-
wiener_filter_h(ptrs[6], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
314-
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
355+
wiener_filter_hv(p, ptrs, left, src, filter, w, edges
356+
HIGHBD_TAIL_SUFFIX);
315357
left++;
316358
src += PXSTRIDE(stride);
317359
p += PXSTRIDE(stride);
@@ -320,26 +362,23 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
320362
if (!(edges & LR_HAVE_BOTTOM))
321363
goto v3;
322364

323-
wiener_filter_h(ptrs[6], NULL, lpf_bottom, fh, w, edges HIGHBD_TAIL_SUFFIX);
324-
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
365+
wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
366+
HIGHBD_TAIL_SUFFIX);
325367
lpf_bottom += PXSTRIDE(stride);
326368
p += PXSTRIDE(stride);
327369

328-
wiener_filter_h(ptrs[6], NULL, lpf_bottom, fh, w, edges HIGHBD_TAIL_SUFFIX);
329-
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
370+
wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
371+
HIGHBD_TAIL_SUFFIX);
330372
p += PXSTRIDE(stride);
331373
v1:
332-
ptrs[6] = ptrs[5];
333374
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
334375

335376
return;
336377

337378
v3:
338-
ptrs[6] = ptrs[5];
339379
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
340380
p += PXSTRIDE(stride);
341381
v2:
342-
ptrs[6] = ptrs[5];
343382
wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
344383
p += PXSTRIDE(stride);
345384
goto v1;

0 commit comments

Comments
 (0)