@@ -171,17 +171,77 @@ static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
171171 for (int i = 0 ; i < w ; i ++ ) {
172172 int sum = - round_offset ;
173173
174- for (int k = 0 ; k < 7 ; k ++ )
174+ // Only filter using 6 input rows. The 7th row is assumed to be
175+ // identical to the last one.
176+ //
177+ // This function is assumed to only be called at the end, when doing
178+ // padding at the bottom.
179+ for (int k = 0 ; k < 6 ; k ++ )
175180 sum += ptrs [k ][i ] * fv [k ];
181+ sum += ptrs [5 ][i ] * fv [6 ];
176182
177183 p [i ] = iclip_pixel ((sum + rounding_off_v ) >> round_bits_v );
178184 }
179185
180- // Rotate the window of pointers
181- uint16_t * tmp = ptrs [0 ];
186+ // Shift the pointers, but only update the first 5; the 6th pointer is kept
187+ // as it was before (and the 7th is implicitly identical to the 6th).
188+ for (int i = 0 ; i < 5 ; i ++ )
189+ ptrs [i ] = ptrs [i + 1 ];
190+ }
191+
192+ static void wiener_filter_hv (pixel * p , uint16_t * * ptrs , const pixel (* left )[4 ],
193+ const pixel * src , const int16_t filter [2 ][8 ],
194+ const int w , const enum LrEdgeFlags edges
195+ HIGHBD_DECL_SUFFIX )
196+ {
197+ const int bitdepth = bitdepth_from_max (bitdepth_max );
198+
199+ const int round_bits_v = 11 - (bitdepth == 12 ) * 2 ;
200+ const int rounding_off_v = 1 << (round_bits_v - 1 );
201+ const int round_offset = 1 << (bitdepth + (round_bits_v - 1 ));
202+
203+ const int16_t * fh = filter [0 ];
204+ const int16_t * fv = filter [1 ];
205+
206+ // Do combined horziontal and vertical filtering; doing horizontal
207+ // filtering of one row, combined with vertical filtering of 6
208+ // preexisting rows and the newly filtered row.
209+
210+ // For simplicity in the C implementation, just do a separate call
211+ // of the horizontal filter, into a temporary buffer.
212+ uint16_t tmp [REST_UNIT_STRIDE ];
213+ wiener_filter_h (tmp , left , src , fh , w , edges HIGHBD_TAIL_SUFFIX );
214+
215+ for (int i = 0 ; i < w ; i ++ ) {
216+ int sum = - round_offset ;
217+
218+ // Filter using the 6 stored preexisting rows, and the newly
219+ // filtered one in tmp[].
220+ for (int k = 0 ; k < 6 ; k ++ )
221+ sum += ptrs [k ][i ] * fv [k ];
222+ sum += tmp [i ] * fv [6 ];
223+ // At this point, after having read all inputs at point [i], we
224+ // could overwrite [i] with the newly filtered data.
225+
226+ p [i ] = iclip_pixel ((sum + rounding_off_v ) >> round_bits_v );
227+ }
228+
229+ // For simplicity in the C implementation, just memcpy the newly
230+ // filtered row into ptrs[6]. Normally, in steady state filtering,
231+ // this output row, ptrs[6], is equal to ptrs[0]. However at startup,
232+ // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
233+ // so we can't assume we can write into ptrs[0] but we need to keep
234+ // a separate pointer for the next row to write into.
235+ memcpy (ptrs [6 ], tmp , sizeof (uint16_t ) * REST_UNIT_STRIDE );
236+
237+ // Rotate the window of pointers. Shift the 6 pointers downwards one step.
182238 for (int i = 0 ; i < 6 ; i ++ )
183239 ptrs [i ] = ptrs [i + 1 ];
184- ptrs [6 ] = tmp ;
240+ // The topmost pointer, ptrs[6], which isn't used as input, is set to
241+ // ptrs[0], which will be used as output for the next _hv call.
242+ // At the start of the filtering, the caller may set ptrs[6] to the
243+ // right next buffer to fill in, instead.
244+ ptrs [6 ] = ptrs [0 ];
185245}
186246
187247// FIXME Could split into luma and chroma specific functions,
@@ -194,10 +254,11 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
194254{
195255 // Values stored between horizontal and vertical filtering don't
196256 // fit in a uint8_t.
197- uint16_t hor [7 * REST_UNIT_STRIDE ];
198- uint16_t * ptrs [7 ], * rows [7 ];
199- for (int i = 0 ; i < 7 ; i ++ )
257+ uint16_t hor [6 * REST_UNIT_STRIDE ];
258+ uint16_t * ptrs [7 ], * rows [6 ];
259+ for (int i = 0 ; i < 6 ; i ++ )
200260 rows [i ] = & hor [i * REST_UNIT_STRIDE ];
261+ const int16_t (* const filter )[8 ] = params -> filter ;
201262 const int16_t * fh = params -> filter [0 ];
202263 const int16_t * fv = params -> filter [1 ];
203264 const pixel * lpf_bottom = lpf + 6 * PXSTRIDE (stride );
@@ -269,8 +330,8 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
269330 goto v3 ;
270331
271332 ptrs [6 ] = rows [3 ];
272- wiener_filter_h ( rows [ 3 ], left , src , fh , w , edges HIGHBD_TAIL_SUFFIX );
273- wiener_filter_v ( p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
333+ wiener_filter_hv ( p , ptrs , left , src , filter , w , edges
334+ HIGHBD_TAIL_SUFFIX );
274335 left ++ ;
275336 src += PXSTRIDE (stride );
276337 p += PXSTRIDE (stride );
@@ -279,8 +340,8 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
279340 goto v3 ;
280341
281342 ptrs [6 ] = rows [4 ];
282- wiener_filter_h ( rows [ 4 ], left , src , fh , w , edges HIGHBD_TAIL_SUFFIX );
283- wiener_filter_v ( p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
343+ wiener_filter_hv ( p , ptrs , left , src , filter , w , edges
344+ HIGHBD_TAIL_SUFFIX );
284345 left ++ ;
285346 src += PXSTRIDE (stride );
286347 p += PXSTRIDE (stride );
@@ -289,29 +350,10 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
289350 goto v3 ;
290351 }
291352
292- ptrs [6 ] = rows [5 ];
293- wiener_filter_h (rows [5 ], left , src , fh , w , edges HIGHBD_TAIL_SUFFIX );
294- wiener_filter_v (p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
295- left ++ ;
296- src += PXSTRIDE (stride );
297- p += PXSTRIDE (stride );
298-
299- if (-- h <= 0 )
300- goto v3 ;
301-
302- ptrs [6 ] = rows [6 ];
303- wiener_filter_h (rows [6 ], left , src , fh , w , edges HIGHBD_TAIL_SUFFIX );
304- wiener_filter_v (p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
305- left ++ ;
306- src += PXSTRIDE (stride );
307- p += PXSTRIDE (stride );
308-
309- if (-- h <= 0 )
310- goto v3 ;
311-
353+ ptrs [6 ] = ptrs [5 ] + REST_UNIT_STRIDE ;
312354 do {
313- wiener_filter_h ( ptrs [ 6 ] , left , src , fh , w , edges HIGHBD_TAIL_SUFFIX );
314- wiener_filter_v ( p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
355+ wiener_filter_hv ( p , ptrs , left , src , filter , w , edges
356+ HIGHBD_TAIL_SUFFIX );
315357 left ++ ;
316358 src += PXSTRIDE (stride );
317359 p += PXSTRIDE (stride );
@@ -320,26 +362,23 @@ static void wiener_c(pixel *p, const ptrdiff_t stride,
320362 if (!(edges & LR_HAVE_BOTTOM ))
321363 goto v3 ;
322364
323- wiener_filter_h ( ptrs [ 6 ] , NULL , lpf_bottom , fh , w , edges HIGHBD_TAIL_SUFFIX );
324- wiener_filter_v ( p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
365+ wiener_filter_hv ( p , ptrs , NULL , lpf_bottom , filter , w , edges
366+ HIGHBD_TAIL_SUFFIX );
325367 lpf_bottom += PXSTRIDE (stride );
326368 p += PXSTRIDE (stride );
327369
328- wiener_filter_h ( ptrs [ 6 ] , NULL , lpf_bottom , fh , w , edges HIGHBD_TAIL_SUFFIX );
329- wiener_filter_v ( p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
370+ wiener_filter_hv ( p , ptrs , NULL , lpf_bottom , filter , w , edges
371+ HIGHBD_TAIL_SUFFIX );
330372 p += PXSTRIDE (stride );
331373v1 :
332- ptrs [6 ] = ptrs [5 ];
333374 wiener_filter_v (p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
334375
335376 return ;
336377
337378v3 :
338- ptrs [6 ] = ptrs [5 ];
339379 wiener_filter_v (p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
340380 p += PXSTRIDE (stride );
341381v2 :
342- ptrs [6 ] = ptrs [5 ];
343382 wiener_filter_v (p , ptrs , fv , w HIGHBD_TAIL_SUFFIX );
344383 p += PXSTRIDE (stride );
345384 goto v1 ;
0 commit comments