@@ -84,18 +84,34 @@ prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
8484 F[6] * src[x + +3 * stride] + \
8585 F[7] * src[x + +4 * stride])
8686
87+ #define FILTER_8TAP2 (src , x , F ) \
88+ (F[0] * src[0][x] + \
89+ F[1] * src[1][x] + \
90+ F[2] * src[2][x] + \
91+ F[3] * src[3][x] + \
92+ F[4] * src[4][x] + \
93+ F[5] * src[5][x] + \
94+ F[6] * src[6][x] + \
95+ F[7] * src[7][x])
96+
8797#define DAV1D_FILTER_8TAP_RND (src , x , F , stride , sh ) \
8898 ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
8999
90100#define DAV1D_FILTER_8TAP_RND2 (src , x , F , stride , rnd , sh ) \
91101 ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
92102
103+ #define DAV1D_FILTER_8TAP_RND3 (src , x , F , sh ) \
104+ ((FILTER_8TAP2(src, x, F) + ((1 << (sh)) >> 1)) >> (sh))
105+
93106#define DAV1D_FILTER_8TAP_CLIP (src , x , F , stride , sh ) \
94107 iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
95108
96109#define DAV1D_FILTER_8TAP_CLIP2 (src , x , F , stride , rnd , sh ) \
97110 iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
98111
112+ #define DAV1D_FILTER_8TAP_CLIP3 (src , x , F , sh ) \
113+ iclip_pixel(DAV1D_FILTER_8TAP_RND3(src, x, F, sh))
114+
99115#define GET_H_FILTER (mx ) \
100116 const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
101117 dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
@@ -179,43 +195,50 @@ put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
179195{
180196 const int intermediate_bits = get_intermediate_bits (bitdepth_max );
181197 const int intermediate_rnd = (1 << intermediate_bits ) >> 1 ;
182- int tmp_h = (((h - 1 ) * dy + my ) >> 10 ) + 8 ;
183- int16_t mid [128 * (256 + 7 )], * mid_ptr = mid ;
198+ int16_t mid [128 * 8 ];
199+ int16_t * mid_ptrs [8 ];
200+ int in_y = -8 ;
184201 src_stride = PXSTRIDE (src_stride );
185202
186- src -= src_stride * 3 ;
187- do {
188- int x ;
189- int imx = mx , ioff = 0 ;
190-
191- for (x = 0 ; x < w ; x ++ ) {
192- GET_H_FILTER (imx >> 6 );
193- mid_ptr [x ] = fh ? DAV1D_FILTER_8TAP_RND (src , ioff , fh , 1 ,
194- 6 - intermediate_bits ) :
195- src [ioff ] << intermediate_bits ;
196- imx += dx ;
197- ioff += imx >> 10 ;
198- imx &= 0x3ff ;
199- }
203+ for (int i = 0 ; i < 8 ; i ++ )
204+ mid_ptrs [i ] = & mid [128 * i ];
200205
201- mid_ptr += 128 ;
202- src += src_stride ;
203- } while (-- tmp_h );
206+ src -= src_stride * 3 ;
204207
205- mid_ptr = mid + 128 * 3 ;
206208 for (int y = 0 ; y < h ; y ++ ) {
207209 int x ;
208- GET_V_FILTER (my >> 6 );
210+ int src_y = my >> 10 ;
211+ GET_V_FILTER ((my & 0x3ff ) >> 6 );
212+
213+ while (in_y < src_y ) {
214+ int imx = mx , ioff = 0 ;
215+ int16_t * mid_ptr = mid_ptrs [0 ];
216+
217+ for (int i = 0 ; i < 7 ; i ++ )
218+ mid_ptrs [i ] = mid_ptrs [i + 1 ];
219+ mid_ptrs [7 ] = mid_ptr ;
220+
221+ for (x = 0 ; x < w ; x ++ ) {
222+ GET_H_FILTER (imx >> 6 );
223+ mid_ptr [x ] = fh ? DAV1D_FILTER_8TAP_RND (src , ioff , fh , 1 ,
224+ 6 - intermediate_bits ) :
225+ src [ioff ] << intermediate_bits ;
226+ imx += dx ;
227+ ioff += imx >> 10 ;
228+ imx &= 0x3ff ;
229+ }
230+
231+ src += src_stride ;
232+ in_y ++ ;
233+ }
209234
210235 for (x = 0 ; x < w ; x ++ )
211- dst [x ] = fv ? DAV1D_FILTER_8TAP_CLIP ( mid_ptr , x , fv , 128 ,
212- 6 + intermediate_bits ) :
213- iclip_pixel ((mid_ptr [x ] + intermediate_rnd ) >>
236+ dst [x ] = fv ? DAV1D_FILTER_8TAP_CLIP3 ( mid_ptrs , x , fv ,
237+ 6 + intermediate_bits ) :
238+ iclip_pixel ((mid_ptrs [ 3 ] [x ] + intermediate_rnd ) >>
214239 intermediate_bits );
215240
216241 my += dy ;
217- mid_ptr += (my >> 10 ) * 128 ;
218- my &= 0x3ff ;
219242 dst += PXSTRIDE (dst_stride );
220243 }
221244}
@@ -288,41 +311,48 @@ prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
288311 HIGHBD_DECL_SUFFIX )
289312{
290313 const int intermediate_bits = get_intermediate_bits (bitdepth_max );
291- int tmp_h = (((h - 1 ) * dy + my ) >> 10 ) + 8 ;
292- int16_t mid [128 * (256 + 7 )], * mid_ptr = mid ;
314+ int16_t mid [128 * 8 ];
315+ int16_t * mid_ptrs [8 ];
316+ int in_y = -8 ;
293317 src_stride = PXSTRIDE (src_stride );
294318
295- src -= src_stride * 3 ;
296- do {
297- int x ;
298- int imx = mx , ioff = 0 ;
299-
300- for (x = 0 ; x < w ; x ++ ) {
301- GET_H_FILTER (imx >> 6 );
302- mid_ptr [x ] = fh ? DAV1D_FILTER_8TAP_RND (src , ioff , fh , 1 ,
303- 6 - intermediate_bits ) :
304- src [ioff ] << intermediate_bits ;
305- imx += dx ;
306- ioff += imx >> 10 ;
307- imx &= 0x3ff ;
308- }
319+ for (int i = 0 ; i < 8 ; i ++ )
320+ mid_ptrs [i ] = & mid [128 * i ];
309321
310- mid_ptr += 128 ;
311- src += src_stride ;
312- } while (-- tmp_h );
322+ src -= src_stride * 3 ;
313323
314- mid_ptr = mid + 128 * 3 ;
315324 for (int y = 0 ; y < h ; y ++ ) {
316325 int x ;
317- GET_V_FILTER (my >> 6 );
326+ int src_y = my >> 10 ;
327+ GET_V_FILTER ((my & 0x3ff ) >> 6 );
328+
329+ while (in_y < src_y ) {
330+ int imx = mx , ioff = 0 ;
331+ int16_t * mid_ptr = mid_ptrs [0 ];
332+
333+ for (int i = 0 ; i < 7 ; i ++ )
334+ mid_ptrs [i ] = mid_ptrs [i + 1 ];
335+ mid_ptrs [7 ] = mid_ptr ;
336+
337+ for (x = 0 ; x < w ; x ++ ) {
338+ GET_H_FILTER (imx >> 6 );
339+ mid_ptr [x ] = fh ? DAV1D_FILTER_8TAP_RND (src , ioff , fh , 1 ,
340+ 6 - intermediate_bits ) :
341+ src [ioff ] << intermediate_bits ;
342+ imx += dx ;
343+ ioff += imx >> 10 ;
344+ imx &= 0x3ff ;
345+ }
346+
347+ src += src_stride ;
348+ in_y ++ ;
349+ }
318350
319351 for (x = 0 ; x < w ; x ++ )
320- tmp [x ] = (fv ? DAV1D_FILTER_8TAP_RND ( mid_ptr , x , fv , 128 , 6 )
321- : mid_ptr [x ]) - PREP_BIAS ;
352+ tmp [x ] = (fv ? DAV1D_FILTER_8TAP_RND3 ( mid_ptrs , x , fv , 6 )
353+ : mid_ptrs [ 3 ] [x ]) - PREP_BIAS ;
322354
323355 my += dy ;
324- mid_ptr += (my >> 10 ) * 128 ;
325- my &= 0x3ff ;
326356 tmp += w ;
327357 }
328358}
@@ -392,6 +422,15 @@ filter_fns(sharp_smooth, DAV1D_FILTER_8TAP_SHARP, DAV1D_FILTER_8TAP_SMOOTH)
392422#define FILTER_BILIN_CLIP (src , x , mxy , stride , sh ) \
393423 iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
394424
425+ #define FILTER_BILIN2 (src1 , src2 , x , mxy ) \
426+ (16 * src1[x] + ((mxy) * (src2[x] - src1[x])))
427+
428+ #define FILTER_BILIN_RND2 (src1 , src2 , x , mxy , sh ) \
429+ ((FILTER_BILIN2(src1, src2, x, mxy) + ((1 << (sh)) >> 1)) >> (sh))
430+
431+ #define FILTER_BILIN_CLIP2 (src1 , src2 , x , mxy , sh ) \
432+ iclip_pixel(FILTER_BILIN_RND2(src1, src2, x, mxy, sh))
433+
395434static void put_bilin_c (pixel * dst , ptrdiff_t dst_stride ,
396435 const pixel * src , ptrdiff_t src_stride ,
397436 const int w , int h , const int mx , const int my
@@ -456,36 +495,37 @@ static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
456495 HIGHBD_DECL_SUFFIX )
457496{
458497 const int intermediate_bits = get_intermediate_bits (bitdepth_max );
459- int tmp_h = ((( h - 1 ) * dy + my ) >> 10 ) + 2 ;
460- int16_t mid [ 128 * ( 256 + 1 )], * mid_ptr = mid ;
498+ int16_t mid [ 128 * 2 ] ;
499+ int in_y = -2 ;
461500
462501 do {
463502 int x ;
464- int imx = mx , ioff = 0 ;
465-
466- for (x = 0 ; x < w ; x ++ ) {
467- mid_ptr [x ] = FILTER_BILIN_RND (src , ioff , imx >> 6 , 1 ,
468- 4 - intermediate_bits );
469- imx += dx ;
470- ioff += imx >> 10 ;
471- imx &= 0x3ff ;
472- }
473-
474- mid_ptr += 128 ;
475- src += PXSTRIDE (src_stride );
476- } while (-- tmp_h );
503+ int y = my >> 10 ;
504+ int16_t * mid1 = & mid [(y & 1 ) * 128 ];
505+ int16_t * mid2 = & mid [((y + 1 ) & 1 ) * 128 ];
506+ int dmy = my & 0x3ff ;
507+
508+ while (in_y < y ) {
509+ int imx = mx , ioff = 0 ;
510+ int16_t * mid_ptr = & mid [(in_y & 1 ) * 128 ];
511+
512+ for (x = 0 ; x < w ; x ++ ) {
513+ mid_ptr [x ] = FILTER_BILIN_RND (src , ioff , imx >> 6 , 1 ,
514+ 4 - intermediate_bits );
515+ imx += dx ;
516+ ioff += imx >> 10 ;
517+ imx &= 0x3ff ;
518+ }
477519
478- mid_ptr = mid ;
479- do {
480- int x ;
520+ src += PXSTRIDE ( src_stride ) ;
521+ in_y ++ ;
522+ }
481523
482524 for (x = 0 ; x < w ; x ++ )
483- dst [x ] = FILTER_BILIN_CLIP ( mid_ptr , x , my >> 6 , 128 ,
525+ dst [x ] = FILTER_BILIN_CLIP2 ( mid1 , mid2 , x , dmy >> 6 ,
484526 4 + intermediate_bits );
485527
486528 my += dy ;
487- mid_ptr += (my >> 10 ) * 128 ;
488- my &= 0x3ff ;
489529 dst += PXSTRIDE (dst_stride );
490530 } while (-- h );
491531}
@@ -551,35 +591,36 @@ static void prep_bilin_scaled_c(int16_t *tmp,
551591 const int dx , const int dy HIGHBD_DECL_SUFFIX )
552592{
553593 const int intermediate_bits = get_intermediate_bits (bitdepth_max );
554- int tmp_h = ((( h - 1 ) * dy + my ) >> 10 ) + 2 ;
555- int16_t mid [ 128 * ( 256 + 1 )], * mid_ptr = mid ;
594+ int16_t mid [ 128 * 2 ] ;
595+ int in_y = -2 ;
556596
557597 do {
558598 int x ;
559- int imx = mx , ioff = 0 ;
560-
561- for (x = 0 ; x < w ; x ++ ) {
562- mid_ptr [x ] = FILTER_BILIN_RND (src , ioff , imx >> 6 , 1 ,
563- 4 - intermediate_bits );
564- imx += dx ;
565- ioff += imx >> 10 ;
566- imx &= 0x3ff ;
567- }
568-
569- mid_ptr += 128 ;
570- src += PXSTRIDE (src_stride );
571- } while (-- tmp_h );
599+ int y = my >> 10 ;
600+ int16_t * mid1 = & mid [(y & 1 ) * 128 ];
601+ int16_t * mid2 = & mid [((y + 1 ) & 1 ) * 128 ];
602+ int dmy = my & 0x3ff ;
603+
604+ while (in_y < y ) {
605+ int imx = mx , ioff = 0 ;
606+ int16_t * mid_ptr = & mid [(in_y & 1 ) * 128 ];
607+
608+ for (x = 0 ; x < w ; x ++ ) {
609+ mid_ptr [x ] = FILTER_BILIN_RND (src , ioff , imx >> 6 , 1 ,
610+ 4 - intermediate_bits );
611+ imx += dx ;
612+ ioff += imx >> 10 ;
613+ imx &= 0x3ff ;
614+ }
572615
573- mid_ptr = mid ;
574- do {
575- int x ;
616+ src += PXSTRIDE ( src_stride ) ;
617+ in_y ++ ;
618+ }
576619
577620 for (x = 0 ; x < w ; x ++ )
578- tmp [x ] = FILTER_BILIN_RND ( mid_ptr , x , my >> 6 , 128 , 4 ) - PREP_BIAS ;
621+ tmp [x ] = FILTER_BILIN_RND2 ( mid1 , mid2 , x , dmy >> 6 , 4 ) - PREP_BIAS ;
579622
580623 my += dy ;
581- mid_ptr += (my >> 10 ) * 128 ;
582- my &= 0x3ff ;
583624 tmp += w ;
584625 } while (-- h );
585626}
0 commit comments