@@ -532,18 +532,27 @@ const oapv_fn_dquant_t oapv_tbl_fn_dquant_avx[2] =
532532
533533void oapv_adjust_itrans_avx (int * src , int * dst , int itrans_diff_idx , int diff_step , int shift )
534534{
535- __m256i v0 = _mm256_set1_epi32 (diff_step );
536- __m256i v1 = _mm256_set1_epi32 (1 << (shift - 1 ));
537- __m256i s0 , s1 ;
535+ __m256i v0 = _mm256_set1_epi32 (( 1 << 16 ) | ( diff_step & 0xffff ) );
536+ __m256i v1 = _mm256_set1_epi16 (1 << (shift - 1 ));
537+ __m256i s0 , s1 , d , d0 , d1 ;
538538
539- for (int j = 0 ; j < 64 ; j += 8 ) {
539+ for (int j = 0 ; j < 64 ; j += 16 )
540+ {
540541 s0 = _mm256_loadu_si256 ((const __m256i * )(src + j ));
541- s1 = _mm256_loadu_si256 ((const __m256i * )(oapv_itrans_diff [itrans_diff_idx ] + j ));
542- s1 = _mm256_mullo_epi32 (s1 , v0 );
543- s1 = _mm256_add_epi32 (s1 , v1 );
544- s1 = _mm256_srai_epi32 (s1 , shift );
545- s1 = _mm256_add_epi32 (s0 , s1 );
546- _mm256_storeu_si256 ((__m256i * )(dst + j ), s1 );
542+ d = _mm256_loadu_si256 ((const __m256i * )(oapv_itrans_diff [itrans_diff_idx ] + j ));
543+
544+ d0 = _mm256_unpacklo_epi16 (d , v1 );
545+ d0 = _mm256_madd_epi16 (d0 , v0 );
546+ d0 = _mm256_srai_epi32 (d0 , shift );
547+ d0 = _mm256_add_epi32 (s0 , d0 );
548+ _mm256_storeu_si256 ((__m256i * )(dst + j ), d0 );
549+
550+ s1 = _mm256_loadu_si256 ((const __m256i * )(src + j + 8 ));
551+ d1 = _mm256_unpackhi_epi16 (d , v1 );
552+ d1 = _mm256_madd_epi16 (d1 , v0 );
553+ d1 = _mm256_srai_epi32 (d1 , shift );
554+ d1 = _mm256_add_epi32 (s1 , d1 );
555+ _mm256_storeu_si256 ((__m256i * )(dst + j + 8 ), d1 );
547556 }
548557}
549558
0 commit comments