4343 _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
4444#endif // !_mm256_loadu2_m128i
4545
46- static void oapv_tx_part_avx (s16 * src , s16 * dst , int shift , int line )
46+ static void oapv_tx_avx (s16 * src , int shift1 , int shift2 , int line )
4747{
4848 __m256i v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ;
49- __m256i d0 , d1 , d2 , d3 ;
49+ __m256i d0 , d1 , d2 , d3 , d4 , d5 ;
5050 __m256i coeff [8 ];
5151 coeff [0 ] = _mm256_set1_epi16 (64 );
5252 coeff [1 ] = _mm256_set_epi16 (64 , -64 , -64 , 64 , 64 , -64 , -64 , 64 , 64 , -64 , -64 , 64 , 64 , -64 , -64 , 64 );
@@ -56,7 +56,8 @@ static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
5656 coeff [5 ] = _mm256_set_epi16 (-75 , 18 , 89 , 50 , -50 , -89 , -18 , 75 , -75 , 18 , 89 , 50 , -50 , -89 , -18 , 75 );
5757 coeff [6 ] = _mm256_set_epi16 (-50 , 89 , -18 , -75 , 75 , 18 , -89 , 50 , -50 , 89 , -18 , -75 , 75 , 18 , -89 , 50 );
5858 coeff [7 ] = _mm256_set_epi16 (-18 , 50 , -75 , 89 , -89 , 75 , -50 , 18 , -18 , 50 , -75 , 89 , -89 , 75 , -50 , 18 );
59- __m256i add = _mm256_set1_epi32 (1 << (shift - 1 ));
59+ __m256i add1 = _mm256_set1_epi32 (1 << (shift1 - 1 ));
60+ __m256i add2 = _mm256_set1_epi32 (1 << (shift2 - 1 ));
6061
6162 __m256i s0 , s1 , s2 , s3 ;
6263
@@ -67,38 +68,63 @@ static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
6768
6869 CALCU_2x8 (coeff [0 ], coeff [4 ], d0 , d1 );
6970 CALCU_2x8 (coeff [2 ], coeff [5 ], d2 , d3 );
70- CALCU_2x8_ADD_SHIFT (d0 , d1 , d2 , d3 , add , shift )
71+ CALCU_2x8_ADD_SHIFT (d0 , d1 , d2 , d3 , add1 , shift1 );
7172
72- d0 = _mm256_packs_epi32 (d0 , d1 );
73+ d0 = _mm256_packs_epi32 (d0 , d1 );
74+ d1 = _mm256_packs_epi32 (d2 , d3 );
75+
76+ d0 = _mm256_permute4x64_epi64 (d0 , 0xd8 );
77+ d1 = _mm256_permute4x64_epi64 (d1 , 0xd8 );
78+
79+ CALCU_2x8 (coeff [1 ], coeff [6 ], d2 , d3 );
80+ CALCU_2x8 (coeff [3 ], coeff [7 ], d4 , d5 );
81+ CALCU_2x8_ADD_SHIFT (d2 , d3 , d4 , d5 , add1 , shift1 );
82+
83+ d2 = _mm256_packs_epi32 (d2 , d3 );
84+ d3 = _mm256_packs_epi32 (d4 , d5 );
85+
86+ d2 = _mm256_permute4x64_epi64 (d2 , 0xd8 );
87+ d3 = _mm256_permute4x64_epi64 (d3 , 0xd8 );
88+
89+ s0 = _mm256_setr_m128i (_mm256_castsi256_si128 (d0 ), _mm256_castsi256_si128 (d2 ));
90+ s1 = _mm256_setr_m128i (_mm256_extracti128_si256 (d0 , 1 ), _mm256_extracti128_si256 (d2 , 1 ));
91+ s2 = _mm256_setr_m128i (_mm256_castsi256_si128 (d1 ), _mm256_castsi256_si128 (d3 ));
92+ s3 = _mm256_setr_m128i (_mm256_extracti128_si256 (d1 , 1 ), _mm256_extracti128_si256 (d3 , 1 ));
93+
94+ CALCU_2x8 (coeff [0 ], coeff [4 ], d0 , d1 );
95+ CALCU_2x8 (coeff [2 ], coeff [5 ], d2 , d3 );
96+ CALCU_2x8_ADD_SHIFT (d0 , d1 , d2 , d3 , add2 , shift2 )
97+
98+ d0 = _mm256_packs_epi32 (d0 , d1 );
7399 d1 = _mm256_packs_epi32 (d2 , d3 );
74100
75101 d0 = _mm256_permute4x64_epi64 (d0 , 0xd8 );
76102 d1 = _mm256_permute4x64_epi64 (d1 , 0xd8 );
77103
78- _mm_store_si128 ((__m128i * )dst , _mm256_castsi256_si128 (d0 ));
79- _mm_store_si128 ((__m128i * )(dst + 1 * line ), _mm256_extracti128_si256 (d0 , 1 ));
80- _mm_store_si128 ((__m128i * )(dst + 2 * line ), _mm256_castsi256_si128 (d1 ));
81- _mm_store_si128 ((__m128i * )(dst + 3 * line ), _mm256_extracti128_si256 (d1 , 1 ));
104+ _mm_store_si128 ((__m128i * )src , _mm256_castsi256_si128 (d0 ));
105+ _mm_store_si128 ((__m128i * )(src + 1 * line ), _mm256_extracti128_si256 (d0 , 1 ));
106+ _mm_store_si128 ((__m128i * )(src + 2 * line ), _mm256_castsi256_si128 (d1 ));
107+ _mm_store_si128 ((__m128i * )(src + 3 * line ), _mm256_extracti128_si256 (d1 , 1 ));
82108
83109 CALCU_2x8 (coeff [1 ], coeff [6 ], d0 , d1 );
84110 CALCU_2x8 (coeff [3 ], coeff [7 ], d2 , d3 );
85- CALCU_2x8_ADD_SHIFT (d0 , d1 , d2 , d3 , add , shift );
111+ CALCU_2x8_ADD_SHIFT (d0 , d1 , d2 , d3 , add2 , shift2 );
86112
87113 d0 = _mm256_packs_epi32 (d0 , d1 );
88114 d1 = _mm256_packs_epi32 (d2 , d3 );
89115
90116 d0 = _mm256_permute4x64_epi64 (d0 , 0xd8 );
91117 d1 = _mm256_permute4x64_epi64 (d1 , 0xd8 );
92118
93- _mm_store_si128 ((__m128i * )(dst + 4 * line ), _mm256_castsi256_si128 (d0 ));
94- _mm_store_si128 ((__m128i * )(dst + 5 * line ), _mm256_extracti128_si256 (d0 , 1 ));
95- _mm_store_si128 ((__m128i * )(dst + 6 * line ), _mm256_castsi256_si128 (d1 ));
96- _mm_store_si128 ((__m128i * )(dst + 7 * line ), _mm256_extracti128_si256 (d1 , 1 ));
119+ _mm_store_si128 ((__m128i * )(src + 4 * line ), _mm256_castsi256_si128 (d0 ));
120+ _mm_store_si128 ((__m128i * )(src + 5 * line ), _mm256_extracti128_si256 (d0 , 1 ));
121+ _mm_store_si128 ((__m128i * )(src + 6 * line ), _mm256_castsi256_si128 (d1 ));
122+ _mm_store_si128 ((__m128i * )(src + 7 * line ), _mm256_extracti128_si256 (d1 , 1 ));
97123}
98124
99125const oapv_fn_tx_t oapv_tbl_fn_txb_avx [2 ] =
100126{
101- oapv_tx_part_avx ,
127+ oapv_tx_avx ,
102128 NULL
103129};
104130
0 commit comments