@@ -819,12 +819,12 @@ void adm_decouple_avx2(AdmBuffer *buf, int w, int h, int stride,
819819 __m256 od_inv_64 = _mm256_mul_ps (inv_64 , _mm256_cvtepi32_ps (od ));
820820 __m256 rst_d_f = _mm256_mul_ps (kd_inv_32768 , od_inv_64 );
821821
822- __m256i gt0_rst_h_f = ( __m256i ) (_mm256_cmp_ps (rst_h_f , _mm256_setzero_ps (), 14 ));
823- __m256i lt0_rst_h_f = ( __m256i ) (_mm256_cmp_ps (rst_h_f , _mm256_setzero_ps (), 1 ));
824- __m256i gt0_rst_v_f = ( __m256i ) (_mm256_cmp_ps (rst_v_f , _mm256_setzero_ps (), 14 ));
825- __m256i lt0_rst_v_f = ( __m256i ) (_mm256_cmp_ps (rst_v_f , _mm256_setzero_ps (), 1 ));
826- __m256i gt0_rst_d_f = ( __m256i ) (_mm256_cmp_ps (rst_d_f , _mm256_setzero_ps (), 14 ));
827- __m256i lt0_rst_d_f = ( __m256i ) (_mm256_cmp_ps (rst_d_f , _mm256_setzero_ps (), 1 ));
822+ __m256i gt0_rst_h_f = _mm256_castps_si256 (_mm256_cmp_ps (rst_h_f , _mm256_setzero_ps (), 14 ));
823+ __m256i lt0_rst_h_f = _mm256_castps_si256 (_mm256_cmp_ps (rst_h_f , _mm256_setzero_ps (), 1 ));
824+ __m256i gt0_rst_v_f = _mm256_castps_si256 (_mm256_cmp_ps (rst_v_f , _mm256_setzero_ps (), 14 ));
825+ __m256i lt0_rst_v_f = _mm256_castps_si256 (_mm256_cmp_ps (rst_v_f , _mm256_setzero_ps (), 1 ));
826+ __m256i gt0_rst_d_f = _mm256_castps_si256 (_mm256_cmp_ps (rst_d_f , _mm256_setzero_ps (), 14 ));
827+ __m256i lt0_rst_d_f = _mm256_castps_si256 (_mm256_cmp_ps (rst_d_f , _mm256_setzero_ps (), 1 ));
828828
829829 __m256i mask_min_max_h = _mm256_or_si256 (gt0_rst_h_f , lt0_rst_h_f );
830830 __m256i mask_min_max_v = _mm256_or_si256 (gt0_rst_v_f , lt0_rst_v_f );
@@ -834,7 +834,7 @@ void adm_decouple_avx2(AdmBuffer *buf, int w, int h, int stride,
834834 __m256i mask_rst_v = _mm256_and_si256 (mask_min_max_v , angle_flag );
835835 __m256i mask_rst_d = _mm256_and_si256 (mask_min_max_d , angle_flag );
836836
837- __m256d adm_gain_d = _mm256_set1_pd (adm_enhn_gain_limit );
837+ __m256d adm_gain_d = _mm256_set1_pd (adm_enhn_gain_limit );
838838 __m256d rst_h_gainlo_d = _mm256_mul_pd (_mm256_cvtepi32_pd (_mm256_extractf128_si256 (rst_h , 0 )), adm_gain_d );
839839 __m256d rst_h_gainhi_d = _mm256_mul_pd (_mm256_cvtepi32_pd (_mm256_extractf128_si256 (rst_h , 1 )), adm_gain_d );
840840 __m256i rst_h_gain = _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm256_cvtpd_epi32 (rst_h_gainlo_d )), _mm256_cvtpd_epi32 (rst_h_gainhi_d ),1 );
@@ -2088,15 +2088,15 @@ float adm_cm_avx2(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stride
20882088 }
20892089 accum_inner_h_lo_256 = _mm256_add_epi64 (accum_inner_h_lo_256 , accum_inner_h_hi_256 );
20902090 __m128i r2_h = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_h_lo_256 ), _mm256_extracti128_si256 (accum_inner_h_lo_256 , 1 ));
2091- int64_t res_h = r2_h [ 0 ] + r2_h [ 1 ] ;
2091+ int64_t res_h = _mm_extract_epi64 ( r2_h , 0 ) + _mm_extract_epi64 ( r2_h , 1 ) ;
20922092
20932093 accum_inner_v_lo_256 = _mm256_add_epi64 (accum_inner_v_lo_256 , accum_inner_v_hi_256 );
20942094 __m128i r2_v = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_v_lo_256 ), _mm256_extracti128_si256 (accum_inner_v_lo_256 , 1 ));
2095- int64_t res_v = r2_v [ 0 ] + r2_v [ 1 ] ;
2095+ int64_t res_v = _mm_extract_epi64 ( r2_v , 0 ) + _mm_extract_epi64 ( r2_v , 1 ) ;
20962096
20972097 accum_inner_d_lo_256 = _mm256_add_epi64 (accum_inner_d_lo_256 , accum_inner_d_hi_256 );
20982098 __m128i r2_d = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_d_lo_256 ), _mm256_extracti128_si256 (accum_inner_d_lo_256 , 1 ));
2099- int64_t res_d = r2_d [ 0 ] + r2_d [ 1 ] ;
2099+ int64_t res_d = _mm_extract_epi64 ( r2_d , 0 ) + _mm_extract_epi64 ( r2_d , 1 ) ;
21002100
21012101 for (j = end_col_mod6 ; j < end_col ; ++ j ) {
21022102 xh = src -> band_h [i * src_stride + j ] * i_rfactor [0 ];
@@ -2521,13 +2521,13 @@ float i4_adm_cm_avx2(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_str
25212521 }
25222522
25232523 __m128i r2_h = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_h_256 ), _mm256_extracti128_si256 (accum_inner_h_256 , 1 ));
2524- int64_t res_h = r2_h [ 0 ] + r2_h [ 1 ] ;
2524+ int64_t res_h = _mm_extract_epi64 ( r2_h , 0 ) + _mm_extract_epi64 ( r2_h , 1 ) ;
25252525
25262526 __m128i r2_v = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_v_256 ), _mm256_extracti128_si256 (accum_inner_v_256 , 1 ));
2527- int64_t res_v = r2_v [ 0 ] + r2_v [ 1 ] ;
2527+ int64_t res_v = _mm_extract_epi64 ( r2_v , 0 ) + _mm_extract_epi64 ( r2_v , 1 ) ;
25282528
25292529 __m128i r2_d = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_d_256 ), _mm256_extracti128_si256 (accum_inner_d_256 , 1 ));
2530- int64_t res_d = r2_d [ 0 ] + r2_d [ 1 ] ;
2530+ int64_t res_d = _mm_extract_epi64 ( r2_d , 0 ) + _mm_extract_epi64 ( r2_d , 1 ) ;
25312531
25322532 for (j = end_col_mod2 ; j < end_col ; ++ j )
25332533 {
@@ -3586,15 +3586,15 @@ float adm_csf_den_scale_avx2(const adm_dwt_band_t *src, int w, int h,
35863586
35873587 accum_inner_h_lo = _mm256_add_epi64 (accum_inner_h_lo , accum_inner_h_hi );
35883588 __m128i h_r2 = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_h_lo ), _mm256_extracti128_si256 (accum_inner_h_lo , 1 ));
3589- uint64_t h_r1 = h_r2 [ 0 ] + h_r2 [ 1 ] ;
3589+ uint64_t h_r1 = _mm_extract_epi64 ( h_r2 , 0 ) + _mm_extract_epi64 ( h_r2 , 1 ) ;
35903590
35913591 accum_inner_v_lo = _mm256_add_epi64 (accum_inner_v_lo , accum_inner_v_hi );
35923592 __m128i v_r2 = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_v_lo ), _mm256_extracti128_si256 (accum_inner_v_lo , 1 ));
3593- uint64_t v_r1 = v_r2 [ 0 ] + v_r2 [ 1 ] ;
3593+ uint64_t v_r1 = _mm_extract_epi64 ( v_r2 , 0 ) + _mm_extract_epi64 ( v_r2 , 1 ) ;
35943594
35953595 accum_inner_d_lo = _mm256_add_epi64 (accum_inner_d_lo , accum_inner_d_hi );
35963596 __m128i d_r2 = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_d_lo ), _mm256_extracti128_si256 (accum_inner_d_lo , 1 ));
3597- uint64_t d_r1 = d_r2 [ 0 ] + d_r2 [ 1 ] ;
3597+ uint64_t d_r1 = _mm_extract_epi64 ( d_r2 , 0 ) + _mm_extract_epi64 ( d_r2 , 1 ) ;
35983598
35993599 for (int j = right_mod_8 ; j < right ; ++ j ) {
36003600 uint16_t h = (uint16_t )abs (src_h [j ]);
@@ -3992,13 +3992,13 @@ float adm_csf_den_s123_avx2(const i4_adm_dwt_band_t *src, int scale, int w, int
39923992 accum_inner_d_256 = _mm256_add_epi64 (accum_inner_d_256 , d_cu );
39933993 }
39943994 __m128i h_r2 = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_h_256 ), _mm256_extracti128_si256 (accum_inner_h_256 , 1 ));
3995- uint64_t h_r1 = h_r2 [ 0 ] + h_r2 [ 1 ] ;
3995+ uint64_t h_r1 = _mm_extract_epi64 ( h_r2 , 0 ) + _mm_extract_epi64 ( h_r2 , 1 ) ;
39963996
39973997 __m128i d_r2 = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_d_256 ), _mm256_extracti128_si256 (accum_inner_d_256 , 1 ));
3998- uint64_t d_r1 = d_r2 [ 0 ] + d_r2 [ 1 ] ;
3998+ uint64_t d_r1 = _mm_extract_epi64 ( d_r2 , 0 ) + _mm_extract_epi64 ( d_r2 , 1 ) ;
39993999
40004000 __m128i v_r2 = _mm_add_epi64 (_mm256_castsi256_si128 (accum_inner_v_256 ), _mm256_extracti128_si256 (accum_inner_v_256 , 1 ));
4001- uint64_t v_r1 = v_r2 [ 0 ] + v_r2 [ 1 ] ;
4001+ uint64_t v_r1 = _mm_extract_epi64 ( v_r2 , 0 ) + _mm_extract_epi64 ( v_r2 , 1 ) ;
40024002
40034003 for (int j = right_mod_4 ; j < right ; ++ j )
40044004 {
0 commit comments