Skip to content

Commit 46b5134

Browse files
committed
1 parent 9a2ca9b commit 46b5134

File tree

4 files changed

+34
-33
lines changed

4 files changed

+34
-33
lines changed

libvmaf/src/feature/x86/adm_avx2.c

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -819,12 +819,12 @@ void adm_decouple_avx2(AdmBuffer *buf, int w, int h, int stride,
819819
__m256 od_inv_64 = _mm256_mul_ps(inv_64, _mm256_cvtepi32_ps(od));
820820
__m256 rst_d_f = _mm256_mul_ps(kd_inv_32768, od_inv_64);
821821

822-
__m256i gt0_rst_h_f = (__m256i)(_mm256_cmp_ps(rst_h_f, _mm256_setzero_ps(), 14));
823-
__m256i lt0_rst_h_f = (__m256i)(_mm256_cmp_ps(rst_h_f, _mm256_setzero_ps(), 1));
824-
__m256i gt0_rst_v_f = (__m256i)(_mm256_cmp_ps(rst_v_f, _mm256_setzero_ps(), 14));
825-
__m256i lt0_rst_v_f = (__m256i)(_mm256_cmp_ps(rst_v_f, _mm256_setzero_ps(), 1));
826-
__m256i gt0_rst_d_f = (__m256i)(_mm256_cmp_ps(rst_d_f, _mm256_setzero_ps(), 14));
827-
__m256i lt0_rst_d_f = (__m256i)(_mm256_cmp_ps(rst_d_f, _mm256_setzero_ps(), 1));
822+
__m256i gt0_rst_h_f = _mm256_castps_si256(_mm256_cmp_ps(rst_h_f, _mm256_setzero_ps(), 14));
823+
__m256i lt0_rst_h_f = _mm256_castps_si256(_mm256_cmp_ps(rst_h_f, _mm256_setzero_ps(), 1));
824+
__m256i gt0_rst_v_f = _mm256_castps_si256(_mm256_cmp_ps(rst_v_f, _mm256_setzero_ps(), 14));
825+
__m256i lt0_rst_v_f = _mm256_castps_si256(_mm256_cmp_ps(rst_v_f, _mm256_setzero_ps(), 1));
826+
__m256i gt0_rst_d_f = _mm256_castps_si256(_mm256_cmp_ps(rst_d_f, _mm256_setzero_ps(), 14));
827+
__m256i lt0_rst_d_f = _mm256_castps_si256(_mm256_cmp_ps(rst_d_f, _mm256_setzero_ps(), 1));
828828

829829
__m256i mask_min_max_h = _mm256_or_si256(gt0_rst_h_f, lt0_rst_h_f);
830830
__m256i mask_min_max_v = _mm256_or_si256(gt0_rst_v_f, lt0_rst_v_f);
@@ -834,7 +834,7 @@ void adm_decouple_avx2(AdmBuffer *buf, int w, int h, int stride,
834834
__m256i mask_rst_v = _mm256_and_si256(mask_min_max_v, angle_flag);
835835
__m256i mask_rst_d = _mm256_and_si256(mask_min_max_d, angle_flag);
836836

837-
__m256d adm_gain_d = _mm256_set1_pd(adm_enhn_gain_limit);
837+
__m256d adm_gain_d = _mm256_set1_pd(adm_enhn_gain_limit);
838838
__m256d rst_h_gainlo_d = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extractf128_si256(rst_h, 0)), adm_gain_d);
839839
__m256d rst_h_gainhi_d = _mm256_mul_pd(_mm256_cvtepi32_pd(_mm256_extractf128_si256(rst_h, 1)), adm_gain_d);
840840
__m256i rst_h_gain = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm256_cvtpd_epi32(rst_h_gainlo_d)), _mm256_cvtpd_epi32(rst_h_gainhi_d),1);
@@ -2088,15 +2088,15 @@ float adm_cm_avx2(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stride
20882088
}
20892089
accum_inner_h_lo_256 = _mm256_add_epi64(accum_inner_h_lo_256, accum_inner_h_hi_256);
20902090
__m128i r2_h = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_h_lo_256), _mm256_extracti128_si256(accum_inner_h_lo_256, 1));
2091-
int64_t res_h = r2_h[0] + r2_h[1];
2091+
int64_t res_h = _mm_extract_epi64(r2_h, 0) + _mm_extract_epi64(r2_h, 1);
20922092

20932093
accum_inner_v_lo_256 = _mm256_add_epi64(accum_inner_v_lo_256, accum_inner_v_hi_256);
20942094
__m128i r2_v = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_v_lo_256), _mm256_extracti128_si256(accum_inner_v_lo_256, 1));
2095-
int64_t res_v = r2_v[0] + r2_v[1];
2095+
int64_t res_v = _mm_extract_epi64(r2_v, 0) + _mm_extract_epi64(r2_v, 1);
20962096

20972097
accum_inner_d_lo_256 = _mm256_add_epi64(accum_inner_d_lo_256, accum_inner_d_hi_256);
20982098
__m128i r2_d = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_d_lo_256), _mm256_extracti128_si256(accum_inner_d_lo_256, 1));
2099-
int64_t res_d = r2_d[0] + r2_d[1];
2099+
int64_t res_d = _mm_extract_epi64(r2_d, 0) + _mm_extract_epi64(r2_d, 1);
21002100

21012101
for (j = end_col_mod6; j < end_col; ++j) {
21022102
xh = src->band_h[i * src_stride + j] * i_rfactor[0];
@@ -2521,13 +2521,13 @@ float i4_adm_cm_avx2(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_str
25212521
}
25222522

25232523
__m128i r2_h = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_h_256), _mm256_extracti128_si256(accum_inner_h_256, 1));
2524-
int64_t res_h = r2_h[0] + r2_h[1];
2524+
int64_t res_h = _mm_extract_epi64(r2_h, 0) + _mm_extract_epi64(r2_h, 1);
25252525

25262526
__m128i r2_v = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_v_256), _mm256_extracti128_si256(accum_inner_v_256, 1));
2527-
int64_t res_v = r2_v[0] + r2_v[1];
2527+
int64_t res_v = _mm_extract_epi64(r2_v, 0) + _mm_extract_epi64(r2_v, 1);
25282528

25292529
__m128i r2_d = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_d_256), _mm256_extracti128_si256(accum_inner_d_256, 1));
2530-
int64_t res_d = r2_d[0] + r2_d[1];
2530+
int64_t res_d = _mm_extract_epi64(r2_d, 0) + _mm_extract_epi64(r2_d, 1);
25312531

25322532
for (j = end_col_mod2; j < end_col; ++j)
25332533
{
@@ -3586,15 +3586,15 @@ float adm_csf_den_scale_avx2(const adm_dwt_band_t *src, int w, int h,
35863586

35873587
accum_inner_h_lo = _mm256_add_epi64(accum_inner_h_lo, accum_inner_h_hi);
35883588
__m128i h_r2 = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_h_lo), _mm256_extracti128_si256(accum_inner_h_lo, 1));
3589-
uint64_t h_r1 = h_r2[0] + h_r2[1];
3589+
uint64_t h_r1 = _mm_extract_epi64(h_r2, 0) + _mm_extract_epi64(h_r2, 1);
35903590

35913591
accum_inner_v_lo = _mm256_add_epi64(accum_inner_v_lo, accum_inner_v_hi);
35923592
__m128i v_r2 = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_v_lo), _mm256_extracti128_si256(accum_inner_v_lo, 1));
3593-
uint64_t v_r1 = v_r2[0] + v_r2[1];
3593+
uint64_t v_r1 = _mm_extract_epi64(v_r2, 0) + _mm_extract_epi64(v_r2, 1);
35943594

35953595
accum_inner_d_lo = _mm256_add_epi64(accum_inner_d_lo, accum_inner_d_hi);
35963596
__m128i d_r2 = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_d_lo), _mm256_extracti128_si256(accum_inner_d_lo, 1));
3597-
uint64_t d_r1 = d_r2[0] + d_r2[1];
3597+
uint64_t d_r1 = _mm_extract_epi64(d_r2, 0) + _mm_extract_epi64(d_r2, 1);
35983598

35993599
for (int j = right_mod_8; j < right; ++j) {
36003600
uint16_t h = (uint16_t)abs(src_h[j]);
@@ -3992,13 +3992,13 @@ float adm_csf_den_s123_avx2(const i4_adm_dwt_band_t *src, int scale, int w, int
39923992
accum_inner_d_256 = _mm256_add_epi64(accum_inner_d_256, d_cu);
39933993
}
39943994
__m128i h_r2 = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_h_256), _mm256_extracti128_si256(accum_inner_h_256, 1));
3995-
uint64_t h_r1 = h_r2[0] + h_r2[1];
3995+
uint64_t h_r1 = _mm_extract_epi64(h_r2, 0) + _mm_extract_epi64(h_r2, 1);
39963996

39973997
__m128i d_r2 = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_d_256), _mm256_extracti128_si256(accum_inner_d_256, 1));
3998-
uint64_t d_r1 = d_r2[0] + d_r2[1];
3998+
uint64_t d_r1 = _mm_extract_epi64(d_r2, 0) + _mm_extract_epi64(d_r2, 1);
39993999

40004000
__m128i v_r2 = _mm_add_epi64(_mm256_castsi256_si128(accum_inner_v_256), _mm256_extracti128_si256(accum_inner_v_256, 1));
4001-
uint64_t v_r1 = v_r2[0] + v_r2[1];
4001+
uint64_t v_r1 = _mm_extract_epi64(v_r2, 0) + _mm_extract_epi64(v_r2, 1);
40024002

40034003
for (int j = right_mod_4; j < right; ++j)
40044004
{

libvmaf/src/feature/x86/adm_avx512.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,17 +1746,17 @@ float adm_cm_avx512(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_stri
17461746
accum_inner_h_lo_512 = _mm512_add_epi64(accum_inner_h_lo_512, accum_inner_h_hi_512);
17471747
__m256i r4_h = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_h_lo_512), _mm512_extracti64x4_epi64(accum_inner_h_lo_512, 1));
17481748
__m128i r2_h = _mm_add_epi64(_mm256_castsi256_si128(r4_h), _mm256_extracti128_si256(r4_h, 1));
1749-
int64_t res_h = r2_h[0] + r2_h[1];
1749+
int64_t res_h = _mm_extract_epi64(r2_h, 0) + _mm_extract_epi64(r2_h, 1);
17501750

17511751
accum_inner_v_lo_512 = _mm512_add_epi64(accum_inner_v_lo_512, accum_inner_v_hi_512);
17521752
__m256i r4_v = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_v_lo_512), _mm512_extracti64x4_epi64(accum_inner_v_lo_512, 1));
17531753
__m128i r2_v = _mm_add_epi64(_mm256_castsi256_si128(r4_v), _mm256_extracti128_si256(r4_v, 1));
1754-
int64_t res_v = r2_v[0] + r2_v[1];
1754+
int64_t res_v = _mm_extract_epi64(r2_v, 0) + _mm_extract_epi64(r2_v, 1);
17551755

17561756
accum_inner_d_lo_512 = _mm512_add_epi64(accum_inner_d_lo_512, accum_inner_d_hi_512);
17571757
__m256i r4_d = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_d_lo_512), _mm512_extracti64x4_epi64(accum_inner_d_lo_512, 1));
17581758
__m128i r2_d = _mm_add_epi64(_mm256_castsi256_si128(r4_d), _mm256_extracti128_si256(r4_d, 1));
1759-
int64_t res_d = r2_d[0] + r2_d[1];
1759+
int64_t res_d = _mm_extract_epi64(r2_d, 0) + _mm_extract_epi64(r2_d, 1);
17601760

17611761
for (j = end_col_mod14; j < end_col; ++j) {
17621762
xh = src->band_h[i * src_stride + j] * i_rfactor[0];
@@ -2168,15 +2168,15 @@ float i4_adm_cm_avx512(AdmBuffer *buf, int w, int h, int src_stride, int csf_a_s
21682168

21692169
__m256i r4_h = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_h_512), _mm512_extracti64x4_epi64(accum_inner_h_512, 1));
21702170
__m128i r2_h = _mm_add_epi64(_mm256_castsi256_si128(r4_h), _mm256_extracti128_si256(r4_h, 1));
2171-
int64_t res_h = r2_h[0] + r2_h[1];
2171+
int64_t res_h = _mm_extract_epi64(r2_h, 0) + _mm_extract_epi64(r2_h, 1);
21722172

21732173
__m256i r4_v = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_v_512), _mm512_extracti64x4_epi64(accum_inner_v_512, 1));
21742174
__m128i r2_v = _mm_add_epi64(_mm256_castsi256_si128(r4_v), _mm256_extracti128_si256(r4_v, 1));
2175-
int64_t res_v = r2_v[0] + r2_v[1];
2175+
int64_t res_v = _mm_extract_epi64(r2_v, 0) + _mm_extract_epi64(r2_v, 1);
21762176

21772177
__m256i r4_d = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_d_512), _mm512_extracti64x4_epi64(accum_inner_d_512, 1));
21782178
__m128i r2_d = _mm_add_epi64(_mm256_castsi256_si128(r4_d), _mm256_extracti128_si256(r4_d, 1));
2179-
int64_t res_d = r2_d[0] + r2_d[1];
2179+
int64_t res_d = _mm_extract_epi64(r2_d, 0) + _mm_extract_epi64(r2_d, 1);
21802180

21812181
for (j = end_col_mod6; j < end_col; ++j)
21822182
{
@@ -3858,17 +3858,17 @@ float adm_csf_den_scale_avx512(const adm_dwt_band_t *src, int w, int h,
38583858
accum_inner_h_lo = _mm512_add_epi64(accum_inner_h_lo, accum_inner_h_hi);
38593859
__m256i h_r4 = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_h_lo), _mm512_extracti64x4_epi64(accum_inner_h_lo, 1));
38603860
__m128i h_r2 = _mm_add_epi64(_mm256_castsi256_si128(h_r4), _mm256_extracti64x2_epi64(h_r4, 1));
3861-
uint64_t h_r1 = h_r2[0] + h_r2[1];
3861+
uint64_t h_r1 = _mm_extract_epi64(h_r2, 0) + _mm_extract_epi64(h_r2, 1);
38623862

38633863
accum_inner_v_lo = _mm512_add_epi64(accum_inner_v_lo, accum_inner_v_hi);
38643864
__m256i v_r4 = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_v_lo), _mm512_extracti64x4_epi64(accum_inner_v_lo, 1));
38653865
__m128i v_r2 = _mm_add_epi64(_mm256_castsi256_si128(v_r4), _mm256_extracti64x2_epi64(v_r4, 1));
3866-
uint64_t v_r1 = v_r2[0] + v_r2[1];
3866+
uint64_t v_r1 = _mm_extract_epi64(v_r2, 0) + _mm_extract_epi64(v_r2, 1);
38673867

38683868
accum_inner_d_lo = _mm512_add_epi64(accum_inner_d_lo, accum_inner_d_hi);
38693869
__m256i d_r4 = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_d_lo), _mm512_extracti64x4_epi64(accum_inner_d_lo, 1));
38703870
__m128i d_r2 = _mm_add_epi64(_mm256_castsi256_si128(d_r4), _mm256_extracti64x2_epi64(d_r4, 1));
3871-
uint64_t d_r1 = d_r2[0] + d_r2[1];
3871+
uint64_t d_r1 = _mm_extract_epi64(d_r2, 0) + _mm_extract_epi64(d_r2, 1);
38723872

38733873
for (int j = right_mod_16; j < right; ++j) {
38743874
uint16_t h = (uint16_t)abs(src_h[j]);
@@ -3985,15 +3985,15 @@ float adm_csf_den_s123_avx512(const i4_adm_dwt_band_t *src, int scale, int w, in
39853985
}
39863986
__m256i h_r4 = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_h_512), _mm512_extracti64x4_epi64(accum_inner_h_512, 1));
39873987
__m128i h_r2 = _mm_add_epi64(_mm256_castsi256_si128(h_r4), _mm256_extracti64x2_epi64(h_r4, 1));
3988-
uint64_t h_r1 = h_r2[0] + h_r2[1];
3988+
uint64_t h_r1 = _mm_extract_epi64(h_r2, 0) + _mm_extract_epi64(h_r2, 1);
39893989

39903990
__m256i d_r4 = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_d_512), _mm512_extracti64x4_epi64(accum_inner_d_512, 1));
39913991
__m128i d_r2 = _mm_add_epi64(_mm256_castsi256_si128(d_r4), _mm256_extracti64x2_epi64(d_r4, 1));
3992-
uint64_t d_r1 = d_r2[0] + d_r2[1];
3992+
uint64_t d_r1 = _mm_extract_epi64(d_r2, 0) + _mm_extract_epi64(d_r2, 1);
39933993

39943994
__m256i v_r4 = _mm256_add_epi64(_mm512_castsi512_si256(accum_inner_v_512), _mm512_extracti64x4_epi64(accum_inner_v_512, 1));
39953995
__m128i v_r2 = _mm_add_epi64(_mm256_castsi256_si128(v_r4), _mm256_extracti64x2_epi64(v_r4, 1));
3996-
uint64_t v_r1 = v_r2[0] + v_r2[1];
3996+
uint64_t v_r1 = _mm_extract_epi64(v_r2, 0) + _mm_extract_epi64(v_r2, 1);
39973997

39983998
for (int j = right_mod_8; j < right; ++j)
39993999
{

libvmaf/src/feature/x86/motion_avx2.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,8 @@ void sad_avx2(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad)
534534
a += (pic_a->stride[0] / 2);
535535
b += (pic_b->stride[0] / 2);
536536
}
537-
uint64_t r1 = final_accum[0] + final_accum[1] + final_accum[2] + final_accum[3];
537+
uint64_t r1 = _mm256_extract_epi64(final_accum, 0) + _mm256_extract_epi64(final_accum, 1) +
538+
_mm256_extract_epi64(final_accum, 2) + _mm256_extract_epi64(final_accum, 3);
538539

539540
*sad += r1;
540541
}

libvmaf/src/feature/x86/motion_avx512.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ void sad_avx512(VmafPicture *pic_a, VmafPicture *pic_b, uint64_t *sad)
450450
}
451451
__m256i r4 = _mm256_add_epi64(_mm512_castsi512_si256(final_accum), _mm512_extracti64x4_epi64(final_accum, 1));
452452
__m128i r2 = _mm_add_epi64(_mm256_castsi256_si128(r4), _mm256_extracti64x2_epi64(r4, 1));
453-
uint64_t r1 = r2[0] + r2[1];
453+
uint64_t r1 = _mm_extract_epi64(r2, 0) + _mm_extract_epi64(r2, 1);
454454

455455
*sad += r1;
456456

0 commit comments

Comments
 (0)