@@ -13,16 +13,16 @@ static inline void L2SqrStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i &sum) {
1313 __m512i vb = _mm512_loadu_epi8 (pVect2); // AVX512BW
1414 pVect2 += 64 ;
1515
16- __m512i va_hi = _mm512_unpackhi_epi8 (va, _mm512_setzero_si512 ()); // AVX512BW
17- __m512i vb_hi = _mm512_unpackhi_epi8 (vb, _mm512_setzero_si512 ());
18- __m512i diff_hi = _mm512_sub_epi16 (va_hi, vb_hi);
19- sum = _mm512_dpwssd_epi32 (sum, diff_hi, diff_hi);
20-
2116 __m512i va_lo = _mm512_unpacklo_epi8 (va, _mm512_setzero_si512 ()); // AVX512BW
2217 __m512i vb_lo = _mm512_unpacklo_epi8 (vb, _mm512_setzero_si512 ());
2318 __m512i diff_lo = _mm512_sub_epi16 (va_lo, vb_lo);
2419 sum = _mm512_dpwssd_epi32 (sum, diff_lo, diff_lo);
2520
21+ __m512i va_hi = _mm512_unpackhi_epi8 (va, _mm512_setzero_si512 ()); // AVX512BW
22+ __m512i vb_hi = _mm512_unpackhi_epi8 (vb, _mm512_setzero_si512 ());
23+ __m512i diff_hi = _mm512_sub_epi16 (va_hi, vb_hi);
24+ sum = _mm512_dpwssd_epi32 (sum, diff_hi, diff_hi);
25+
2626 // _mm512_dpwssd_epi32(src, a, b)
2727 // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
2828 // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results
0 commit comments