Skip to content

Commit 6443294

Browse files
committed
unpack lo before high
1 parent dfd51f3 commit 6443294

File tree

2 files changed

+9
-9
lines changed

2 files changed

+9
-9
lines changed

src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ static inline void InnerProductStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i
1313
__m512i vb = _mm512_loadu_epi8(pVect2); // AVX512BW
1414
pVect2 += 64;
1515

16-
__m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512()); // AVX512BW
17-
__m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
18-
sum = _mm512_dpwssd_epi32(sum, va_hi, vb_hi);
19-
2016
__m512i va_lo = _mm512_unpacklo_epi8(va, _mm512_setzero_si512()); // AVX512BW
2117
__m512i vb_lo = _mm512_unpacklo_epi8(vb, _mm512_setzero_si512());
2218
sum = _mm512_dpwssd_epi32(sum, va_lo, vb_lo);
2319

20+
__m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512()); // AVX512BW
21+
__m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
22+
sum = _mm512_dpwssd_epi32(sum, va_hi, vb_hi);
23+
2424
// _mm512_dpwssd_epi32(src, a, b)
2525
// Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
2626
// 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results

src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ static inline void L2SqrStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i &sum) {
1313
__m512i vb = _mm512_loadu_epi8(pVect2); // AVX512BW
1414
pVect2 += 64;
1515

16-
__m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512()); // AVX512BW
17-
__m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
18-
__m512i diff_hi = _mm512_sub_epi16(va_hi, vb_hi);
19-
sum = _mm512_dpwssd_epi32(sum, diff_hi, diff_hi);
20-
2116
__m512i va_lo = _mm512_unpacklo_epi8(va, _mm512_setzero_si512()); // AVX512BW
2217
__m512i vb_lo = _mm512_unpacklo_epi8(vb, _mm512_setzero_si512());
2318
__m512i diff_lo = _mm512_sub_epi16(va_lo, vb_lo);
2419
sum = _mm512_dpwssd_epi32(sum, diff_lo, diff_lo);
2520

21+
__m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512()); // AVX512BW
22+
__m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
23+
__m512i diff_hi = _mm512_sub_epi16(va_hi, vb_hi);
24+
sum = _mm512_dpwssd_epi32(sum, diff_hi, diff_hi);
25+
2626
// _mm512_dpwssd_epi32(src, a, b)
2727
// Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
2828
// 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results

0 commit comments

Comments
 (0)