Skip to content

Commit 1682b5f

Browse files
committed
Correct the pointer-alignment changes to rans4x32 AVX512
The O0 and O1 encoder had incorrect arguments to *storeu_epi16. The (1<<pc2)-1 is creating a bit-mask of which 16-bit quantities to store (it's the "_mask_" part of the function call), so we can't naively double it. The O1 decoder missed doubling the 'sp' increment when we're decoding a TF_SHIFT_O1_FAST data-stream.
1 parent 604924d commit 1682b5f

File tree

1 file changed

+14
-14
lines changed

1 file changed

+14
-14
lines changed

htscodecs/rANS_static32x16pr_avx512.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -222,20 +222,20 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
222222
SET512(xmax, SB);
223223

224224
uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
225-
int pc1 = _mm_popcnt_u32(gt_mask1) * 2;
225+
int pc1 = _mm_popcnt_u32(gt_mask1);
226226
__m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
227227
__m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
228228
uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
229229
SET512(SDv, SD);
230-
int pc2 = _mm_popcnt_u32(gt_mask2) * 2;
230+
int pc2 = _mm_popcnt_u32(gt_mask2);
231231

232232
Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
233233
Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
234234

235-
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc2, pc2-1, Rp2);
236-
ptr -= pc2;
237-
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc1, pc1-1, Rp1);
238-
ptr -= pc1;
235+
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc2*2, (1<<pc2)-1, Rp2);
236+
ptr -= pc2*2;
237+
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc1*2, (1<<pc1)-1, Rp1);
238+
ptr -= pc1*2;
239239

240240
SET512(rfv, SA);
241241
Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
@@ -688,20 +688,20 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
688688
SET512x(xmax, x_max); // high latency
689689

690690
uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask(Rv1, xmax1);
691-
int pc1 = _mm_popcnt_u32(gt_mask1) * 2;
691+
int pc1 = _mm_popcnt_u32(gt_mask1);
692692
__m512i Rp1 = _mm512_and_si512(Rv1, _mm512_set1_epi32(0xffff));
693693
__m512i Rp2 = _mm512_and_si512(Rv2, _mm512_set1_epi32(0xffff));
694694
uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask(Rv2, xmax2);
695695
SET512x(SDv, cmpl_freq); // good
696-
int pc2 = _mm_popcnt_u32(gt_mask2) * 2;
696+
int pc2 = _mm_popcnt_u32(gt_mask2);
697697

698698
Rp1 = _mm512_maskz_compress_epi32(gt_mask1, Rp1);
699699
Rp2 = _mm512_maskz_compress_epi32(gt_mask2, Rp2);
700700

701-
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc2, pc2-1, Rp2);
702-
ptr -= pc2;
703-
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc1, pc1-1, Rp1);
704-
ptr -= pc1;
701+
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc2*2, (1<<pc2)-1, Rp2);
702+
ptr -= pc2*2;
703+
_mm512_mask_cvtepi32_storeu_epi16(ptr-pc1*2, (1<<pc1)-1, Rp1);
704+
ptr -= pc1*2;
705705

706706
Rv1 = _mm512_mask_srli_epi32(Rv1, gt_mask1, Rv1, 16);
707707
Rv2 = _mm512_mask_srli_epi32(Rv2, gt_mask2, Rv2, 16);
@@ -1087,11 +1087,11 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
10871087

10881088
__m512i renorm_words1 = _mm512_cvtepu16_epi32
10891089
(_mm256_loadu_si256((const __m256i *)sp));
1090-
sp += _mm_popcnt_u32(_imask1);
1090+
sp += _mm_popcnt_u32(_imask1) * 2;
10911091

10921092
__m512i renorm_words2 = _mm512_cvtepu16_epi32
10931093
(_mm256_loadu_si256((const __m256i *)sp));
1094-
sp += _mm_popcnt_u32(_imask2);
1094+
sp += _mm_popcnt_u32(_imask2) * 2;
10951095

10961096
__m512i _renorm_vals1 =
10971097
_mm512_maskz_expand_epi32(_imask1, renorm_words1);

0 commit comments

Comments
 (0)