@@ -222,20 +222,20 @@ unsigned char *rans_compress_O0_32x16_avx512(unsigned char *in,
222222 SET512 (xmax , SB );
223223
224224 uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask (Rv1 , xmax1 );
225- int pc1 = _mm_popcnt_u32 (gt_mask1 ) * 2 ;
225+ int pc1 = _mm_popcnt_u32 (gt_mask1 );
226226 __m512i Rp1 = _mm512_and_si512 (Rv1 , _mm512_set1_epi32 (0xffff ));
227227 __m512i Rp2 = _mm512_and_si512 (Rv2 , _mm512_set1_epi32 (0xffff ));
228228 uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask (Rv2 , xmax2 );
229229 SET512 (SDv , SD );
230- int pc2 = _mm_popcnt_u32 (gt_mask2 ) * 2 ;
230+ int pc2 = _mm_popcnt_u32 (gt_mask2 );
231231
232232 Rp1 = _mm512_maskz_compress_epi32 (gt_mask1 , Rp1 );
233233 Rp2 = _mm512_maskz_compress_epi32 (gt_mask2 , Rp2 );
234234
235- _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc2 , pc2 - 1 , Rp2 );
236- ptr -= pc2 ;
237- _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc1 , pc1 - 1 , Rp1 );
238- ptr -= pc1 ;
235+ _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc2 * 2 , ( 1 << pc2 ) - 1 , Rp2 );
236+ ptr -= pc2 * 2 ;
237+ _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc1 * 2 , ( 1 << pc1 ) - 1 , Rp1 );
238+ ptr -= pc1 * 2 ;
239239
240240 SET512 (rfv , SA );
241241 Rv1 = _mm512_mask_srli_epi32 (Rv1 , gt_mask1 , Rv1 , 16 );
@@ -688,20 +688,20 @@ unsigned char *rans_compress_O1_32x16_avx512(unsigned char *in,
688688 SET512x (xmax , x_max ); // high latency
689689
690690 uint16_t gt_mask1 = _mm512_cmpgt_epi32_mask (Rv1 , xmax1 );
691- int pc1 = _mm_popcnt_u32 (gt_mask1 ) * 2 ;
691+ int pc1 = _mm_popcnt_u32 (gt_mask1 );
692692 __m512i Rp1 = _mm512_and_si512 (Rv1 , _mm512_set1_epi32 (0xffff ));
693693 __m512i Rp2 = _mm512_and_si512 (Rv2 , _mm512_set1_epi32 (0xffff ));
694694 uint16_t gt_mask2 = _mm512_cmpgt_epi32_mask (Rv2 , xmax2 );
695695 SET512x (SDv , cmpl_freq ); // good
696- int pc2 = _mm_popcnt_u32 (gt_mask2 ) * 2 ;
696+ int pc2 = _mm_popcnt_u32 (gt_mask2 );
697697
698698 Rp1 = _mm512_maskz_compress_epi32 (gt_mask1 , Rp1 );
699699 Rp2 = _mm512_maskz_compress_epi32 (gt_mask2 , Rp2 );
700700
701- _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc2 , pc2 - 1 , Rp2 );
702- ptr -= pc2 ;
703- _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc1 , pc1 - 1 , Rp1 );
704- ptr -= pc1 ;
701+ _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc2 * 2 , ( 1 << pc2 ) - 1 , Rp2 );
702+ ptr -= pc2 * 2 ;
703+ _mm512_mask_cvtepi32_storeu_epi16 (ptr - pc1 * 2 , ( 1 << pc1 ) - 1 , Rp1 );
704+ ptr -= pc1 * 2 ;
705705
706706 Rv1 = _mm512_mask_srli_epi32 (Rv1 , gt_mask1 , Rv1 , 16 );
707707 Rv2 = _mm512_mask_srli_epi32 (Rv2 , gt_mask2 , Rv2 , 16 );
@@ -1087,11 +1087,11 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
10871087
10881088 __m512i renorm_words1 = _mm512_cvtepu16_epi32
10891089 (_mm256_loadu_si256 ((const __m256i * )sp ));
1090- sp += _mm_popcnt_u32 (_imask1 );
1090+ sp += _mm_popcnt_u32 (_imask1 ) * 2 ;
10911091
10921092 __m512i renorm_words2 = _mm512_cvtepu16_epi32
10931093 (_mm256_loadu_si256 ((const __m256i * )sp ));
1094- sp += _mm_popcnt_u32 (_imask2 );
1094+ sp += _mm_popcnt_u32 (_imask2 ) * 2 ;
10951095
10961096 __m512i _renorm_vals1 =
10971097 _mm512_maskz_expand_epi32 (_imask1 , renorm_words1 );
0 commit comments