@@ -33,8 +33,10 @@ static inline __m512i loadu_epi16_avx512(const void* mem_address)
3333static void
3434compress_prb_avx512 (compressed_prb& c_prb, const int16_t * uncompr_samples, uint8_t exponent, unsigned data_width)
3535{
36+ const __mmask32 load_mask = 0x00ffffff ;
37+
3638 // Load from memory.
37- __m512i rb_epi16 = loadu_epi16_avx512 ( uncompr_samples);
39+ __m512i rb_epi16 = _mm512_maskz_loadu_epi16 (load_mask, uncompr_samples);
3840
3941 // Apply exponent (compress).
4042 __m512i rb_shifted_epi16 = _mm512_srai_epi16 (rb_epi16, exponent);
@@ -102,10 +104,10 @@ void iq_compression_bfp_avx512::compress(span<compressed_prb> output,
102104 }
103105
104106 // Process the remaining PRBs (one PRB at a time),
105- // except the last one - to avoid reading behind the input data memory.
106- for ( size_t rb_index_end = output. size () - 1 ; rb != rb_index_end; ++rb) {
107- const __m512i AVX512_ZERO = _mm512_set1_epi16 ( 0 ) ;
108- __m512i rb_epi16 = loadu_epi16_avx512 ( &input_quantized[sample_idx]);
107+ for ( size_t rb_index_end = output. size (); rb != rb_index_end; ++rb) {
108+ const __m512i AVX512_ZERO = _mm512_set1_epi16 ( 0 );
109+ const __mmask32 load_mask = 0x00ffffff ;
110+ __m512i rb_epi16 = _mm512_maskz_loadu_epi16 (load_mask, &input_quantized[sample_idx]);
109111
110112 // Determine BFP exponent and extract it from the first byte of the first 128bit lane.
111113 __m512i exp_epu32 = mm512::determine_bfp_exponent (rb_epi16, AVX512_ZERO, AVX512_ZERO, params.data_width );
@@ -118,13 +120,6 @@ void iq_compression_bfp_avx512::compress(span<compressed_prb> output,
118120
119121 sample_idx += NOF_SAMPLES_PER_PRB;
120122 }
121-
122- // Use generic implementation for the remaining resource blocks.
123- for (size_t rb_index_end = output.size (); rb != rb_index_end; ++rb) {
124- const auto * start_it = input_quantized.begin () + sample_idx;
125- compress_prb_generic (output[rb], {start_it, NOF_SAMPLES_PER_PRB}, params.data_width );
126- sample_idx += NOF_SAMPLES_PER_PRB;
127- }
128123}
129124
130125void iq_compression_bfp_avx512::decompress (span<cf_t > output,
0 commit comments