1010
1111#include " pseudo_random_generator_impl.h"
1212#include " srsran/support/math_utils.h"
13- #include < cassert>
14- #include < cstring>
13+ #include " srsran/support/srsran_assert.h"
1514
16- #if HAVE_SSE
15+ #if __SSE3__
1716#include < immintrin.h>
18- #endif // HAVE_SSE
17+ #endif // __SSE3__
1918
20- #ifdef HAVE_NEON
19+ #ifdef __aarch64__
2120#include < arm_neon.h>
22- #endif
21+ #endif // __aarch64__
2322
2423using namespace srsran ;
2524
@@ -118,7 +117,7 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
118117 uint32_t c = (uint32_t )(x1 ^ x2);
119118
120119 uint32_t j = 0 ;
121- #ifdef HAVE_SSE
120+ #ifdef __SSE3__
122121 for (; j < SEQUENCE_PAR_BITS - 3 ; j += 4 ) {
123122 // Preloads bits of interest in the 4 LSB
124123 __m128i mask = _mm_set1_epi32 (c >> j);
@@ -140,8 +139,8 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
140139
141140 _mm_storeu_ps (&out[i + j], v);
142141 }
143- #endif
144- #ifdef HAVE_NEON
142+ #endif // __SSE3__
143+ #ifdef __aarch64__
145144 for (; j < SEQUENCE_PAR_BITS - 3 ; j += 4 ) {
146145 // Preloads bits of interest in the 4 LSB
147146 int32x4_t mask_s32 = vdupq_n_s32 (c >> j);
@@ -165,7 +164,7 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
165164
166165 vst1q_f32 (&out[i + j], v);
167166 }
168- #endif // HAVE_NEON
167+ #endif // __aarch64__
169168 // Finish the parallel bits with generic code
170169 for (; j != SEQUENCE_PAR_BITS; ++j) {
171170 FLOAT_U32_XOR (out[i + j], value, (c << (31U - j)) & 0x80000000 );
@@ -188,7 +187,10 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
188187
189188void pseudo_random_generator_impl::apply_xor (bit_buffer& out, const bit_buffer& in)
190189{
191- assert (in.size () == out.size ());
190+ srsran_assert (in.size () == out.size (),
191+ " Input size (i.e., {}) and output size (i.e., {}) must be equal." ,
192+ in.size (),
193+ out.size ());
192194
193195 static constexpr unsigned BITS_PER_BYTE = 8 ;
194196 static constexpr unsigned NOF_PAR_BYTES = 3 ;
@@ -252,98 +254,121 @@ void pseudo_random_generator_impl::apply_xor(bit_buffer& out, const bit_buffer&
252254
253255void pseudo_random_generator_impl::apply_xor (span<uint8_t > out, span<const uint8_t > in)
254256{
255- assert (in.size () == out.size ());
257+ srsran_assert (in.size () == out.size (),
258+ " Input size (i.e., {}) and output size (i.e., {}) must be equal." ,
259+ in.size (),
260+ out.size ());
261+
262+ // The optimal number of parallel bits to process is 16.
263+ static constexpr unsigned nof_par_bits = 16 ;
256264
257265 unsigned i = 0 ;
258266 unsigned length = in.size ();
259267
260- if (length >= SEQUENCE_PAR_BITS) {
261- for (unsigned max_i = length - (SEQUENCE_PAR_BITS - 1 ); i < max_i; i += SEQUENCE_PAR_BITS) {
262- uint32_t c = (uint32_t )(x1 ^ x2);
268+ for (unsigned i_end = (length / nof_par_bits) * nof_par_bits; i != i_end; i += nof_par_bits) {
269+ uint32_t c = (x1 ^ x2);
263270
264- uint32_t j = 0 ;
265- #ifdef HAVE_SSE
266- if (SEQUENCE_PAR_BITS >= 16 ) {
267- // Preloads bits of interest in the 16 LSB
268- __m128i mask = _mm_set1_epi32 (c);
269- mask = _mm_shuffle_epi8 (mask, _mm_setr_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ));
271+ uint32_t j = 0 ;
272+ #ifdef __SSE3__
273+ if (nof_par_bits >= 16 ) {
274+ // Preloads bits of interest in the 16 LSB
275+ __m128i mask = _mm_set1_epi32 (c);
276+ mask = _mm_shuffle_epi8 (mask, _mm_setr_epi8 (0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ));
270277
271- // Masks each bit
272- // mask = _mm_and_si128( mask, _mm_set_epi64x(0x0102040810204080, 0x0102040810204080));
273- mask = _mm_and_si128 (mask, _mm_set_epi64x (0x8040201008040201 , 0x8040201008040201 ));
278+ // Masks each bit.
279+ mask = _mm_and_si128 (mask, _mm_set_epi64x (0x8040201008040201 , 0x8040201008040201 ));
274280
275- // Get non zero mask
276- mask = _mm_cmpeq_epi8 (mask, _mm_set_epi64x (0x8040201008040201 , 0x8040201008040201 ));
281+ // Get non zero mask.
282+ mask = _mm_cmpeq_epi8 (mask, _mm_set_epi64x (0x8040201008040201 , 0x8040201008040201 ));
277283
278- // Reduce to 1s and 0s
279- mask = _mm_and_si128 (mask, _mm_set1_epi8 (1 ));
284+ // Reduce to 1s and 0s.
285+ mask = _mm_and_si128 (mask, _mm_set1_epi8 (1 ));
280286
281- // Load input
282- __m128i v = _mm_loadu_si128 ((__m128i*)(&in[i + j]));
287+ // Load input.
288+ __m128i v = _mm_loadu_si128 ((__m128i*)(&in[i + j]));
283289
284- // Apply XOR
285- v = _mm_xor_si128 (mask, v);
290+ // Apply XOR.
291+ v = _mm_xor_si128 (mask, v);
286292
287- _mm_storeu_si128 ((__m128i*)(&out[i + j]), v);
293+ // Store output.
294+ _mm_storeu_si128 ((__m128i*)(&out[i + j]), v);
288295
289- // Increment bit counter `j`
290- j += 16 ;
291- }
292- #endif
293- #ifdef HAVE_NEON
294- if (SEQUENCE_PAR_BITS >= 16 ) {
295- // Preloads bits of interest in the 16 LSB
296- uint32x2_t c_dup_u32 = vdup_n_u32 (c);
297- uint8x16_t mask_u8 = vcombine_u8 (vdup_lane_u8 (vreinterpret_u8_u32 (c_dup_u32), 0 ),
298- vdup_lane_u8 (vreinterpret_u8_u32 (c_dup_u32), 1 ));
296+ // Increment bit counter within the word.
297+ j += 16 ;
299298
300- // Create bit masks
301- const uint8_t bit_masks[8 ] = {0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 , 0x80 };
302- const uint8x16_t bit_masks_u8 = vcombine_u8 (vcreate_u8 (*(reinterpret_cast <const uint64_t *>(bit_masks))),
303- vcreate_u8 (*(reinterpret_cast <const uint64_t *>(bit_masks))));
304- // Mask each bit
305- mask_u8 = vandq_u8 (mask_u8, bit_masks_u8);
299+ // Shift c.
300+ c = c >> 16U ;
301+ }
302+ #endif // __SSE3__
303+ #ifdef __aarch64__
304+ if (nof_par_bits >= 16 ) {
305+ // Preloads bits of interest in the 16 LSB.
306+ uint32x2_t c_dup_u32 = vdup_n_u32 (c);
307+ uint8x16_t mask_u8 =
308+ vcombine_u8 (vdup_lane_u8 (vreinterpret_u8_u32 (c_dup_u32), 0 ), vdup_lane_u8 (vreinterpret_u8_u32 (c_dup_u32), 1 ));
306309
307- // Get non zero mask
308- mask_u8 = vceqq_u8 (mask_u8, bit_masks_u8);
310+ // Create bit masks.
311+ const uint8_t bit_masks[8 ] = {0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 , 0x80 };
312+ const uint8x16_t bit_masks_u8 = vcombine_u8 (vcreate_u8 (*(reinterpret_cast <const uint64_t *>(bit_masks))),
313+ vcreate_u8 (*(reinterpret_cast <const uint64_t *>(bit_masks))));
314+ // Mask each bit.
315+ mask_u8 = vandq_u8 (mask_u8, bit_masks_u8);
309316
310- // Reduce to 1s and 0s
311- mask_u8 = vandq_u8 (mask_u8, vdupq_n_u8 ( 1 ) );
317+ // Get non zero mask.
318+ mask_u8 = vceqq_u8 (mask_u8, bit_masks_u8 );
312319
313- // Load input
314- uint8x16_t v = vld1q_u8 (&in[i + j] );
320+ // Reduce to 1s and 0s.
321+ mask_u8 = vandq_u8 (mask_u8, vdupq_n_u8 ( 1 ) );
315322
316- // Apply XOR
317- v = veorq_u8 (mask_u8, v );
323+ // Load input.
324+ uint8x16_t v = vld1q_u8 (&in[i + j] );
318325
319- vst1q_u8 (&out[i + j], v);
326+ // Apply XOR.
327+ v = veorq_u8 (mask_u8, v);
320328
321- // Increment bit counter `j`
322- j += 16 ;
323- }
324- #endif // HAVE_NEON
325- for (; j < SEQUENCE_PAR_BITS; j++) {
326- out[i + j] = in[i + j] ^ ((c >> j) & 1U );
327- }
329+ // Store output.
330+ vst1q_u8 (&out[i + j], v);
328331
329- // Step sequences
330- x1 = step_par_x1 (x1);
331- x2 = step_par_x2 (x2);
332+ // Increment bit counter within the word.
333+ j += 16 ;
334+
335+ // Shift c.
336+ c = c >> 16U ;
337+ }
338+ #endif // __aarch64__
339+
340+ // Apply mask to remainder bits.
341+ for (; j != nof_par_bits; ++j) {
342+ out[i + j] = in[i + j] ^ (c & 1U );
343+ c = c >> 1U ;
332344 }
345+
346+ // Step sequences.
347+ x1 = step_par_x1 (x1, nof_par_bits);
348+ x2 = step_par_x2 (x2, nof_par_bits);
333349 }
334350
335- for (; i != length; ++i) {
336- out[i] = in[i] ^ ((x1 ^ x2) & 1U ) ;
351+ // Number of remainder bits.
352+ unsigned remainder = length - i ;
337353
338- // Step sequences
339- x1 = step_x1 (x1);
340- x2 = step_x2 (x2);
354+ // Apply remainder bits.
355+ uint32_t c = (x1 ^ x2);
356+ for (; i != length; ++i) {
357+ out[i] = in[i] ^ (c & 1U );
358+ c = c >> 1U ;
341359 }
360+
361+ // Step remainder bits.
362+ x1 = step_par_x1 (x1, remainder);
363+ x2 = step_par_x2 (x2, remainder);
342364}
343365
344366void pseudo_random_generator_impl::apply_xor (span<log_likelihood_ratio> out, span<const log_likelihood_ratio> in)
345367{
346- assert (in.size () == out.size ());
368+ srsran_assert (in.size () == out.size (),
369+ " Input size (i.e., {}) and output size (i.e., {}) must be equal." ,
370+ in.size (),
371+ out.size ());
347372
348373 unsigned i = 0 ;
349374 unsigned length = in.size ();
@@ -353,7 +378,7 @@ void pseudo_random_generator_impl::apply_xor(span<log_likelihood_ratio> out, spa
353378 uint32_t c = (uint32_t )(x1 ^ x2);
354379
355380 uint32_t j = 0 ;
356- #ifdef HAVE_SSE
381+ #ifdef __SSE3__
357382 if (SEQUENCE_PAR_BITS >= 16 ) {
358383 // Preloads bits of interest in the 16 LSB
359384 __m128i mask = _mm_set1_epi32 (c);
@@ -381,8 +406,8 @@ void pseudo_random_generator_impl::apply_xor(span<log_likelihood_ratio> out, spa
381406 // Increment bit counter `j`
382407 j += 16 ;
383408 }
384- #endif
385- #ifdef HAVE_NEON
409+ #endif // __SSE3__
410+ #ifdef __aarch64__
386411 if (SEQUENCE_PAR_BITS >= 16 ) {
387412 // Preloads bits of interest in the 16 LSB
388413 uint32x2_t c_dup_u32 = vdup_n_u32 (c);
@@ -415,7 +440,7 @@ void pseudo_random_generator_impl::apply_xor(span<log_likelihood_ratio> out, spa
415440 // Increment bit counter `j`
416441 j += 16 ;
417442 }
418- #endif // HAVE_NEON
443+ #endif // __aarch64__
419444 for (; j != SEQUENCE_PAR_BITS; ++j) {
420445 out[i + j] = in[i + j].to_value_type () * (((c >> j) & 1U ) ? -1 : +1 );
421446 }
0 commit comments