Skip to content

Commit 7edede8

Browse files
committed
phy: optimize PRN
1 parent d8b96d2 commit 7edede8

File tree

1 file changed

+103
-78
lines changed

1 file changed

+103
-78
lines changed

lib/phy/upper/sequence_generators/pseudo_random_generator_impl.cpp

Lines changed: 103 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,15 @@
1010

1111
#include "pseudo_random_generator_impl.h"
1212
#include "srsran/support/math_utils.h"
13-
#include <cassert>
14-
#include <cstring>
13+
#include "srsran/support/srsran_assert.h"
1514

16-
#if HAVE_SSE
15+
#if __SSE3__
1716
#include <immintrin.h>
18-
#endif // HAVE_SSE
17+
#endif // __SSE3__
1918

20-
#ifdef HAVE_NEON
19+
#ifdef __aarch64__
2120
#include <arm_neon.h>
22-
#endif
21+
#endif // __aarch64__
2322

2423
using namespace srsran;
2524

@@ -118,7 +117,7 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
118117
uint32_t c = (uint32_t)(x1 ^ x2);
119118

120119
uint32_t j = 0;
121-
#ifdef HAVE_SSE
120+
#ifdef __SSE3__
122121
for (; j < SEQUENCE_PAR_BITS - 3; j += 4) {
123122
// Preloads bits of interest in the 4 LSB
124123
__m128i mask = _mm_set1_epi32(c >> j);
@@ -140,8 +139,8 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
140139

141140
_mm_storeu_ps(&out[i + j], v);
142141
}
143-
#endif
144-
#ifdef HAVE_NEON
142+
#endif // __SSE3__
143+
#ifdef __aarch64__
145144
for (; j < SEQUENCE_PAR_BITS - 3; j += 4) {
146145
// Preloads bits of interest in the 4 LSB
147146
int32x4_t mask_s32 = vdupq_n_s32(c >> j);
@@ -165,7 +164,7 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
165164

166165
vst1q_f32(&out[i + j], v);
167166
}
168-
#endif // HAVE_NEON
167+
#endif // __aarch64__
169168
// Finish the parallel bits with generic code
170169
for (; j != SEQUENCE_PAR_BITS; ++j) {
171170
FLOAT_U32_XOR(out[i + j], value, (c << (31U - j)) & 0x80000000);
@@ -188,7 +187,10 @@ void pseudo_random_generator_impl::generate(span<float> out, float value)
188187

189188
void pseudo_random_generator_impl::apply_xor(bit_buffer& out, const bit_buffer& in)
190189
{
191-
assert(in.size() == out.size());
190+
srsran_assert(in.size() == out.size(),
191+
"Input size (i.e., {}) and output size (i.e., {}) must be equal.",
192+
in.size(),
193+
out.size());
192194

193195
static constexpr unsigned BITS_PER_BYTE = 8;
194196
static constexpr unsigned NOF_PAR_BYTES = 3;
@@ -252,98 +254,121 @@ void pseudo_random_generator_impl::apply_xor(bit_buffer& out, const bit_buffer&
252254

253255
void pseudo_random_generator_impl::apply_xor(span<uint8_t> out, span<const uint8_t> in)
254256
{
255-
assert(in.size() == out.size());
257+
srsran_assert(in.size() == out.size(),
258+
"Input size (i.e., {}) and output size (i.e., {}) must be equal.",
259+
in.size(),
260+
out.size());
261+
262+
// The optimal number of parallel bits to process is 16.
263+
static constexpr unsigned nof_par_bits = 16;
256264

257265
unsigned i = 0;
258266
unsigned length = in.size();
259267

260-
if (length >= SEQUENCE_PAR_BITS) {
261-
for (unsigned max_i = length - (SEQUENCE_PAR_BITS - 1); i < max_i; i += SEQUENCE_PAR_BITS) {
262-
uint32_t c = (uint32_t)(x1 ^ x2);
268+
for (unsigned i_end = (length / nof_par_bits) * nof_par_bits; i != i_end; i += nof_par_bits) {
269+
uint32_t c = (x1 ^ x2);
263270

264-
uint32_t j = 0;
265-
#ifdef HAVE_SSE
266-
if (SEQUENCE_PAR_BITS >= 16) {
267-
// Preloads bits of interest in the 16 LSB
268-
__m128i mask = _mm_set1_epi32(c);
269-
mask = _mm_shuffle_epi8(mask, _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1));
271+
uint32_t j = 0;
272+
#ifdef __SSE3__
273+
if (nof_par_bits >= 16) {
274+
// Preloads bits of interest in the 16 LSB
275+
__m128i mask = _mm_set1_epi32(c);
276+
mask = _mm_shuffle_epi8(mask, _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1));
270277

271-
// Masks each bit
272-
// mask = _mm_and_si128( mask, _mm_set_epi64x(0x0102040810204080, 0x0102040810204080));
273-
mask = _mm_and_si128(mask, _mm_set_epi64x(0x8040201008040201, 0x8040201008040201));
278+
// Masks each bit.
279+
mask = _mm_and_si128(mask, _mm_set_epi64x(0x8040201008040201, 0x8040201008040201));
274280

275-
// Get non zero mask
276-
mask = _mm_cmpeq_epi8(mask, _mm_set_epi64x(0x8040201008040201, 0x8040201008040201));
281+
// Get non zero mask.
282+
mask = _mm_cmpeq_epi8(mask, _mm_set_epi64x(0x8040201008040201, 0x8040201008040201));
277283

278-
// Reduce to 1s and 0s
279-
mask = _mm_and_si128(mask, _mm_set1_epi8(1));
284+
// Reduce to 1s and 0s.
285+
mask = _mm_and_si128(mask, _mm_set1_epi8(1));
280286

281-
// Load input
282-
__m128i v = _mm_loadu_si128((__m128i*)(&in[i + j]));
287+
// Load input.
288+
__m128i v = _mm_loadu_si128((__m128i*)(&in[i + j]));
283289

284-
// Apply XOR
285-
v = _mm_xor_si128(mask, v);
290+
// Apply XOR.
291+
v = _mm_xor_si128(mask, v);
286292

287-
_mm_storeu_si128((__m128i*)(&out[i + j]), v);
293+
// Store output.
294+
_mm_storeu_si128((__m128i*)(&out[i + j]), v);
288295

289-
// Increment bit counter `j`
290-
j += 16;
291-
}
292-
#endif
293-
#ifdef HAVE_NEON
294-
if (SEQUENCE_PAR_BITS >= 16) {
295-
// Preloads bits of interest in the 16 LSB
296-
uint32x2_t c_dup_u32 = vdup_n_u32(c);
297-
uint8x16_t mask_u8 = vcombine_u8(vdup_lane_u8(vreinterpret_u8_u32(c_dup_u32), 0),
298-
vdup_lane_u8(vreinterpret_u8_u32(c_dup_u32), 1));
296+
// Increment bit counter within the word.
297+
j += 16;
299298

300-
// Create bit masks
301-
const uint8_t bit_masks[8] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
302-
const uint8x16_t bit_masks_u8 = vcombine_u8(vcreate_u8(*(reinterpret_cast<const uint64_t*>(bit_masks))),
303-
vcreate_u8(*(reinterpret_cast<const uint64_t*>(bit_masks))));
304-
// Mask each bit
305-
mask_u8 = vandq_u8(mask_u8, bit_masks_u8);
299+
// Shift c.
300+
c = c >> 16U;
301+
}
302+
#endif // __SSE3__
303+
#ifdef __aarch64__
304+
if (nof_par_bits >= 16) {
305+
// Preloads bits of interest in the 16 LSB.
306+
uint32x2_t c_dup_u32 = vdup_n_u32(c);
307+
uint8x16_t mask_u8 =
308+
vcombine_u8(vdup_lane_u8(vreinterpret_u8_u32(c_dup_u32), 0), vdup_lane_u8(vreinterpret_u8_u32(c_dup_u32), 1));
306309

307-
// Get non zero mask
308-
mask_u8 = vceqq_u8(mask_u8, bit_masks_u8);
310+
// Create bit masks.
311+
const uint8_t bit_masks[8] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
312+
const uint8x16_t bit_masks_u8 = vcombine_u8(vcreate_u8(*(reinterpret_cast<const uint64_t*>(bit_masks))),
313+
vcreate_u8(*(reinterpret_cast<const uint64_t*>(bit_masks))));
314+
// Mask each bit.
315+
mask_u8 = vandq_u8(mask_u8, bit_masks_u8);
309316

310-
// Reduce to 1s and 0s
311-
mask_u8 = vandq_u8(mask_u8, vdupq_n_u8(1));
317+
// Get non zero mask.
318+
mask_u8 = vceqq_u8(mask_u8, bit_masks_u8);
312319

313-
// Load input
314-
uint8x16_t v = vld1q_u8(&in[i + j]);
320+
// Reduce to 1s and 0s.
321+
mask_u8 = vandq_u8(mask_u8, vdupq_n_u8(1));
315322

316-
// Apply XOR
317-
v = veorq_u8(mask_u8, v);
323+
// Load input.
324+
uint8x16_t v = vld1q_u8(&in[i + j]);
318325

319-
vst1q_u8(&out[i + j], v);
326+
// Apply XOR.
327+
v = veorq_u8(mask_u8, v);
320328

321-
// Increment bit counter `j`
322-
j += 16;
323-
}
324-
#endif // HAVE_NEON
325-
for (; j < SEQUENCE_PAR_BITS; j++) {
326-
out[i + j] = in[i + j] ^ ((c >> j) & 1U);
327-
}
329+
// Store output.
330+
vst1q_u8(&out[i + j], v);
328331

329-
// Step sequences
330-
x1 = step_par_x1(x1);
331-
x2 = step_par_x2(x2);
332+
// Increment bit counter within the word.
333+
j += 16;
334+
335+
// Shift c.
336+
c = c >> 16U;
337+
}
338+
#endif // __aarch64__
339+
340+
// Apply mask to remainder bits.
341+
for (; j != nof_par_bits; ++j) {
342+
out[i + j] = in[i + j] ^ (c & 1U);
343+
c = c >> 1U;
332344
}
345+
346+
// Step sequences.
347+
x1 = step_par_x1(x1, nof_par_bits);
348+
x2 = step_par_x2(x2, nof_par_bits);
333349
}
334350

335-
for (; i != length; ++i) {
336-
out[i] = in[i] ^ ((x1 ^ x2) & 1U);
351+
// Number of remainder bits.
352+
unsigned remainder = length - i;
337353

338-
// Step sequences
339-
x1 = step_x1(x1);
340-
x2 = step_x2(x2);
354+
// Apply remainder bits.
355+
uint32_t c = (x1 ^ x2);
356+
for (; i != length; ++i) {
357+
out[i] = in[i] ^ (c & 1U);
358+
c = c >> 1U;
341359
}
360+
361+
// Step remainder bits.
362+
x1 = step_par_x1(x1, remainder);
363+
x2 = step_par_x2(x2, remainder);
342364
}
343365

344366
void pseudo_random_generator_impl::apply_xor(span<log_likelihood_ratio> out, span<const log_likelihood_ratio> in)
345367
{
346-
assert(in.size() == out.size());
368+
srsran_assert(in.size() == out.size(),
369+
"Input size (i.e., {}) and output size (i.e., {}) must be equal.",
370+
in.size(),
371+
out.size());
347372

348373
unsigned i = 0;
349374
unsigned length = in.size();
@@ -353,7 +378,7 @@ void pseudo_random_generator_impl::apply_xor(span<log_likelihood_ratio> out, spa
353378
uint32_t c = (uint32_t)(x1 ^ x2);
354379

355380
uint32_t j = 0;
356-
#ifdef HAVE_SSE
381+
#ifdef __SSE3__
357382
if (SEQUENCE_PAR_BITS >= 16) {
358383
// Preloads bits of interest in the 16 LSB
359384
__m128i mask = _mm_set1_epi32(c);
@@ -381,8 +406,8 @@ void pseudo_random_generator_impl::apply_xor(span<log_likelihood_ratio> out, spa
381406
// Increment bit counter `j`
382407
j += 16;
383408
}
384-
#endif
385-
#ifdef HAVE_NEON
409+
#endif // __SSE3__
410+
#ifdef __aarch64__
386411
if (SEQUENCE_PAR_BITS >= 16) {
387412
// Preloads bits of interest in the 16 LSB
388413
uint32x2_t c_dup_u32 = vdup_n_u32(c);
@@ -415,7 +440,7 @@ void pseudo_random_generator_impl::apply_xor(span<log_likelihood_ratio> out, spa
415440
// Increment bit counter `j`
416441
j += 16;
417442
}
418-
#endif // HAVE_NEON
443+
#endif // __aarch64__
419444
for (; j != SEQUENCE_PAR_BITS; ++j) {
420445
out[i + j] = in[i + j].to_value_type() * (((c >> j) & 1U) ? -1 : +1);
421446
}

0 commit comments

Comments
 (0)