Skip to content

Commit 9463865

Browse files
committed
phy: direct precoder mapping in BF16
phy: AVX512 precoder could use BF16 extensions apply clang-format patch phy: fix precoder compilation for AVX2 phy: review documentation phy: review resource grid documentation phy: fix compilation
1 parent b3a6145 commit 9463865

25 files changed

+416
-142
lines changed

include/srsran/phy/generic_functions/precoding/channel_precoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class channel_precoder
4646
/// of RE per layer of the input buffer.
4747
/// \remark An assertion is triggered if the precoding matrix dimensions are not consistent with input buffer size and
4848
/// the number of antenna ports of the output buffer.
49-
virtual void apply_layer_map_and_precoding(re_buffer_writer<>& output,
49+
virtual void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,
5050
span<const ci8_t> input,
5151
const precoding_weight_matrix& precoding) const = 0;
5252
};

include/srsran/phy/support/re_buffer.h

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,6 @@ class modular_re_buffer_reader : public re_buffer_reader<T>
290290
/// \param[in] nof_slices Number of slices.
291291
/// \param[in] nof_re Number of resource elements.
292292
/// \remark An assertion is triggered if the number of slices exceeds \ref max_nof_slices.
293-
/// \remark An assertion is triggered if the number of resource elements exceeds \ref max_nof_re.
294293
void resize(unsigned nof_slices_, unsigned nof_re_)
295294
{
296295
nof_slices = nof_slices_;
@@ -344,4 +343,71 @@ class modular_re_buffer_reader : public re_buffer_reader<T>
344343
std::vector<span<const T>> data;
345344
};
346345

346+
/// \brief Implements a modular resource element buffer writer.
347+
///
348+
/// In this implementation, each slice is a view to an external block of contiguous REs that must be loaded with the
349+
/// \ref set_slice method.
350+
///
351+
/// \tparam T Resource element type.
352+
template <unsigned MaxNofSlices, typename T = cf_t>
353+
class modular_re_buffer_writer : public re_buffer_writer<T>
354+
{
355+
public:
356+
/// \brief Resizes the buffer view to a desired number of RE and slices.
357+
/// \param[in] nof_slices Number of slices.
358+
/// \param[in] nof_re Number of resource elements.
359+
/// \remark An assertion is triggered if the number of slices exceeds \ref max_nof_slices.
360+
void resize(unsigned nof_slices_, unsigned nof_re_)
361+
{
362+
nof_slices = nof_slices_;
363+
nof_re = nof_re_;
364+
srsran_assert(nof_slices <= data.size(),
365+
"The number of slices (i.e., {}) exceeds the maximum (i.e., {}).",
366+
nof_slices,
367+
data.size());
368+
369+
// Empty all slices.
370+
std::fill_n(data.begin(), nof_slices, span<T>());
371+
}
372+
373+
/// \brief Sets the view for a given slice.
374+
/// \param[in] i_slice Slice identifier.
375+
/// \param[in] view Slice view.
376+
/// \remark An assertion is triggered if the view size is not equal to the number of resource elements.
377+
void set_slice(unsigned i_slice, span<T> view)
378+
{
379+
srsran_assert(view.size() == nof_re,
380+
"The view size (i.e., {}) must be equal to the number of resource elements (i.e., {}).",
381+
view.size(),
382+
nof_re);
383+
data[i_slice] = view;
384+
}
385+
386+
// See interface for documentation.
387+
unsigned get_nof_slices() const override { return nof_slices; }
388+
389+
// See interface for documentation.
390+
unsigned get_nof_re() const override { return nof_re; }
391+
392+
// See interface for documentation.
393+
span<T> get_slice(unsigned i_slice) override
394+
{
395+
srsran_assert(i_slice < nof_slices,
396+
"The slice index (i.e., {}) exceeds the number of slices (i.e., {}).",
397+
i_slice,
398+
nof_slices);
399+
srsran_assert(!data[i_slice].empty(), "Data for slice {} is empty.", i_slice);
400+
return data[i_slice];
401+
}
402+
403+
private:
404+
/// Current number of slices.
405+
unsigned nof_slices;
406+
/// Current number of resource elements.
407+
unsigned nof_re;
408+
409+
/// Internal data storage.
410+
std::array<span<T>, MaxNofSlices> data;
411+
};
412+
347413
} // namespace srsran

include/srsran/phy/support/resource_grid_writer.h

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class resource_grid_writer : public resource_grid_base
3737
/// \param[in] mask Bitset denoting the subcarriers to be written (if \c true), starting from \c k_init.
3838
/// \param[in] symbols Symbols to be written into the resource grid.
3939
/// \return A view to the unused entries of \c symbols.
40-
/// \note The number of elements of \c mask shall be equal to or greater than the resource grid number of subcarriers.
40+
/// \note The number of elements of \c mask shall be equal to or lower than the resource grid number of subcarriers.
4141
/// \note The number of elements of \c symbols shall be equal to or greater than the number of true elements in
4242
/// \c mask.
4343
virtual span<const cf_t> put(unsigned port,
@@ -46,6 +46,24 @@ class resource_grid_writer : public resource_grid_base
4646
const bounded_bitset<NRE * MAX_RB>& mask,
4747
span<const cf_t> symbols) = 0;
4848

49+
/// \brief Puts a number of resource elements in the resource grid at the given port and symbol using a bounded bitset
50+
/// to indicate which subcarriers are allocated and which are not.
51+
///
52+
/// \param[in] port Port index.
53+
/// \param[in] l Symbol index.
54+
/// \param[in] k_init Initial subcarrier index.
55+
/// \param[in] mask Bitset denoting the subcarriers to be written (if \c true), starting from \c k_init.
56+
/// \param[in] symbols Symbols to be written into the resource grid.
57+
/// \return A view to the unused entries of \c symbols.
58+
/// \note The number of elements of \c mask shall be equal to or lower than the resource grid number of subcarriers.
59+
/// \note The number of elements of \c symbols shall be equal to or greater than the number of true elements in
60+
/// \c mask.
61+
virtual span<const cbf16_t> put(unsigned port,
62+
unsigned l,
63+
unsigned k_init,
64+
const bounded_bitset<NRE * MAX_RB>& mask,
65+
span<const cbf16_t> symbols) = 0;
66+
4967
/// \brief Puts a consecutive number of resource elements for the given \c port and symbol \c l, starting at \c
5068
/// k_init.
5169
///
@@ -68,6 +86,13 @@ class resource_grid_writer : public resource_grid_base
6886
/// \note The RE positions given \c k_init, the number of elements in \c symbols and the \c stride shall be within the
6987
/// resource grid number of subcarriers.
7088
virtual void put(unsigned port, unsigned l, unsigned k_init, unsigned stride, span<const cf_t> symbols) = 0;
89+
90+
/// \brief Gets a read-write view of an OFDM symbol for a given port.
91+
///
92+
/// \param[in] port Port index.
93+
/// \param[in] l OFDM symbol index.
94+
/// \return Resource grid view.
95+
virtual span<cbf16_t> get_view(unsigned port, unsigned l) = 0;
7196
};
7297

7398
} // namespace srsran

lib/phy/generic_functions/precoding/channel_precoder_avx2.cpp

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,24 @@ static inline void layer4_map_and_ci8_to_cf(simd_cf_interleaved& out_l0,
181181
from_ci8_to_cf(out_l0, out_l1, out_l2, out_l3, tmp);
182182
}
183183

184-
void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>& output,
184+
inline __m128i ps_to_cbf16(simd_cf_interleaved in)
185+
{
186+
const __m256i bias = _mm256_set1_epi32(0x7fff);
187+
const __m256i one = _mm256_set1_epi32(0x1);
188+
189+
__m256i a_i32 = _mm256_castps_si256(in);
190+
191+
// Round to nearest even.
192+
a_i32 = _mm256_add_epi32(a_i32, _mm256_add_epi32(bias, _mm256_and_si256(_mm256_srli_epi32(a_i32, 16), one)));
193+
194+
// Shift right 16 bits.
195+
a_i32 = _mm256_srai_epi32(a_i32, 16);
196+
197+
// Pack both parts in 32-bit registers.
198+
return _mm_packs_epi32(_mm256_extractf128_si256(a_i32, 0), _mm256_extractf128_si256(a_i32, 1));
199+
}
200+
201+
void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,
185202
span<const ci8_t> input,
186203
const precoding_weight_matrix& precoding) const
187204
{
@@ -193,7 +210,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
193210
simd_cf_t weights[precoding_constants::MAX_NOF_PORTS][precoding_constants::MAX_NOF_LAYERS];
194211

195212
// Views to store the precoded symbols.
196-
span<cf_t> outputs[precoding_constants::MAX_NOF_PORTS];
213+
span<cbf16_t> outputs[precoding_constants::MAX_NOF_PORTS];
197214

198215
for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
199216
span<const cf_t> port_coeff = precoding.get_port_coefficients(i_port);
@@ -221,10 +238,10 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
221238
simd_cf_interleaved result3 = infp_3 * weights[i_port][0];
222239

223240
// Store.
224-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
225-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), result1);
226-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + (2 * AVX2_CF_SIZE)]), result2);
227-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + (3 * AVX2_CF_SIZE)]), result3);
241+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 0 * AVX2_CF_SIZE]), ps_to_cbf16(result0));
242+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 1 * AVX2_CF_SIZE]), ps_to_cbf16(result1));
243+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 2 * AVX2_CF_SIZE]), ps_to_cbf16(result2));
244+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 3 * AVX2_CF_SIZE]), ps_to_cbf16(result3));
228245
}
229246
}
230247
}
@@ -245,8 +262,8 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
245262
simd_cf_interleaved result1 = infp1_l0 * weights[i_port][0] + infp1_l1 * weights[i_port][1];
246263

247264
// Store.
248-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
249-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), result1);
265+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result0));
266+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), ps_to_cbf16(result1));
250267
}
251268
}
252269
}
@@ -270,7 +287,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
270287
infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] + infp_2 * weights[i_port][2];
271288

272289
// Store.
273-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
290+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
274291
}
275292
}
276293
}
@@ -291,7 +308,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
291308
infp_2 * weights[i_port][2] + infp_3 * weights[i_port][3];
292309

293310
// Store.
294-
_mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
311+
_mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
295312
}
296313
}
297314
}
@@ -300,7 +317,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
300317
for (; i_re != nof_re; ++i_re) {
301318
for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
302319
span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);
303-
span<cf_t> port_re = output.get_slice(i_port);
320+
span<cbf16_t> port_re = output.get_slice(i_port);
304321

305322
cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];
306323
for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {

lib/phy/generic_functions/precoding/channel_precoder_avx2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class channel_precoder_avx2 : public channel_precoder_impl
2727
span<const cf_t> port_weights) const override;
2828

2929
// See interface for documentation.
30-
void apply_layer_map_and_precoding(re_buffer_writer<>& output,
30+
void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,
3131
span<const ci8_t> input,
3232
const precoding_weight_matrix& precoding) const override;
3333
};

lib/phy/generic_functions/precoding/channel_precoder_avx512.cpp

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,27 @@ static inline void layer4_map_and_ci8_to_cf(simd_cf_interleaved& out0,
203203
from_ci8_to_cf(out0, out1, out2, out3, tmp);
204204
}
205205

206+
inline __m256i ps_to_cbf16(simd_cf_interleaved in)
207+
{
208+
#if __AVX512BF16__
209+
return (__m256i)_mm512_cvtneps_pbh(in);
210+
#else // __AVX512BF16__
211+
const __m512i bias = _mm512_set1_epi32(0x7fff);
212+
const __m512i one = _mm512_set1_epi32(0x1);
213+
214+
__m512i a_i32 = _mm512_castps_si512(in);
215+
216+
// Round to nearest even.
217+
a_i32 = _mm512_add_epi32(a_i32, _mm512_add_epi32(bias, _mm512_and_si512(_mm512_srli_epi32(a_i32, 16), one)));
218+
219+
// Shift right 16 bits.
220+
a_i32 = _mm512_srli_epi32(a_i32, 16);
221+
222+
// Pack both parts in 32-bit registers.
223+
return _mm512_cvtepi32_epi16(a_i32);
224+
#endif // __AVX512BF16__
225+
}
226+
206227
void channel_precoder_avx512::apply_precoding_port(span<cf_t> port_re,
207228
const re_buffer_reader<>& input_re,
208229
span<const cf_t> port_weights) const
@@ -254,7 +275,7 @@ void channel_precoder_avx512::apply_precoding_port(span<cf_t> por
254275
}
255276
}
256277

257-
void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>& output,
278+
void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,
258279
span<const ci8_t> input,
259280
const precoding_weight_matrix& precoding) const
260281
{
@@ -263,8 +284,8 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
263284
unsigned nof_ports = precoding.get_nof_ports();
264285
unsigned i_re = 0;
265286

266-
simd_cf_t weights[4][4];
267-
span<cf_t> outputs[4];
287+
simd_cf_t weights[4][4];
288+
span<cbf16_t> outputs[4];
268289
for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
269290
span<const cf_t> port_coeff = precoding.get_port_coefficients(i_port);
270291
outputs[i_port] = output.get_slice(i_port);
@@ -286,10 +307,10 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
286307
simd_cf_interleaved result2 = infp_2 * weights[i_port][0];
287308
simd_cf_interleaved result3 = infp_3 * weights[i_port][0];
288309

289-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
290-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 8]), result1);
291-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 16]), result2);
292-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 24]), result3);
310+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 0]), ps_to_cbf16(result0));
311+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 8]), ps_to_cbf16(result1));
312+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 16]), ps_to_cbf16(result2));
313+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 24]), ps_to_cbf16(result3));
293314
}
294315
}
295316
}
@@ -306,8 +327,8 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
306327
simd_cf_interleaved result0 = infp_0 * weights[i_port][0] + infp_2 * weights[i_port][1];
307328
simd_cf_interleaved result1 = infp_1 * weights[i_port][0] + infp_3 * weights[i_port][1];
308329

309-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
310-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX512_CF_SIZE]), result1);
330+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result0));
331+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + AVX512_CF_SIZE]), ps_to_cbf16(result1));
311332
}
312333
}
313334
}
@@ -327,7 +348,7 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
327348
for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
328349
simd_cf_interleaved result =
329350
infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] + infp_2 * weights[i_port][2];
330-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
351+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
331352
}
332353
}
333354
}
@@ -343,21 +364,23 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
343364
for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
344365
simd_cf_interleaved result = infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] +
345366
infp_2 * weights[i_port][2] + infp_3 * weights[i_port][3];
346-
_mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
367+
368+
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
347369
}
348370
}
349371
}
350372

373+
// Generic implementation.
351374
for (; i_re != nof_re; ++i_re) {
352375
for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
353376
span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);
354-
span<cf_t> port_re = output.get_slice(i_port);
377+
span<cbf16_t> port_re = output.get_slice(i_port);
355378

356379
cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];
357380
for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {
358381
sum += to_cf(input[nof_layers * i_re + i_layer]) * port_weights[i_layer];
359382
}
360-
port_re[i_re] = sum;
383+
port_re[i_re] = to_cbf16(sum);
361384
}
362385
}
363386
}

lib/phy/generic_functions/precoding/channel_precoder_avx512.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class channel_precoder_avx512 : public channel_precoder_impl
2727

2828
public:
2929
// See interface for documentation.
30-
void apply_layer_map_and_precoding(re_buffer_writer<>& output,
30+
void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,
3131
span<const ci8_t> input,
3232
const precoding_weight_matrix& precoding) const override;
3333
};

lib/phy/generic_functions/precoding/channel_precoder_generic.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ void channel_precoder_generic::apply_precoding_port(span<cf_t> po
3535
}
3636
}
3737

38-
void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<>& output,
38+
void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,
3939
span<const ci8_t> input,
4040
const precoding_weight_matrix& precoding) const
4141
{
@@ -46,7 +46,7 @@ void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<>&
4646
for (unsigned i_re = 0; i_re != nof_re; ++i_re) {
4747
for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
4848
span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);
49-
span<cf_t> port_re = output.get_slice(i_port);
49+
span<cbf16_t> port_re = output.get_slice(i_port);
5050

5151
cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];
5252
for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {

lib/phy/generic_functions/precoding/channel_precoder_generic.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class channel_precoder_generic : public channel_precoder_impl
2727

2828
public:
2929
// See interface for documentation.
30-
void apply_layer_map_and_precoding(re_buffer_writer<>& output,
30+
void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,
3131
span<const ci8_t> input,
3232
const precoding_weight_matrix& precoding) const override;
3333
};

0 commit comments

Comments
 (0)