srsran
diff --git a/‎include/srsran/phy/generic_functions/precoding/channel_precoder.h‎
Lines changed: 1 addition & 1 deletion b/‎include/srsran/phy/generic_functions/precoding/channel_precoder.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/srsran/phy/support/re_buffer.h‎
Lines changed: 67 additions & 1 deletion b/‎include/srsran/phy/support/re_buffer.h‎
Lines changed: 67 additions & 1 deletion
diff --git a/‎include/srsran/phy/support/resource_grid_writer.h‎
Lines changed: 26 additions & 1 deletion b/‎include/srsran/phy/support/resource_grid_writer.h‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎lib/phy/generic_functions/precoding/channel_precoder_avx2.cpp‎
Lines changed: 28 additions & 11 deletions b/‎lib/phy/generic_functions/precoding/channel_precoder_avx2.cpp‎
Lines changed: 28 additions & 11 deletions
diff --git a/‎lib/phy/generic_functions/precoding/channel_precoder_avx2.h‎
Lines changed: 1 addition & 1 deletion b/‎lib/phy/generic_functions/precoding/channel_precoder_avx2.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/phy/generic_functions/precoding/channel_precoder_avx512.cpp‎
Lines changed: 36 additions & 13 deletions b/‎lib/phy/generic_functions/precoding/channel_precoder_avx512.cpp‎
Lines changed: 36 additions & 13 deletions
diff --git a/‎lib/phy/generic_functions/precoding/channel_precoder_avx512.h‎
Lines changed: 1 addition & 1 deletion b/‎lib/phy/generic_functions/precoding/channel_precoder_avx512.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/phy/generic_functions/precoding/channel_precoder_generic.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/phy/generic_functions/precoding/channel_precoder_generic.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/phy/generic_functions/precoding/channel_precoder_generic.h‎
Lines changed: 1 addition & 1 deletion b/‎lib/phy/generic_functions/precoding/channel_precoder_generic.h‎
Lines changed: 1 addition & 1 deletion
@@ -46,7 +46,7 @@ class channel_precoder
   /// of RE per layer of the input buffer.
   /// \remark An assertion is triggered if the precoding matrix dimensions are not consistent with input buffer size and
   /// the number of antenna ports of the output buffer.
-  virtual void apply_layer_map_and_precoding(re_buffer_writer<>&            output,
+  virtual void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>&     output,
                                              span<const ci8_t>              input,
                                              const precoding_weight_matrix& precoding) const = 0;
 };
 
@@ -290,7 +290,6 @@ class modular_re_buffer_reader : public re_buffer_reader<T>
   /// \param[in] nof_slices Number of slices.
   /// \param[in] nof_re     Number of resource elements.
   /// \remark An assertion is triggered if the number of slices exceeds \ref max_nof_slices.
-  /// \remark An assertion is triggered if the number of resource elements exceeds \ref max_nof_re.
   void resize(unsigned nof_slices_, unsigned nof_re_)
   {
     nof_slices = nof_slices_;
@@ -344,4 +343,71 @@ class modular_re_buffer_reader : public re_buffer_reader<T>
   std::vector<span<const T>> data;
 };
 
+/// \brief Implements a modular resource element buffer writer.
+///
+/// In this implementation, each slice is a view to an external block of contiguous REs that must be loaded with the
+/// \ref set_slice method.
+///
+/// \tparam T Resource element type.
+template <unsigned MaxNofSlices, typename T = cf_t>
+class modular_re_buffer_writer : public re_buffer_writer<T>
+{
+public:
+  /// \brief Resizes the buffer view to a desired number of RE and slices.
+  /// \param[in] nof_slices Number of slices.
+  /// \param[in] nof_re     Number of resource elements.
+  /// \remark An assertion is triggered if the number of slices exceeds \ref max_nof_slices.
+  void resize(unsigned nof_slices_, unsigned nof_re_)
+  {
+    nof_slices = nof_slices_;
+    nof_re     = nof_re_;
+    srsran_assert(nof_slices <= data.size(),
+                  "The number of slices (i.e., {}) exceeds the maximum (i.e., {}).",
+                  nof_slices,
+                  data.size());
+
+    // Empty all slices.
+    std::fill_n(data.begin(), nof_slices, span<T>());
+  }
+
+  /// \brief Sets the view for a given slice.
+  /// \param[in] i_slice Slice identifier.
+  /// \param[in] view    Slice view.
+  /// \remark An assertion is triggered if the view size is not equal to the number of resource elements.
+  void set_slice(unsigned i_slice, span<T> view)
+  {
+    srsran_assert(view.size() == nof_re,
+                  "The view size (i.e., {}) must be equal to the number of resource elements (i.e., {}).",
+                  view.size(),
+                  nof_re);
+    data[i_slice] = view;
+  }
+
+  // See interface for documentation.
+  unsigned get_nof_slices() const override { return nof_slices; }
+
+  // See interface for documentation.
+  unsigned get_nof_re() const override { return nof_re; }
+
+  // See interface for documentation.
+  span<T> get_slice(unsigned i_slice) override
+  {
+    srsran_assert(i_slice < nof_slices,
+                  "The slice index (i.e., {}) exceeds the number of slices (i.e., {}).",
+                  i_slice,
+                  nof_slices);
+    srsran_assert(!data[i_slice].empty(), "Data for slice {} is empty.", i_slice);
+    return data[i_slice];
+  }
+
+private:
+  /// Current number of slices.
+  unsigned nof_slices;
+  /// Current number of resource elements.
+  unsigned nof_re;
+
+  /// Internal data storage.
+  std::array<span<T>, MaxNofSlices> data;
+};
+
 } // namespace srsran
@@ -37,7 +37,7 @@ class resource_grid_writer : public resource_grid_base
   /// \param[in] mask    Bitset denoting the subcarriers to be written (if \c true), starting from \c k_init.
   /// \param[in] symbols Symbols to be written into the resource grid.
   /// \return A view to the unused entries of \c symbols.
-  /// \note The number of elements of \c mask shall be equal to or greater than the resource grid number of subcarriers.
+  /// \note The number of elements of \c mask shall be equal to or lower than the resource grid number of subcarriers.
   /// \note The number of elements of \c symbols shall be equal to or greater than the number of true elements in
   /// \c mask.
   virtual span<const cf_t> put(unsigned                            port,
@@ -46,6 +46,24 @@ class resource_grid_writer : public resource_grid_base
                                const bounded_bitset<NRE * MAX_RB>& mask,
                                span<const cf_t>                    symbols) = 0;
 
+  /// \brief Puts a number of resource elements in the resource grid at the given port and symbol using a bounded bitset
+  /// to indicate which subcarriers are allocated and which are not.
+  ///
+  /// \param[in] port    Port index.
+  /// \param[in] l       Symbol index.
+  /// \param[in] k_init  Initial subcarrier index.
+  /// \param[in] mask    Bitset denoting the subcarriers to be written (if \c true), starting from \c k_init.
+  /// \param[in] symbols Symbols to be written into the resource grid.
+  /// \return A view to the unused entries of \c symbols.
+  /// \note The number of elements of \c mask shall be equal to or lower than the resource grid number of subcarriers.
+  /// \note The number of elements of \c symbols shall be equal to or greater than the number of true elements in
+  /// \c mask.
+  virtual span<const cbf16_t> put(unsigned                            port,
+                                  unsigned                            l,
+                                  unsigned                            k_init,
+                                  const bounded_bitset<NRE * MAX_RB>& mask,
+                                  span<const cbf16_t>                 symbols) = 0;
+
   /// \brief Puts a consecutive number of resource elements for the given \c port and symbol \c l, starting at \c
   /// k_init.
   ///
@@ -68,6 +86,13 @@ class resource_grid_writer : public resource_grid_base
   /// \note The RE positions given \c k_init, the number of elements in \c symbols and the \c stride shall be within the
   /// resource grid number of subcarriers.
   virtual void put(unsigned port, unsigned l, unsigned k_init, unsigned stride, span<const cf_t> symbols) = 0;
+
+  /// \brief Gets a read-write view of an OFDM symbol for a given port.
+  ///
+  /// \param[in] port Port index.
+  /// \param[in] l    OFDM symbol index.
+  /// \return Resource grid view.
+  virtual span<cbf16_t> get_view(unsigned port, unsigned l) = 0;
 };
 
 } // namespace srsran
@@ -181,7 +181,24 @@ static inline void layer4_map_and_ci8_to_cf(simd_cf_interleaved& out_l0,
   from_ci8_to_cf(out_l0, out_l1, out_l2, out_l3, tmp);
 }
 
-void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&            output,
+inline __m128i ps_to_cbf16(simd_cf_interleaved in)
+{
+  const __m256i bias = _mm256_set1_epi32(0x7fff);
+  const __m256i one  = _mm256_set1_epi32(0x1);
+
+  __m256i a_i32 = _mm256_castps_si256(in);
+
+  // Round to nearest even.
+  a_i32 = _mm256_add_epi32(a_i32, _mm256_add_epi32(bias, _mm256_and_si256(_mm256_srli_epi32(a_i32, 16), one)));
+
+  // Shift right 16 bits.
+  a_i32 = _mm256_srai_epi32(a_i32, 16);
+
+  // Pack both parts in 32-bit registers.
+  return _mm_packs_epi32(_mm256_extractf128_si256(a_i32, 0), _mm256_extractf128_si256(a_i32, 1));
+}
+
+void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>&     output,
                                                           span<const ci8_t>              input,
                                                           const precoding_weight_matrix& precoding) const
 {
@@ -193,7 +210,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
   simd_cf_t weights[precoding_constants::MAX_NOF_PORTS][precoding_constants::MAX_NOF_LAYERS];
 
   // Views to store the precoded symbols.
-  span<cf_t> outputs[precoding_constants::MAX_NOF_PORTS];
+  span<cbf16_t> outputs[precoding_constants::MAX_NOF_PORTS];
 
   for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
     span<const cf_t> port_coeff = precoding.get_port_coefficients(i_port);
@@ -221,10 +238,10 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
         simd_cf_interleaved result3 = infp_3 * weights[i_port][0];
 
         // Store.
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), result1);
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + (2 * AVX2_CF_SIZE)]), result2);
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + (3 * AVX2_CF_SIZE)]), result3);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 0 * AVX2_CF_SIZE]), ps_to_cbf16(result0));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 1 * AVX2_CF_SIZE]), ps_to_cbf16(result1));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 2 * AVX2_CF_SIZE]), ps_to_cbf16(result2));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + 3 * AVX2_CF_SIZE]), ps_to_cbf16(result3));
       }
     }
   }
@@ -245,8 +262,8 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
         simd_cf_interleaved result1 = infp1_l0 * weights[i_port][0] + infp1_l1 * weights[i_port][1];
 
         // Store.
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), result1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result0));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), ps_to_cbf16(result1));
       }
     }
   }
@@ -270,7 +287,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
             infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] + infp_2 * weights[i_port][2];
 
         // Store.
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
       }
     }
   }
@@ -291,7 +308,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
                                      infp_2 * weights[i_port][2] + infp_3 * weights[i_port][3];
 
         // Store.
-        _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
       }
     }
   }
@@ -300,7 +317,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&
   for (; i_re != nof_re; ++i_re) {
     for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
       span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);
-      span<cf_t>       port_re      = output.get_slice(i_port);
+      span<cbf16_t>    port_re      = output.get_slice(i_port);
 
       cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];
       for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {
 
@@ -27,7 +27,7 @@ class channel_precoder_avx2 : public channel_precoder_impl
                             span<const cf_t>          port_weights) const override;
 
   // See interface for documentation.
-  void apply_layer_map_and_precoding(re_buffer_writer<>&            output,
+  void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>&     output,
                                      span<const ci8_t>              input,
                                      const precoding_weight_matrix& precoding) const override;
 };
 
@@ -203,6 +203,27 @@ static inline void layer4_map_and_ci8_to_cf(simd_cf_interleaved& out0,
   from_ci8_to_cf(out0, out1, out2, out3, tmp);
 }
 
+inline __m256i ps_to_cbf16(simd_cf_interleaved in)
+{
+#if __AVX512BF16__
+  return (__m256i)_mm512_cvtneps_pbh(in);
+#else  // __AVX512BF16__
+  const __m512i bias = _mm512_set1_epi32(0x7fff);
+  const __m512i one  = _mm512_set1_epi32(0x1);
+
+  __m512i a_i32 = _mm512_castps_si512(in);
+
+  // Round to nearest even.
+  a_i32 = _mm512_add_epi32(a_i32, _mm512_add_epi32(bias, _mm512_and_si512(_mm512_srli_epi32(a_i32, 16), one)));
+
+  // Shift right 16 bits.
+  a_i32 = _mm512_srli_epi32(a_i32, 16);
+
+  // Pack both parts in 32-bit registers.
+  return _mm512_cvtepi32_epi16(a_i32);
+#endif // __AVX512BF16__
+}
+
 void channel_precoder_avx512::apply_precoding_port(span<cf_t>                port_re,
                                                    const re_buffer_reader<>& input_re,
                                                    span<const cf_t>          port_weights) const
@@ -254,7 +275,7 @@ void channel_precoder_avx512::apply_precoding_port(span<cf_t>                por
   }
 }
 
-void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&            output,
+void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>&     output,
                                                             span<const ci8_t>              input,
                                                             const precoding_weight_matrix& precoding) const
 {
@@ -263,8 +284,8 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
   unsigned nof_ports  = precoding.get_nof_ports();
   unsigned i_re       = 0;
 
-  simd_cf_t  weights[4][4];
-  span<cf_t> outputs[4];
+  simd_cf_t     weights[4][4];
+  span<cbf16_t> outputs[4];
   for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
     span<const cf_t> port_coeff = precoding.get_port_coefficients(i_port);
     outputs[i_port]             = output.get_slice(i_port);
@@ -286,10 +307,10 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
         simd_cf_interleaved result2 = infp_2 * weights[i_port][0];
         simd_cf_interleaved result3 = infp_3 * weights[i_port][0];
 
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 8]), result1);
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 16]), result2);
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 24]), result3);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 0]), ps_to_cbf16(result0));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 8]), ps_to_cbf16(result1));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 16]), ps_to_cbf16(result2));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 24]), ps_to_cbf16(result3));
       }
     }
   }
@@ -306,8 +327,8 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
         simd_cf_interleaved result0 = infp_0 * weights[i_port][0] + infp_2 * weights[i_port][1];
         simd_cf_interleaved result1 = infp_1 * weights[i_port][0] + infp_3 * weights[i_port][1];
 
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX512_CF_SIZE]), result1);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result0));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + AVX512_CF_SIZE]), ps_to_cbf16(result1));
       }
     }
   }
@@ -327,7 +348,7 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
       for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
         simd_cf_interleaved result =
             infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] + infp_2 * weights[i_port][2];
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
       }
     }
   }
@@ -343,21 +364,23 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&
       for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
         simd_cf_interleaved result = infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] +
                                      infp_2 * weights[i_port][2] + infp_3 * weights[i_port][3];
-        _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);
+
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));
       }
     }
   }
 
+  // Generic implementation.
   for (; i_re != nof_re; ++i_re) {
     for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
       span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);
-      span<cf_t>       port_re      = output.get_slice(i_port);
+      span<cbf16_t>    port_re      = output.get_slice(i_port);
 
       cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];
       for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {
         sum += to_cf(input[nof_layers * i_re + i_layer]) * port_weights[i_layer];
       }
-      port_re[i_re] = sum;
+      port_re[i_re] = to_cbf16(sum);
     }
   }
 }
@@ -27,7 +27,7 @@ class channel_precoder_avx512 : public channel_precoder_impl
 
 public:
   // See interface for documentation.
-  void apply_layer_map_and_precoding(re_buffer_writer<>&            output,
+  void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>&     output,
                                      span<const ci8_t>              input,
                                      const precoding_weight_matrix& precoding) const override;
 };
 
@@ -35,7 +35,7 @@ void channel_precoder_generic::apply_precoding_port(span<cf_t>                po
   }
 }
 
-void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<>&            output,
+void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>&     output,
                                                              span<const ci8_t>              input,
                                                              const precoding_weight_matrix& precoding) const
 {
@@ -46,7 +46,7 @@ void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<>&
   for (unsigned i_re = 0; i_re != nof_re; ++i_re) {
     for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {
       span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);
-      span<cf_t>       port_re      = output.get_slice(i_port);
+      span<cbf16_t>    port_re      = output.get_slice(i_port);
 
       cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];
       for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {
 
@@ -27,7 +27,7 @@ class channel_precoder_generic : public channel_precoder_impl
 
 public:
   // See interface for documentation.
-  void apply_layer_map_and_precoding(re_buffer_writer<>&            output,
+  void apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>&     output,
                                      span<const ci8_t>              input,
                                      const precoding_weight_matrix& precoding) const override;
 };
Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,24 @@ static inline void layer4_map_and_ci8_to_cf(simd_cf_interleaved& out_l0,`
`181`	`181`	`from_ci8_to_cf(out_l0, out_l1, out_l2, out_l3, tmp);`
`182`	`182`	`}`
`183`	`183`
`184`		`-void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>& output,`
	`184`	`+inline __m128i ps_to_cbf16(simd_cf_interleaved in)`
	`185`	`+{`
	`186`	`+ const __m256i bias = _mm256_set1_epi32(0x7fff);`
	`187`	`+ const __m256i one = _mm256_set1_epi32(0x1);`
	`188`	`+`
	`189`	`+ __m256i a_i32 = _mm256_castps_si256(in);`
	`190`	`+`
	`191`	`+ // Round to nearest even.`
	`192`	`+ a_i32 = _mm256_add_epi32(a_i32, _mm256_add_epi32(bias, _mm256_and_si256(_mm256_srli_epi32(a_i32, 16), one)));`
	`193`	`+`
	`194`	`+ // Shift right 16 bits.`
	`195`	`+ a_i32 = _mm256_srai_epi32(a_i32, 16);`
	`196`	`+`
	`197`	`+ // Pack both parts in 32-bit registers.`
	`198`	`+ return _mm_packs_epi32(_mm256_extractf128_si256(a_i32, 0), _mm256_extractf128_si256(a_i32, 1));`
	`199`	`+}`
	`200`	`+`
	`201`	`+void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,`
`185`	`202`	`span<const ci8_t> input,`
`186`	`203`	`const precoding_weight_matrix& precoding) const`
`187`	`204`	`{`
`@@ -193,7 +210,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&`
`193`	`210`	`simd_cf_t weights[precoding_constants::MAX_NOF_PORTS][precoding_constants::MAX_NOF_LAYERS];`
`194`	`211`
`195`	`212`	`// Views to store the precoded symbols.`
`196`		`- span<cf_t> outputs[precoding_constants::MAX_NOF_PORTS];`
	`213`	`+ span<cbf16_t> outputs[precoding_constants::MAX_NOF_PORTS];`
`197`	`214`
`198`	`215`	`for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {`
`199`	`216`	`span<const cf_t> port_coeff = precoding.get_port_coefficients(i_port);`
`@@ -221,10 +238,10 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&`
`221`	`238`	`simd_cf_interleaved result3 = infp_3 * weights[i_port][0];`
`222`	`239`
`223`	`240`	`// Store.`
`224`		`- _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);`
`225`		`- _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), result1);`
`226`		`- _mm256_storeu_ps(reinterpret_cast<float>(&outputs[i_port][i_re + (2 AVX2_CF_SIZE)]), result2);`
`227`		`- _mm256_storeu_ps(reinterpret_cast<float>(&outputs[i_port][i_re + (3 AVX2_CF_SIZE)]), result3);`
	`241`	`+ _mm_storeu_si128(reinterpret_cast<__m128i>(&outputs[i_port][i_re + 0 AVX2_CF_SIZE]), ps_to_cbf16(result0));`
	`242`	`+ _mm_storeu_si128(reinterpret_cast<__m128i>(&outputs[i_port][i_re + 1 AVX2_CF_SIZE]), ps_to_cbf16(result1));`
	`243`	`+ _mm_storeu_si128(reinterpret_cast<__m128i>(&outputs[i_port][i_re + 2 AVX2_CF_SIZE]), ps_to_cbf16(result2));`
	`244`	`+ _mm_storeu_si128(reinterpret_cast<__m128i>(&outputs[i_port][i_re + 3 AVX2_CF_SIZE]), ps_to_cbf16(result3));`
`228`	`245`	`}`
`229`	`246`	`}`
`230`	`247`	`}`
`@@ -245,8 +262,8 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&`
`245`	`262`	`simd_cf_interleaved result1 = infp1_l0 * weights[i_port][0] + infp1_l1 * weights[i_port][1];`
`246`	`263`
`247`	`264`	`// Store.`
`248`		`- _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);`
`249`		`- _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), result1);`
	`265`	`+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result0));`
	`266`	`+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re + AVX2_CF_SIZE]), ps_to_cbf16(result1));`
`250`	`267`	`}`
`251`	`268`	`}`
`252`	`269`	`}`
`@@ -270,7 +287,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&`
`270`	`287`	`infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] + infp_2 * weights[i_port][2];`
`271`	`288`
`272`	`289`	`// Store.`
`273`		`- _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);`
	`290`	`+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));`
`274`	`291`	`}`
`275`	`292`	`}`
`276`	`293`	`}`
`@@ -291,7 +308,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&`
`291`	`308`	`infp_2 * weights[i_port][2] + infp_3 * weights[i_port][3];`
`292`	`309`
`293`	`310`	`// Store.`
`294`		`- _mm256_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);`
	`311`	`+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));`
`295`	`312`	`}`
`296`	`313`	`}`
`297`	`314`	`}`
`@@ -300,7 +317,7 @@ void channel_precoder_avx2::apply_layer_map_and_precoding(re_buffer_writer<>&`
`300`	`317`	`for (; i_re != nof_re; ++i_re) {`
`301`	`318`	`for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {`
`302`	`319`	`span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);`
`303`		`- span<cf_t> port_re = output.get_slice(i_port);`
	`320`	`+ span<cbf16_t> port_re = output.get_slice(i_port);`
`304`	`321`
`305`	`322`	`cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];`
`306`	`323`	`for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {`
Original file line number	Diff line number	Diff line change
`@@ -203,6 +203,27 @@ static inline void layer4_map_and_ci8_to_cf(simd_cf_interleaved& out0,`
`203`	`203`	`from_ci8_to_cf(out0, out1, out2, out3, tmp);`
`204`	`204`	`}`
`205`	`205`
	`206`	`+inline __m256i ps_to_cbf16(simd_cf_interleaved in)`
	`207`	`+{`
	`208`	`+#if __AVX512BF16__`
	`209`	`+ return (__m256i)_mm512_cvtneps_pbh(in);`
	`210`	`+#else // __AVX512BF16__`
	`211`	`+ const __m512i bias = _mm512_set1_epi32(0x7fff);`
	`212`	`+ const __m512i one = _mm512_set1_epi32(0x1);`
	`213`	`+`
	`214`	`+ __m512i a_i32 = _mm512_castps_si512(in);`
	`215`	`+`
	`216`	`+ // Round to nearest even.`
	`217`	`+ a_i32 = _mm512_add_epi32(a_i32, _mm512_add_epi32(bias, _mm512_and_si512(_mm512_srli_epi32(a_i32, 16), one)));`
	`218`	`+`
	`219`	`+ // Shift right 16 bits.`
	`220`	`+ a_i32 = _mm512_srli_epi32(a_i32, 16);`
	`221`	`+`
	`222`	`+ // Pack both parts in 32-bit registers.`
	`223`	`+ return _mm512_cvtepi32_epi16(a_i32);`
	`224`	`+#endif // __AVX512BF16__`
	`225`	`+}`
	`226`	`+`
`206`	`227`	`void channel_precoder_avx512::apply_precoding_port(span<cf_t> port_re,`
`207`	`228`	`const re_buffer_reader<>& input_re,`
`208`	`229`	`span<const cf_t> port_weights) const`
`@@ -254,7 +275,7 @@ void channel_precoder_avx512::apply_precoding_port(span<cf_t> por`
`254`	`275`	`}`
`255`	`276`	`}`
`256`	`277`
`257`		`-void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>& output,`
	`278`	`+void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,`
`258`	`279`	`span<const ci8_t> input,`
`259`	`280`	`const precoding_weight_matrix& precoding) const`
`260`	`281`	`{`
`@@ -263,8 +284,8 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&`
`263`	`284`	`unsigned nof_ports = precoding.get_nof_ports();`
`264`	`285`	`unsigned i_re = 0;`
`265`	`286`
`266`		`- simd_cf_t weights[4][4];`
`267`		`- span<cf_t> outputs[4];`
	`287`	`+ simd_cf_t weights[4][4];`
	`288`	`+ span<cbf16_t> outputs[4];`
`268`	`289`	`for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {`
`269`	`290`	`span<const cf_t> port_coeff = precoding.get_port_coefficients(i_port);`
`270`	`291`	`outputs[i_port] = output.get_slice(i_port);`
`@@ -286,10 +307,10 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&`
`286`	`307`	`simd_cf_interleaved result2 = infp_2 * weights[i_port][0];`
`287`	`308`	`simd_cf_interleaved result3 = infp_3 * weights[i_port][0];`
`288`	`309`
`289`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);`
`290`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 8]), result1);`
`291`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 16]), result2);`
`292`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + 24]), result3);`
	`310`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 0]), ps_to_cbf16(result0));`
	`311`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 8]), ps_to_cbf16(result1));`
	`312`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 16]), ps_to_cbf16(result2));`
	`313`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + 24]), ps_to_cbf16(result3));`
`293`	`314`	`}`
`294`	`315`	`}`
`295`	`316`	`}`
`@@ -306,8 +327,8 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&`
`306`	`327`	`simd_cf_interleaved result0 = infp_0 * weights[i_port][0] + infp_2 * weights[i_port][1];`
`307`	`328`	`simd_cf_interleaved result1 = infp_1 * weights[i_port][0] + infp_3 * weights[i_port][1];`
`308`	`329`
`309`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result0);`
`310`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re + AVX512_CF_SIZE]), result1);`
	`330`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result0));`
	`331`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re + AVX512_CF_SIZE]), ps_to_cbf16(result1));`
`311`	`332`	`}`
`312`	`333`	`}`
`313`	`334`	`}`
`@@ -327,7 +348,7 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&`
`327`	`348`	`for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {`
`328`	`349`	`simd_cf_interleaved result =`
`329`	`350`	`infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] + infp_2 * weights[i_port][2];`
`330`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);`
	`351`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));`
`331`	`352`	`}`
`332`	`353`	`}`
`333`	`354`	`}`
`@@ -343,21 +364,23 @@ void channel_precoder_avx512::apply_layer_map_and_precoding(re_buffer_writer<>&`
`343`	`364`	`for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {`
`344`	`365`	`simd_cf_interleaved result = infp_0 * weights[i_port][0] + infp_1 * weights[i_port][1] +`
`345`	`366`	`infp_2 * weights[i_port][2] + infp_3 * weights[i_port][3];`
`346`		`- _mm512_storeu_ps(reinterpret_cast<float*>(&outputs[i_port][i_re]), result);`
	`367`	`+`
	`368`	`+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(&outputs[i_port][i_re]), ps_to_cbf16(result));`
`347`	`369`	`}`
`348`	`370`	`}`
`349`	`371`	`}`
`350`	`372`
	`373`	`+ // Generic implementation.`
`351`	`374`	`for (; i_re != nof_re; ++i_re) {`
`352`	`375`	`for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {`
`353`	`376`	`span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);`
`354`		`- span<cf_t> port_re = output.get_slice(i_port);`
	`377`	`+ span<cbf16_t> port_re = output.get_slice(i_port);`
`355`	`378`
`356`	`379`	`cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];`
`357`	`380`	`for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {`
`358`	`381`	`sum += to_cf(input[nof_layers * i_re + i_layer]) * port_weights[i_layer];`
`359`	`382`	`}`
`360`		`- port_re[i_re] = sum;`
	`383`	`+ port_re[i_re] = to_cbf16(sum);`
`361`	`384`	`}`
`362`	`385`	`}`
`363`	`386`	`}`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ void channel_precoder_generic::apply_precoding_port(span<cf_t> po`
`35`	`35`	`}`
`36`	`36`	`}`
`37`	`37`
`38`		`-void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<>& output,`
	`38`	`+void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<cbf16_t>& output,`
`39`	`39`	`span<const ci8_t> input,`
`40`	`40`	`const precoding_weight_matrix& precoding) const`
`41`	`41`	`{`
`@@ -46,7 +46,7 @@ void channel_precoder_generic::apply_layer_map_and_precoding(re_buffer_writer<>&`
`46`	`46`	`for (unsigned i_re = 0; i_re != nof_re; ++i_re) {`
`47`	`47`	`for (unsigned i_port = 0; i_port != nof_ports; ++i_port) {`
`48`	`48`	`span<const cf_t> port_weights = precoding.get_port_coefficients(i_port);`
`49`		`- span<cf_t> port_re = output.get_slice(i_port);`
	`49`	`+ span<cbf16_t> port_re = output.get_slice(i_port);`
`50`	`50`
`51`	`51`	`cf_t sum = to_cf(input[nof_layers * i_re]) * port_weights[0];`
`52`	`52`	`for (unsigned i_layer = 1; i_layer != nof_layers; ++i_layer) {`