Skip to content

Commit c4cc9a5

Browse files
committed
Batching the zs allows to not have pipeline stalls when using the broadcast
1 parent b1bdbe1 commit c4cc9a5

File tree

1 file changed

+12
-10
lines changed

1 file changed

+12
-10
lines changed

src/spreadinterp.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -722,16 +722,18 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
722722
static constexpr auto horner_coeffs = get_horner_coeffs_200<FLT, w>();
723723
alignas(alignment) static constexpr auto padded_coeffs =
724724
pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);
725-
alignas(alignment) const std::array<batch_t, nc - 1> pow_z =
726-
[](const FLT z) constexpr noexcept {
727-
std::array<batch_t, nc - 1> zs_v{};
728-
auto sz = z;
729-
for (uint8_t i = 0; i < nc - 1; ++i) {
730-
zs_v[i] = batch_t(sz);
731-
sz *= z;
732-
}
733-
return zs_v;
734-
}(z);
725+
const std::array<batch_t, nc - 1> pow_z = [](const FLT z) constexpr noexcept {
726+
std::array<FLT, nc - 1> zs{};
727+
std::array<batch_t, nc - 1> zs_v{};
728+
zs[0] = z;
729+
for (uint8_t i = 1; i < nc - 1; ++i) {
730+
zs[i] = zs[i - 1] * z;
731+
}
732+
for (uint8_t i = 0; i < nc - 1; ++i) {
733+
zs_v[i] = batch_t::broadcast(zs[i]);
734+
}
735+
return zs_v;
736+
}(z);
735737
for (uint8_t i = 0; i < w; i += avx_size) {
736738
auto k = batch_t::load_aligned(padded_coeffs[0].data() + i);
737739
for (uint8_t j = 1; j < nc; ++j) {

0 commit comments

Comments
 (0)