Skip to content

Commit d04ffcf

Browse files
committed
cleanup a bit, a bit slower
1 parent c9fded5 commit d04ffcf

File tree

1 file changed

+37
-53
lines changed

1 file changed

+37
-53
lines changed

src/spreadinterp.cpp

Lines changed: 37 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -794,67 +794,51 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
794794
if constexpr (use_ker_sym) {
795795
static constexpr uint8_t tail = w % simd_size;
796796
static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
797-
static const simd_type zerov(0.0);
797+
static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size;
798+
static constexpr uint8_t end_idx = (w + (tail > 0)) / 2;
798799
const simd_type zv(z);
799800
const simd_type z2v = zv * zv;
800801

801-
// no xsimd::shuffle neeeded if tail is zero
802-
if constexpr (tail) {
803-
// some xsimd constant for shuffle
804-
static constexpr auto shuffle_batch =
805-
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
806-
shuffle_index<tail>>();
807-
808-
// process simd vecs
809-
simd_type k_odd, k_even, k_prev, k_sym = zerov;
810-
for (uint8_t i = 0, offset = w - tail; i < (w + 1) / 2;
811-
i += simd_size, offset -= simd_size) {
812-
k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i)
813-
: zerov;
814-
k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
815-
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) {
816-
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
817-
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
818-
const auto cji_even =
819-
simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
820-
k_even = xsimd::fma(k_even, z2v, cji_even);
802+
// some xsimd constant for shuffle or inverse
803+
static constexpr auto shuffle_batch = []() constexpr noexcept {
804+
if constexpr (tail) {
805+
return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
806+
shuffle_index<tail>>();
807+
} else {
808+
return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
809+
reverse_index<simd_size>>();
810+
}
811+
}();
812+
813+
// process simd vecs
814+
simd_type k_odd, k_even, k_prev, k_sym{0};
815+
for (uint8_t i = 0, offset = offset_start; i < end_idx;
816+
i += simd_size, offset -= simd_size) {
817+
k_odd = [i]() constexpr noexcept {
818+
if constexpr (if_odd_degree) {
819+
return simd_type::load_aligned(padded_coeffs[0].data() + i);
820+
} else {
821+
return simd_type{0};
821822
}
822-
// left part
823-
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
824-
// right part symmetric to the left part
825-
if (offset >= (w + 1) / 2) {
823+
}();
824+
k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
825+
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) {
826+
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
827+
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
828+
const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
829+
k_even = xsimd::fma(k_even, z2v, cji_even);
830+
}
831+
// left part
832+
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
833+
// right part symmetric to the left part
834+
if (offset >= end_idx) {
835+
if constexpr (tail) {
826836
// to use aligned store, we need shuffle the previous k_sym and current k_sym
827837
k_prev = k_sym;
828838
k_sym = xsimd::fnma(k_odd, zv, k_even);
829839
xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
830-
}
831-
}
832-
} else {
833-
// xsimd constants for reverse
834-
static constexpr auto reverse_batch =
835-
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
836-
reverse_index<simd_size>>();
837-
838-
// process simd vecs
839-
for (uint8_t i = 0, offset = w - simd_size; i < w / 2;
840-
i += simd_size, offset -= simd_size) {
841-
auto k_odd = if_odd_degree
842-
? simd_type::load_aligned(padded_coeffs[0].data() + i)
843-
: zerov;
844-
auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
845-
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) {
846-
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
847-
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
848-
const auto cji_even =
849-
simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
850-
k_even = xsimd::fma(k_even, z2v, cji_even);
851-
}
852-
// left part
853-
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
854-
// right part symmetric to the left part
855-
if (offset >= w / 2) {
856-
// reverse the order for symmetric part
857-
xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), reverse_batch)
840+
} else {
841+
xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch)
858842
.store_aligned(ker + offset);
859843
}
860844
}

0 commit comments

Comments
 (0)