Skip to content

Commit 3b91e0b

Browse files
committed
Extract duplication utility, allows compilation without AVX-512
1 parent 908b92f commit 3b91e0b

File tree

2 files changed

+27
-7
lines changed

2 files changed

+27
-7
lines changed

cpp/lib/Filter.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,6 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
6767

6868
static_assert(view.stride(0) == 1); // contiguous in kx
6969

70-
// permutation indices
71-
static_assert(C_WIDTH == 4); // idx are hardcoded
72-
VIdx const lower_idx = {0, 0, 1, 1, 2, 2, 3, 3};
73-
VIdx const upper_idx = {4, 4, 5, 5, 6, 6, 7, 7};
74-
7570
for (int ky = 0; ky < grid.KY; ++ky) {
7671
// avoid std::vector dereference inside loop:
7772
// broadcast fy value into vector
@@ -93,11 +88,11 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
9388
VReal vfx_full{fx_addr};
9489

9590
// Permute lower factors, multiply lower input
96-
VReal lower_fx = _mm512_permutex2var_pd(vfx_full, lower_idx, vfx_full);
91+
VReal lower_fx = duplicateLower(vfx_full);
9792
eve::store(input_lower * lower_fx * vfy, view_addr);
9893

9994
// Permute upper factors, multiply upper input
100-
VReal upper_fx = _mm512_permutex2var_pd(vfx_full, upper_idx, vfx_full);
95+
VReal upper_fx = duplicateUpper(vfx_full);
10196
eve::store(input_upper * upper_fx * vfy, upper_view_addr);
10297
}
10398

@@ -107,4 +102,26 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
107102
}
108103
}
109104
}
105+
106+
#ifdef AVX512_ENABLED
107+
HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateLower(VReal src) const {
108+
static_assert(R_WIDTH == 8);
109+
const VIdx lower_idx{0, 0, 1, 1, 2, 2, 3, 3};
110+
return _mm512_permutex2var_pd(src, lower_idx, src);
111+
}
112+
HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateUpper(VReal src) const {
113+
static_assert(R_WIDTH == 8);
114+
const VIdx upper_idx{4, 4, 5, 5, 6, 6, 7, 7};
115+
return _mm512_permutex2var_pd(src, upper_idx, src);
116+
}
117+
#else
118+
HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateLower(VReal src) const {
119+
static_assert(R_WIDTH == 2);
120+
return _mm_unpacklo_pd(src, src);
121+
}
122+
HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateUpper(VReal src) const {
123+
static_assert(R_WIDTH == 2);
124+
return _mm_unpackhi_pd(src, src);
125+
}
126+
#endif
110127
} // namespace ahr

cpp/lib/Filter.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,8 @@ class HouLiFilterCached1DVector : HouLiFilterCached1D {
4545
using VReal = eve::wide<Real>;
4646
static auto constexpr R_WIDTH = VReal::size();
4747
static auto constexpr C_WIDTH = VReal::size() / 2;
48+
49+
VReal duplicateLower(VReal src) const; ///< duplicate lower half of src
50+
VReal duplicateUpper(VReal src) const; ///< duplicate upper half of src
4851
};
4952
} // namespace ahr

0 commit comments

Comments
 (0)