Skip to content

Commit 908b92f

Browse files
committed
2 vectors along kx at a time, single vector to load values in, use permutes to rearrange
1 parent 20d7e0f commit 908b92f

File tree

3 files changed

+51
-19
lines changed

3 files changed

+51
-19
lines changed

cpp/lib/Filter.cpp

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "Filter.hpp"
66

77
#include <eve/module/core.hpp>
8+
#include <immintrin.h>
89

910
namespace ahr {
1011
void HouLiFilter::operator()(Grid::View::C_XY view) const {
@@ -43,28 +44,61 @@ void HouLiFilterCached1D::operator()(Grid::View::C_XY view) const {
4344
});
4445
}
4546

46-
HouLiFilterCached1DVector::HouLiFilterCached1DVector(Grid const &grid)
47-
: HouLiFilterCached1D(grid), factors_x_duped(2 * grid.KX) {
48-
assert(grid.KY % C_WIDTH == 0);
49-
for (Dim kx = 0; kx < grid.KX; ++kx) {
50-
factors_x_duped[kx * 2] = factors_x[kx];
51-
factors_x_duped[kx * 2 + 1] = factors_x[kx];
52-
}
47+
HouLiFilterCached1DVector::HouLiFilterCached1DVector(Grid const &grid) : HouLiFilterCached1D(grid) {
48+
assert(grid.KX > R_WIDTH);
5349
}
5450

5551
void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
52+
// This method applies the HouLi filter to view using vector instructions.
53+
// An array of contiguous complex numbers is simply treated as a real array
54+
// with double the length.
55+
// The code is vectorized along the kx (continuous) dimension.
56+
// We load R_WIDTH factors_x and expand them into two registers, duplicating each element.
57+
// That way, each two consecutive real numbers (real and imaginary parts)
58+
// get multiplied with the same factor.
59+
// Finally, we read the complex numbers, multiply with kx- and ky-factors, and write it back.
60+
// Duplication demo:
61+
// vfx_full = {kx, kx+1, kx+2, kx+3, kx+4, kx+5, kx+6, kx+7}
62+
// into
63+
// lower_fx = {kx, kx, kx+1, kx+1, kx+2, kx+2, kx+3, kx+3}
64+
// upper_fx = {kx+4, kx+4, kx+5, kx+5, kx+6, kx+6, kx+7, kx+7}
65+
//
66+
// TODO multiple fy at once to better reuse fx
67+
68+
static_assert(view.stride(0) == 1); // contiguous in kx
69+
70+
// permutation indices
71+
static_assert(C_WIDTH == 4); // idx are hardcoded
72+
VIdx const lower_idx = {0, 0, 1, 1, 2, 2, 3, 3};
73+
VIdx const upper_idx = {4, 4, 5, 5, 6, 6, 7, 7};
74+
5675
for (int ky = 0; ky < grid.KY; ++ky) {
76+
// avoid std::vector dereference inside loop:
77+
// broadcast fy value into vector
5778
VReal vfy{factors_y[ky]};
79+
// prepare iteration address for fx
80+
Real const *fx_addr = factors_x.data();
5881

59-
// avoid vector dereference inside loop
60-
Real *fx_addr = factors_x_duped.data();
6182
int kx = 0;
62-
for (; kx <= grid.KX - C_WIDTH; kx += C_WIDTH, fx_addr += R_WIDTH) {
63-
Real *view_addr = (Real *)&view(kx, ky);
64-
VReal input{view_addr};
65-
VReal vfx{fx_addr};
83+
// Make sure the last element in the 2nd vector isn't past the end
84+
// Process 1 vector of factors at a time (2 vectors of complex)
85+
for (; kx <= grid.KX - R_WIDTH; kx += R_WIDTH, fx_addr += R_WIDTH) {
86+
// get address for two vectors we're writing to
87+
auto *view_addr = (Real *)&view(kx, ky);
88+
auto *upper_view_addr = view_addr + R_WIDTH;
89+
VReal input_lower{view_addr};
90+
VReal input_upper{upper_view_addr};
91+
92+
// Load factors
93+
VReal vfx_full{fx_addr};
94+
95+
// Permute lower factors, multiply lower input
96+
VReal lower_fx = _mm512_permutex2var_pd(vfx_full, lower_idx, vfx_full);
97+
eve::store(input_lower * lower_fx * vfy, view_addr);
6698

67-
eve::store(input * vfx * vfy, view_addr);
99+
// Permute upper factors, multiply upper input
100+
VReal upper_fx = _mm512_permutex2var_pd(vfx_full, upper_idx, vfx_full);
101+
eve::store(input_upper * upper_fx * vfy, upper_view_addr);
68102
}
69103

70104
// tail

cpp/lib/Filter.hpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,9 @@ class HouLiFilterCached1DVector : HouLiFilterCached1D {
4141
void operator()(Grid::View::C_XY view) const;
4242

4343
private:
44+
using VIdx = eve::wide<long long>;
4445
using VReal = eve::wide<Real>;
4546
static auto constexpr R_WIDTH = VReal::size();
4647
static auto constexpr C_WIDTH = VReal::size() / 2;
47-
48-
/// A pre-expanded vector of 2d factors
49-
std::vector<Real> factors_x_duped;
5048
};
51-
5249
} // namespace ahr

cpp/test/filter.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "Filter.hpp"
22
#include "grid.hpp"
33

4+
#include "debug.hpp"
45
#include "util.hpp"
56

67
#include <gtest/gtest.h>
@@ -9,7 +10,7 @@ namespace ahr {
910

1011
template <typename TestedFilter> class TestFilter : public ::testing::Test {
1112
protected:
12-
Grid grid{5, 9, 16};
13+
Grid grid{5, 32, 32};
1314
HouLiFilter filter{grid};
1415
TestedFilter filter_t{grid};
1516
};

0 commit comments

Comments
 (0)