Skip to content

Commit ce0fb6a

Browse files
committed
Add tile along KY dimension, significant speed improvement (30-60%)
1 parent 3b91e0b commit ce0fb6a

File tree

2 files changed

+20
-10
lines changed

2 files changed

+20
-10
lines changed

cpp/lib/Filter.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,15 @@ void HouLiFilterCached1D::operator()(Grid::View::C_XY view) const {
4646

4747
HouLiFilterCached1DVector::HouLiFilterCached1DVector(Grid const &grid) : HouLiFilterCached1D(grid) {
4848
assert(grid.KX > R_WIDTH);
49+
assert(grid.KY % KY_TILE == 0);
4950
}
5051

5152
void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
5253
// This method applies the HouLi filter to view using vector instructions.
5354
// An array of contiguous complex numbers is simply treated as a real array
5455
// with double the length.
5556
// The code is vectorized along the kx (continuous) dimension.
57+
// To reuse kx-factors, we process KY_TILE rows at a time.
5658
// We load R_WIDTH factors_x and expand them into two registers, duplicating each element.
5759
// That way, each two consecutive real numbers (real and imaginary parts)
5860
// get multiplied with the same factor.
@@ -63,14 +65,16 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
6365
// lower_fx = {kx, kx, kx+1, kx+1, kx+2, kx+2, kx+3, kx+3}
6466
// upper_fx = {kx+4, kx+4, kx+5, kx+5, kx+6, kx+6, kx+7, kx+7}
6567
//
66-
// TODO multiple fy at once to better reuse fx
6768

6869
static_assert(view.stride(0) == 1); // contiguous in kx
6970

70-
for (int ky = 0; ky < grid.KY; ++ky) {
71+
for (int ky = 0; ky < grid.KY; ky += KY_TILE) {
7172
// avoid std::vector dereference inside loop:
7273
// broadcast fy value into vector
73-
VReal vfy{factors_y[ky]};
74+
std::array<VReal, KY_TILE> vfy;
75+
for (int i = 0; i < KY_TILE; ++i) {
76+
vfy[i] = factors_y[ky + i];
77+
}
7478
// prepare iteration address for fx
7579
Real const *fx_addr = factors_x.data();
7680

@@ -79,26 +83,30 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
7983
// Process 1 vector of factors at a time (2 vectors of complex)
8084
for (; kx <= grid.KX - R_WIDTH; kx += R_WIDTH, fx_addr += R_WIDTH) {
8185
// get address for two vectors we're writing to
82-
auto *view_addr = (Real *)&view(kx, ky);
83-
auto *upper_view_addr = view_addr + R_WIDTH;
84-
VReal input_lower{view_addr};
85-
VReal input_upper{upper_view_addr};
86+
auto lower_view_addr = [&](int i) { return (Real *)&view(kx, ky + i); };
87+
auto upper_view_addr = [&](int i) { return (Real *)&view(kx + C_WIDTH, ky + i); };
8688

8789
// Load factors
8890
VReal vfx_full{fx_addr};
8991

9092
// Permute lower factors, multiply lower input
9193
VReal lower_fx = duplicateLower(vfx_full);
92-
eve::store(input_lower * lower_fx * vfy, view_addr);
94+
for (int i = 0; i < KY_TILE; ++i) {
95+
eve::store(VReal{lower_view_addr(i)} * lower_fx * vfy[i], lower_view_addr(i));
96+
}
9397

9498
// Permute upper factors, multiply upper input
9599
VReal upper_fx = duplicateUpper(vfx_full);
96-
eve::store(input_upper * upper_fx * vfy, upper_view_addr);
100+
for (int i = 0; i < KY_TILE; ++i) {
101+
eve::store(VReal{upper_view_addr(i)} * upper_fx * vfy[i], upper_view_addr(i));
102+
}
97103
}
98104

99105
// tail
100106
for (; kx < grid.KX; ++kx) {
101-
view(kx, ky) *= factors_x[kx] * factors_y[ky];
107+
for (int i = 0; i < KY_TILE; ++i) {
108+
view(kx, ky + i) *= factors_x[kx] * factors_y[ky + i];
109+
}
102110
}
103111
}
104112
}

cpp/lib/Filter.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ class HouLiFilterCached1DVector : HouLiFilterCached1D {
4646
static auto constexpr R_WIDTH = VReal::size();
4747
static auto constexpr C_WIDTH = VReal::size() / 2;
4848

49+
static auto constexpr KY_TILE = 4;
50+
4951
VReal duplicateLower(VReal src) const; ///< duplicate lower half of src
5052
VReal duplicateUpper(VReal src) const; ///< duplicate upper half of src
5153
};

0 commit comments

Comments
 (0)