@@ -46,13 +46,15 @@ void HouLiFilterCached1D::operator()(Grid::View::C_XY view) const {
46
46
47
47
HouLiFilterCached1DVector::HouLiFilterCached1DVector (Grid const &grid) : HouLiFilterCached1D(grid) {
48
48
assert (grid.KX > R_WIDTH);
49
+ assert (grid.KY % KY_TILE == 0 );
49
50
}
50
51
51
52
void HouLiFilterCached1DVector::operator ()(Grid::View::C_XY view) const {
52
53
// This method applies the HouLi filter to view using vector instructions.
53
54
// An array of contiguous complex numbers is simply treated as a real array
54
55
// with double the length.
55
56
// The code is vectorized along the kx (continuous) dimension.
57
+ // To reuse kx-factors, we process KY_TILE rows at a time.
56
58
// We load R_WIDTH factors_x and expand them into two registers, duplicating each element.
57
59
// That way, each two consecutive real numbers (real and imaginary parts)
58
60
// get multiplied with the same factor.
@@ -63,14 +65,16 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
63
65
// lower_fx = {kx, kx, kx+1, kx+1, kx+2, kx+2, kx+3, kx+3}
64
66
// upper_fx = {kx+4, kx+4, kx+5, kx+5, kx+6, kx+6, kx+7, kx+7}
65
67
//
66
- // TODO multiple fy at once to better reuse fx
67
68
68
69
static_assert (view.stride (0 ) == 1 ); // contiguous in kx
69
70
70
- for (int ky = 0 ; ky < grid.KY ; ++ky ) {
71
+ for (int ky = 0 ; ky < grid.KY ; ky += KY_TILE ) {
71
72
// avoid std::vector dereference inside loop:
72
73
// broadcast fy value into vector
73
- VReal vfy{factors_y[ky]};
74
+ std::array<VReal, KY_TILE> vfy;
75
+ for (int i = 0 ; i < KY_TILE; ++i) {
76
+ vfy[i] = factors_y[ky + i];
77
+ }
74
78
// prepare iteration address for fx
75
79
Real const *fx_addr = factors_x.data ();
76
80
@@ -79,26 +83,30 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
79
83
// Process 1 vector of factors at a time (2 vectors of complex)
80
84
for (; kx <= grid.KX - R_WIDTH; kx += R_WIDTH, fx_addr += R_WIDTH) {
81
85
// get address for two vectors we're writing to
82
- auto *view_addr = (Real *)&view (kx, ky);
83
- auto *upper_view_addr = view_addr + R_WIDTH;
84
- VReal input_lower{view_addr};
85
- VReal input_upper{upper_view_addr};
86
+ auto lower_view_addr = [&](int i) { return (Real *)&view (kx, ky + i); };
87
+ auto upper_view_addr = [&](int i) { return (Real *)&view (kx + C_WIDTH, ky + i); };
86
88
87
89
// Load factors
88
90
VReal vfx_full{fx_addr};
89
91
90
92
// Permute lower factors, multiply lower input
91
93
VReal lower_fx = duplicateLower (vfx_full);
92
- eve::store (input_lower * lower_fx * vfy, view_addr);
94
+ for (int i = 0 ; i < KY_TILE; ++i) {
95
+ eve::store (VReal{lower_view_addr (i)} * lower_fx * vfy[i], lower_view_addr (i));
96
+ }
93
97
94
98
// Permute upper factors, multiply upper input
95
99
VReal upper_fx = duplicateUpper (vfx_full);
96
- eve::store (input_upper * upper_fx * vfy, upper_view_addr);
100
+ for (int i = 0 ; i < KY_TILE; ++i) {
101
+ eve::store (VReal{upper_view_addr (i)} * upper_fx * vfy[i], upper_view_addr (i));
102
+ }
97
103
}
98
104
99
105
// tail
100
106
for (; kx < grid.KX ; ++kx) {
101
- view (kx, ky) *= factors_x[kx] * factors_y[ky];
107
+ for (int i = 0 ; i < KY_TILE; ++i) {
108
+ view (kx, ky + i) *= factors_x[kx] * factors_y[ky + i];
109
+ }
102
110
}
103
111
}
104
112
}
0 commit comments