|
5 | 5 | #include "Filter.hpp"
|
6 | 6 |
|
7 | 7 | #include <eve/module/core.hpp>
|
| 8 | +#include <immintrin.h> |
8 | 9 |
|
9 | 10 | namespace ahr {
|
10 | 11 | void HouLiFilter::operator()(Grid::View::C_XY view) const {
|
@@ -43,28 +44,61 @@ void HouLiFilterCached1D::operator()(Grid::View::C_XY view) const {
|
43 | 44 | });
|
44 | 45 | }
|
45 | 46 |
|
46 |
| -HouLiFilterCached1DVector::HouLiFilterCached1DVector(Grid const &grid) |
47 |
| - : HouLiFilterCached1D(grid), factors_x_duped(2 * grid.KX) { |
48 |
| - assert(grid.KY % C_WIDTH == 0); |
49 |
| - for (Dim kx = 0; kx < grid.KX; ++kx) { |
50 |
| - factors_x_duped[kx * 2] = factors_x[kx]; |
51 |
| - factors_x_duped[kx * 2 + 1] = factors_x[kx]; |
52 |
| - } |
| 47 | +HouLiFilterCached1DVector::HouLiFilterCached1DVector(Grid const &grid) : HouLiFilterCached1D(grid) { |
| 48 | + assert(grid.KX > R_WIDTH); |
53 | 49 | }
|
54 | 50 |
|
55 | 51 | void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
|
| 52 | + // This method applies the HouLi filter to view using vector instructions. |
| 53 | + // An array of contiguous complex numbers is simply treated as a real array |
| 54 | + // with double the length. |
| 55 | + // The code is vectorized along the kx (continuous) dimension. |
| 56 | + // We load R_WIDTH factors_x and expand them into two registers, duplicating each element. |
| 57 | + // That way, each two consecutive real numbers (real and imaginary parts) |
| 58 | + // get multiplied with the same factor. |
| 59 | + // Finally, we read the complex numbers, multiply with kx- and ky-factors, and write it back. |
| 60 | + // Duplication demo: |
| 61 | + // vfx_full = {kx, kx+1, kx+2, kx+3, kx+4, kx+5, kx+6, kx+7} |
| 62 | + // into |
| 63 | + // lower_fx = {kx, kx, kx+1, kx+1, kx+2, kx+2, kx+3, kx+3} |
| 64 | + // upper_fx = {kx+4, kx+4, kx+5, kx+5, kx+6, kx+6, kx+7, kx+7} |
| 65 | + // |
| 66 | + // TODO multiple fy at once to better reuse fx |
| 67 | + |
| 68 | + static_assert(view.stride(0) == 1); // contiguous in kx |
| 69 | + |
| 70 | + // permutation indices |
| 71 | + static_assert(C_WIDTH == 4); // idx are hardcoded |
| 72 | + VIdx const lower_idx = {0, 0, 1, 1, 2, 2, 3, 3}; |
| 73 | + VIdx const upper_idx = {4, 4, 5, 5, 6, 6, 7, 7}; |
| 74 | + |
56 | 75 | for (int ky = 0; ky < grid.KY; ++ky) {
|
| 76 | + // avoid std::vector dereference inside loop: |
| 77 | + // broadcast fy value into vector |
57 | 78 | VReal vfy{factors_y[ky]};
|
| 79 | + // prepare iteration address for fx |
| 80 | + Real const *fx_addr = factors_x.data(); |
58 | 81 |
|
59 |
| - // avoid vector dereference inside loop |
60 |
| - Real *fx_addr = factors_x_duped.data(); |
61 | 82 | int kx = 0;
|
62 |
| - for (; kx <= grid.KX - C_WIDTH; kx += C_WIDTH, fx_addr += R_WIDTH) { |
63 |
| - Real *view_addr = (Real *)&view(kx, ky); |
64 |
| - VReal input{view_addr}; |
65 |
| - VReal vfx{fx_addr}; |
| 83 | + // Make sure the last element in the 2nd vector isn't past the end |
| 84 | + // Process 1 vector of factors at a time (2 vectors of complex) |
| 85 | + for (; kx <= grid.KX - R_WIDTH; kx += R_WIDTH, fx_addr += R_WIDTH) { |
| 86 | + // get address for two vectors we're writing to |
| 87 | + auto *view_addr = (Real *)&view(kx, ky); |
| 88 | + auto *upper_view_addr = view_addr + R_WIDTH; |
| 89 | + VReal input_lower{view_addr}; |
| 90 | + VReal input_upper{upper_view_addr}; |
| 91 | + |
| 92 | + // Load factors |
| 93 | + VReal vfx_full{fx_addr}; |
| 94 | + |
| 95 | + // Permute lower factors, multiply lower input |
| 96 | + VReal lower_fx = _mm512_permutex2var_pd(vfx_full, lower_idx, vfx_full); |
| 97 | + eve::store(input_lower * lower_fx * vfy, view_addr); |
66 | 98 |
|
67 |
| - eve::store(input * vfx * vfy, view_addr); |
| 99 | + // Permute upper factors, multiply upper input |
| 100 | + VReal upper_fx = _mm512_permutex2var_pd(vfx_full, upper_idx, vfx_full); |
| 101 | + eve::store(input_upper * upper_fx * vfy, upper_view_addr); |
68 | 102 | }
|
69 | 103 |
|
70 | 104 | // tail
|
|
0 commit comments