2 vectors along kx at a time, single vector to load values in, use permutes to rearrange

ProExpertProg · ProExpertProg · commit 908b92f193d4 · 2024-11-24T15:08:15.000-05:00
diff --git a/cpp/lib/Filter.cpp b/cpp/lib/Filter.cpp
@@ -5,6 +5,7 @@
 #include "Filter.hpp"
 
 #include <eve/module/core.hpp>
+#include <immintrin.h>
 
 namespace ahr {
 void HouLiFilter::operator()(Grid::View::C_XY view) const {
@@ -43,28 +44,61 @@ void HouLiFilterCached1D::operator()(Grid::View::C_XY view) const {
   });
 }
 
-HouLiFilterCached1DVector::HouLiFilterCached1DVector(Grid const &grid)
-    : HouLiFilterCached1D(grid), factors_x_duped(2 * grid.KX) {
-  assert(grid.KY % C_WIDTH == 0);
-  for (Dim kx = 0; kx < grid.KX; ++kx) {
-    factors_x_duped[kx * 2] = factors_x[kx];
-    factors_x_duped[kx * 2 + 1] = factors_x[kx];
-  }
+HouLiFilterCached1DVector::HouLiFilterCached1DVector(Grid const &grid) : HouLiFilterCached1D(grid) {
+  assert(grid.KX > R_WIDTH);
 }
 
 void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
+  // This method applies the HouLi filter to view using vector instructions.
+  // An array of contiguous complex numbers is simply treated as a real array
+  // with double the length.
+  // The code is vectorized along the kx (continuous) dimension.
+  // We load R_WIDTH factors_x and expand them into two registers, duplicating each element.
+  // That way, each two consecutive real numbers (real and imaginary parts)
+  // get multiplied with the same factor.
+  // Finally, we read the complex numbers, multiply with kx- and ky-factors, and write it back.
+  // Duplication demo:
+  // vfx_full = {kx, kx+1, kx+2, kx+3, kx+4, kx+5, kx+6, kx+7}
+  // into
+  // lower_fx = {kx, kx, kx+1, kx+1, kx+2, kx+2, kx+3, kx+3}
+  // upper_fx = {kx+4, kx+4, kx+5, kx+5, kx+6, kx+6, kx+7, kx+7}
+  //
+  // TODO multiple fy at once to better reuse fx
+
+  static_assert(view.stride(0) == 1); // contiguous in kx
+
+  // permutation indices
+  static_assert(C_WIDTH == 4); // idx are hardcoded
+  VIdx const lower_idx = {0, 0, 1, 1, 2, 2, 3, 3};
+  VIdx const upper_idx = {4, 4, 5, 5, 6, 6, 7, 7};
+
   for (int ky = 0; ky < grid.KY; ++ky) {
+    // avoid std::vector dereference inside loop:
+    // broadcast fy value into vector
     VReal vfy{factors_y[ky]};
+    // prepare iteration address for fx
+    Real const *fx_addr = factors_x.data();
 
-    // avoid vector dereference inside loop
-    Real *fx_addr = factors_x_duped.data();
     int kx = 0;
-    for (; kx <= grid.KX - C_WIDTH; kx += C_WIDTH, fx_addr += R_WIDTH) {
-      Real *view_addr = (Real *)&view(kx, ky);
-      VReal input{view_addr};
-      VReal vfx{fx_addr};
+    // Make sure the last element in the 2nd vector isn't past the end
+    // Process 1 vector of factors at a time (2 vectors of complex)
+    for (; kx <= grid.KX - R_WIDTH; kx += R_WIDTH, fx_addr += R_WIDTH) {
+      // get address for two vectors we're writing to
+      auto *view_addr = (Real *)&view(kx, ky);
+      auto *upper_view_addr = view_addr + R_WIDTH;
+      VReal input_lower{view_addr};
+      VReal input_upper{upper_view_addr};
+
+      // Load factors
+      VReal vfx_full{fx_addr};
+
+      // Permute lower factors, multiply lower input
+      VReal lower_fx = _mm512_permutex2var_pd(vfx_full, lower_idx, vfx_full);
+      eve::store(input_lower * lower_fx * vfy, view_addr);
 
-      eve::store(input * vfx * vfy, view_addr);
+      // Permute upper factors, multiply upper input
+      VReal upper_fx = _mm512_permutex2var_pd(vfx_full, upper_idx, vfx_full);
+      eve::store(input_upper * upper_fx * vfy, upper_view_addr);
     }
 
     // tail
diff --git a/cpp/lib/Filter.hpp b/cpp/lib/Filter.hpp
@@ -41,12 +41,9 @@ class HouLiFilterCached1DVector : HouLiFilterCached1D {
   void operator()(Grid::View::C_XY view) const;
 
 private:
+  using VIdx = eve::wide<long long>;
   using VReal = eve::wide<Real>;
   static auto constexpr R_WIDTH = VReal::size();
   static auto constexpr C_WIDTH = VReal::size() / 2;
-
-  /// A pre-expanded vector of 2d factors
-  std::vector<Real> factors_x_duped;
 };
-
 } // namespace ahr
diff --git a/cpp/test/filter.cpp b/cpp/test/filter.cpp
@@ -1,6 +1,7 @@
 #include "Filter.hpp"
 #include "grid.hpp"
 
+#include "debug.hpp"
 #include "util.hpp"
 
 #include <gtest/gtest.h>
@@ -9,7 +10,7 @@ namespace ahr {
 
 template <typename TestedFilter> class TestFilter : public ::testing::Test {
 protected:
-  Grid grid{5, 9, 16};
+  Grid grid{5, 32, 32};
   HouLiFilter filter{grid};
   TestedFilter filter_t{grid};
 };