KernelU_Pruned: initial work on pruned version of Kernel-U.

ribesstefano · ribesstefano · commit 32b21d4e54d5 · 2021-09-25T20:24:04.000+02:00
diff --git a/include/kernel/u_kernel.h b/include/kernel/u_kernel.h
@@ -199,7 +199,6 @@ void KernelU(const int num_active_inputs,
     const int input_size,
     const int num_refinements[params::N],
     const bool pad_output,
-    // hls::stream<typename params::IndexU_Type>& z_idx_port,
     hls::stream<typename params::VectTuAxiPacketType>& x_port,
     hls::stream<typename params::VectTuAxiPacketType>& u_port,
     hls::stream<typename WrapperAxisG::PacketType>& xu_port) {
@@ -210,13 +209,14 @@ void KernelU(const int num_active_inputs,
 #pragma HLS STABLE variable=u_port
 #pragma HLS STABLE variable=xu_port
   typedef typename params::ActivationD ActivationType;
-  const int kNumTilesU = input_size / params::Tu;
-  const int kMaxNumTilesU = params::I / params::Tu;
-  const int kStreamDepth_X = 2 + kMaxNumTilesU * params::N;
-  const int kStreamDepth_U = 8 + kMaxNumTilesU * params::N;
-  const int kStreamDepth_XU = 2 + params::G;
-  assert(num_active_inputs <= params::N);
+  const unsigned int kNumTilesU = input_size / params::Tu;
+  const unsigned int kMaxNumTilesU = params::I / params::Tu;
+  const unsigned int kStreamDepth_X = 2 + kMaxNumTilesU * params::N;
+  const unsigned int kStreamDepth_U = 8 + kMaxNumTilesU * params::N;
+  const unsigned int kStreamDepth_XU = 2 + params::G;
   assert(num_active_inputs > 0);
+  assert(kNumTilesU > 0);
+  assert(num_active_inputs <= params::N);
   assert(params::I % params::Tu == 0);
   assert(input_size % params::Tu == 0);
   assert(input_size <= params::I);
@@ -344,6 +344,180 @@ void KernelU(const int num_active_inputs,
     }
   }
 }
+
+
+template <
+  typename params,
+  typename WrapperAxisG = svd::AxiStreamPort<params::VectG_AxiWidth>
+>
+void KernelU_Pruned(const int num_active_inputs,
+    const int input_size,
+    const int num_refinements[params::N],
+    const int num_zero_tiles_u,
+    hls::stream<typename params::VectGZTuAxiPacketType>& uz_idx_port,
+    hls::stream<typename params::VectTuAxiPacketType>& x_port,
+    hls::stream<typename params::VectTuAxiPacketType>& u_port,
+    hls::stream<typename WrapperAxisG::PacketType>& xu_port) {
+#pragma HLS TOP name=KernelU
+#pragma HLS DATAFLOW
+#pragma HLS INLINE
+#pragma HLS STABLE variable=x_port
+#pragma HLS STABLE variable=u_port
+#pragma HLS STABLE variable=xu_port
+  typedef typename params::ActivationD ActivationType;
+  const unsigned int kNumTilesU = input_size / params::Tu;
+  const unsigned int kMaxNumTilesU = params::I / params::Tu;
+  const unsigned int kStreamDepth_X = 2 + kMaxNumTilesU * params::N;
+  const unsigned int kStreamDepth_U = 8 + kMaxNumTilesU * params::N;
+  const unsigned int kStreamDepth_XU = 2 + params::G;
+  assert(num_active_inputs > 0);
+  assert(kNumTilesU > 0);
+  assert(num_active_inputs <= params::N);
+  assert(params::I % params::Tu == 0);
+  assert(input_size % params::Tu == 0);
+  assert(input_size <= params::I);
+  assert(kNumTilesU <= kMaxNumTilesU);
+  auto uz_axis = svd::AxiStreamPort<params::NumGTuBitsAligned>(uz_idx_port);
+  auto x_axis = svd::AxiStreamPort<params::VectTuAxiWidth>(x_port);
+  auto u_axis = svd::AxiStreamPort<params::VectTuAxiWidth>(u_port);
+  auto xu_axis = svd::AxiStreamInterface<WrapperAxisG>(xu_port);
+  hls::stream<typename params::VectTuType> x_stream[params::G];
+  hls::stream<typename params::VectTuType> u_streams[params::G];
+  hls::stream<ActivationType> xu_streams[params::G];
+  ActivationType x_buffer[params::N][params::Tu][kMaxNumTilesU];
+#pragma HLS STREAM variable=x_stream depth=kStreamDepth_X
+#pragma HLS STREAM variable=u_streams depth=kStreamDepth_U
+#pragma HLS STREAM variable=xu_streams depth=kStreamDepth_XU
+#pragma HLS ARRAY_PARTITION variable=u_streams complete dim=1
+#pragma HLS ARRAY_PARTITION variable=x_buffer complete dim=1
+#pragma HLS ARRAY_PARTITION variable=x_buffer complete dim=2
+#pragma HLS BIND_STORAGE variable=x_buffer type=ram_t2p impl=bram latency=1
+  /*
+   * Ideally, if the Rs are ordered, it would be: R0 * N + (R1-R0) * (N-1) +
+   * (R2-R1) * (N-2)
+   *
+   * Imagine we have: R0 = 2, R1 = 3, R2 = 6
+   *
+   * This means:
+   *  - till refinement 2 we have input 0 to process
+   *  - till refinement 3 we have input 1 to process
+   *  - till refinement 6 we have input 2 to process
+   *
+   * So it would become:
+   *
+   * R_total = 2 * 3 + (3-2) * (3-1) + (6-3) * (3-2)
+   */
+  // ===========================================================================
+  // TODO: Same as non-pruned version -> wrap into a function
+  // ===========================================================================
+  int R_max = num_refinements[0];
+  int R_total = num_refinements[0] * num_active_inputs; // Total elements.
+  Get_Total_R:
+  for (int i = 1; i < num_active_inputs; ++i) {
+#pragma HLS PIPELINE II=1 style=frp
+    if (num_refinements[i] > R_max) {
+      R_max = num_refinements[i];
+    }
+    assert(num_refinements[i] >= num_refinements[i - 1]);
+    R_total += (num_refinements[i] - num_refinements[i - 1]) * (num_active_inputs - i);
+  }
+
+  // Added
+  X_DAM_in:
+  for (int i = 0; i < num_active_inputs; ++i) {
+    for (int j = 0; j < kNumTilesU; ++j) {
+#pragma HLS LOOP_FLATTEN      
+#pragma HLS PIPELINE II=1 style=frp
+      auto x_val = x_axis.template PopVector<ActivationType, params::Tu>();
+      for (int k = 0; k < params::Tu; ++k) {
+        x_buffer[i][k][j] = x_val[k];
+      }
+    }
+  }
+
+  // Changed
+  int R_prev = 0;
+  X_DMA_dispatcher:
+  for (int ii = 0; ii < num_active_inputs; ++ii) {
+    for (int i = 0; i < num_refinements[ii] - R_prev; ++i) {
+      assert(num_refinements[ii] - R_prev >= 1);
+      for (int j = 0; j < kNumTilesU; ++j) {
+        // Read z_idx
+        auto z_idx = uz_axis.template PopVector<ActivationType, params::G>();
+        for (int k = 0; k < num_active_inputs - ii; ++k) {
+#pragma HLS PIPELINE II=1 style=frp
+          assert(num_active_inputs - ii >= 1);
+          assert(k + ii < params::N);
+          for (int kk = 0; kk < params::G; ++kk) {
+            typename params::VectTuType x_val;
+            for (int jj = 0; jj < params::Tu; ++jj) {
+              x_val[jj] = x_buffer[k + ii][jj][z_idx[kk]];
+            }
+            x_stream[kk] << x_val;
+          }
+        }
+      }
+    }
+    R_prev = num_refinements[ii];
+  }
+
+  // ===========================================================================
+  // TODO: Same as non-pruned version -> wrap into a function
+  // ===========================================================================
+  U_DMA:
+  for (int i = 0; i < R_max; ++i) {
+#pragma HLS LOOP_TRIPCOUNT min=params::R max=params::R
+    for (int j = 0; j < kNumTilesU; ++j) {
+      for (int k = 0; k < params::G; ++k) {
+        auto u_val = u_axis.template PopVector<ActivationType, params::Tu>();
+        for (int ii = 0; ii < num_active_inputs; ++ii) {
+#pragma HLS PIPELINE II=1 style=frp
+          if (i < num_refinements[ii]) {
+            u_streams[k] << u_val;
+          }
+        }
+      }
+    }
+  }
+
+  // Changed
+  U_Kernel:
+  for (int i = 0; i < R_total; ++i) {
+    for (int j = 0; j < kNumTilesU; ++j) {
+#pragma HLS PIPELINE II=1 style=frp
+      for (int k = 0; k < params::G; ++k) {
+        xu_streams[k] << hlsutils::adder_tree<ActivationType, params::Tu>(
+          x_stream[k].read() * u_streams[k].read());
+      }
+    }
+  }
+  
+  // ===========================================================================
+  // TODO: Same as non-pruned version -> wrap into a function
+  // ===========================================================================
+  int iter_cnt = 0;
+  XU_DMA:
+  for (int i = 0; i < R_max; ++i) {
+    typename params::VectG_Type xu_out[params::N] = {typename params::VectG_Type(0)};
+#pragma HLS ARRAY_PARTITION variable=xu_out complete dim=1
+    for (int j = 0; j < kNumTilesU; ++j) {
+      for (int k = 0; k < num_active_inputs; ++k) {
+#pragma HLS PIPELINE II=1 style=frp
+        for (int ii = 0; ii < params::G; ++ii) {
+          if (i < num_refinements[k]) {
+            xu_out[k][ii] += xu_streams[ii].read();
+#pragma HLS BIND_OP variable=xu_out[k][ii] op=add impl=dsp
+          }
+        }
+        if (i < num_refinements[k] && j == kNumTilesU - 1) {
+          const bool kIsLast = iter_cnt == R_total - 1;
+          xu_axis.template PushVector<ActivationType, params::G>(xu_out[k], kIsLast);
+          ++iter_cnt;
+        }
+      }
+    }
+  } 
+}
 #endif // end __VITIS_HLS__
 
 } // svd
@@ -404,6 +578,8 @@ void HlsKernelU(const int num_active_inputs,
   const int input_size,
   const int num_refinements[testu::params::N],
   const bool pad_output,
+  // const int num_zero_tiles_u,
+  // hls::stream<ap_uint<testu::NumTuBits> >& uz_idx_port,
   hls::stream<typename testu::params::VectTuAxiPacketType>& x_port,
   hls::stream<typename testu::params::VectTuAxiPacketType>& u_port,
   hls::stream<typename testu::params::VectG_AxiPacketType>& xu_port);
diff --git a/include/layers/lstm/hls/lstm_svd.h b/include/layers/lstm/hls/lstm_svd.h
@@ -407,4 +407,32 @@ void HlsWrapperLstmSvd(
     typename svd::lstm_params::ActivationD* h_curr,
     typename svd::lstm_params::ActivationD* c_curr);
 
+extern "C" void C_WrapperLstmSvd(
+    const int num_timesteps,
+    const int num_active_inputs,
+    const int input_size,
+    const int output_size,
+    const int num_refinements[svd::lstm_params::N],
+    const int num_zero_tiles_u,
+    const int num_zero_tiles_v,
+    // Current Gates
+    const float* x_in,
+    const float* u_cur_in,
+    const float* s_cur_in,
+    const float* v_cur_in,
+    const int* uz_idx_cur_in,
+    const int* vz_idx_cur_in,
+    // Recurrent Gates
+    const float* h_in,
+    const float* u_rec_in,
+    const float* s_rec_in,
+    const float* v_rec_in,
+    const int* uz_idx_rec_in,
+    const int* vz_idx_rec_in,
+    // Non-Linearities
+    const float* bias_in,
+    const float* c_prev_in,
+    float* h_curr_in,
+    float* c_curr_in);
+
 #endif // end LSTM_HLS_LSTM_SVD_H_
diff --git a/include/svd_params.h b/include/svd_params.h
@@ -37,6 +37,11 @@ template <int Ni, int Ii, int Hi, int Ri, int Tui, int Tvi, int ZTui = 0,
   typename WeightD_tp = ap_fixed<8, 3>,
   typename AccumulationD_tp = ap_fixed<16, 3> >
 struct SvdParameters {
+  static_assert(Ni > 0, "ERROR. Found negative value: N <= 0");
+  static_assert(Ii > 0, "ERROR. Found negative value: I <= 0");
+  static_assert(Hi > 0, "ERROR. Found negative value: H <= 0");
+  static_assert(Tui > 0, "ERROR. Found negative value: Tu <= 0");
+  static_assert(Tvi > 0, "ERROR. Found negative value: Tv <= 0");
   static const int N = Ni;
   static const int I = Ii;
   static const int H = Hi;
@@ -51,24 +56,28 @@ struct SvdParameters {
   static const int PeU = MaxNumTu - ZTu;
   static const int PeV = H / MaxNumTv;
 private:
-  static const int TuBits_tmp = hlsutils::log2<MaxNumTu>::value;
-  static const int TvBits_tmp = hlsutils::log2<MaxNumTv>::value;
+  static const int NumTuBits_tmp = hlsutils::log2<MaxNumTu>::value;
+  static const int NumTvBits_tmp = hlsutils::log2<MaxNumTv>::value;
 public:
-  static const int TuBits = TuBits_tmp > 0 ? TuBits_tmp : 1;
-  static const int TvBits = TvBits_tmp > 0 ? TvBits_tmp : 1;
-  typedef ap_uint<MaxNumTu> IndexU_Type;
-  typedef ap_uint<MaxNumTv> IndexV_Type;
+  static const int NumTuBits = NumTuBits_tmp > 0 ? NumTuBits_tmp : 1;
+  static const int NumTvBits = NumTvBits_tmp > 0 ? NumTvBits_tmp : 1;
+  static const int NumTuBitsAligned = (NumTuBits + 7) & (-8); // align to 8bit
+  static const int NumTvBitsAligned = (NumTvBits + 7) & (-8); // align to 8bit
+  static const int NumGTuBitsAligned = (NumTuBits * G + 7) & (-8); // align to 8bit
+  static const int NumGTvBitsAligned = (NumTvBits * G + 7) & (-8); // align to 8bit
+  typedef ap_uint<NumTuBits> IndexU_Type; // deprecated
+  typedef ap_uint<NumTvBits> IndexV_Type; // deprecated
   typedef ap_uint<MaxNumTu> UnzD;
   typedef ap_uint<MaxNumTv> VnzD;
-  typedef ap_uint<TuBits> UnzIdxD;
-  typedef ap_uint<TvBits> VnzIdxD;
+  typedef ap_uint<NumTuBits> UnzIdxD;
+  typedef ap_uint<NumTvBits> VnzIdxD;
   typedef ActivationD_tp ActivationD;
   typedef WeightD_tp WeightD;
   typedef AccumulationD_tp AccumulationD;
   typedef hls::stream<UnzD> UnzS;
   typedef hls::stream<VnzD> VnzS;
-  typedef hls::stream<ap_uint<TuBits> > UnzIdxS;
-  typedef hls::stream<ap_uint<TvBits> > VnzIdxS;
+  typedef hls::stream<ap_uint<NumTuBits> > UnzIdxS;
+  typedef hls::stream<ap_uint<NumTvBits> > VnzIdxS;
   typedef hls::stream<ActivationD> ActivationS;
   typedef hls::stream<WeightD> WeightS;
   typedef hls::stream<AccumulationD> AccumulationS;
@@ -87,6 +96,10 @@ struct SvdParameters {
   static const int VectG_AxiWidth = ActivationWidth * G;
   static const int VectGN_AxiWidth = ActivationWidth * G * N;
   static const int VectGTvAxiWidth = ActivationWidth * G * Tv;
+
+  typedef typename svd::AxiStreamPort<NumGTuBitsAligned>::PacketType VectGZTuAxiPacketType;
+  typedef typename svd::AxiStreamPort<NumGTvBitsAligned>::PacketType VectGZTvAxiPacketType;
+
   typedef typename svd::AxiStreamPort<VectTuAxiWidth>::PacketType VectTuAxiPacketType;
   typedef typename svd::AxiStreamPort<VectTvAxiWidth>::PacketType VectTvAxiPacketType;
   typedef typename svd::AxiStreamPort<VectN_AxiWidth>::PacketType VectN_AxiPacketType;
diff --git a/include/testbenches/test_u_kernel_pruned.h b/include/testbenches/test_u_kernel_pruned.h
@@ -0,0 +1,7 @@
+#ifndef TESTBENCHES_TEST_U_KERNEL_H_
+#define TESTBENCHES_TEST_U_KERNEL_H_
+
+#include "kernel/u_kernel.h"
+#include "hls_utils/hls_debugging.h"
+
+#endif // end TESTBENCHES_TEST_U_KERNEL_H_
diff --git a/src/layers/lstm/hls/lstm_svd.cpp b/src/layers/lstm/hls/lstm_svd.cpp
@@ -416,7 +416,6 @@ void HlsLstmSvd(const int num_active_inputs,
     const int input_size,
     const int output_size,
     const int num_refinements[svd::lstm_params::N],
-    // const hls::vector<int, svd::lstm_params::N> num_refinements,
     // Current Gates
     hls::stream<typename svd::lstm_params::VectTuAxiPacketType>& x_port,
     hls::stream<typename svd::lstm_params::VectTuAxiPacketType>& u_cur_port,
@@ -557,16 +556,22 @@ extern "C" void C_WrapperLstmSvd(
     const int input_size,
     const int output_size,
     const int num_refinements[svd::lstm_params::N],
+    const int num_zero_tiles_u,
+    const int num_zero_tiles_v,
     // Current Gates
     const float* x_in,
     const float* u_cur_in,
     const float* s_cur_in,
     const float* v_cur_in,
+    const int* uz_idx_cur_in,
+    const int* vz_idx_cur_in,
     // Recurrent Gates
     const float* h_in,
     const float* u_rec_in,
     const float* s_rec_in,
     const float* v_rec_in,
+    const int* uz_idx_rec_in,
+    const int* vz_idx_rec_in,
     // Non-Linearities
     const float* bias_in,
     const float* c_prev_in,
diff --git a/src/svd.cpp b/src/svd.cpp
@@ -15,6 +15,7 @@ int main(int argc, char const *argv[]) {
 
   const bool kTestSoftwareAccelerator = false;
   const int kN = 2;
+  const int kNumActiveInputs = 1;
   const int kR = svd::lstm_params::R;
   const int kI = svd::lstm_params::I;
   const int kH = svd::lstm_params::H;
@@ -178,6 +179,36 @@ int main(int argc, char const *argv[]) {
         storage.get_h(j));
     }
   }
+
+  // int num_refinements[kN] = {kR};
+  // C_WrapperLstmSvd(
+  //   NUM_TIMESTEPS,
+  //   kNumActiveInputs,
+  //   kI,
+  //   kH,
+  //   num_refinements,
+  //   kZTu,
+  //   kZTv,
+  //   // Current Gates
+  //   x_in,
+  //   u_cur_in,
+  //   s_cur_in,
+  //   v_cur_in,
+  //   uz_idx_cur_in,
+  //   vz_idx_cur_in,
+  //   // Recurrent Gates
+  //   h_in,
+  //   u_rec_in,
+  //   s_rec_in,
+  //   v_rec_in,
+  //   uz_idx_rec_in,
+  //   vz_idx_rec_in,
+  //   // Non-Linearities
+  //   bias_in,
+  //   c_prev_in,
+  //   h_curr_in,
+  //   c_curr_in);
+
   storage.ResetLstmOutputs();
   std::cout << "Cleaning up." << std::endl;
   delete[] h_prev_hls;
diff --git a/src/testbenches/CMakeLists.txt b/src/testbenches/CMakeLists.txt
diff --git a/src/testbenches/test_u_kernel_pruned.cpp b/src/testbenches/test_u_kernel_pruned.cpp