sw: add unrolled kernels for dp-fdotp and dp-faxpy

Navaneeth-KunhiPurayil · Navaneeth-KunhiPurayil · commit cefd1b70274e · 2025-12-12T18:09:04.000+01:00
diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt
@@ -24,6 +24,10 @@ include_directories(${SNRUNTIME_INCLUDE_DIRS})
 
 add_compile_options(-O3 -g -ffunction-sections)
 
+# Use unrolled spatzBenchmarks
+if (UNROLL)
+add_definitions(-DUNROLL)
+endif()
 
 # Macro to regenerate the golden values and compile a module
 macro(add_spatz_test_oneParam name file param1)
diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.c
@@ -45,6 +45,54 @@ void faxpy_v64b(const double a, const double *x, const double *y,
   } while (avl > 0);
 }
 
+// Unrolled 64-bit AXPY: y = a * x + y
+void faxpy_v64b_unrl(const double a, const double *x, const double *y, unsigned int avl) {
+
+  unsigned int vl;
+  double *y2;
+
+  // Stripmine and accumulate a partial vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load vectors
+    asm volatile("vle64.v v0, (%0)" ::"r"(x));
+    asm volatile("vle64.v v8, (%0)" ::"r"(y));
+
+    // Multiply-accumulate
+    asm volatile("vfmacc.vf v8, %0, v0" ::"f"(a));
+    avl -= vl;
+    if (avl > 0) {
+      // Set the vl
+      asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+      // Load vectors
+      x += vl;
+      asm volatile("vle64.v v16, (%0)" ::"r"(x));
+      y2 = y + vl;
+      asm volatile("vle64.v v24, (%0)" ::"r"(y2));
+
+      // Multiply-accumulate
+      asm volatile("vfmacc.vf v24, %0, v16" ::"f"(a));
+    }
+
+    // Store results
+    asm volatile("vse64.v v8, (%0)" ::"r"(y));
+    if (avl > 0) {
+      // Store results
+      y += vl;
+      asm volatile("vse64.v v24, (%0)" ::"r"(y));
+      avl -= vl;
+    }
+
+    // Bump pointers
+    x += vl;
+    y += vl;
+
+  } while (avl > 0);
+}
+
 // 32-bit AXPY: y = a * x + y
 void faxpy_v32b(const float a, const float *x, const float *y,
                 unsigned int avl) {
diff --git a/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h b/sw/spatzBenchmarks/dp-faxpy/kernel/faxpy.h
@@ -21,6 +21,8 @@
 
 inline void faxpy_v64b(const double a, const double *x, const double *y,
                        unsigned int avl) __attribute__((always_inline));
+inline void faxpy_v64b_unrl(const double a, const double *x, const double *y,
+                       unsigned int avl) __attribute__((always_inline));
 inline void faxpy_v32b(const float a, const float *x, const float *y,
                        unsigned int avl) __attribute__((always_inline));
 inline void faxpy_v16b(const _Float16 a, const _Float16 *x, const _Float16 *y,
diff --git a/sw/spatzBenchmarks/dp-faxpy/main.c b/sw/spatzBenchmarks/dp-faxpy/main.c
@@ -62,6 +62,7 @@ int main() {
 
     snrt_dma_start_1d(x, axpy_X_dram, dim * sizeof(double));
     snrt_dma_start_1d(y, axpy_Y_dram, dim * sizeof(double));
+    snrt_dma_wait_all();
   }
 
   // Wait for all cores to finish
@@ -83,7 +84,11 @@ int main() {
     timer = benchmark_get_cycle();
 
   // Call AXPY
+#ifdef UNROLL
+  faxpy_v64b_unrl(*a, x_int, y_int, dim_core);
+#else
   faxpy_v64b(*a, x_int, y_int, dim_core);
+#endif
 
   // Wait for all cores to finish
   snrt_cluster_hw_barrier();
diff --git a/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.c b/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.c
@@ -59,6 +59,80 @@ double fdotp_v64b(const double *a, const double *b, unsigned int avl) {
   return red;
 }
 
+// 64-bit dot-product: a * b
+// m8 allows only for partial register re-allocation with factor-2 unrolling
+double fdotp_v64b_m8_unrl(const double *a, const double *b, unsigned int avl) {
+  const unsigned int orig_avl = avl;
+  unsigned int vl;
+
+  double red;
+
+  // Stripmine and accumulate a partial reduced vector
+  do {
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load chunk a and b
+    asm volatile("vle64.v v8,  (%0)" ::"r"(a));
+    asm volatile("vle64.v v16, (%0)" ::"r"(b));
+
+    // Multiply and accumulate
+    if (avl == orig_avl) {
+      asm volatile("vfmul.vv v24, v8, v16");
+    } else {
+      asm volatile("vfmacc.vv v24, v8, v16");
+    }
+
+    // Bump pointers
+    a += vl;
+    b += vl;
+    avl -= vl;
+
+    if (avl <= 0) break;
+
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load chunk a and b
+    asm volatile("vle64.v v0, (%0)" ::"r"(a));
+    asm volatile("vle64.v v8, (%0)" ::"r"(b));
+
+    // Multiply and accumulate
+    asm volatile("vfmacc.vv v24, v0, v8");
+
+    // Bump pointers
+    a += vl;
+    b += vl;
+    avl -= vl;
+
+    if (avl <= 0) break;
+
+    // Set the vl
+    asm volatile("vsetvli %0, %1, e64, m8, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // Load chunk a and b
+    asm volatile("vle64.v v16, (%0)" ::"r"(a));
+    asm volatile("vle64.v v0, (%0)" ::"r"(b));
+
+    // Multiply and accumulate
+    asm volatile("vfmacc.vv v24, v0, v16");
+
+    // Bump pointers
+    a += vl;
+    b += vl;
+    avl -= vl;
+  } while (avl > 0);
+
+  // Clean the accumulator
+  asm volatile("vmv.s.x v0, zero");
+
+  // Reduce and return
+  asm volatile("vfredusum.vs v0, v24, v0");
+  asm volatile("vfmv.f.s %0, v0" : "=f"(red));
+
+  return red;
+}
+
 // 32-bit dot-product: a * b
 float fdotp_v32b(const float *a, const float *b, unsigned int avl) {
   const unsigned int orig_avl = avl;
diff --git a/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.h b/sw/spatzBenchmarks/dp-fdotp/kernel/fdotp.h
@@ -21,6 +21,8 @@
 
 inline double fdotp_v64b(const double *a, const double *b, unsigned int avl)
     __attribute__((always_inline));
+inline double fdotp_v64b_m8_unrl(const double *a, const double *b, unsigned int avl)
+    __attribute__((always_inline));
 inline float fdotp_v32b(const float *a, const float *b, unsigned int avl)
     __attribute__((always_inline));
 inline _Float16 fdotp_v16b(const _Float16 *a, const _Float16 *b,
diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c
@@ -82,7 +82,11 @@ int main() {
 
   // Calculate dotp
   double acc;
+#ifdef UNROLL
+  acc = fdotp_v64b_m8_unrl(a_int, b_int, dim);
+#else
   acc = fdotp_v64b(a_int, b_int, dim);
+#endif
   result[cid] = acc;
 
   // Wait for all cores to finish