Add fp_peak benchmark

jiegec · jiegec · commit 5d41fea8ee18 · 2025-03-18T12:44:26.000+08:00
diff --git a/meson.build b/meson.build
@@ -70,6 +70,7 @@ progs = [
 	['btb_size_basic', false, true, true],
 	['elimination', false, true, true],
 	['find_branch_misses_pmu', false, false, true],
+	['fp_peak', false, false, true],
 	['ghr_size', false, true, true],
 	['itlb_size', false, true, true],
 	['phr_branch_bits_location', true, false, true],
diff --git a/src/fp_peak.cpp b/src/fp_peak.cpp
@@ -0,0 +1,116 @@
+#include "include/utils.h"
+#include <assert.h>
+#include <cstddef>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <vector>
+
+// defined in fp_peak_gen.cpp
+// args: loop count
+typedef void (*gadget)(size_t);
+extern "C" {
+extern gadget fp_peak_gadgets[];
+}
+
+int main(int argc, char *argv[]) {
+  // match fp_peak_gen.cpp
+  int repeat = 1000;
+  int loop_count = 1000;
+
+#ifdef HOST_AARCH64
+  int num_patterns = 6;
+  char patterns[][20] = {
+      "32-bit SP FMADD",  "64-bit DP FMADD", "128-bit SP ASIMD",
+      "128-bit DP ASIMD", "xxxx-bit SP SVE", "xxxx-bit DP SVE",
+  };
+
+  int coef[] = {
+      32 / 32 * 2,  // 32-bit SP
+      64 / 64 * 2,  // 64-bit DP
+      128 / 32 * 2, // 128-bit SP
+      128 / 64 * 2, // 128-bit DP
+      0,            // ?-bit SP
+      0,            // ?-bit DP
+  };
+#else
+  int num_patterns = 4;
+  const char *patterns[] = {
+      "256-bit SP FMA",
+      "256-bit DP FMA",
+      "512-bit SP AVX512F",
+      "512-bit DP AVX512F",
+  };
+
+  int coef[] = {
+      256 / 32 * 2, // 256-bit SP
+      256 / 64 * 2, // 256-bit DP
+      512 / 32 * 2, // 512-bit SP
+      512 / 64 * 2, // 512-bit DP
+  };
+#endif
+
+  bind_to_core();
+  setup_perf_cycles();
+  FILE *fp = fopen("fp_peak.csv", "w");
+  assert(fp);
+
+  int gadget_index = 0;
+  fprintf(fp, "pattern,min,avg,max\n");
+  for (int pattern = 0; pattern < num_patterns; pattern++) {
+    std::vector<double> history;
+    int iterations = 100;
+    history.reserve(iterations);
+
+#ifdef HOST_AARCH64
+    // read sve length in runtime
+    if (pattern == 4) {
+      uint64_t len = 0;
+      asm __volatile__(".arch armv9-a+sve\ncntw %0" : "=r"(len));
+      sprintf(patterns[pattern], "%ld-bit SP SVE", len * 32);
+      coef[pattern] = len * 2;
+    } else if (pattern == 5) {
+      uint64_t len = 0;
+      asm __volatile__(".arch armv9-a+sve\ncntd %0" : "=r"(len));
+      sprintf(patterns[pattern], "%ld-bit DP SVE", len * 64);
+      coef[pattern] = len * 2;
+    }
+#endif
+
+    double sum = 0;
+    // run several times
+    for (int i = 0; i < iterations; i++) {
+      uint64_t begin = perf_read_cycles();
+      fp_peak_gadgets[gadget_index](loop_count);
+      uint64_t elapsed = perf_read_cycles() - begin;
+
+      // skip warmup
+      if (i >= 10) {
+        double time =
+            (double)coef[pattern] / ((double)elapsed / loop_count / repeat);
+        history.push_back(time);
+        sum += time;
+      }
+    }
+
+    double min = history[0];
+    double max = history[0];
+    for (size_t i = 0; i < history.size(); i++) {
+      if (min > history[i]) {
+        min = history[i];
+      }
+      if (max < history[i]) {
+        max = history[i];
+      }
+    }
+
+    fprintf(fp, "%s,%.2lf,%.2lf,%.2lf\n", patterns[pattern], min,
+            sum / history.size(), max);
+    fflush(fp);
+
+    gadget_index++;
+  }
+
+  printf("Results are written to fp_peak.csv\n");
+  return 0;
+}
diff --git a/src/fp_peak_gen.cpp b/src/fp_peak_gen.cpp
@@ -0,0 +1,96 @@
+#include "include/utils.h"
+#include <cstdio>
+
+int main(int argc, char *argv[]) {
+  FILE *fp = fopen(argv[1], "w");
+  assert(fp);
+#ifdef HOST_AARCH64
+  int num_patterns = 6;
+#else
+  int num_patterns = 4;
+#endif
+  int repeat = 1000;
+
+  // args: loop count
+  fprintf(fp, ".text\n");
+  for (int pattern = 0; pattern < num_patterns; pattern++) {
+    // entry
+    fprintf(fp, ".global fp_peak_%d\n", pattern);
+    fprintf(fp, ".balign 32\n");
+    fprintf(fp, "fp_peak_%d:\n", pattern);
+#if defined(HOST_AARCH64)
+    fprintf(fp, "\tsub sp, sp, #0x100\n");
+    fprintf(fp, "\tstp q8, q9, [sp, #0x0]\n");
+    fprintf(fp, "\tstp q10, q11, [sp, #0x20]\n");
+    fprintf(fp, "\tstp q12, q13, [sp, #0x40]\n");
+    fprintf(fp, "\tstp q14, q15, [sp, #0x60]\n");
+
+    if (pattern == 4 || pattern == 5) {
+      fprintf(fp, "\t.arch armv9-a+sve\n");
+      fprintf(fp, "\tptrue p0.b\n");
+    }
+
+    fprintf(fp, "\t1:\n");
+    for (int i = 0; i < repeat; i++) {
+      if (pattern == 0) {
+        // single precision 32-bit fma using FMADD
+        fprintf(fp, "\tfmadd s%d, s0, s1, s2\n", (i % 16) + 3);
+      } else if (pattern == 1) {
+        // double precision 64-bit fma using FMADD
+        fprintf(fp, "\tfmadd d%d, d0, d1, d2\n", (i % 16) + 3);
+      } else if (pattern == 2) {
+        // single precision 128-bit fma using ASIMD
+        fprintf(fp, "\tfmla v%d.4s, v0.4s, v1.4s\n", (i % 16) + 2);
+      } else if (pattern == 3) {
+        // double precision 128-bit fma using ASIMD
+        fprintf(fp, "\tfmla v%d.2d, v0.2d, v1.2d\n", (i % 16) + 2);
+      } else if (pattern == 4) {
+        // single precision fma using SVE
+        fprintf(fp, "\tfmla z%d.s, p0/m, z0.s, z1.s\n", (i % 16) + 2);
+      } else if (pattern == 5) {
+        // double precision fma using SVE
+        fprintf(fp, "\tfmla z%d.d, p0/m, z0.d, z1.d\n", (i % 16) + 2);
+      }
+    }
+
+    fprintf(fp, "\tsubs x0, x0, #1\n");
+    fprintf(fp, "\tbne 1b\n");
+
+    fprintf(fp, "\tldp q8, q9, [sp, #0x0]\n");
+    fprintf(fp, "\tldp q10, q11, [sp, #0x20]\n");
+    fprintf(fp, "\tldp q12, q13, [sp, #0x40]\n");
+    fprintf(fp, "\tldp q14, q15, [sp, #0x60]\n");
+    fprintf(fp, "\tadd sp, sp, #0x100\n");
+    fprintf(fp, "\tret\n");
+#endif
+#if defined(HOST_AMD64)
+    fprintf(fp, "\t1:\n");
+    for (int i = 0; i < repeat; i++) {
+      if (pattern == 0) {
+        // single precision 256-bit fma using FMA
+        fprintf(fp, "\tvfmadd213ps %%ymm0, %%ymm1, %%ymm%d\n", (i % 14) + 2);
+      } else if (pattern == 1) {
+        // double precision 256-bit fma using FMA
+        fprintf(fp, "\tvfmadd213pd %%ymm0, %%ymm1, %%ymm%d\n", (i % 14) + 2);
+      } else if (pattern == 2) {
+        // single precision 512-bit fma using AVX512F
+        fprintf(fp, "\tvfmadd213ps %%zmm0, %%zmm1, %%zmm%d\n", (i % 14) + 2);
+      } else if (pattern == 3) {
+        // double precision 512-bit fma using AVX512F
+        fprintf(fp, "\tvfmadd213pd %%zmm0, %%zmm1, %%zmm%d\n", (i % 14) + 2);
+      }
+    }
+
+    fprintf(fp, "\tdec %%rdi\n");
+    fprintf(fp, "\tjnz 1b\n");
+
+    fprintf(fp, "\tret\n");
+#endif
+  }
+
+  define_gadgets_array(fp, "fp_peak_gadgets");
+  for (int pattern = 0; pattern < num_patterns; pattern++) {
+    add_gadget(fp, "fp_peak_%d", pattern);
+  }
+  return 0;
+}