Tomasulo: Add floating-point benchmarks and FP hazard correctness tests

adambagley · adambagley · commit 0e403a11804b · 2026-02-21T19:41:32.000-05:00
tomasulo_perf gets 6 new double-precision benchmarks (Bench 8-13):
  dependent/independent FADD.D and FMUL.D chains, dependent FMADD.D
  chain, and mixed FP+INT cross-unit parallelism measurement.

  tomasulo_test gets Test 11 with 16 FP assertions covering RAW/WAR/WAW
  hazards on FP registers, FP MUL-ADD forwarding, FP-INT crossover via
  fcvt, dependent FMADD.D chains, and independent FP op parallelism.
  Adds a TEST_FP macro that converts double results to int via fcvt.w.d
  for comparison without needing FP printf support.
diff --git a/sw/apps/tomasulo_perf/tomasulo_perf.c b/sw/apps/tomasulo_perf/tomasulo_perf.c
@@ -28,14 +28,22 @@
  * Uses hardware cycle and instret counters (Zicntr CSRs) for measurement.
  * IPC is reported as IPC*100 (integer, so 150 means IPC = 1.50).
  *
- * Benchmarks:
+ * Benchmarks (integer):
  *   1. Dependent ADD chain      (worst-case ILP: serialized)
  *   2. Independent ADD chains   (best-case ILP: fully parallel)
  *   3. Dependent MUL chain      (long-latency serialized)
  *   4. Independent MUL chains   (long-latency parallel)
  *   5. Mixed MUL + ADD          (latency hiding)
  *   6. Load-store throughput    (memory subsystem)
  *   7. Branch-heavy loop        (branch prediction + OOO)
+ *
+ * Benchmarks (floating-point, double-precision):
+ *   8. Dependent FADD.D chain   (FP ALU serialized)
+ *   9. Independent FADD.D chains (FP ALU parallel)
+ *  10. Dependent FMUL.D chain   (FP MUL serialized)
+ *  11. Independent FMUL.D chains (FP MUL parallel)
+ *  12. Dependent FMADD.D chain  (fused multiply-add, key for numerics)
+ *  13. Mixed FP + INT           (cross-unit parallelism)
  */
 
 #include "csr.h"
@@ -218,12 +226,149 @@ int main(void)
     i1 = rdinstret();
     print_result(c1 - c0, i1 - i0);
 
+    /* ===================================================================== */
+    /* Floating-Point Benchmarks                                             */
+    /* ===================================================================== */
+    uart_printf("\n--- Floating-Point Benchmarks (double-precision) ---\n\n");
+
+    /* ===================================================================== */
+    /* Benchmark 8: Dependent FADD.D chain (100 instructions)                */
+    /* Each FADD.D reads the result of the previous one - no ILP possible.   */
+    /* FP analogue of Bench 1.                                               */
+    /* ===================================================================== */
+    uart_printf("Bench 8: Dependent FADD.D chain (100 instrs)\n");
+    {
+        double accum = 1.0, incr = 0.5;
+        c0 = rdcycle();
+        i0 = rdinstret();
+        __asm__ volatile(".rept 100\n"
+                         "fadd.d %[a], %[a], %[i]\n"
+                         ".endr\n"
+                         : [a] "+f"(accum)
+                         : [i] "f"(incr));
+        c1 = rdcycle();
+        i1 = rdinstret();
+        print_result(c1 - c0, i1 - i0);
+    }
+
+    /* ===================================================================== */
+    /* Benchmark 9: Independent FADD.D chains (4 x 25 = 100 instructions)    */
+    /* 4 chains with no cross-dependencies - ideal for OOO execution.        */
+    /* FP analogue of Bench 2.                                               */
+    /* ===================================================================== */
+    uart_printf("Bench 9: Independent FADD.D chains (4x25 = 100 instrs)\n");
+    {
+        double a0 = 1.0, a1 = 2.0, a2 = 3.0, a3 = 4.0;
+        double inc = 0.5;
+        c0 = rdcycle();
+        i0 = rdinstret();
+        __asm__ volatile(".rept 25\n"
+                         "fadd.d %[a0], %[a0], %[inc]\n"
+                         "fadd.d %[a1], %[a1], %[inc]\n"
+                         "fadd.d %[a2], %[a2], %[inc]\n"
+                         "fadd.d %[a3], %[a3], %[inc]\n"
+                         ".endr\n"
+                         : [a0] "+f"(a0), [a1] "+f"(a1), [a2] "+f"(a2), [a3] "+f"(a3)
+                         : [inc] "f"(inc));
+        c1 = rdcycle();
+        i1 = rdinstret();
+        print_result(c1 - c0, i1 - i0);
+    }
+
+    /* ===================================================================== */
+    /* Benchmark 10: Dependent FMUL.D chain (50 instructions)                */
+    /* FMUL.D has multi-cycle latency; dependent chain is very slow.         */
+    /* Multiply by 1.0 to keep value stable. FP analogue of Bench 3.        */
+    /* ===================================================================== */
+    uart_printf("Bench 10: Dependent FMUL.D chain (50 instrs)\n");
+    {
+        double accum = 2.0, factor = 1.0;
+        c0 = rdcycle();
+        i0 = rdinstret();
+        __asm__ volatile(".rept 50\n"
+                         "fmul.d %[a], %[a], %[f]\n"
+                         ".endr\n"
+                         : [a] "+f"(accum)
+                         : [f] "f"(factor));
+        c1 = rdcycle();
+        i1 = rdinstret();
+        print_result(c1 - c0, i1 - i0);
+    }
+
+    /* ===================================================================== */
+    /* Benchmark 11: Independent FMUL.D chains (4 x 12 = 48 instructions)   */
+    /* 4 independent FMUL.D chains. FP analogue of Bench 4.                 */
+    /* ===================================================================== */
+    uart_printf("Bench 11: Independent FMUL.D chains (4x12 = 48 instrs)\n");
+    {
+        double m0 = 1.0, m1 = 2.0, m2 = 3.0, m3 = 4.0;
+        double factor = 1.0;
+        c0 = rdcycle();
+        i0 = rdinstret();
+        __asm__ volatile(".rept 12\n"
+                         "fmul.d %[m0], %[m0], %[f]\n"
+                         "fmul.d %[m1], %[m1], %[f]\n"
+                         "fmul.d %[m2], %[m2], %[f]\n"
+                         "fmul.d %[m3], %[m3], %[f]\n"
+                         ".endr\n"
+                         : [m0] "+f"(m0), [m1] "+f"(m1), [m2] "+f"(m2), [m3] "+f"(m3)
+                         : [f] "f"(factor));
+        c1 = rdcycle();
+        i1 = rdinstret();
+        print_result(c1 - c0, i1 - i0);
+    }
+
+    /* ===================================================================== */
+    /* Benchmark 12: Dependent FMADD.D chain (50 instructions)               */
+    /* Fused multiply-add: accum = accum * 1.0 + 0.5, serialized.           */
+    /* Key for numerical workloads (BLAS, FFT, etc.).                        */
+    /* ===================================================================== */
+    uart_printf("Bench 12: Dependent FMADD.D chain (50 instrs)\n");
+    {
+        double accum = 0.0, mul_one = 1.0, add_half = 0.5;
+        c0 = rdcycle();
+        i0 = rdinstret();
+        __asm__ volatile(".rept 50\n"
+                         "fmadd.d %[a], %[a], %[m], %[c]\n"
+                         ".endr\n"
+                         : [a] "+f"(accum)
+                         : [m] "f"(mul_one), [c] "f"(add_half));
+        c1 = rdcycle();
+        i1 = rdinstret();
+        print_result(c1 - c0, i1 - i0);
+    }
+
+    /* ===================================================================== */
+    /* Benchmark 13: Mixed FP + INT (50 pairs = 100 instructions)            */
+    /* Tests cross-unit parallelism: FP and INT units should work in         */
+    /* parallel since there are no data dependencies between them.           */
+    /* ===================================================================== */
+    uart_printf("Bench 13: Mixed FP+INT (50 pairs = 100 instrs)\n");
+    {
+        double fp_acc = 1.0, fp_inc = 0.5;
+        c0 = rdcycle();
+        i0 = rdinstret();
+        __asm__ volatile("addi t0, zero, 0\n"
+                         "addi t1, zero, 1\n"
+                         ".rept 50\n"
+                         "fadd.d %[fa], %[fa], %[fi]\n"
+                         "add    t0, t0, t1\n"
+                         ".endr\n"
+                         : [fa] "+f"(fp_acc)
+                         : [fi] "f"(fp_inc)
+                         : "t0", "t1");
+        c1 = rdcycle();
+        i1 = rdinstret();
+        print_result(c1 - c0, i1 - i0);
+    }
+
     /* ===================================================================== */
     /* Summary                                                               */
     /* ===================================================================== */
     uart_printf("\n============================================================\n");
     uart_printf("  Performance measurement complete.\n");
-    uart_printf("  Compare Bench 1 vs 2 (ADD) and Bench 3 vs 4 (MUL)\n");
+    uart_printf("  INT: Compare Bench 1 vs 2 (ADD) and Bench 3 vs 4 (MUL)\n");
+    uart_printf("  FP:  Compare Bench 8 vs 9 (FADD) and Bench 10 vs 11 (FMUL)\n");
     uart_printf("  to see the IPC benefit of out-of-order execution.\n");
     uart_printf("============================================================\n\n");
 
diff --git a/sw/apps/tomasulo_test/tomasulo_test.c b/sw/apps/tomasulo_test/tomasulo_test.c
@@ -34,6 +34,7 @@
  *   8. Complex mixed dependency chains
  *   9. Branch with loop - speculative execution / branch prediction
  *  10. CDB contention - multiple simultaneous completions
+ *  11. FP hazards - RAW/WAR/WAW/crossover with double-precision FP
  */
 
 #include "uart.h"
@@ -58,6 +59,16 @@ static uint32_t tests_failed;
         }                                                                                          \
     } while (0)
 
+/* Convert double-precision FP result to int32 (truncate toward zero) and     */
+/* compare using the existing TEST macro. Avoids needing FP printf support.   */
+#define TEST_FP(name, fp_result, expected_int)                                                     \
+    do {                                                                                           \
+        int32_t _iv;                                                                               \
+        double _fr = (fp_result);                                                                  \
+        __asm__ volatile("fcvt.w.d %0, %1, rtz" : "=r"(_iv) : "f"(_fr));                           \
+        TEST(name, (uint32_t) _iv, (uint32_t) (expected_int));                                     \
+    } while (0)
+
 /* ========================================================================== */
 /* Test 1: RAW (Read After Write) Hazard                                      */
 /* Tests: Data forwarding through CDB, reservation station waiting            */
@@ -539,6 +550,91 @@ static void test_cdb_contention(void)
     uart_printf(" done\n");
 }
 
+/* ========================================================================== */
+/* Test 11: Floating-Point Hazards (double-precision)                         */
+/* Tests: FP RAW/WAR/WAW, FP-INT crossover, FMADD chain, independent FP ops  */
+/* ========================================================================== */
+
+static void test_fp_hazards(void)
+{
+    uart_printf("Test 11: FP hazards...");
+
+    /* FP RAW chain: each FADD.D reads the previous result */
+    double fa, fb, fc;
+    __asm__ volatile("fadd.d %[fa], %[v1], %[v2]\n" /* fa = 1.0 + 2.0 = 3.0 */
+                     "fadd.d %[fb], %[fa], %[v4]\n" /* fb = 3.0 + 4.0 = 7.0  (RAW) */
+                     "fadd.d %[fc], %[fb], %[v8]\n" /* fc = 7.0 + 8.0 = 15.0 (RAW) */
+                     : [fa] "=&f"(fa), [fb] "=&f"(fb), [fc] "=&f"(fc)
+                     : [v1] "f"(1.0), [v2] "f"(2.0), [v4] "f"(4.0), [v8] "f"(8.0));
+    TEST_FP("FP RAW fa", fa, 3);
+    TEST_FP("FP RAW fb", fb, 7);
+    TEST_FP("FP RAW fc", fc, 15);
+
+    /* FP MUL→ADD RAW: FMUL.D produces, FADD.D consumes */
+    double fp, fs;
+    __asm__ volatile("fmul.d %[p], %[a], %[b]\n" /* fp = 3.0 * 4.0 = 12.0 */
+                     "fadd.d %[s], %[p], %[c]\n" /* fs = 12.0 + 1.0 = 13.0 (RAW) */
+                     : [p] "=&f"(fp), [s] "=&f"(fs)
+                     : [a] "f"(3.0), [b] "f"(4.0), [c] "f"(1.0));
+    TEST_FP("FP MUL-ADD product", fp, 12);
+    TEST_FP("FP MUL-ADD sum", fs, 13);
+
+    /* FP WAR: read src, then overwrite it */
+    double fp_res;
+    double fp_src = 5.0;
+    __asm__ volatile("fadd.d %[res], %[src], %[src]\n" /* res = 5.0 + 5.0 = 10.0 */
+                     "fmul.d %[src], %[z], %[z]\n"     /* WAR: overwrite src = 0*0 = 0 */
+                     : [res] "=&f"(fp_res), [src] "+f"(fp_src)
+                     : [z] "f"(0.0));
+    TEST_FP("FP WAR result", fp_res, 10);
+    TEST_FP("FP WAR src overwritten", fp_src, 0);
+
+    /* FP WAW: multiple writes, only final value survives */
+    double fw;
+    __asm__ volatile("fadd.d %[w], %[v1], %[z]\n" /* 1.0 */
+                     "fadd.d %[w], %[v2], %[z]\n" /* WAW: 2.0 */
+                     "fadd.d %[w], %[v3], %[z]\n" /* WAW: 3.0 (final) */
+                     : [w] "=f"(fw)
+                     : [v1] "f"(1.0), [v2] "f"(2.0), [v3] "f"(3.0), [z] "f"(0.0));
+    TEST_FP("FP WAW final", fw, 3);
+
+    /* FP-INT crossover: INT produces value, FP consumes via convert */
+    uint32_t int_val;
+    double fp_from_int;
+    __asm__ volatile("addi %[iv], zero, 7\n"             /* INT: iv = 7 */
+                     "fcvt.d.w %[fv], %[iv]\n"           /* Convert to FP: 7.0 */
+                     "fadd.d   %[fv], %[fv], %[three]\n" /* FP: 7.0 + 3.0 = 10.0 */
+                     : [iv] "=&r"(int_val), [fv] "=&f"(fp_from_int)
+                     : [three] "f"(3.0));
+    TEST("FP-INT crossover int_val", int_val, 7);
+    TEST_FP("FP-INT crossover fp result", fp_from_int, 10);
+
+    /* FMADD.D dependent chain: accum = accum * 1.0 + addend */
+    double fma_acc;
+    __asm__ volatile("fmul.d  %[a], %[z], %[z]\n"          /* accum = 0.0 */
+                     "fmadd.d %[a], %[a], %[one], %[v2]\n" /* 0*1+2 = 2.0 */
+                     "fmadd.d %[a], %[a], %[one], %[v3]\n" /* 2*1+3 = 5.0 */
+                     "fmadd.d %[a], %[a], %[one], %[v4]\n" /* 5*1+4 = 9.0 */
+                     : [a] "=&f"(fma_acc)
+                     : [z] "f"(0.0), [one] "f"(1.0), [v2] "f"(2.0), [v3] "f"(3.0), [v4] "f"(4.0));
+    TEST_FP("FMADD chain", fma_acc, 9);
+
+    /* 4 independent FADD.D ops - all can execute in parallel */
+    double ia, ib, ic, id;
+    __asm__ volatile("fadd.d %[a], %[v1], %[v2]\n" /* 1+2 = 3 */
+                     "fadd.d %[b], %[v3], %[v4]\n" /* 3+4 = 7 */
+                     "fadd.d %[c], %[v5], %[v1]\n" /* 5+1 = 6 */
+                     "fadd.d %[d], %[v2], %[v3]\n" /* 2+3 = 5 */
+                     : [a] "=&f"(ia), [b] "=&f"(ib), [c] "=&f"(ic), [d] "=&f"(id)
+                     : [v1] "f"(1.0), [v2] "f"(2.0), [v3] "f"(3.0), [v4] "f"(4.0), [v5] "f"(5.0));
+    TEST_FP("FP indep a", ia, 3);
+    TEST_FP("FP indep b", ib, 7);
+    TEST_FP("FP indep c", ic, 6);
+    TEST_FP("FP indep d", id, 5);
+
+    uart_printf(" done\n");
+}
+
 /* ========================================================================== */
 /* Main Entry Point                                                           */
 /* ========================================================================== */
@@ -560,6 +656,7 @@ int main(void)
     test_complex_deps();
     test_branch_loop();
     test_cdb_contention();
+    test_fp_hazards();
 
     uart_printf("\n------------------------------------------------------------\n");
     uart_printf(