Skip to content

Commit 0e403a1

Browse files
committed
Tomasulo: Add floating-point benchmarks and FP hazard correctness tests
tomasulo_perf gets 6 new double-precision benchmarks (Bench 8-13): dependent/independent FADD.D and FMUL.D chains, dependent FMADD.D chain, and mixed FP+INT cross-unit parallelism measurement. tomasulo_test gets Test 11 with 16 FP assertions covering RAW/WAR/WAW hazards on FP registers, FP MUL-ADD forwarding, FP-INT crossover via fcvt, dependent FMADD.D chains, and independent FP op parallelism. Adds a TEST_FP macro that converts double results to int via fcvt.w.d for comparison without needing FP printf support.
1 parent 09d6f0e commit 0e403a1

File tree

2 files changed

+244
-2
lines changed

2 files changed

+244
-2
lines changed

sw/apps/tomasulo_perf/tomasulo_perf.c

Lines changed: 147 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,22 @@
2828
* Uses hardware cycle and instret counters (Zicntr CSRs) for measurement.
2929
* IPC is reported as IPC*100 (integer, so 150 means IPC = 1.50).
3030
*
31-
* Benchmarks:
31+
* Benchmarks (integer):
3232
* 1. Dependent ADD chain (worst-case ILP: serialized)
3333
* 2. Independent ADD chains (best-case ILP: fully parallel)
3434
* 3. Dependent MUL chain (long-latency serialized)
3535
* 4. Independent MUL chains (long-latency parallel)
3636
* 5. Mixed MUL + ADD (latency hiding)
3737
* 6. Load-store throughput (memory subsystem)
3838
* 7. Branch-heavy loop (branch prediction + OOO)
39+
*
40+
* Benchmarks (floating-point, double-precision):
41+
* 8. Dependent FADD.D chain (FP ALU serialized)
42+
* 9. Independent FADD.D chains (FP ALU parallel)
43+
* 10. Dependent FMUL.D chain (FP MUL serialized)
44+
* 11. Independent FMUL.D chains (FP MUL parallel)
45+
* 12. Dependent FMADD.D chain (fused multiply-add, key for numerics)
46+
* 13. Mixed FP + INT (cross-unit parallelism)
3947
*/
4048

4149
#include "csr.h"
@@ -218,12 +226,149 @@ int main(void)
218226
i1 = rdinstret();
219227
print_result(c1 - c0, i1 - i0);
220228

229+
/* ===================================================================== */
230+
/* Floating-Point Benchmarks */
231+
/* ===================================================================== */
232+
uart_printf("\n--- Floating-Point Benchmarks (double-precision) ---\n\n");
233+
234+
/* ===================================================================== */
235+
/* Benchmark 8: Dependent FADD.D chain (100 instructions) */
236+
/* Each FADD.D reads the result of the previous one - no ILP possible. */
237+
/* FP analogue of Bench 1. */
238+
/* ===================================================================== */
239+
uart_printf("Bench 8: Dependent FADD.D chain (100 instrs)\n");
240+
{
241+
double accum = 1.0, incr = 0.5;
242+
c0 = rdcycle();
243+
i0 = rdinstret();
244+
__asm__ volatile(".rept 100\n"
245+
"fadd.d %[a], %[a], %[i]\n"
246+
".endr\n"
247+
: [a] "+f"(accum)
248+
: [i] "f"(incr));
249+
c1 = rdcycle();
250+
i1 = rdinstret();
251+
print_result(c1 - c0, i1 - i0);
252+
}
253+
254+
/* ===================================================================== */
255+
/* Benchmark 9: Independent FADD.D chains (4 x 25 = 100 instructions) */
256+
/* 4 chains with no cross-dependencies - ideal for OOO execution. */
257+
/* FP analogue of Bench 2. */
258+
/* ===================================================================== */
259+
uart_printf("Bench 9: Independent FADD.D chains (4x25 = 100 instrs)\n");
260+
{
261+
double a0 = 1.0, a1 = 2.0, a2 = 3.0, a3 = 4.0;
262+
double inc = 0.5;
263+
c0 = rdcycle();
264+
i0 = rdinstret();
265+
__asm__ volatile(".rept 25\n"
266+
"fadd.d %[a0], %[a0], %[inc]\n"
267+
"fadd.d %[a1], %[a1], %[inc]\n"
268+
"fadd.d %[a2], %[a2], %[inc]\n"
269+
"fadd.d %[a3], %[a3], %[inc]\n"
270+
".endr\n"
271+
: [a0] "+f"(a0), [a1] "+f"(a1), [a2] "+f"(a2), [a3] "+f"(a3)
272+
: [inc] "f"(inc));
273+
c1 = rdcycle();
274+
i1 = rdinstret();
275+
print_result(c1 - c0, i1 - i0);
276+
}
277+
278+
/* ===================================================================== */
279+
/* Benchmark 10: Dependent FMUL.D chain (50 instructions) */
280+
/* FMUL.D has multi-cycle latency; dependent chain is very slow. */
281+
/* Multiply by 1.0 to keep value stable. FP analogue of Bench 3. */
282+
/* ===================================================================== */
283+
uart_printf("Bench 10: Dependent FMUL.D chain (50 instrs)\n");
284+
{
285+
double accum = 2.0, factor = 1.0;
286+
c0 = rdcycle();
287+
i0 = rdinstret();
288+
__asm__ volatile(".rept 50\n"
289+
"fmul.d %[a], %[a], %[f]\n"
290+
".endr\n"
291+
: [a] "+f"(accum)
292+
: [f] "f"(factor));
293+
c1 = rdcycle();
294+
i1 = rdinstret();
295+
print_result(c1 - c0, i1 - i0);
296+
}
297+
298+
/* ===================================================================== */
299+
/* Benchmark 11: Independent FMUL.D chains (4 x 12 = 48 instructions) */
300+
/* 4 independent FMUL.D chains. FP analogue of Bench 4. */
301+
/* ===================================================================== */
302+
uart_printf("Bench 11: Independent FMUL.D chains (4x12 = 48 instrs)\n");
303+
{
304+
double m0 = 1.0, m1 = 2.0, m2 = 3.0, m3 = 4.0;
305+
double factor = 1.0;
306+
c0 = rdcycle();
307+
i0 = rdinstret();
308+
__asm__ volatile(".rept 12\n"
309+
"fmul.d %[m0], %[m0], %[f]\n"
310+
"fmul.d %[m1], %[m1], %[f]\n"
311+
"fmul.d %[m2], %[m2], %[f]\n"
312+
"fmul.d %[m3], %[m3], %[f]\n"
313+
".endr\n"
314+
: [m0] "+f"(m0), [m1] "+f"(m1), [m2] "+f"(m2), [m3] "+f"(m3)
315+
: [f] "f"(factor));
316+
c1 = rdcycle();
317+
i1 = rdinstret();
318+
print_result(c1 - c0, i1 - i0);
319+
}
320+
321+
/* ===================================================================== */
322+
/* Benchmark 12: Dependent FMADD.D chain (50 instructions) */
323+
/* Fused multiply-add: accum = accum * 1.0 + 0.5, serialized. */
324+
/* Key for numerical workloads (BLAS, FFT, etc.). */
325+
/* ===================================================================== */
326+
uart_printf("Bench 12: Dependent FMADD.D chain (50 instrs)\n");
327+
{
328+
double accum = 0.0, mul_one = 1.0, add_half = 0.5;
329+
c0 = rdcycle();
330+
i0 = rdinstret();
331+
__asm__ volatile(".rept 50\n"
332+
"fmadd.d %[a], %[a], %[m], %[c]\n"
333+
".endr\n"
334+
: [a] "+f"(accum)
335+
: [m] "f"(mul_one), [c] "f"(add_half));
336+
c1 = rdcycle();
337+
i1 = rdinstret();
338+
print_result(c1 - c0, i1 - i0);
339+
}
340+
341+
/* ===================================================================== */
342+
/* Benchmark 13: Mixed FP + INT (50 pairs = 100 instructions) */
343+
/* Tests cross-unit parallelism: FP and INT units should work in */
344+
/* parallel since there are no data dependencies between them. */
345+
/* ===================================================================== */
346+
uart_printf("Bench 13: Mixed FP+INT (50 pairs = 100 instrs)\n");
347+
{
348+
double fp_acc = 1.0, fp_inc = 0.5;
349+
c0 = rdcycle();
350+
i0 = rdinstret();
351+
__asm__ volatile("addi t0, zero, 0\n"
352+
"addi t1, zero, 1\n"
353+
".rept 50\n"
354+
"fadd.d %[fa], %[fa], %[fi]\n"
355+
"add t0, t0, t1\n"
356+
".endr\n"
357+
: [fa] "+f"(fp_acc)
358+
: [fi] "f"(fp_inc)
359+
: "t0", "t1");
360+
c1 = rdcycle();
361+
i1 = rdinstret();
362+
print_result(c1 - c0, i1 - i0);
363+
}
364+
221365
/* ===================================================================== */
222366
/* Summary */
223367
/* ===================================================================== */
224368
uart_printf("\n============================================================\n");
225369
uart_printf(" Performance measurement complete.\n");
226-
uart_printf(" Compare Bench 1 vs 2 (ADD) and Bench 3 vs 4 (MUL)\n");
370+
uart_printf(" INT: Compare Bench 1 vs 2 (ADD) and Bench 3 vs 4 (MUL)\n");
371+
uart_printf(" FP: Compare Bench 8 vs 9 (FADD) and Bench 10 vs 11 (FMUL)\n");
227372
uart_printf(" to see the IPC benefit of out-of-order execution.\n");
228373
uart_printf("============================================================\n\n");
229374

sw/apps/tomasulo_test/tomasulo_test.c

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
* 8. Complex mixed dependency chains
3535
* 9. Branch with loop - speculative execution / branch prediction
3636
* 10. CDB contention - multiple simultaneous completions
37+
* 11. FP hazards - RAW/WAR/WAW/crossover with double-precision FP
3738
*/
3839

3940
#include "uart.h"
@@ -58,6 +59,16 @@ static uint32_t tests_failed;
5859
} \
5960
} while (0)
6061

62+
/* Convert double-precision FP result to int32 (truncate toward zero) and */
63+
/* compare using the existing TEST macro. Avoids needing FP printf support. */
64+
#define TEST_FP(name, fp_result, expected_int) \
65+
do { \
66+
int32_t _iv; \
67+
double _fr = (fp_result); \
68+
__asm__ volatile("fcvt.w.d %0, %1, rtz" : "=r"(_iv) : "f"(_fr)); \
69+
TEST(name, (uint32_t) _iv, (uint32_t) (expected_int)); \
70+
} while (0)
71+
6172
/* ========================================================================== */
6273
/* Test 1: RAW (Read After Write) Hazard */
6374
/* Tests: Data forwarding through CDB, reservation station waiting */
@@ -539,6 +550,91 @@ static void test_cdb_contention(void)
539550
uart_printf(" done\n");
540551
}
541552

553+
/* ========================================================================== */
554+
/* Test 11: Floating-Point Hazards (double-precision) */
555+
/* Tests: FP RAW/WAR/WAW, FP-INT crossover, FMADD chain, independent FP ops */
556+
/* ========================================================================== */
557+
558+
static void test_fp_hazards(void)
559+
{
560+
uart_printf("Test 11: FP hazards...");
561+
562+
/* FP RAW chain: each FADD.D reads the previous result */
563+
double fa, fb, fc;
564+
__asm__ volatile("fadd.d %[fa], %[v1], %[v2]\n" /* fa = 1.0 + 2.0 = 3.0 */
565+
"fadd.d %[fb], %[fa], %[v4]\n" /* fb = 3.0 + 4.0 = 7.0 (RAW) */
566+
"fadd.d %[fc], %[fb], %[v8]\n" /* fc = 7.0 + 8.0 = 15.0 (RAW) */
567+
: [fa] "=&f"(fa), [fb] "=&f"(fb), [fc] "=&f"(fc)
568+
: [v1] "f"(1.0), [v2] "f"(2.0), [v4] "f"(4.0), [v8] "f"(8.0));
569+
TEST_FP("FP RAW fa", fa, 3);
570+
TEST_FP("FP RAW fb", fb, 7);
571+
TEST_FP("FP RAW fc", fc, 15);
572+
573+
/* FP MUL→ADD RAW: FMUL.D produces, FADD.D consumes */
574+
double fp, fs;
575+
__asm__ volatile("fmul.d %[p], %[a], %[b]\n" /* fp = 3.0 * 4.0 = 12.0 */
576+
"fadd.d %[s], %[p], %[c]\n" /* fs = 12.0 + 1.0 = 13.0 (RAW) */
577+
: [p] "=&f"(fp), [s] "=&f"(fs)
578+
: [a] "f"(3.0), [b] "f"(4.0), [c] "f"(1.0));
579+
TEST_FP("FP MUL-ADD product", fp, 12);
580+
TEST_FP("FP MUL-ADD sum", fs, 13);
581+
582+
/* FP WAR: read src, then overwrite it */
583+
double fp_res;
584+
double fp_src = 5.0;
585+
__asm__ volatile("fadd.d %[res], %[src], %[src]\n" /* res = 5.0 + 5.0 = 10.0 */
586+
"fmul.d %[src], %[z], %[z]\n" /* WAR: overwrite src = 0*0 = 0 */
587+
: [res] "=&f"(fp_res), [src] "+f"(fp_src)
588+
: [z] "f"(0.0));
589+
TEST_FP("FP WAR result", fp_res, 10);
590+
TEST_FP("FP WAR src overwritten", fp_src, 0);
591+
592+
/* FP WAW: multiple writes, only final value survives */
593+
double fw;
594+
__asm__ volatile("fadd.d %[w], %[v1], %[z]\n" /* 1.0 */
595+
"fadd.d %[w], %[v2], %[z]\n" /* WAW: 2.0 */
596+
"fadd.d %[w], %[v3], %[z]\n" /* WAW: 3.0 (final) */
597+
: [w] "=f"(fw)
598+
: [v1] "f"(1.0), [v2] "f"(2.0), [v3] "f"(3.0), [z] "f"(0.0));
599+
TEST_FP("FP WAW final", fw, 3);
600+
601+
/* FP-INT crossover: INT produces value, FP consumes via convert */
602+
uint32_t int_val;
603+
double fp_from_int;
604+
__asm__ volatile("addi %[iv], zero, 7\n" /* INT: iv = 7 */
605+
"fcvt.d.w %[fv], %[iv]\n" /* Convert to FP: 7.0 */
606+
"fadd.d %[fv], %[fv], %[three]\n" /* FP: 7.0 + 3.0 = 10.0 */
607+
: [iv] "=&r"(int_val), [fv] "=&f"(fp_from_int)
608+
: [three] "f"(3.0));
609+
TEST("FP-INT crossover int_val", int_val, 7);
610+
TEST_FP("FP-INT crossover fp result", fp_from_int, 10);
611+
612+
/* FMADD.D dependent chain: accum = accum * 1.0 + addend */
613+
double fma_acc;
614+
__asm__ volatile("fmul.d %[a], %[z], %[z]\n" /* accum = 0.0 */
615+
"fmadd.d %[a], %[a], %[one], %[v2]\n" /* 0*1+2 = 2.0 */
616+
"fmadd.d %[a], %[a], %[one], %[v3]\n" /* 2*1+3 = 5.0 */
617+
"fmadd.d %[a], %[a], %[one], %[v4]\n" /* 5*1+4 = 9.0 */
618+
: [a] "=&f"(fma_acc)
619+
: [z] "f"(0.0), [one] "f"(1.0), [v2] "f"(2.0), [v3] "f"(3.0), [v4] "f"(4.0));
620+
TEST_FP("FMADD chain", fma_acc, 9);
621+
622+
/* 4 independent FADD.D ops - all can execute in parallel */
623+
double ia, ib, ic, id;
624+
__asm__ volatile("fadd.d %[a], %[v1], %[v2]\n" /* 1+2 = 3 */
625+
"fadd.d %[b], %[v3], %[v4]\n" /* 3+4 = 7 */
626+
"fadd.d %[c], %[v5], %[v1]\n" /* 5+1 = 6 */
627+
"fadd.d %[d], %[v2], %[v3]\n" /* 2+3 = 5 */
628+
: [a] "=&f"(ia), [b] "=&f"(ib), [c] "=&f"(ic), [d] "=&f"(id)
629+
: [v1] "f"(1.0), [v2] "f"(2.0), [v3] "f"(3.0), [v4] "f"(4.0), [v5] "f"(5.0));
630+
TEST_FP("FP indep a", ia, 3);
631+
TEST_FP("FP indep b", ib, 7);
632+
TEST_FP("FP indep c", ic, 6);
633+
TEST_FP("FP indep d", id, 5);
634+
635+
uart_printf(" done\n");
636+
}
637+
542638
/* ========================================================================== */
543639
/* Main Entry Point */
544640
/* ========================================================================== */
@@ -560,6 +656,7 @@ int main(void)
560656
test_complex_deps();
561657
test_branch_loop();
562658
test_cdb_contention();
659+
test_fp_hazards();
563660

564661
uart_printf("\n------------------------------------------------------------\n");
565662
uart_printf(

0 commit comments

Comments
 (0)