Skip to content

Commit 5d41fea

Browse files
committed
Add fp_peak benchmark
1 parent 13e347c commit 5d41fea

File tree

3 files changed

+213
-0
lines changed

3 files changed

+213
-0
lines changed

meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ progs = [
7070
['btb_size_basic', false, true, true],
7171
['elimination', false, true, true],
7272
['find_branch_misses_pmu', false, false, true],
73+
['fp_peak', false, false, true],
7374
['ghr_size', false, true, true],
7475
['itlb_size', false, true, true],
7576
['phr_branch_bits_location', true, false, true],

src/fp_peak.cpp

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#include "include/utils.h"
2+
#include <assert.h>
3+
#include <cstddef>
4+
#include <stdio.h>
5+
#include <stdlib.h>
6+
#include <unistd.h>
7+
#include <vector>
8+
9+
// defined in fp_peak_gen.cpp
10+
// args: loop count
11+
typedef void (*gadget)(size_t);
12+
extern "C" {
13+
extern gadget fp_peak_gadgets[];
14+
}
15+
16+
int main(int argc, char *argv[]) {
17+
// match fp_peak_gen.cpp
18+
int repeat = 1000;
19+
int loop_count = 1000;
20+
21+
#ifdef HOST_AARCH64
22+
int num_patterns = 6;
23+
char patterns[][20] = {
24+
"32-bit SP FMADD", "64-bit DP FMADD", "128-bit SP ASIMD",
25+
"128-bit DP ASIMD", "xxxx-bit SP SVE", "xxxx-bit DP SVE",
26+
};
27+
28+
int coef[] = {
29+
32 / 32 * 2, // 32-bit SP
30+
64 / 64 * 2, // 64-bit DP
31+
128 / 32 * 2, // 128-bit SP
32+
128 / 64 * 2, // 128-bit DP
33+
0, // ?-bit SP
34+
0, // ?-bit DP
35+
};
36+
#else
37+
int num_patterns = 4;
38+
const char *patterns[] = {
39+
"256-bit SP FMA",
40+
"256-bit DP FMA",
41+
"512-bit SP AVX512F",
42+
"512-bit DP AVX512F",
43+
};
44+
45+
int coef[] = {
46+
256 / 32 * 2, // 256-bit SP
47+
256 / 64 * 2, // 256-bit DP
48+
512 / 32 * 2, // 512-bit SP
49+
512 / 64 * 2, // 512-bit DP
50+
};
51+
#endif
52+
53+
bind_to_core();
54+
setup_perf_cycles();
55+
FILE *fp = fopen("fp_peak.csv", "w");
56+
assert(fp);
57+
58+
int gadget_index = 0;
59+
fprintf(fp, "pattern,min,avg,max\n");
60+
for (int pattern = 0; pattern < num_patterns; pattern++) {
61+
std::vector<double> history;
62+
int iterations = 100;
63+
history.reserve(iterations);
64+
65+
#ifdef HOST_AARCH64
66+
// read sve length in runtime
67+
if (pattern == 4) {
68+
uint64_t len = 0;
69+
asm __volatile__(".arch armv9-a+sve\ncntw %0" : "=r"(len));
70+
sprintf(patterns[pattern], "%ld-bit SP SVE", len * 32);
71+
coef[pattern] = len * 2;
72+
} else if (pattern == 5) {
73+
uint64_t len = 0;
74+
asm __volatile__(".arch armv9-a+sve\ncntd %0" : "=r"(len));
75+
sprintf(patterns[pattern], "%ld-bit DP SVE", len * 64);
76+
coef[pattern] = len * 2;
77+
}
78+
#endif
79+
80+
double sum = 0;
81+
// run several times
82+
for (int i = 0; i < iterations; i++) {
83+
uint64_t begin = perf_read_cycles();
84+
fp_peak_gadgets[gadget_index](loop_count);
85+
uint64_t elapsed = perf_read_cycles() - begin;
86+
87+
// skip warmup
88+
if (i >= 10) {
89+
double time =
90+
(double)coef[pattern] / ((double)elapsed / loop_count / repeat);
91+
history.push_back(time);
92+
sum += time;
93+
}
94+
}
95+
96+
double min = history[0];
97+
double max = history[0];
98+
for (size_t i = 0; i < history.size(); i++) {
99+
if (min > history[i]) {
100+
min = history[i];
101+
}
102+
if (max < history[i]) {
103+
max = history[i];
104+
}
105+
}
106+
107+
fprintf(fp, "%s,%.2lf,%.2lf,%.2lf\n", patterns[pattern], min,
108+
sum / history.size(), max);
109+
fflush(fp);
110+
111+
gadget_index++;
112+
}
113+
114+
printf("Results are written to fp_peak.csv\n");
115+
return 0;
116+
}

src/fp_peak_gen.cpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#include "include/utils.h"
2+
#include <cstdio>
3+
4+
int main(int argc, char *argv[]) {
5+
FILE *fp = fopen(argv[1], "w");
6+
assert(fp);
7+
#ifdef HOST_AARCH64
8+
int num_patterns = 6;
9+
#else
10+
int num_patterns = 4;
11+
#endif
12+
int repeat = 1000;
13+
14+
// args: loop count
15+
fprintf(fp, ".text\n");
16+
for (int pattern = 0; pattern < num_patterns; pattern++) {
17+
// entry
18+
fprintf(fp, ".global fp_peak_%d\n", pattern);
19+
fprintf(fp, ".balign 32\n");
20+
fprintf(fp, "fp_peak_%d:\n", pattern);
21+
#if defined(HOST_AARCH64)
22+
fprintf(fp, "\tsub sp, sp, #0x100\n");
23+
fprintf(fp, "\tstp q8, q9, [sp, #0x0]\n");
24+
fprintf(fp, "\tstp q10, q11, [sp, #0x20]\n");
25+
fprintf(fp, "\tstp q12, q13, [sp, #0x40]\n");
26+
fprintf(fp, "\tstp q14, q15, [sp, #0x60]\n");
27+
28+
if (pattern == 4 || pattern == 5) {
29+
fprintf(fp, "\t.arch armv9-a+sve\n");
30+
fprintf(fp, "\tptrue p0.b\n");
31+
}
32+
33+
fprintf(fp, "\t1:\n");
34+
for (int i = 0; i < repeat; i++) {
35+
if (pattern == 0) {
36+
// single precision 32-bit fma using FMADD
37+
fprintf(fp, "\tfmadd s%d, s0, s1, s2\n", (i % 16) + 3);
38+
} else if (pattern == 1) {
39+
// double precision 64-bit fma using FMADD
40+
fprintf(fp, "\tfmadd d%d, d0, d1, d2\n", (i % 16) + 3);
41+
} else if (pattern == 2) {
42+
// single precision 128-bit fma using ASIMD
43+
fprintf(fp, "\tfmla v%d.4s, v0.4s, v1.4s\n", (i % 16) + 2);
44+
} else if (pattern == 3) {
45+
// double precision 128-bit fma using ASIMD
46+
fprintf(fp, "\tfmla v%d.2d, v0.2d, v1.2d\n", (i % 16) + 2);
47+
} else if (pattern == 4) {
48+
// single precision fma using SVE
49+
fprintf(fp, "\tfmla z%d.s, p0/m, z0.s, z1.s\n", (i % 16) + 2);
50+
} else if (pattern == 5) {
51+
// double precision fma using SVE
52+
fprintf(fp, "\tfmla z%d.d, p0/m, z0.d, z1.d\n", (i % 16) + 2);
53+
}
54+
}
55+
56+
fprintf(fp, "\tsubs x0, x0, #1\n");
57+
fprintf(fp, "\tbne 1b\n");
58+
59+
fprintf(fp, "\tldp q8, q9, [sp, #0x0]\n");
60+
fprintf(fp, "\tldp q10, q11, [sp, #0x20]\n");
61+
fprintf(fp, "\tldp q12, q13, [sp, #0x40]\n");
62+
fprintf(fp, "\tldp q14, q15, [sp, #0x60]\n");
63+
fprintf(fp, "\tadd sp, sp, #0x100\n");
64+
fprintf(fp, "\tret\n");
65+
#endif
66+
#if defined(HOST_AMD64)
67+
fprintf(fp, "\t1:\n");
68+
for (int i = 0; i < repeat; i++) {
69+
if (pattern == 0) {
70+
// single precision 256-bit fma using FMA
71+
fprintf(fp, "\tvfmadd213ps %%ymm0, %%ymm1, %%ymm%d\n", (i % 14) + 2);
72+
} else if (pattern == 1) {
73+
// double precision 256-bit fma using FMA
74+
fprintf(fp, "\tvfmadd213pd %%ymm0, %%ymm1, %%ymm%d\n", (i % 14) + 2);
75+
} else if (pattern == 2) {
76+
// single precision 512-bit fma using AVX512F
77+
fprintf(fp, "\tvfmadd213ps %%zmm0, %%zmm1, %%zmm%d\n", (i % 14) + 2);
78+
} else if (pattern == 3) {
79+
// double precision 512-bit fma using AVX512F
80+
fprintf(fp, "\tvfmadd213pd %%zmm0, %%zmm1, %%zmm%d\n", (i % 14) + 2);
81+
}
82+
}
83+
84+
fprintf(fp, "\tdec %%rdi\n");
85+
fprintf(fp, "\tjnz 1b\n");
86+
87+
fprintf(fp, "\tret\n");
88+
#endif
89+
}
90+
91+
define_gadgets_array(fp, "fp_peak_gadgets");
92+
for (int pattern = 0; pattern < num_patterns; pattern++) {
93+
add_gadget(fp, "fp_peak_%d", pattern);
94+
}
95+
return 0;
96+
}

0 commit comments

Comments
 (0)