Skip to content

Commit c227a4b

Browse files
Disable loop unrolling in the throughput benchmark loop by default
1 parent 351d398 commit c227a4b

File tree

3 files changed

+41
-0
lines changed

3 files changed

+41
-0
lines changed

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ add_subdirectory(timing)
22

33
add_custom_target(gpu-benchmark)
44

5+
option(LIBC_GPU_BENCHMARKS_ALLOW_UNROLL "Allow compiler loop unrolling in throughput loops" OFF)
6+
57
function(add_benchmark benchmark_name)
68
cmake_parse_arguments(
79
"BENCHMARK"
@@ -14,6 +16,12 @@ function(add_benchmark benchmark_name)
1416
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
1517
message(FATAL_ERROR "target does not support clock")
1618
endif()
19+
20+
set(benchmark_extra_flags "")
21+
if(NOT LIBC_GPU_BENCHMARKS_ALLOW_UNROLL)
22+
list(APPEND benchmark_extra_flags "-DLIBC_GPU_BENCHMARKS_DISABLE_UNROLL=1")
23+
endif()
24+
1725
add_libc_hermetic(
1826
${benchmark_name}
1927
IS_GPU_BENCHMARK
@@ -26,6 +34,7 @@ function(add_benchmark benchmark_name)
2634
${BENCHMARK_UNPARSED_ARGUMENTS}
2735
COMPILE_OPTIONS
2836
-flto
37+
${benchmark_extra_flags}
2938
)
3039
get_fq_target_name(${benchmark_name} fq_target_name)
3140
set(fq_build_target_name ${fq_target_name}.__build__)

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
117117
asm("" ::"s"(start));
118118

119119
T result{};
120+
121+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
122+
#pragma clang loop unroll(disable)
123+
#endif
120124
for (auto input : inputs) {
121125
asm("" ::"v"(input));
122126
result = input;
@@ -146,6 +150,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
146150
asm("" ::"s"(start));
147151

148152
T result{};
153+
154+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
155+
#pragma clang loop unroll(disable)
156+
#endif
149157
for (auto input : inputs) {
150158
asm("" ::"v"(input));
151159
result = f(input);
@@ -174,6 +182,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
174182
asm("" ::"s"(start));
175183

176184
T result{};
185+
186+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
187+
#pragma clang loop unroll(disable)
188+
#endif
177189
for (size_t i = 0; i < N; i++) {
178190
T x = inputs1[i];
179191
T y = inputs2[i];
@@ -206,6 +218,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
206218
asm("" ::"s"(start));
207219

208220
T result{};
221+
222+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
223+
#pragma clang loop unroll(disable)
224+
#endif
209225
for (size_t i = 0; i < N; i++) {
210226
T x = inputs1[i];
211227
T y = inputs2[i];

libc/benchmarks/gpu/timing/nvptx/timing.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
106106
asm("" ::"llr"(start));
107107

108108
T result{};
109+
110+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
111+
#pragma clang loop unroll(disable)
112+
#endif
109113
for (auto input : inputs) {
110114
asm("" ::"r"(input));
111115
result = input;
@@ -135,6 +139,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
135139
asm("" ::"llr"(start));
136140

137141
T result{};
142+
143+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
144+
#pragma clang loop unroll(disable)
145+
#endif
138146
for (auto input : inputs) {
139147
asm("" ::"r"(input));
140148
result = f(input);
@@ -163,6 +171,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
163171
asm("" ::"llr"(start));
164172

165173
T result{};
174+
175+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
176+
#pragma clang loop unroll(disable)
177+
#endif
166178
for (size_t i = 0; i < N; i++) {
167179
T x = inputs1[i];
168180
T y = inputs2[i];
@@ -195,6 +207,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
195207
asm("" ::"llr"(start));
196208

197209
T result{};
210+
211+
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
212+
#pragma clang loop unroll(disable)
213+
#endif
198214
for (size_t i = 0; i < N; i++) {
199215
T x = inputs1[i];
200216
T y = inputs2[i];

0 commit comments

Comments
 (0)