Skip to content

Commit 4da8053

Browse files
Remove unroll toggle; make throughput loop non-unrolled unconditionally
1 parent c227a4b commit 4da8053

File tree

3 files changed

+8
-33
lines changed

3 files changed

+8
-33
lines changed

libc/benchmarks/gpu/CMakeLists.txt

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ add_subdirectory(timing)
22

33
add_custom_target(gpu-benchmark)
44

5-
option(LIBC_GPU_BENCHMARKS_ALLOW_UNROLL "Allow compiler loop unrolling in throughput loops" OFF)
6-
75
function(add_benchmark benchmark_name)
86
cmake_parse_arguments(
97
"BENCHMARK"
@@ -16,12 +14,6 @@ function(add_benchmark benchmark_name)
1614
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
1715
message(FATAL_ERROR "target does not support clock")
1816
endif()
19-
20-
set(benchmark_extra_flags "")
21-
if(NOT LIBC_GPU_BENCHMARKS_ALLOW_UNROLL)
22-
list(APPEND benchmark_extra_flags "-DLIBC_GPU_BENCHMARKS_DISABLE_UNROLL=1")
23-
endif()
24-
2517
add_libc_hermetic(
2618
${benchmark_name}
2719
IS_GPU_BENCHMARK
@@ -34,7 +26,6 @@ function(add_benchmark benchmark_name)
3426
${BENCHMARK_UNPARSED_ARGUMENTS}
3527
COMPILE_OPTIONS
3628
-flto
37-
${benchmark_extra_flags}
3829
)
3930
get_fq_target_name(${benchmark_name} fq_target_name)
4031
set(fq_build_target_name ${fq_target_name}.__build__)

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,7 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
118118

119119
T result{};
120120

121-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
122-
#pragma clang loop unroll(disable)
123-
#endif
121+
#pragma clang loop unroll(disable)
124122
for (auto input : inputs) {
125123
asm("" ::"v"(input));
126124
result = input;
@@ -151,9 +149,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
151149

152150
T result{};
153151

154-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
155-
#pragma clang loop unroll(disable)
156-
#endif
152+
#pragma clang loop unroll(disable)
157153
for (auto input : inputs) {
158154
asm("" ::"v"(input));
159155
result = f(input);
@@ -183,9 +179,7 @@ static LIBC_INLINE uint64_t throughput_baseline(
183179

184180
T result{};
185181

186-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
187-
#pragma clang loop unroll(disable)
188-
#endif
182+
#pragma clang loop unroll(disable)
189183
for (size_t i = 0; i < N; i++) {
190184
T x = inputs1[i];
191185
T y = inputs2[i];
@@ -219,9 +213,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
219213

220214
T result{};
221215

222-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
223-
#pragma clang loop unroll(disable)
224-
#endif
216+
#pragma clang loop unroll(disable)
225217
for (size_t i = 0; i < N; i++) {
226218
T x = inputs1[i];
227219
T y = inputs2[i];

libc/benchmarks/gpu/timing/nvptx/timing.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,7 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
107107

108108
T result{};
109109

110-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
111-
#pragma clang loop unroll(disable)
112-
#endif
110+
#pragma clang loop unroll(disable)
113111
for (auto input : inputs) {
114112
asm("" ::"r"(input));
115113
result = input;
@@ -140,9 +138,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
140138

141139
T result{};
142140

143-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
144-
#pragma clang loop unroll(disable)
145-
#endif
141+
#pragma clang loop unroll(disable)
146142
for (auto input : inputs) {
147143
asm("" ::"r"(input));
148144
result = f(input);
@@ -172,9 +168,7 @@ static LIBC_INLINE uint64_t throughput_baseline(
172168

173169
T result{};
174170

175-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
176-
#pragma clang loop unroll(disable)
177-
#endif
171+
#pragma clang loop unroll(disable)
178172
for (size_t i = 0; i < N; i++) {
179173
T x = inputs1[i];
180174
T y = inputs2[i];
@@ -208,9 +202,7 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
208202

209203
T result{};
210204

211-
#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
212-
#pragma clang loop unroll(disable)
213-
#endif
205+
#pragma clang loop unroll(disable)
214206
for (size_t i = 0; i < N; i++) {
215207
T x = inputs1[i];
216208
T y = inputs2[i];

0 commit comments

Comments
 (0)