Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions libc/benchmarks/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ add_subdirectory(timing)

add_custom_target(gpu-benchmark)

option(LIBC_GPU_BENCHMARKS_ALLOW_UNROLL "Allow compiler loop unrolling in throughput loops" OFF)

function(add_benchmark benchmark_name)
cmake_parse_arguments(
"BENCHMARK"
Expand All @@ -14,6 +16,12 @@ function(add_benchmark benchmark_name)
if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS)
message(FATAL_ERROR "target does not support clock")
endif()

set(benchmark_extra_flags "")
if(NOT LIBC_GPU_BENCHMARKS_ALLOW_UNROLL)
list(APPEND benchmark_extra_flags "-DLIBC_GPU_BENCHMARKS_DISABLE_UNROLL=1")
endif()

add_libc_hermetic(
${benchmark_name}
IS_GPU_BENCHMARK
Expand All @@ -26,6 +34,7 @@ function(add_benchmark benchmark_name)
${BENCHMARK_UNPARSED_ARGUMENTS}
COMPILE_OPTIONS
-flto
${benchmark_extra_flags}
)
get_fq_target_name(${benchmark_name} fq_target_name)
set(fq_build_target_name ${fq_target_name}.__build__)
Expand Down
16 changes: 16 additions & 0 deletions libc/benchmarks/gpu/timing/amdgpu/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
asm("" ::"s"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (auto input : inputs) {
asm("" ::"v"(input));
result = input;
Expand Down Expand Up @@ -146,6 +150,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"s"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (auto input : inputs) {
asm("" ::"v"(input));
result = f(input);
Expand Down Expand Up @@ -174,6 +182,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
asm("" ::"s"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
Expand Down Expand Up @@ -206,6 +218,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
asm("" ::"s"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
Expand Down
16 changes: 16 additions & 0 deletions libc/benchmarks/gpu/timing/nvptx/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ throughput_baseline(const cpp::array<T, N> &inputs) {
asm("" ::"llr"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (auto input : inputs) {
asm("" ::"r"(input));
result = input;
Expand Down Expand Up @@ -135,6 +139,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
asm("" ::"llr"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (auto input : inputs) {
asm("" ::"r"(input));
result = f(input);
Expand Down Expand Up @@ -163,6 +171,10 @@ static LIBC_INLINE uint64_t throughput_baseline(
asm("" ::"llr"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
Expand Down Expand Up @@ -195,6 +207,10 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
asm("" ::"llr"(start));

T result{};

#if defined(LIBC_GPU_BENCHMARKS_DISABLE_UNROLL)
#pragma clang loop unroll(disable)
#endif
for (size_t i = 0; i < N; i++) {
T x = inputs1[i];
T y = inputs2[i];
Expand Down
Loading