diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h index b4a174f729817..8b92584b39230 100644 --- a/libc/benchmarks/gpu/timing/amdgpu/timing.h +++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h @@ -117,6 +117,8 @@ throughput_baseline(const cpp::array &inputs) { asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"v"(input)); result = input; @@ -146,6 +148,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"v"(input)); result = f(input); @@ -174,6 +178,8 @@ static LIBC_INLINE uint64_t throughput_baseline( asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; @@ -206,6 +212,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index 0c93a67129b8d..944d3732eae65 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -106,6 +106,8 @@ throughput_baseline(const cpp::array &inputs) { asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"r"(input)); result = input; @@ -135,6 +137,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"r"(input)); result = f(input); @@ -163,6 +167,8 @@ static LIBC_INLINE uint64_t throughput_baseline( asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; @@ -195,6 +201,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i];