Skip to content

Commit de59e7b

Browse files
committed
[libc] Fix GPU benchmarking
1 parent cfa918b commit de59e7b

File tree

6 files changed

+106
-55
lines changed

6 files changed

+106
-55
lines changed

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
#include "src/__support/GPU/utils.h"
88
#include "src/__support/fixedvector.h"
99
#include "src/__support/macros/config.h"
10+
#include "src/__support/time/gpu/time_utils.h"
1011
#include "src/stdio/printf.h"
1112
#include "src/stdlib/srand.h"
12-
#include "src/time/gpu/time_utils.h"
1313

1414
namespace LIBC_NAMESPACE_DECL {
1515
namespace benchmarks {

libc/benchmarks/gpu/src/math/atan2_benchmark.cpp

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,8 @@
33
#include "src/math/atan2.h"
44
#include "src/stdlib/rand.h"
55

6-
#ifdef NVPTX_MATH_FOUND
7-
#include "src/math/nvptx/declarations.h"
8-
#endif
9-
10-
#ifdef AMDGPU_MATH_FOUND
11-
#include "src/math/amdgpu/declarations.h"
6+
#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
7+
#include "platform.h"
128
#endif
139

1410
#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
@@ -33,15 +29,15 @@ BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
3329
BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
3430

3531
#ifdef NVPTX_MATH_FOUND
36-
BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
37-
BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
38-
BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
39-
BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
32+
BENCH(double, NvAtan2, __nv_atan2, -1023, 1023);
33+
BENCH(double, NvAtan2TwoPi, __nv_atan2, -10, 3);
34+
BENCH(double, NvAtan2TwoPow30, __nv_atan2, 0, 30);
35+
BENCH(double, NvAtan2Large, __nv_atan2, 30, 1000);
4036
#endif
4137

4238
#ifdef AMDGPU_MATH_FOUND
43-
BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
44-
BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
45-
BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
46-
BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
39+
BENCH(double, AmdAtan2, __ocml_atan2_f64, -1023, 1023);
40+
BENCH(double, AmdAtan2TwoPi, __ocml_atan2_f64, -10, 3);
41+
BENCH(double, AmdAtan2TwoPow30, __ocml_atan2_f64, 0, 30);
42+
BENCH(double, AmdAtan2Large, __ocml_atan2_f64, 30, 1000);
4743
#endif
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
//===-- AMDGPU specific platform definitions for math support -------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#ifndef LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
9+
#define LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
10+
#include "src/__support/macros/attributes.h"
11+
#include "src/__support/macros/config.h"
12+
#include <stdint.h>
13+
14+
namespace LIBC_NAMESPACE_DECL {
15+
16+
#ifdef LIBC_TARGET_ARCH_IS_AMDGPU
17+
// The ROCm device library uses control globals to alter codegen for the
18+
// different targets. To avoid needing to link them in manually we simply
19+
// define them here.
20+
extern "C" {
21+
extern const LIBC_INLINE_VAR uint8_t __oclc_unsafe_math_opt = 0;
22+
extern const LIBC_INLINE_VAR uint8_t __oclc_daz_opt = 0;
23+
extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
24+
extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
25+
extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
26+
}
27+
28+
// These aliases cause clang to emit the control constants with ODR linkage.
29+
// This allows us to link against the symbols without preventing them from being
30+
// optimized out or causing symbol collisions.
31+
[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
32+
[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
33+
[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
34+
__oclc_correctly_rounded_sqrt32__;
35+
[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
36+
[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
37+
#endif
38+
} // namespace LIBC_NAMESPACE_DECL
39+
40+
// Forward declarations for the vendor math libraries.
41+
extern "C" {
42+
#ifdef AMDGPU_MATH_FOUND
43+
double __ocml_sin_f64(double);
44+
float __ocml_sin_f32(float);
45+
double __ocml_atan2_f64(double, double);
46+
float __ocml_atan2_f32(float, float);
47+
#endif
48+
49+
#ifdef NVPTX_MATH_FOUND
50+
double __nv_sin(double);
51+
float __nv_sinf(float);
52+
double __nv_atan2(double, double);
53+
float __nv_atan2f(float, float);
54+
#endif
55+
}
56+
57+
#endif // LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H

libc/benchmarks/gpu/src/math/sin_benchmark.cpp

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,8 @@
88
#include "src/math/sinf.h"
99
#include "src/stdlib/rand.h"
1010

11-
#ifdef NVPTX_MATH_FOUND
12-
#include "src/math/nvptx/declarations.h"
13-
#endif
14-
15-
#ifdef AMDGPU_MATH_FOUND
16-
#include "src/math/amdgpu/declarations.h"
11+
#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
12+
#include "platform.h"
1713
#endif
1814

1915
// BENCHMARK() expects a function that with no parameters that returns a
@@ -42,17 +38,17 @@ BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
4238
BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
4339

4440
#ifdef NVPTX_MATH_FOUND
45-
BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
46-
BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
47-
BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
48-
BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
41+
BENCH(double, NvSin, __nv_sin, -1023, 1023);
42+
BENCH(double, NvSinTwoPi, __nv_sin, -10, 3);
43+
BENCH(double, NvSinTwoPow30, __nv_sin, 0, 30);
44+
BENCH(double, NvSinVeryLarge, __nv_sin, 30, 1000);
4945
#endif
5046

5147
#ifdef AMDGPU_MATH_FOUND
52-
BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
53-
BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
54-
BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
55-
BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
48+
BENCH(double, AmdSin, __ocml_sin_f64, -1023, 1023);
49+
BENCH(double, AmdSinTwoPi, __ocml_sin_f64, -10, 3);
50+
BENCH(double, AmdSinTwoPow30, __ocml_sin_f64, 0, 30);
51+
BENCH(double, AmdSinVeryLarge, __ocml_sin_f64, 30, 1000);
5652
#endif
5753

5854
BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
@@ -61,15 +57,15 @@ BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
6157
BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);
6258

6359
#ifdef NVPTX_MATH_FOUND
64-
BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
65-
BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
66-
BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
67-
BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
60+
BENCH(float, NvSinf, __nv_sinf, -127, 128);
61+
BENCH(float, NvSinfTwoPi, __nv_sinf, -10, 3);
62+
BENCH(float, NvSinfTwoPow30, __nv_sinf, 0, 30);
63+
BENCH(float, NvSinfVeryLarge, __nv_sinf, 30, 120);
6864
#endif
6965

7066
#ifdef AMDGPU_MATH_FOUND
71-
BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
72-
BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
73-
BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
74-
BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
67+
BENCH(float, AmdSinf, __ocml_sin_f32, -127, 128);
68+
BENCH(float, AmdSinfTwoPi, __ocml_sin_f32, -10, 3);
69+
BENCH(float, AmdSinfTwoPow30, __ocml_sin_f32, 0, 30);
70+
BENCH(float, AmdSinfVeryLarge, __ocml_sin_f32, 30, 120);
7571
#endif

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
1111

1212
#include "src/__support/CPP/array.h"
13+
#include "src/__support/CPP/atomic.h"
1314
#include "src/__support/CPP/type_traits.h"
1415
#include "src/__support/GPU/utils.h"
1516
#include "src/__support/common.h"
@@ -24,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL {
2425
// allows us to substract the constant-time overhead from the latency to
2526
// obtain a true result. This can vary with system load.
2627
[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
27-
gpu::memory_fence();
28+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
2829
uint64_t start = gpu::processor_clock();
2930
uint32_t result = 0.0;
3031
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
@@ -44,13 +45,13 @@ template <typename F, typename T>
4445
T arg = storage;
4546

4647
// The AMDGPU architecture needs to wait on pending results.
47-
gpu::memory_fence();
48+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
4849
// Get the current timestamp from the clock.
4950
uint64_t start = gpu::processor_clock();
5051

5152
// This forces the compiler to load the input argument and run the clock
5253
// cycle counter before the profiling region.
53-
asm("" ::"s"(start));
54+
asm("" : "+v"(arg) : "s"(start));
5455

5556
// Run the function under test and return its value.
5657
auto result = f(arg);
@@ -71,7 +72,7 @@ template <typename F, typename T>
7172
// ordering.
7273
uint64_t stop = gpu::processor_clock();
7374
asm("" ::"s"(stop));
74-
gpu::memory_fence();
75+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
7576

7677
// Return the time elapsed.
7778
return stop - start;
@@ -84,7 +85,7 @@ template <typename F, typename T1, typename T2>
8485
T1 arg1 = storage1;
8586
T2 arg2 = storage2;
8687

87-
gpu::memory_fence();
88+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
8889
uint64_t start = gpu::processor_clock();
8990

9091
asm("" ::"s"(start));
@@ -100,7 +101,7 @@ template <typename F, typename T1, typename T2>
100101

101102
uint64_t stop = gpu::processor_clock();
102103
asm("" ::"s"(stop));
103-
gpu::memory_fence();
104+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
104105

105106
return stop - start;
106107
}
@@ -111,7 +112,7 @@ template <typename F, typename T, size_t N>
111112
throughput(F f, const cpp::array<T, N> &inputs) {
112113
asm("" ::"v"(&inputs));
113114

114-
gpu::memory_fence();
115+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
115116
uint64_t start = gpu::processor_clock();
116117

117118
asm("" ::"s"(start));
@@ -124,7 +125,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
124125

125126
uint64_t stop = gpu::processor_clock();
126127
asm("" ::"s"(stop));
127-
gpu::memory_fence();
128+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
128129

129130
// Return the time elapsed.
130131
return stop - start;
@@ -136,7 +137,7 @@ template <typename F, typename T, size_t N>
136137
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
137138
asm("" ::"v"(&inputs1), "v"(&inputs2));
138139

139-
gpu::memory_fence();
140+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
140141
uint64_t start = gpu::processor_clock();
141142

142143
asm("" ::"s"(start));
@@ -149,7 +150,7 @@ template <typename F, typename T, size_t N>
149150

150151
uint64_t stop = gpu::processor_clock();
151152
asm("" ::"s"(stop));
152-
gpu::memory_fence();
153+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
153154

154155
// Return the time elapsed.
155156
return stop - start;

libc/benchmarks/gpu/timing/nvptx/timing.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
1111

1212
#include "src/__support/CPP/array.h"
13+
#include "src/__support/CPP/atomic.h"
1314
#include "src/__support/CPP/type_traits.h"
1415
#include "src/__support/GPU/utils.h"
1516
#include "src/__support/common.h"
@@ -46,7 +47,7 @@ template <typename F, typename T>
4647
T arg = storage;
4748

4849
// Get the current timestamp from the clock.
49-
gpu::memory_fence();
50+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
5051
uint64_t start = gpu::processor_clock();
5152

5253
// This forces the compiler to load the input argument and run the clock cycle
@@ -63,7 +64,7 @@ template <typename F, typename T>
6364
// Obtain the current timestamp after running the calculation and force
6465
// ordering.
6566
uint64_t stop = gpu::processor_clock();
66-
gpu::memory_fence();
67+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
6768
asm("" ::"r"(stop));
6869
volatile T output = result;
6970

@@ -78,7 +79,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
7879
T1 arg = storage;
7980
T2 arg2 = storage2;
8081

81-
gpu::memory_fence();
82+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
8283
uint64_t start = gpu::processor_clock();
8384

8485
asm("" ::"llr"(start));
@@ -88,7 +89,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
8889
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
8990

9091
uint64_t stop = gpu::processor_clock();
91-
gpu::memory_fence();
92+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
9293
asm("" ::"r"(stop));
9394
volatile auto output = result;
9495

@@ -101,7 +102,7 @@ template <typename F, typename T, size_t N>
101102
throughput(F f, const cpp::array<T, N> &inputs) {
102103
asm("" ::"r"(&inputs));
103104

104-
gpu::memory_fence();
105+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
105106
uint64_t start = gpu::processor_clock();
106107

107108
asm("" ::"llr"(start));
@@ -114,7 +115,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
114115
}
115116

116117
uint64_t stop = gpu::processor_clock();
117-
gpu::memory_fence();
118+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
118119
asm("" ::"r"(stop));
119120
volatile auto output = result;
120121

@@ -128,7 +129,7 @@ template <typename F, typename T, size_t N>
128129
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
129130
asm("" ::"r"(&inputs1), "r"(&inputs2));
130131

131-
gpu::memory_fence();
132+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
132133
uint64_t start = gpu::processor_clock();
133134

134135
asm("" ::"llr"(start));
@@ -140,7 +141,7 @@ template <typename F, typename T, size_t N>
140141
}
141142

142143
uint64_t stop = gpu::processor_clock();
143-
gpu::memory_fence();
144+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
144145
asm("" ::"r"(stop));
145146
volatile auto output = result;
146147

0 commit comments

Comments
 (0)