Skip to content

Commit 6e60a3d

Browse files
Fix throughput overhead
1 parent aa6a03d commit 6e60a3d

File tree

6 files changed

+187
-36
lines changed

6 files changed

+187
-36
lines changed

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -176,21 +176,14 @@ benchmark(const BenchmarkOptions &options,
176176
uint64_t min = UINT64_MAX;
177177
uint64_t max = 0;
178178

179-
uint64_t overhead = UINT64_MAX;
180-
int overhead_iterations = 10;
181-
for (int i = 0; i < overhead_iterations; i++)
182-
overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead());
183-
184179
uint32_t call_index = 0;
185180

186181
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
187182
RefinableRuntimeEstimator sample_estimator;
188183

189184
const clock_t start = clock();
190185
while (sample_estimator.get_iterations() < iterations) {
191-
auto wrapper_intermediate = wrapper_func(call_index++);
192-
uint64_t current_result =
193-
wrapper_intermediate < overhead ? 0 : wrapper_intermediate - overhead;
186+
auto current_result = wrapper_func(call_index++);
194187
max = cpp::max(max, current_result);
195188
min = cpp::min(min, current_result);
196189
sample_estimator.update(current_result);

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ template <typename T> class MathPerf {
248248
}
249249

250250
public:
251+
// Returns cycles-per-call (lower is better)
251252
template <size_t N = 1>
252253
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp,
253254
uint32_t call_index) {
@@ -265,6 +266,7 @@ template <typename T> class MathPerf {
265266
return total_time / N;
266267
}
267268

269+
// Returns cycles-per-call (lower is better)
268270
template <size_t N = 1>
269271
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
270272
int arg1_max_exp, int arg2_min_exp,

libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ add_header_library(
77
libc.src.__support.common
88
libc.src.__support.macros.config
99
libc.src.__support.macros.attributes
10-
libc.src.__support.CPP.type_traits
10+
libc.src.__support.CPP.algorithm
1111
libc.src.__support.CPP.array
12+
libc.src.__support.CPP.type_traits
1213
)

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 92 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
1111

1212
#include "hdr/stdint_proxy.h"
13+
#include "src/__support/CPP/algorithm.h"
1314
#include "src/__support/CPP/array.h"
1415
#include "src/__support/CPP/atomic.h"
1516
#include "src/__support/CPP/type_traits.h"
@@ -105,54 +106,131 @@ template <typename F, typename T1, typename T2>
105106
return stop - start;
106107
}
107108

108-
// Provides throughput benchmarking.
109-
template <typename F, typename T, size_t N>
110-
[[gnu::noinline]] static LIBC_INLINE uint64_t
111-
throughput(F f, const cpp::array<T, N> &inputs) {
109+
// Provides the *baseline* for throughput: measures loop and measurement costs
110+
// without calling the f function
111+
template <typename T, size_t N>
112+
static LIBC_INLINE uint64_t
113+
throughput_baseline(const cpp::array<T, N> &inputs) {
112114
asm("" ::"v"(&inputs));
113115

114116
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
115117
uint64_t start = gpu::processor_clock();
116118

117119
asm("" ::"s"(start));
118120

121+
T result{};
119122
for (auto input : inputs) {
120-
auto result = f(input);
123+
asm("" ::"v"(input));
124+
result = input;
125+
asm("" ::"v"(result));
126+
}
127+
128+
uint64_t stop = gpu::processor_clock();
129+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
130+
asm("" ::"s"(stop));
131+
132+
volatile auto output = result;
133+
(void)output;
134+
135+
return stop - start;
136+
}
137+
138+
// Provides throughput benchmarking
139+
template <typename F, typename T, size_t N>
140+
static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
141+
uint64_t baseline = UINT64_MAX;
142+
for (int i = 0; i < 5; ++i)
143+
baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
144+
145+
asm("" ::"v"(&inputs));
146+
147+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
148+
uint64_t start = gpu::processor_clock();
121149

150+
asm("" ::"s"(start));
151+
152+
T result{};
153+
for (auto input : inputs) {
154+
asm("" ::"v"(input));
155+
result = f(input);
122156
asm("" ::"v"(result));
123157
}
124158

125159
uint64_t stop = gpu::processor_clock();
160+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
126161
asm("" ::"s"(stop));
162+
163+
volatile auto output = result;
164+
(void)output;
165+
166+
const uint64_t measured = stop - start;
167+
return measured > baseline ? (measured - baseline) : 0;
168+
}
169+
170+
// Provides the *baseline* for throughput with 2 arguments: measures loop and
171+
// measurement costs without calling the f function
172+
template <typename T, size_t N>
173+
static LIBC_INLINE uint64_t throughput_baseline(
174+
const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
175+
asm("" ::"v"(&inputs1), "v"(&inputs2));
176+
127177
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
178+
uint64_t start = gpu::processor_clock();
179+
180+
asm("" ::"s"(start));
181+
182+
T result{};
183+
for (size_t i = 0; i < N; i++) {
184+
T x = inputs1[i];
185+
T y = inputs2[i];
186+
asm("" ::"v"(x), "v"(y));
187+
result = x;
188+
asm("" ::"v"(result));
189+
}
190+
191+
uint64_t stop = gpu::processor_clock();
192+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
193+
asm("" ::"s"(stop));
194+
195+
volatile auto output = result;
196+
(void)output;
128197

129-
// Return the time elapsed.
130198
return stop - start;
131199
}
132200

133201
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
134202
template <typename F, typename T, size_t N>
135-
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
136-
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
203+
static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
204+
const cpp::array<T, N> &inputs2) {
205+
uint64_t baseline = UINT64_MAX;
206+
for (int i = 0; i < 5; ++i)
207+
baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
208+
137209
asm("" ::"v"(&inputs1), "v"(&inputs2));
138210

139211
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
140212
uint64_t start = gpu::processor_clock();
141213

142214
asm("" ::"s"(start));
143215

144-
for (size_t i = 0; i < inputs1.size(); i++) {
145-
auto result = f(inputs1[i], inputs2[i]);
146-
216+
T result{};
217+
for (size_t i = 0; i < N; i++) {
218+
T x = inputs1[i];
219+
T y = inputs2[i];
220+
asm("" ::"v"(x), "v"(y));
221+
result = f(x, y);
147222
asm("" ::"v"(result));
148223
}
149224

150225
uint64_t stop = gpu::processor_clock();
151-
asm("" ::"s"(stop));
152226
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
227+
asm("" ::"s"(stop));
153228

154-
// Return the time elapsed.
155-
return stop - start;
229+
volatile auto output = result;
230+
(void)output;
231+
232+
const uint64_t measured = stop - start;
233+
return measured > baseline ? (measured - baseline) : 0;
156234
}
157235

158236
} // namespace LIBC_NAMESPACE_DECL

libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ add_header_library(
77
libc.src.__support.common
88
libc.src.__support.macros.config
99
libc.src.__support.macros.attributes
10-
libc.src.__support.CPP.type_traits
10+
libc.src.__support.CPP.algorithm
1111
libc.src.__support.CPP.array
12+
libc.src.__support.CPP.type_traits
1213
)

libc/benchmarks/gpu/timing/nvptx/timing.h

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
1111

1212
#include "hdr/stdint_proxy.h"
13+
#include "src/__support/CPP/algorithm.h"
1314
#include "src/__support/CPP/array.h"
1415
#include "src/__support/CPP/atomic.h"
1516
#include "src/__support/CPP/type_traits.h"
@@ -95,18 +96,50 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
9596
return stop - start;
9697
}
9798

98-
// Provides throughput benchmarking.
99+
// Provides the *baseline* for throughput: measures loop and measurement costs
100+
// without calling the f function
101+
template <typename T, size_t N>
102+
static LIBC_INLINE uint64_t
103+
throughput_baseline(const cpp::array<T, N> &inputs) {
104+
asm("" ::"r"(&inputs));
105+
106+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
107+
uint64_t start = gpu::processor_clock();
108+
109+
asm("" ::"llr"(start));
110+
111+
T result{};
112+
for (auto input : inputs) {
113+
asm("" ::"r"(input));
114+
result = input;
115+
asm("" ::"r"(result));
116+
}
117+
118+
uint64_t stop = gpu::processor_clock();
119+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
120+
asm("" ::"r"(stop));
121+
122+
volatile auto output = result;
123+
(void)output;
124+
125+
return stop - start;
126+
}
127+
128+
// Provides throughput benchmarking
99129
template <typename F, typename T, size_t N>
100-
[[gnu::noinline]] static LIBC_INLINE uint64_t
101-
throughput(F f, const cpp::array<T, N> &inputs) {
130+
static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs) {
131+
uint64_t baseline = UINT64_MAX;
132+
for (int i = 0; i < 5; ++i)
133+
baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs));
134+
102135
asm("" ::"r"(&inputs));
103136

104137
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
105138
uint64_t start = gpu::processor_clock();
106139

107140
asm("" ::"llr"(start));
108141

109-
uint64_t result;
142+
T result{};
110143
for (auto input : inputs) {
111144
asm("" ::"r"(input));
112145
result = f(input);
@@ -116,37 +149,80 @@ throughput(F f, const cpp::array<T, N> &inputs) {
116149
uint64_t stop = gpu::processor_clock();
117150
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
118151
asm("" ::"r"(stop));
152+
119153
volatile auto output = result;
154+
(void)output;
155+
156+
const uint64_t measured = stop - start;
157+
return measured > baseline ? (measured - baseline) : 0;
158+
}
159+
160+
// Provides the *baseline* for throughput with 2 arguments: measures loop and
161+
// measurement costs without calling the f function
162+
template <typename T, size_t N>
163+
static LIBC_INLINE uint64_t throughput_baseline(
164+
const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
165+
asm("" ::"r"(&inputs1), "r"(&inputs2));
166+
167+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
168+
uint64_t start = gpu::processor_clock();
169+
170+
asm("" ::"llr"(start));
171+
172+
T result{};
173+
for (size_t i = 0; i < N; i++) {
174+
T x = inputs1[i];
175+
T y = inputs2[i];
176+
asm("" ::"r"(x), "r"(y));
177+
result = x;
178+
asm("" ::"r"(result));
179+
}
180+
181+
uint64_t stop = gpu::processor_clock();
182+
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
183+
asm("" ::"r"(stop));
184+
185+
volatile auto output = result;
186+
(void)output;
120187

121-
// Return the time elapsed.
122188
return stop - start;
123189
}
124190

125191
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
126192
template <typename F, typename T, size_t N>
127-
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
128-
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
193+
static LIBC_INLINE uint64_t throughput(F f, const cpp::array<T, N> &inputs1,
194+
const cpp::array<T, N> &inputs2) {
195+
uint64_t baseline = UINT64_MAX;
196+
for (int i = 0; i < 5; ++i)
197+
baseline = cpp::min(baseline, throughput_baseline<T, N>(inputs1, inputs2));
198+
129199
asm("" ::"r"(&inputs1), "r"(&inputs2));
130200

131201
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
132202
uint64_t start = gpu::processor_clock();
133203

134204
asm("" ::"llr"(start));
135205

136-
uint64_t result;
137-
for (size_t i = 0; i < inputs1.size(); i++) {
138-
result = f(inputs1[i], inputs2[i]);
206+
T result{};
207+
for (size_t i = 0; i < N; i++) {
208+
T x = inputs1[i];
209+
T y = inputs2[i];
210+
asm("" ::"r"(x), "r"(y));
211+
result = f(x, y);
139212
asm("" ::"r"(result));
140213
}
141214

142215
uint64_t stop = gpu::processor_clock();
143216
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
144217
asm("" ::"r"(stop));
218+
145219
volatile auto output = result;
220+
(void)output;
146221

147-
// Return the time elapsed.
148-
return stop - start;
222+
const uint64_t measured = stop - start;
223+
return measured > baseline ? (measured - baseline) : 0;
149224
}
225+
150226
} // namespace LIBC_NAMESPACE_DECL
151227

152228
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

0 commit comments

Comments
 (0)