Skip to content

Commit 97991fd

Browse files
committed
Add: In-register FMA benchmarks for GPUs
1 parent b5d4610 commit 97991fd

File tree

2 files changed

+114
-0
lines changed

2 files changed

+114
-0
lines changed

less_slow.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2133,9 +2133,36 @@ static void theoretic_tops_cuda( //
21332133
state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate);
21342134
}
21352135

2136+
extern __global__ void tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel();
2137+
extern __global__ void tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel();
2138+
extern __global__ void tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel();
2139+
extern __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel();
2140+
2141+
BENCHMARK_CAPTURE( //
2142+
theoretic_tops_cuda, f32f32_sm60fma, tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel, //
2143+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2144+
->MinTime(10);
2145+
BENCHMARK_CAPTURE( //
2146+
theoretic_tops_cuda, f64f64_sm60fma, tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel, //
2147+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2148+
->MinTime(10);
2149+
BENCHMARK_CAPTURE( //
2150+
theoretic_tops_cuda, i32i32_sm60fma, tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel, //
2151+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2152+
->MinTime(10);
2153+
BENCHMARK_CAPTURE( //
2154+
theoretic_tops_cuda, i64i64_sm60fma, tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel, //
2155+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2156+
->MinTime(10);
2157+
2158+
extern __global__ void tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel();
21362159
extern __global__ void tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel();
21372160
extern __global__ void tops_f16f32_sm70wmma_16x16x16_loop128_cuda_kernel();
21382161

2162+
BENCHMARK_CAPTURE( //
2163+
theoretic_tops_cuda, f16f16_sm60fma, tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel, //
2164+
16, 16, 16, 70, 128, tensor_core_scale_t::single_k)
2165+
->MinTime(10);
21392166
BENCHMARK_CAPTURE( //
21402167
theoretic_tops_cuda, f16f16_sm70wmma, tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel, //
21412168
16, 16, 16, 70, 128, tensor_core_scale_t::warp_k)
@@ -2162,11 +2189,16 @@ BENCHMARK_CAPTURE(
21622189
8, 8, 128, 75, 128, tensor_core_scale_t::warp_k)
21632190
->MinTime(10);
21642191

2192+
extern __global__ void tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel();
21652193
extern __global__ void tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel();
21662194
extern __global__ void tops_tf32f32_sm80wmma_16x16x8_loop128_cuda_kernel();
21672195
extern __global__ void tops_f64f64_sm80wmma_8x8x4_loop128_cuda_kernel();
21682196
extern __global__ void tops_b1i32and_sm80wmma_8x8x128_loop128_cuda_kernel();
21692197

2198+
BENCHMARK_CAPTURE( //
2199+
theoretic_tops_cuda, bf16bf16_sm60fma, tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel, //
2200+
16, 16, 16, 75, 128, tensor_core_scale_t::single_k)
2201+
->MinTime(10);
21702202
BENCHMARK_CAPTURE( //
21712203
theoretic_tops_cuda, bf16f32_sm80wmma, tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel, //
21722204
16, 16, 16, 80, 128, tensor_core_scale_t::warp_k)

less_slow.cu

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,88 @@ void reverse_and_sort_with_cub(std::uint32_t *device_pointer, std::size_t array_
135135

136136
#pragma region Numerics
137137

138+
/**
139+
* @brief On-device @b Fused-Multiply-Add operator, that for most numeric
140+
* types will be replaced by a single PTX instruction on most GPUs.
141+
*/
142+
struct fma_t {
143+
template <typename scalar_type_>
144+
inline __device__ scalar_type_ operator()(scalar_type_ a, scalar_type_ b, scalar_type_ c) const noexcept {
145+
return c + a * b;
146+
}
147+
};
148+
149+
/**
150+
* To benchmark matrix multiplications throughput we could start with
151+
* a traditional GEMM kernel, fetching data into shared memory, and then
152+
* running tiled mat-mul. That, however, may end up benchmarking the L2
153+
* throughput, rather than the ALUs on device. So we start with a simpler
154+
* kernel, that operates over small tiles of data already in shared memory.
155+
*/
156+
template <typename input_type_, typename output_type_, int matrix_side_, int repetitions_,
157+
typename fma_operator_ = fma_t>
158+
__device__ void tops_fma_cuda_kernel() {
159+
160+
// In‑register arrays, all allocated as local variables
161+
input_type_ a_tile[matrix_side_][matrix_side_], b_tile[matrix_side_][matrix_side_];
162+
output_type_ c_tile[matrix_side_][matrix_side_];
163+
164+
// Repeatedly perform FMA-like operations
165+
fma_operator_ fma_operator;
166+
for (int r = 0; r < repetitions_; ++r) {
167+
for (int i = 0; i < matrix_side_; ++i)
168+
for (int j = 0; j < matrix_side_; ++j)
169+
for (int k = 0; k < matrix_side_; ++k)
170+
c_tile[i][j] = fma_operator(a_tile[i][k], b_tile[k][j], c_tile[i][j]);
171+
}
172+
173+
// Prevent dead-code elimination by writing one result out
174+
if (threadIdx.x == 0 && blockIdx.x == 0) {
175+
volatile output_type_ sink = c_tile[0][0]; // A dummy volatile store should be enough
176+
(void)sink;
177+
}
178+
}
179+
180+
__global__ void tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel() { tops_fma_cuda_kernel<float, float, 16, 128>(); }
181+
182+
__global__ void tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel() { tops_fma_cuda_kernel<double, double, 16, 128>(); }
183+
184+
__global__ void tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel() {
185+
#if (__CUDA_ARCH__ >= 700)
186+
struct f16_fma_t {
187+
inline __device__ half operator()(half a, half b, half c) const noexcept { return __hfma(a, b, c); }
188+
};
189+
tops_fma_cuda_kernel<half, half, 16, 128, f16_fma_t>();
190+
#endif
191+
}
192+
193+
__global__ void tops_bf16bf16_sm80fma_16x16x16_loop128_cuda_kernel() {
194+
#if (__CUDA_ARCH__ >= 800)
195+
struct bf16_fma_t {
196+
inline __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) const noexcept {
197+
return __hfma(a, b, c);
198+
}
199+
};
200+
tops_fma_cuda_kernel<__nv_bfloat16, __nv_bfloat16, 16, 128, bf16_fma_t>();
201+
#endif
202+
}
203+
204+
/**
205+
* Aside from floating-point numbers, similar operations are often performed
206+
* on integer inputs. If historically graphics cards struggled with those,
207+
* today they have outstanding performance and can be used in variety of
208+
* @b combinatorial problems from encryption and Ethereum mining to Graph
209+
* processing, Integer Programming, Bioinformatics, or more mainstream
210+
* @b AI-Inference of quantized models.
211+
*/
212+
__global__ void tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel() {
213+
tops_fma_cuda_kernel<std::int32_t, std::int32_t, 16, 128>();
214+
}
215+
216+
__global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel() {
217+
tops_fma_cuda_kernel<std::int64_t, std::int64_t, 16, 128>();
218+
}
219+
138220
/**
139221
* Starting with Nvidia Volta GPUs, specialized "Tensor Cores" @b (TC) are
140222
* added for faster matrix multiplications. These Tensor Cores are much faster

0 commit comments

Comments
 (0)