Skip to content

Commit 85f78c3

Browse files
committed
Fix: Counting TOPS across TC generations
This commit adds `tensor_core_scale_t`, that affects the way TOPS are computed.
1 parent 28e639e commit 85f78c3

File tree

2 files changed

+175
-69
lines changed

2 files changed

+175
-69
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ if(USE_NVIDIA_CCCL)
434434
# target_link_libraries(less_slow PRIVATE nvidia::cutlass::cutlass)
435435

436436
# List the PTX files you want to copy
437-
set(PTX_FILES less_slow_sm70.ptx less_slow_sm90a.ptx)
437+
set(PTX_FILES less_slow_sm70.ptx less_slow_sm80.ptx less_slow_sm90a.ptx)
438438

439439
# Loop over each PTX file and add a custom command to copy it
440440
foreach(PTX ${PTX_FILES})

less_slow.cpp

Lines changed: 174 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -2046,15 +2046,67 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne
20462046
#if _LESS_SLOW_WITH_CUDA
20472047
#include <cuda.h>
20482048

2049+
/**
2050+
* Different generations of matrix multiplication instructions on GPUs use
2051+
* different synchronization/cooperation scales across generations.
2052+
*/
2053+
enum class tensor_core_scale_t : int {
2054+
unknown_k = 0,
2055+
2056+
/**
2057+
* Before Volta, individual CUDA cores would compute matrix multiplication
2058+
* as many individual scalar FMA operations over tiles in shared cache.
2059+
* Applies to SM levels @b <7.0.
2060+
*/
2061+
single_k = 1,
2062+
/**
2063+
* On Volta and newer, 8 consecutive threads compute the MMA together.
2064+
* Applies to SM level @b ≥7.0.
2065+
*/
2066+
quadpair_k = 8,
2067+
/**
2068+
* On Ampere and newer, 32 consecutive threads in a single warp compute
2069+
* WMMA together. Applies to SM level @b ≥8.0.
2070+
*/
2071+
warp_k = 32,
2072+
/**
2073+
* On Hopper and newer, 128 consecutive threads in 4 consecutive warps
2074+
* compute larger Warp Group MMA together. Applies to SM level @b ≥9.0.
2075+
*/
2076+
warpgroup_k = 128,
2077+
2078+
};
2079+
2080+
tensor_core_scale_t tensor_core_scale(int sm_capability) {
2081+
if (sm_capability >= 90) return tensor_core_scale_t::warpgroup_k;
2082+
if (sm_capability >= 80) return tensor_core_scale_t::warp_k;
2083+
if (sm_capability >= 70) return tensor_core_scale_t::quadpair_k;
2084+
return tensor_core_scale_t::single_k;
2085+
}
2086+
2087+
/**
2088+
* @brief Runs the benchmark loop for precompiled CUDA C++ kernels using
2089+
* the high-level @b runtime API. It counts TOPS (Tensor Operations Per
2090+
* Second) as the number of scalar multiplications in $A * B$, ignoring
2091+
* the $D$ additive part of $A * B + D$.
2092+
*
2093+
* @param m,n,k Dimensions of matrices multiplied by one instruction.
2094+
* @param required_capability GPU's Streaming Multiprocessor generation needed.
2095+
* @param scale Number of threads in each block, computing MMA collectively.
2096+
*/
20492097
static void theoretic_tops_cuda( //
20502098
bm::State &state, __global__ void (*kernel)(), //
20512099
std::size_t m, std::size_t n, std::size_t k, //
2052-
std::size_t repetitions, int required_capability) {
2100+
int required_capability, //
2101+
std::size_t repetitions, //
2102+
tensor_core_scale_t scale = tensor_core_scale_t::unknown_k) {
20532103

2054-
cudaDeviceProp prop;
2055-
cudaGetDeviceProperties(&prop, 0);
2056-
int const blocks = prop.multiProcessorCount;
2057-
int const threads_per_block = prop.warpSize;
2104+
cudaDeviceProp properties;
2105+
cudaGetDeviceProperties(&properties, 0);
2106+
int const blocks = properties.multiProcessorCount;
2107+
// On Hopper and newer, 4 warps need to synchronize WGMMAs.
2108+
int const threads_per_block = properties.warpSize * 4;
2109+
if (scale == tensor_core_scale_t::unknown_k) scale = tensor_core_scale(required_capability);
20582110

20592111
for (auto _ : state) {
20602112
// A typical CUDA kernel invocation would look like this:
@@ -2075,68 +2127,103 @@ static void theoretic_tops_cuda( //
20752127
cudaDeviceSynchronize();
20762128
}
20772129

2130+
std::size_t const threads = static_cast<std::size_t>(blocks * threads_per_block);
20782131
std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
2079-
std::size_t const tops_per_gpu = tops_per_cycle * blocks; //? Warps compute each tile product collectively!
2132+
std::size_t const tops_per_gpu = tops_per_cycle * threads / static_cast<std::size_t>(scale);
20802133
state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate);
20812134
}
20822135

2083-
extern __global__ void tops_f16f16_sm70tc_16x16x16_loop128_cuda_kernel();
2084-
extern __global__ void tops_f16f32_sm70tc_16x16x16_loop128_cuda_kernel();
2136+
extern __global__ void tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel();
2137+
extern __global__ void tops_f16f32_sm70wmma_16x16x16_loop128_cuda_kernel();
20852138

2086-
extern __global__ void tops_u8i32_sm75tc_16x16x16_loop128_cuda_kernel();
2087-
extern __global__ void tops_u4i32_sm75tc_8x8x32_loop128_cuda_kernel();
2088-
extern __global__ void tops_b1i32xor_sm75tc_8x8x128_loop128_cuda_kernel();
2139+
BENCHMARK_CAPTURE( //
2140+
theoretic_tops_cuda, f16f16_sm70wmma, tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel, //
2141+
16, 16, 16, 70, 128, tensor_core_scale_t::warp_k)
2142+
->MinTime(10);
2143+
BENCHMARK_CAPTURE( //
2144+
theoretic_tops_cuda, f16f32_sm70wmma, tops_f16f32_sm70wmma_16x16x16_loop128_cuda_kernel, //
2145+
16, 16, 16, 70, 128, tensor_core_scale_t::warp_k)
2146+
->MinTime(10);
20892147

2090-
extern __global__ void tops_bf16f32_sm80tc_16x16x16_loop128_cuda_kernel();
2091-
extern __global__ void tops_tf32f32_sm80tc_16x16x8_loop128_cuda_kernel();
2092-
extern __global__ void tops_f64f64_sm80tc_8x8x4_loop128_cuda_kernel();
2093-
extern __global__ void tops_b1i32and_sm80tc_8x8x128_loop128_cuda_kernel();
2148+
extern __global__ void tops_u8i32_sm75wmma_16x16x16_loop128_cuda_kernel();
2149+
extern __global__ void tops_u4i32_sm75wmma_8x8x32_loop128_cuda_kernel();
2150+
extern __global__ void tops_b1i32xor_sm75wmma_8x8x128_loop128_cuda_kernel();
20942151

2095-
BENCHMARK_CAPTURE( //
2096-
theoretic_tops_cuda, f16f16_sm70tc, tops_f16f16_sm70tc_16x16x16_loop128_cuda_kernel, //
2097-
16, 16, 16, 128, 70)
2152+
BENCHMARK_CAPTURE( //
2153+
theoretic_tops_cuda, u8i32_sm75wmma, tops_u8i32_sm75wmma_16x16x16_loop128_cuda_kernel, //
2154+
16, 16, 16, 75, 128, tensor_core_scale_t::warp_k)
20982155
->MinTime(10);
20992156
BENCHMARK_CAPTURE( //
2100-
theoretic_tops_cuda, f16f32_sm70tc, tops_f16f32_sm70tc_16x16x16_loop128_cuda_kernel, //
2101-
16, 16, 16, 128, 70)
2102-
->MinTime(10);
2103-
BENCHMARK_CAPTURE( //
2104-
theoretic_tops_cuda, u8i32_sm75tc, tops_u8i32_sm75tc_16x16x16_loop128_cuda_kernel, //
2105-
16, 16, 16, 128, 75)
2157+
theoretic_tops_cuda, u4i32_sm75wmma, tops_u4i32_sm75wmma_8x8x32_loop128_cuda_kernel, //
2158+
8, 8, 32, 75, 128, tensor_core_scale_t::warp_k)
21062159
->MinTime(10);
2107-
BENCHMARK_CAPTURE( //
2108-
theoretic_tops_cuda, u4i32_sm75tc, tops_u4i32_sm75tc_8x8x32_loop128_cuda_kernel, //
2109-
8, 8, 32, 128, 75)
2160+
BENCHMARK_CAPTURE( //
2161+
theoretic_tops_cuda, b1i32xor_sm75wmma, tops_b1i32xor_sm75wmma_8x8x128_loop128_cuda_kernel, //
2162+
8, 8, 128, 75, 128, tensor_core_scale_t::warp_k)
21102163
->MinTime(10);
2111-
BENCHMARK_CAPTURE( //
2112-
theoretic_tops_cuda, b1i32xor_sm75tc, tops_b1i32xor_sm75tc_8x8x128_loop128_cuda_kernel, //
2113-
8, 8, 128, 128, 75)
2164+
2165+
extern __global__ void tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel();
2166+
extern __global__ void tops_tf32f32_sm80wmma_16x16x8_loop128_cuda_kernel();
2167+
extern __global__ void tops_f64f64_sm80wmma_8x8x4_loop128_cuda_kernel();
2168+
extern __global__ void tops_b1i32and_sm80wmma_8x8x128_loop128_cuda_kernel();
2169+
2170+
BENCHMARK_CAPTURE( //
2171+
theoretic_tops_cuda, bf16f32_sm80wmma, tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel, //
2172+
16, 16, 16, 80, 128, tensor_core_scale_t::warp_k)
21142173
->MinTime(10);
2115-
BENCHMARK_CAPTURE( //
2116-
theoretic_tops_cuda, bf16f32_sm80tc, tops_bf16f32_sm80tc_16x16x16_loop128_cuda_kernel, //
2117-
16, 16, 16, 128, 80)
2174+
BENCHMARK_CAPTURE( //
2175+
theoretic_tops_cuda, tf32f32_sm80wmma, tops_tf32f32_sm80wmma_16x16x8_loop128_cuda_kernel, //
2176+
16, 16, 8, 80, 128, tensor_core_scale_t::warp_k)
21182177
->MinTime(10);
21192178
BENCHMARK_CAPTURE( //
2120-
theoretic_tops_cuda, tf32f32_sm80tc, tops_tf32f32_sm80tc_16x16x8_loop128_cuda_kernel, //
2121-
16, 16, 8, 128, 80)
2179+
theoretic_tops_cuda, f64f64_sm80wmma, tops_f64f64_sm80wmma_8x8x4_loop128_cuda_kernel, //
2180+
8, 8, 4, 80, 128, tensor_core_scale_t::warp_k)
21222181
->MinTime(10);
2123-
BENCHMARK_CAPTURE( //
2124-
theoretic_tops_cuda, f64f64_sm80tc, tops_f64f64_sm80tc_8x8x4_loop128_cuda_kernel, //
2125-
8, 8, 4, 128, 80)
2182+
BENCHMARK_CAPTURE( //
2183+
theoretic_tops_cuda, b1i32and_sm80wmma, tops_b1i32and_sm80wmma_8x8x128_loop128_cuda_kernel, //
2184+
8, 8, 128, 80, 128, tensor_core_scale_t::warp_k)
21262185
->MinTime(10);
2127-
BENCHMARK_CAPTURE( //
2128-
theoretic_tops_cuda, b1i32and_sm80tc, tops_b1i32and_sm80tc_8x8x128_loop128_cuda_kernel, //
2129-
8, 8, 128, 128, 80)
2186+
2187+
extern __global__ void tops_f16f32_sm90wgmma_64x256x16_loop128_cuda_kernel();
2188+
extern __global__ void tops_bf16f32_sm90wgmma_64x256x16_loop128_cuda_kernel();
2189+
extern __global__ void tops_tf32f32_sm90wgmma_64x256x16_loop128_cuda_kernel();
2190+
2191+
BENCHMARK_CAPTURE( //
2192+
theoretic_tops_cuda, f16f32_sm90wgmma, tops_f16f32_sm90wgmma_64x256x16_loop128_cuda_kernel, //
2193+
64, 256, 16, 90, 128, tensor_core_scale_t::warpgroup_k)
2194+
->MinTime(10);
2195+
BENCHMARK_CAPTURE( //
2196+
theoretic_tops_cuda, bf16f32_sm90wgmma, tops_bf16f32_sm90wgmma_64x256x16_loop128_cuda_kernel, //
2197+
64, 256, 16, 90, 128, tensor_core_scale_t::warpgroup_k)
2198+
->MinTime(10);
2199+
BENCHMARK_CAPTURE( //
2200+
theoretic_tops_cuda, tf32f32_sm90wgmma, tops_tf32f32_sm90wgmma_64x256x16_loop128_cuda_kernel, //
2201+
64, 256, 16, 90, 128, tensor_core_scale_t::warpgroup_k)
21302202
->MinTime(10);
21312203

2132-
#include <filesystem>
2204+
#include <filesystem> // `std::filesystem::absolute` to locate PTX IR file
21332205

2206+
/**
2207+
* @brief Runs the benchmark loop for precompiled CUDA C++ kernels using
2208+
* the low-level @b driver API. It counts TOPS (Tensor Operations Per
2209+
* Second) as the number of scalar multiplications in $A * B$, ignoring
2210+
* the $D$ additive part of $A * B + D$.
2211+
*
2212+
* @param m,n,k Dimensions of matrices multiplied by one instruction.
2213+
* @param required_capability GPU's Streaming Multiprocessor generation needed.
2214+
*
2215+
* @param file_name The name of the @b `.ptx` file in current directory.
2216+
* @param kernel_name The name of a specific @b `.visible` entry function.
2217+
* @param scale Number of threads in each block, computing MMA collectively.
2218+
*/
21342219
static void theoretic_tops_ptx( //
21352220
bm::State &state, //
21362221
std::string file_name, //
21372222
std::string kernel_name, //
21382223
std::size_t m, std::size_t n, std::size_t k, //
2139-
std::size_t repetitions, int required_capability) {
2224+
int required_capability, //
2225+
std::size_t repetitions, //
2226+
tensor_core_scale_t scale = tensor_core_scale_t::unknown_k) {
21402227

21412228
// Resolve the absolute path to the PTX file
21422229
std::string ptx_file = file_name;
@@ -2233,17 +2320,23 @@ static void theoretic_tops_ptx( //
22332320
return;
22342321
}
22352322

2236-
// Set kernel launch configuration
2323+
// Set kernel launch configuration, same way as in `theoretic_tops_cuda`.
22372324
dim3 grid_dim(num_sms);
2238-
dim3 block_dim(warp_size);
2325+
dim3 block_dim(warp_size * 4);
22392326
void *kernel_args[] = {nullptr};
22402327

2328+
// We need shared memory for matrix multiplications on Hopper:
2329+
// - on V100 we have 96 KB per SM
2330+
// - on H100 we have 228 KB per SM
2331+
unsigned int shared_memory = 0; // 32 * 1024;
2332+
if (scale == tensor_core_scale_t::unknown_k) scale = tensor_core_scale(required_capability);
2333+
22412334
for (auto _ : state) {
22422335
result = cuLaunchKernel( //
22432336
kernel, //
22442337
grid_dim.x, grid_dim.y, grid_dim.z, //
22452338
block_dim.x, block_dim.y, block_dim.z, //
2246-
0, nullptr, kernel_args, nullptr);
2339+
shared_memory, nullptr, kernel_args, nullptr);
22472340
if (result != CUDA_SUCCESS) {
22482341
state.SkipWithError("Failed to launch the kernel: " + last_error_string());
22492342
break;
@@ -2255,55 +2348,68 @@ static void theoretic_tops_ptx( //
22552348
}
22562349
}
22572350

2351+
std::size_t const threads = static_cast<std::size_t>(grid_dim.x * block_dim.x);
22582352
std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
2259-
std::size_t const tops_per_gpu = tops_per_cycle * num_sms; //? Warps compute each tile product collectively!
2353+
std::size_t const tops_per_gpu = tops_per_cycle * threads / static_cast<std::size_t>(scale);
22602354
state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate);
22612355

22622356
// Clean up
22632357
cuModuleUnload(module_);
22642358
cuCtxDestroy(context);
22652359
}
22662360

2267-
BENCHMARK_CAPTURE( //
2268-
theoretic_tops_ptx, f16f16_sm70tc, //
2269-
"less_slow_sm70.ptx", "tops_f16f16_sm70tc_16x16x16_loop128_ptx_kernel", //
2270-
16, 16, 16, 128, 70)
2361+
BENCHMARK_CAPTURE( //
2362+
theoretic_tops_ptx, f16f16_sm70mma, //
2363+
"less_slow_sm70.ptx", "tops_f16f16_sm70mma_8x8x4_loop128_ptx_kernel", //
2364+
16, 16, 16, 70, 128, tensor_core_scale_t::quadpair_k)
22712365
->MinTime(10);
22722366

2273-
BENCHMARK_CAPTURE( //
2274-
theoretic_tops_ptx, f16f16_sm90tc, //
2275-
"less_slow_sm90a.ptx", "tops_f16f16_sm90tc_16x16x16_loop128_ptx_kernel", //
2276-
16, 16, 16, 128, 90)
2367+
BENCHMARK_CAPTURE( //
2368+
theoretic_tops_ptx, f16f32_sm70mma, //
2369+
"less_slow_sm70.ptx", "tops_f16f32_sm70mma_8x8x4_loop128_ptx_kernel", //
2370+
16, 16, 16, 70, 128, tensor_core_scale_t::quadpair_k)
2371+
->MinTime(10);
2372+
2373+
BENCHMARK_CAPTURE( //
2374+
theoretic_tops_ptx, f16f16_sm80wmma, //
2375+
"less_slow_sm80.ptx", "tops_f16f16_sm80wmma_16x16x16_loop128_ptx_kernel", //
2376+
16, 16, 16, 80, 128, tensor_core_scale_t::warp_k)
2377+
->MinTime(10);
2378+
2379+
BENCHMARK_CAPTURE( //
2380+
theoretic_tops_ptx, f16f32_sm80wmma, //
2381+
"less_slow_sm80.ptx", "tops_f16f32_sm80wmma_16x16x16_loop128_ptx_kernel", //
2382+
16, 16, 16, 80, 128, tensor_core_scale_t::warp_k)
22772383
->MinTime(10);
22782384

22792385
BENCHMARK_CAPTURE( //
2280-
theoretic_tops_ptx, f64f64_sm90tc, //
2281-
"less_slow_sm90a.ptx", "tops_f64f64_sm90tc_8x8x4_loop128_ptx_kernel", //
2282-
8, 8, 4, 128, 90)
2386+
theoretic_tops_ptx, f64f64_sm80mma, //
2387+
"less_slow_sm80.ptx", "tops_f64f64_sm80mma_8x8x4_loop128_ptx_kernel", //
2388+
8, 8, 4, 80, 128, tensor_core_scale_t::quadpair_k)
22832389
->MinTime(10);
22842390

2285-
BENCHMARK_CAPTURE( //
2286-
theoretic_tops_ptx, tf32f32_sm90tc, //
2287-
"less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_16x16x8_loop128_ptx_kernel", //
2288-
16, 16, 8, 128, 90)
2391+
BENCHMARK_CAPTURE( //
2392+
theoretic_tops_ptx, tf32f32_sm80wmma, //
2393+
"less_slow_sm80.ptx", "tops_tf32f32_sm80wmma_16x16x8_loop128_ptx_kernel", //
2394+
16, 16, 8, 80, 128, tensor_core_scale_t::warp_k)
22892395
->MinTime(10);
22902396

22912397
BENCHMARK_CAPTURE( //
2292-
theoretic_tops_ptx, tf32f32_sm90tc_wgmma_smallest, //
2398+
theoretic_tops_ptx, tf32f32_sm90wgmma_smallest, //
22932399
"less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n16k8_loop128_ptx_kernel", //
2294-
64, 16, 8, 128, 90)
2400+
64, 16, 8, 90, 128, tensor_core_scale_t::warpgroup_k)
22952401
->MinTime(10);
22962402

22972403
BENCHMARK_CAPTURE( //
2298-
theoretic_tops_ptx, tf32f32_sm90tc_wgmma_largest, //
2404+
theoretic_tops_ptx, tf32f32_sm90wgmma, //
22992405
"less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n256k8_loop128_ptx_kernel", //
2300-
64, 256, 8, 128, 90)
2406+
64, 256, 8, 90, 128)
23012407
->MinTime(10);
23022408

23032409
BENCHMARK_CAPTURE( //
2304-
theoretic_tops_ptx, b1i32and_sm90tc_wgmma, //
2410+
theoretic_tops_ptx, b1i32and_sm90wgmma, //
23052411
"less_slow_sm90a.ptx", "tops_b1i32and_sm90tc_m64n256k256_loop128_ptx_kernel", //
2306-
64, 256, 256, 128, 90)
2412+
64, 256, 256, 90, 128, tensor_core_scale_t::warpgroup_k)
23072413
->MinTime(10);
23082414

23092415
/**

0 commit comments

Comments
 (0)