@@ -2046,15 +2046,67 @@ BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kerne
20462046#if _LESS_SLOW_WITH_CUDA
20472047#include < cuda.h>
20482048
2049+ /* *
2050+ * Different generations of matrix multiplication instructions on GPUs use
2051+ * different synchronization/cooperation scales across generations.
2052+ */
2053+ enum class tensor_core_scale_t : int {
2054+ unknown_k = 0 ,
2055+
2056+ /* *
2057+ * Before Volta, individual CUDA cores would compute matrix multiplication
2058+ * as many individual scalar FMA operations over tiles in shared cache.
2059+ * Applies to SM levels @b <7.0.
2060+ */
2061+ single_k = 1 ,
2062+ /* *
2063+ * On Volta and newer, 8 consecutive threads compute the MMA together.
2064+ * Applies to SM level @b ≥7.0.
2065+ */
2066+ quadpair_k = 8 ,
2067+ /* *
2068+ * On Ampere and newer, 32 consecutive threads in a single warp compute
2069+ * WMMA together. Applies to SM level @b ≥8.0.
2070+ */
2071+ warp_k = 32 ,
2072+ /* *
2073+ * On Hopper and newer, 128 consecutive threads in 4 consecutive warps
2074+ * compute larger Warp Group MMA together. Applies to SM level @b ≥9.0.
2075+ */
2076+ warpgroup_k = 128 ,
2077+
2078+ };
2079+
2080+ tensor_core_scale_t tensor_core_scale (int sm_capability) {
2081+ if (sm_capability >= 90 ) return tensor_core_scale_t ::warpgroup_k;
2082+ if (sm_capability >= 80 ) return tensor_core_scale_t ::warp_k;
2083+ if (sm_capability >= 70 ) return tensor_core_scale_t ::quadpair_k;
2084+ return tensor_core_scale_t ::single_k;
2085+ }
2086+
2087+ /* *
2088+ * @brief Runs the benchmark loop for precompiled CUDA C++ kernels using
2089+ * the high-level @b runtime API. It counts TOPS (Tensor Operations Per
2090+ * Second) as the number of scalar multiplications in $A * B$, ignoring
2091+ * the $D$ additive part of $A * B + D$.
2092+ *
2093+ * @param m,n,k Dimensions of matrices multiplied by one instruction.
2094+ * @param required_capability GPU's Streaming Multiprocessor generation needed.
2095+ * @param scale Number of threads in each block, computing MMA collectively.
2096+ */
20492097static void theoretic_tops_cuda ( //
20502098 bm::State &state, __global__ void (*kernel)(), //
20512099 std::size_t m, std::size_t n, std::size_t k, //
2052- std::size_t repetitions, int required_capability) {
2100+ int required_capability, //
2101+ std::size_t repetitions, //
2102+ tensor_core_scale_t scale = tensor_core_scale_t::unknown_k) {
20532103
2054- cudaDeviceProp prop;
2055- cudaGetDeviceProperties (&prop, 0 );
2056- int const blocks = prop.multiProcessorCount ;
2057- int const threads_per_block = prop.warpSize ;
2104+ cudaDeviceProp properties;
2105+ cudaGetDeviceProperties (&properties, 0 );
2106+ int const blocks = properties.multiProcessorCount ;
2107+ // On Hopper and newer, 4 warps need to synchronize WGMMAs.
2108+ int const threads_per_block = properties.warpSize * 4 ;
2109+ if (scale == tensor_core_scale_t ::unknown_k) scale = tensor_core_scale (required_capability);
20582110
20592111 for (auto _ : state) {
20602112 // A typical CUDA kernel invocation would look like this:
@@ -2075,68 +2127,103 @@ static void theoretic_tops_cuda( //
20752127 cudaDeviceSynchronize ();
20762128 }
20772129
2130+ std::size_t const threads = static_cast <std::size_t >(blocks * threads_per_block);
20782131 std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
2079- std::size_t const tops_per_gpu = tops_per_cycle * blocks; // ? Warps compute each tile product collectively!
2132+ std::size_t const tops_per_gpu = tops_per_cycle * threads / static_cast <std:: size_t >(scale);
20802133 state.counters [" TOP" ] = benchmark::Counter (tops_per_gpu * state.iterations (), benchmark::Counter::kIsRate );
20812134}
20822135
2083- extern __global__ void tops_f16f16_sm70tc_16x16x16_loop128_cuda_kernel ();
2084- extern __global__ void tops_f16f32_sm70tc_16x16x16_loop128_cuda_kernel ();
2136+ extern __global__ void tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel ();
2137+ extern __global__ void tops_f16f32_sm70wmma_16x16x16_loop128_cuda_kernel ();
20852138
2086- extern __global__ void tops_u8i32_sm75tc_16x16x16_loop128_cuda_kernel ();
2087- extern __global__ void tops_u4i32_sm75tc_8x8x32_loop128_cuda_kernel ();
2088- extern __global__ void tops_b1i32xor_sm75tc_8x8x128_loop128_cuda_kernel ();
2139+ BENCHMARK_CAPTURE ( //
2140+ theoretic_tops_cuda, f16f16_sm70wmma, tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel, //
2141+ 16 , 16 , 16 , 70 , 128 , tensor_core_scale_t ::warp_k)
2142+ ->MinTime(10 );
2143+ BENCHMARK_CAPTURE ( //
2144+ theoretic_tops_cuda, f16f32_sm70wmma, tops_f16f32_sm70wmma_16x16x16_loop128_cuda_kernel, //
2145+ 16 , 16 , 16 , 70 , 128 , tensor_core_scale_t ::warp_k)
2146+ ->MinTime(10 );
20892147
2090- extern __global__ void tops_bf16f32_sm80tc_16x16x16_loop128_cuda_kernel ();
2091- extern __global__ void tops_tf32f32_sm80tc_16x16x8_loop128_cuda_kernel ();
2092- extern __global__ void tops_f64f64_sm80tc_8x8x4_loop128_cuda_kernel ();
2093- extern __global__ void tops_b1i32and_sm80tc_8x8x128_loop128_cuda_kernel ();
2148+ extern __global__ void tops_u8i32_sm75wmma_16x16x16_loop128_cuda_kernel ();
2149+ extern __global__ void tops_u4i32_sm75wmma_8x8x32_loop128_cuda_kernel ();
2150+ extern __global__ void tops_b1i32xor_sm75wmma_8x8x128_loop128_cuda_kernel ();
20942151
2095- BENCHMARK_CAPTURE ( //
2096- theoretic_tops_cuda, f16f16_sm70tc, tops_f16f16_sm70tc_16x16x16_loop128_cuda_kernel , //
2097- 16 , 16 , 16 , 128 , 70 )
2152+ BENCHMARK_CAPTURE ( //
2153+ theoretic_tops_cuda, u8i32_sm75wmma, tops_u8i32_sm75wmma_16x16x16_loop128_cuda_kernel , //
2154+ 16 , 16 , 16 , 75 , 128 , tensor_core_scale_t ::warp_k )
20982155 ->MinTime(10 );
20992156BENCHMARK_CAPTURE ( //
2100- theoretic_tops_cuda, f16f32_sm70tc, tops_f16f32_sm70tc_16x16x16_loop128_cuda_kernel, //
2101- 16 , 16 , 16 , 128 , 70 )
2102- ->MinTime(10 );
2103- BENCHMARK_CAPTURE ( //
2104- theoretic_tops_cuda, u8i32_sm75tc, tops_u8i32_sm75tc_16x16x16_loop128_cuda_kernel, //
2105- 16 , 16 , 16 , 128 , 75 )
2157+ theoretic_tops_cuda, u4i32_sm75wmma, tops_u4i32_sm75wmma_8x8x32_loop128_cuda_kernel, //
2158+ 8 , 8 , 32 , 75 , 128 , tensor_core_scale_t ::warp_k)
21062159 ->MinTime(10 );
2107- BENCHMARK_CAPTURE ( //
2108- theoretic_tops_cuda, u4i32_sm75tc, tops_u4i32_sm75tc_8x8x32_loop128_cuda_kernel , //
2109- 8 , 8 , 32 , 128 , 75 )
2160+ BENCHMARK_CAPTURE ( //
2161+ theoretic_tops_cuda, b1i32xor_sm75wmma, tops_b1i32xor_sm75wmma_8x8x128_loop128_cuda_kernel , //
2162+ 8 , 8 , 128 , 75 , 128 , tensor_core_scale_t ::warp_k )
21102163 ->MinTime(10 );
2111- BENCHMARK_CAPTURE ( //
2112- theoretic_tops_cuda, b1i32xor_sm75tc, tops_b1i32xor_sm75tc_8x8x128_loop128_cuda_kernel, //
2113- 8 , 8 , 128 , 128 , 75 )
2164+
2165+ extern __global__ void tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel ();
2166+ extern __global__ void tops_tf32f32_sm80wmma_16x16x8_loop128_cuda_kernel ();
2167+ extern __global__ void tops_f64f64_sm80wmma_8x8x4_loop128_cuda_kernel ();
2168+ extern __global__ void tops_b1i32and_sm80wmma_8x8x128_loop128_cuda_kernel ();
2169+
2170+ BENCHMARK_CAPTURE ( //
2171+ theoretic_tops_cuda, bf16f32_sm80wmma, tops_bf16f32_sm80wmma_16x16x16_loop128_cuda_kernel, //
2172+ 16 , 16 , 16 , 80 , 128 , tensor_core_scale_t ::warp_k)
21142173 ->MinTime(10 );
2115- BENCHMARK_CAPTURE ( //
2116- theoretic_tops_cuda, bf16f32_sm80tc, tops_bf16f32_sm80tc_16x16x16_loop128_cuda_kernel , //
2117- 16 , 16 , 16 , 128 , 80 )
2174+ BENCHMARK_CAPTURE ( //
2175+ theoretic_tops_cuda, tf32f32_sm80wmma, tops_tf32f32_sm80wmma_16x16x8_loop128_cuda_kernel , //
2176+ 16 , 16 , 8 , 80 , 128 , tensor_core_scale_t ::warp_k )
21182177 ->MinTime(10 );
21192178BENCHMARK_CAPTURE ( //
2120- theoretic_tops_cuda, tf32f32_sm80tc, tops_tf32f32_sm80tc_16x16x8_loop128_cuda_kernel , //
2121- 16 , 16 , 8 , 128 , 80 )
2179+ theoretic_tops_cuda, f64f64_sm80wmma, tops_f64f64_sm80wmma_8x8x4_loop128_cuda_kernel , //
2180+ 8 , 8 , 4 , 80 , 128 , tensor_core_scale_t ::warp_k )
21222181 ->MinTime(10 );
2123- BENCHMARK_CAPTURE ( //
2124- theoretic_tops_cuda, f64f64_sm80tc, tops_f64f64_sm80tc_8x8x4_loop128_cuda_kernel , //
2125- 8 , 8 , 4 , 128 , 80 )
2182+ BENCHMARK_CAPTURE ( //
2183+ theoretic_tops_cuda, b1i32and_sm80wmma, tops_b1i32and_sm80wmma_8x8x128_loop128_cuda_kernel , //
2184+ 8 , 8 , 128 , 80 , 128 , tensor_core_scale_t ::warp_k )
21262185 ->MinTime(10 );
2127- BENCHMARK_CAPTURE ( //
2128- theoretic_tops_cuda, b1i32and_sm80tc, tops_b1i32and_sm80tc_8x8x128_loop128_cuda_kernel, //
2129- 8 , 8 , 128 , 128 , 80 )
2186+
2187+ extern __global__ void tops_f16f32_sm90wgmma_64x256x16_loop128_cuda_kernel ();
2188+ extern __global__ void tops_bf16f32_sm90wgmma_64x256x16_loop128_cuda_kernel ();
2189+ extern __global__ void tops_tf32f32_sm90wgmma_64x256x16_loop128_cuda_kernel ();
2190+
2191+ BENCHMARK_CAPTURE ( //
2192+ theoretic_tops_cuda, f16f32_sm90wgmma, tops_f16f32_sm90wgmma_64x256x16_loop128_cuda_kernel, //
2193+ 64 , 256 , 16 , 90 , 128 , tensor_core_scale_t ::warpgroup_k)
2194+ ->MinTime(10 );
2195+ BENCHMARK_CAPTURE ( //
2196+ theoretic_tops_cuda, bf16f32_sm90wgmma, tops_bf16f32_sm90wgmma_64x256x16_loop128_cuda_kernel, //
2197+ 64 , 256 , 16 , 90 , 128 , tensor_core_scale_t ::warpgroup_k)
2198+ ->MinTime(10 );
2199+ BENCHMARK_CAPTURE ( //
2200+ theoretic_tops_cuda, tf32f32_sm90wgmma, tops_tf32f32_sm90wgmma_64x256x16_loop128_cuda_kernel, //
2201+ 64 , 256 , 16 , 90 , 128 , tensor_core_scale_t ::warpgroup_k)
21302202 ->MinTime(10 );
21312203
2132- #include < filesystem>
2204+ #include < filesystem> // `std::filesystem::absolute` to locate PTX IR file
21332205
2206+ /* *
2207+ * @brief Runs the benchmark loop for precompiled CUDA C++ kernels using
2208+ * the low-level @b driver API. It counts TOPS (Tensor Operations Per
2209+ * Second) as the number of scalar multiplications in $A * B$, ignoring
2210+ * the $D$ additive part of $A * B + D$.
2211+ *
2212+ * @param m,n,k Dimensions of matrices multiplied by one instruction.
2213+ * @param required_capability GPU's Streaming Multiprocessor generation needed.
2214+ *
2215+ * @param file_name The name of the @b `.ptx` file in current directory.
2216+ * @param kernel_name The name of a specific @b `.visible` entry function.
2217+ * @param scale Number of threads in each block, computing MMA collectively.
2218+ */
21342219static void theoretic_tops_ptx ( //
21352220 bm::State &state, //
21362221 std::string file_name, //
21372222 std::string kernel_name, //
21382223 std::size_t m, std::size_t n, std::size_t k, //
2139- std::size_t repetitions, int required_capability) {
2224+ int required_capability, //
2225+ std::size_t repetitions, //
2226+ tensor_core_scale_t scale = tensor_core_scale_t ::unknown_k) {
21402227
21412228 // Resolve the absolute path to the PTX file
21422229 std::string ptx_file = file_name;
@@ -2233,17 +2320,23 @@ static void theoretic_tops_ptx( //
22332320 return ;
22342321 }
22352322
2236- // Set kernel launch configuration
2323+ // Set kernel launch configuration, same way as in `theoretic_tops_cuda`.
22372324 dim3 grid_dim (num_sms);
2238- dim3 block_dim (warp_size);
2325+ dim3 block_dim (warp_size * 4 );
22392326 void *kernel_args[] = {nullptr };
22402327
2328+ // We need shared memory for matrix multiplications on Hopper:
2329+ // - on V100 we have 96 KB per SM
2330+ // - on H100 we have 228 KB per SM
2331+ unsigned int shared_memory = 0 ; // 32 * 1024;
2332+ if (scale == tensor_core_scale_t ::unknown_k) scale = tensor_core_scale (required_capability);
2333+
22412334 for (auto _ : state) {
22422335 result = cuLaunchKernel ( //
22432336 kernel, //
22442337 grid_dim.x , grid_dim.y , grid_dim.z , //
22452338 block_dim.x , block_dim.y , block_dim.z , //
2246- 0 , nullptr , kernel_args, nullptr );
2339+ shared_memory , nullptr , kernel_args, nullptr );
22472340 if (result != CUDA_SUCCESS) {
22482341 state.SkipWithError (" Failed to launch the kernel: " + last_error_string ());
22492342 break ;
@@ -2255,55 +2348,68 @@ static void theoretic_tops_ptx( //
22552348 }
22562349 }
22572350
2351+ std::size_t const threads = static_cast <std::size_t >(grid_dim.x * block_dim.x );
22582352 std::size_t const tops_per_cycle = m * n * k * 2 * repetitions;
2259- std::size_t const tops_per_gpu = tops_per_cycle * num_sms; // ? Warps compute each tile product collectively!
2353+ std::size_t const tops_per_gpu = tops_per_cycle * threads / static_cast <std:: size_t >(scale);
22602354 state.counters [" TOP" ] = benchmark::Counter (tops_per_gpu * state.iterations (), benchmark::Counter::kIsRate );
22612355
22622356 // Clean up
22632357 cuModuleUnload (module_);
22642358 cuCtxDestroy (context);
22652359}
22662360
2267- BENCHMARK_CAPTURE ( //
2268- theoretic_tops_ptx, f16f16_sm70tc, //
2269- " less_slow_sm70.ptx" , " tops_f16f16_sm70tc_16x16x16_loop128_ptx_kernel " , //
2270- 16 , 16 , 16 , 128 , 70 )
2361+ BENCHMARK_CAPTURE ( //
2362+ theoretic_tops_ptx, f16f16_sm70mma, //
2363+ " less_slow_sm70.ptx" , " tops_f16f16_sm70mma_8x8x4_loop128_ptx_kernel " , //
2364+ 16 , 16 , 16 , 70 , 128 , tensor_core_scale_t ::quadpair_k )
22712365 ->MinTime(10 );
22722366
2273- BENCHMARK_CAPTURE ( //
2274- theoretic_tops_ptx, f16f16_sm90tc, //
2275- " less_slow_sm90a.ptx" , " tops_f16f16_sm90tc_16x16x16_loop128_ptx_kernel" , //
2276- 16 , 16 , 16 , 128 , 90 )
2367+ BENCHMARK_CAPTURE ( //
2368+ theoretic_tops_ptx, f16f32_sm70mma, //
2369+ " less_slow_sm70.ptx" , " tops_f16f32_sm70mma_8x8x4_loop128_ptx_kernel" , //
2370+ 16 , 16 , 16 , 70 , 128 , tensor_core_scale_t ::quadpair_k)
2371+ ->MinTime(10 );
2372+
2373+ BENCHMARK_CAPTURE ( //
2374+ theoretic_tops_ptx, f16f16_sm80wmma, //
2375+ " less_slow_sm80.ptx" , " tops_f16f16_sm80wmma_16x16x16_loop128_ptx_kernel" , //
2376+ 16 , 16 , 16 , 80 , 128 , tensor_core_scale_t ::warp_k)
2377+ ->MinTime(10 );
2378+
2379+ BENCHMARK_CAPTURE ( //
2380+ theoretic_tops_ptx, f16f32_sm80wmma, //
2381+ " less_slow_sm80.ptx" , " tops_f16f32_sm80wmma_16x16x16_loop128_ptx_kernel" , //
2382+ 16 , 16 , 16 , 80 , 128 , tensor_core_scale_t ::warp_k)
22772383 ->MinTime(10 );
22782384
22792385BENCHMARK_CAPTURE ( //
2280- theoretic_tops_ptx, f64f64_sm90tc, //
2281- " less_slow_sm90a .ptx" , " tops_f64f64_sm90tc_8x8x4_loop128_ptx_kernel " , //
2282- 8 , 8 , 4 , 128 , 90 )
2386+ theoretic_tops_ptx, f64f64_sm80mma, //
2387+ " less_slow_sm80 .ptx" , " tops_f64f64_sm80mma_8x8x4_loop128_ptx_kernel " , //
2388+ 8 , 8 , 4 , 80 , 128 , tensor_core_scale_t ::quadpair_k )
22832389 ->MinTime(10 );
22842390
2285- BENCHMARK_CAPTURE ( //
2286- theoretic_tops_ptx, tf32f32_sm90tc, //
2287- " less_slow_sm90a .ptx" , " tops_tf32f32_sm90tc_16x16x8_loop128_ptx_kernel " , //
2288- 16 , 16 , 8 , 128 , 90 )
2391+ BENCHMARK_CAPTURE ( //
2392+ theoretic_tops_ptx, tf32f32_sm80wmma, //
2393+ " less_slow_sm80 .ptx" , " tops_tf32f32_sm80wmma_16x16x8_loop128_ptx_kernel " , //
2394+ 16 , 16 , 8 , 80 , 128 , tensor_core_scale_t ::warp_k )
22892395 ->MinTime(10 );
22902396
22912397BENCHMARK_CAPTURE ( //
2292- theoretic_tops_ptx, tf32f32_sm90tc_wgmma_smallest, //
2398+ theoretic_tops_ptx, tf32f32_sm90wgmma_smallest, //
22932399 " less_slow_sm90a.ptx" , " tops_tf32f32_sm90tc_m64n16k8_loop128_ptx_kernel" , //
2294- 64 , 16 , 8 , 128 , 90 )
2400+ 64 , 16 , 8 , 90 , 128 , tensor_core_scale_t ::warpgroup_k )
22952401 ->MinTime(10 );
22962402
22972403BENCHMARK_CAPTURE ( //
2298- theoretic_tops_ptx, tf32f32_sm90tc_wgmma_largest, //
2404+ theoretic_tops_ptx, tf32f32_sm90wgmma, //
22992405 " less_slow_sm90a.ptx" , " tops_tf32f32_sm90tc_m64n256k8_loop128_ptx_kernel" , //
2300- 64 , 256 , 8 , 128 , 90 )
2406+ 64 , 256 , 8 , 90 , 128 )
23012407 ->MinTime(10 );
23022408
23032409BENCHMARK_CAPTURE ( //
2304- theoretic_tops_ptx, b1i32and_sm90tc_wgmma, //
2410+ theoretic_tops_ptx, b1i32and_sm90wgmma, //
23052411 " less_slow_sm90a.ptx" , " tops_b1i32and_sm90tc_m64n256k256_loop128_ptx_kernel" , //
2306- 64 , 256 , 256 , 128 , 90 )
2412+ 64 , 256 , 256 , 90 , 128 , tensor_core_scale_t ::warpgroup_k )
23072413 ->MinTime(10 );
23082414
23092415/* *
0 commit comments