@@ -2400,6 +2400,18 @@ BENCHMARK_CAPTURE( /
24002400 64 , 256 , 16 , 90 , 128 , tensor_core_scale_t ::warpgroup_k)
24012401 ->MinTime(10 );
24022402
2403+ BENCHMARK_CAPTURE ( //
2404+ theoretic_tops_ptx, bf16f32_sm90wgmma, //
2405+ " less_slow_sm90a.ptx" , " tops_bf16f32_sm90tc_m64n256k16_loop128_ptx_kernel" , //
2406+ 64 , 256 , 16 , 90 , 128 , tensor_core_scale_t ::warpgroup_k)
2407+ ->MinTime(10 );
2408+
2409+ BENCHMARK_CAPTURE ( //
2410+ theoretic_tops_ptx, tf32f32_sm90wgmma, //
2411+ " less_slow_sm90a.ptx" , " tops_tf32f32_sm90tc_m64n256k8_loop128_ptx_kernel" , //
2412+ 64 , 256 , 8 , 90 , 128 , tensor_core_scale_t ::warpgroup_k)
2413+ ->MinTime(10 );
2414+
24032415BENCHMARK_CAPTURE ( //
24042416 theoretic_tops_ptx, b1i32and_sm90wgmma, //
24052417 " less_slow_sm90a.ptx" , " tops_b1i32and_sm90tc_m64n256k256_loop128_ptx_kernel" , //
@@ -2414,8 +2426,8 @@ BENCHMARK_CAPTURE(
24142426 * number recommended in the datasheet. Similar for double-precision.
24152427 *
24162428 * - The highest-precision "properly accelerated" type - TF32, will yield only
2417- * @b 75 TOPs when using the old warp -level primitives, but will skyrocket
2418- * to @b 300 TOPS when using the Warp-Group MMA, @b 60% of the recommended .
2429+ * @b 25 TOPs when using the old Warp -level primitives, but will skyrocket
2430+ * to @b 600 TOPS when using the Warp-Group-level MMA.
24192431 */
24202432
24212433#endif
@@ -3124,14 +3136,14 @@ BENCHMARK(cublas_tops<int8_t, int32_t>)->RangeMultiplier(2)->Range(8, 16384)->Co
31243136 *
31253137 * Datasheet MMA kernels cuBLAS
31263138 *
3127- * - `f64` @b 67 T @b 3.3 T @b 60 T
3128- * - `f32` @b 67 T @b 51 T @b 49 T
3129- * - `tf32` @b 500 T @b 21 T -
3130- * - `bf16` @b 1'000 T @b 51 T -
3131- * - `f16` @b 1'000 T - @b 764 T
3139+ * - `f64` @b 67 T @b 17 T @b 60 T
3140+ * - `f32` @b 67 T - @b 49 T
3141+ * - `tf32` @b 500 T @b 520 T -
3142+ * - `bf16` @b 1'000 T @b 1'047 T -
3143+ * - `f16` @b 1'000 T @b 1'056 T @b 764 T
31323144 * - `i8` & `u8` @b 2'000 T - @b 122 T
3133- * - `b1` XOR-based - @b 39 T -
3134- * - `b1` AND-based - @b 143 T -
3145+ * - `b1` XOR-based - @b 79 T -
3146+ * - `b1` AND-based - @b 8'439 T -
31353147 *
31363148 * For comparison, on AMD MI 300X accelerators:
31373149 * - 80 T arithmetic and 160 T matrix multiplications for `f64`.
0 commit comments