diff --git a/benchmarks/bench_groupwise_gemm_fp8_blackwell.py b/benchmarks/bench_groupwise_gemm_fp8_blackwell.py index 3175d78a11..451bb902e6 100644 --- a/benchmarks/bench_groupwise_gemm_fp8_blackwell.py +++ b/benchmarks/bench_groupwise_gemm_fp8_blackwell.py @@ -167,10 +167,12 @@ def bench_groupwise_gemm_fp8_blackwell(m, n, k, in_dtype, out_dtype): b_scale = torch.rand((k // 128, n // 128), dtype=torch.float32, device="cuda") out = torch.empty((m, n), dtype=out_dtype, device="cuda") - gemm_fp8_nt_groupwise(a, b, a_scale, b_scale, out=out) + gemm_fp8_nt_groupwise(a, b, a_scale, b_scale, out=out, scale_major_mode="MN") measurements = bench_gpu_time( - lambda: gemm_fp8_nt_groupwise(a, b, a_scale, b_scale, out=out) + lambda: gemm_fp8_nt_groupwise( + a, b, a_scale, b_scale, out=out, scale_major_mode="MN" + ) ) ms = np.median(measurements) tflops_per_second = 2 * m * n * k * 1e-9 / ms diff --git a/benchmarks/bench_tgv_gemm.py b/benchmarks/bench_tgv_gemm.py index 6b5f1c97d6..b5574aa71c 100755 --- a/benchmarks/bench_tgv_gemm.py +++ b/benchmarks/bench_tgv_gemm.py @@ -65,7 +65,7 @@ def test_tgv_gemm_bf16_sm100_perf(): for m, n, k, has_bias, description in test_cases: print(f"\n--- {description}: M={m}, N={n}, K={k}, has_bias={has_bias} ---") - + flops = m * n * k * 2 / 1e12 # Create tensors A = torch.randn(m, k, device="cuda", dtype=torch.bfloat16) B = torch.randn(n, k, device="cuda", dtype=torch.bfloat16).t() @@ -99,7 +99,9 @@ def test_tgv_gemm_bf16_sm100_perf(): torch.cuda.synchronize() end_time = time.time() cublas_avg_time = (end_time - start_time) / 100 - print(f"CUBLAS average time: {cublas_avg_time * 1000:.6f} ms") + print( + f"CUBLAS average time: {cublas_avg_time * 1000:.6f} ms, {flops / cublas_avg_time:.3f} TFLOPS" + ) # Warmup with autotune(tune_mode=True): @@ -128,7 +130,7 @@ def test_tgv_gemm_bf16_sm100_perf(): tgv_avg_time = (end_time - start_time) / 100 print( - f"TGV average time: {tgv_avg_time * 1000:.6f} ms, speedup: {cublas_avg_time / tgv_avg_time:.2f}x" + f"TGV average time: {tgv_avg_time * 1000:.6f} ms, {flops / tgv_avg_time:.3f} TFLOPS, speedup: {cublas_avg_time / tgv_avg_time:.2f}x" ) # Test with PDL @@ -151,7 +153,7 @@ def test_tgv_gemm_bf16_sm100_perf(): pdl_avg_time = (end_time - start_time) / 100 print( - f"PDL average time: {pdl_avg_time * 1000:.6f} ms, speedup: {cublas_avg_time / pdl_avg_time:.2f}x" + f"PDL average time: {pdl_avg_time * 1000:.6f} ms, {flops / pdl_avg_time:.3f} TFLOPS, speedup: {cublas_avg_time / pdl_avg_time:.2f}x" ) # Store results for CSV