flashinfer-ai · bkryu · Oct 8, 2025 · Oct 7, 2025 · Oct 8, 2025 · gemini-code-assist
diff --git a/benchmarks/bench_groupwise_gemm_fp8_blackwell.py b/benchmarks/bench_groupwise_gemm_fp8_blackwell.py
@@ -167,10 +167,12 @@ def bench_groupwise_gemm_fp8_blackwell(m, n, k, in_dtype, out_dtype):
     b_scale = torch.rand((k // 128, n // 128), dtype=torch.float32, device="cuda")
 
     out = torch.empty((m, n), dtype=out_dtype, device="cuda")
-    gemm_fp8_nt_groupwise(a, b, a_scale, b_scale, out=out)
+    gemm_fp8_nt_groupwise(a, b, a_scale, b_scale, out=out, scale_major_mode="MN")
 
     measurements = bench_gpu_time(
-        lambda: gemm_fp8_nt_groupwise(a, b, a_scale, b_scale, out=out)
+        lambda: gemm_fp8_nt_groupwise(
+            a, b, a_scale, b_scale, out=out, scale_major_mode="MN"
+        )
     )
     ms = np.median(measurements)
     tflops_per_second = 2 * m * n * k * 1e-9 / ms

diff --git a/benchmarks/bench_tgv_gemm.py b/benchmarks/bench_tgv_gemm.py
@@ -65,7 +65,7 @@ def test_tgv_gemm_bf16_sm100_perf():
 
     for m, n, k, has_bias, description in test_cases:
         print(f"\n--- {description}: M={m}, N={n}, K={k}, has_bias={has_bias} ---")
-
+        flops = m * n * k * 2 / 1e12
-        flops = m * n * k * 2 / 1e12
+        FLOPS_PER_MAC = 2
+        TFLOPS_SCALE = 1e12
+        flops = (m * n * k * FLOPS_PER_MAC) / TFLOPS_SCALE
-        flops = m * n * k * 2 / 1e12
+        FLOPS_PER_MAC = 2
+        TFLOPS_SCALE = 1e12
+        flops = (m * n * k * FLOPS_PER_MAC) / TFLOPS_SCALE
         # Create tensors
         A = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
         B = torch.randn(n, k, device="cuda", dtype=torch.bfloat16).t()
@@ -99,7 +99,9 @@ def test_tgv_gemm_bf16_sm100_perf():
         torch.cuda.synchronize()
         end_time = time.time()
         cublas_avg_time = (end_time - start_time) / 100
-        print(f"CUBLAS average time: {cublas_avg_time * 1000:.6f} ms")
+        print(
+            f"CUBLAS average time: {cublas_avg_time * 1000:.6f} ms, {flops / cublas_avg_time:.3f} TFLOPS"
+        )
 
         # Warmup
         with autotune(tune_mode=True):
@@ -128,7 +130,7 @@ def test_tgv_gemm_bf16_sm100_perf():
 
         tgv_avg_time = (end_time - start_time) / 100
         print(
-            f"TGV average time: {tgv_avg_time * 1000:.6f} ms, speedup: {cublas_avg_time / tgv_avg_time:.2f}x"
+            f"TGV average time: {tgv_avg_time * 1000:.6f} ms, {flops / tgv_avg_time:.3f} TFLOPS, speedup: {cublas_avg_time / tgv_avg_time:.2f}x"
         )
 
         # Test with PDL
@@ -151,7 +153,7 @@ def test_tgv_gemm_bf16_sm100_perf():
 
         pdl_avg_time = (end_time - start_time) / 100
         print(
-            f"PDL average time: {pdl_avg_time * 1000:.6f} ms, speedup: {cublas_avg_time / pdl_avg_time:.2f}x"
+            f"PDL average time: {pdl_avg_time * 1000:.6f} ms, {flops / pdl_avg_time:.3f} TFLOPS, speedup: {cublas_avg_time / pdl_avg_time:.2f}x"
         )
 
         # Store results for CSV