@@ -65,7 +65,7 @@ def test_tgv_gemm_bf16_sm100_perf():
65
65
66
66
for m , n , k , has_bias , description in test_cases :
67
67
print (f"\n --- { description } : M={ m } , N={ n } , K={ k } , has_bias={ has_bias } ---" )
68
-
68
+ flops = m * n * k * 2 / 1e12
69
69
# Create tensors
70
70
A = torch .randn (m , k , device = "cuda" , dtype = torch .bfloat16 )
71
71
B = torch .randn (n , k , device = "cuda" , dtype = torch .bfloat16 ).t ()
@@ -99,7 +99,7 @@ def test_tgv_gemm_bf16_sm100_perf():
99
99
torch .cuda .synchronize ()
100
100
end_time = time .time ()
101
101
cublas_avg_time = (end_time - start_time ) / 100
102
- print (f"CUBLAS average time: { cublas_avg_time * 1000 :.6f} ms" )
102
+ print (f"CUBLAS average time: { cublas_avg_time * 1000 :.6f} ms, { flops / cublas_avg_time :.3f } TFLOPS " )
103
103
104
104
# Warmup
105
105
with autotune (tune_mode = True ):
@@ -128,7 +128,7 @@ def test_tgv_gemm_bf16_sm100_perf():
128
128
129
129
tgv_avg_time = (end_time - start_time ) / 100
130
130
print (
131
- f"TGV average time: { tgv_avg_time * 1000 :.6f} ms, speedup: { cublas_avg_time / tgv_avg_time :.2f} x"
131
+ f"TGV average time: { tgv_avg_time * 1000 :.6f} ms, { flops / tgv_avg_time :.3f } TFLOPS, speedup: { cublas_avg_time / tgv_avg_time :.2f} x"
132
132
)
133
133
134
134
# Test with PDL
@@ -151,7 +151,7 @@ def test_tgv_gemm_bf16_sm100_perf():
151
151
152
152
pdl_avg_time = (end_time - start_time ) / 100
153
153
print (
154
- f"PDL average time: { pdl_avg_time * 1000 :.6f} ms, speedup: { cublas_avg_time / pdl_avg_time :.2f} x"
154
+ f"PDL average time: { pdl_avg_time * 1000 :.6f} ms, { flops / pdl_avg_time :.3f } TFLOPS, speedup: { cublas_avg_time / pdl_avg_time :.2f} x"
155
155
)
156
156
157
157
# Store results for CSV
0 commit comments