feature: enable cublas for fp4 gemm when cudnn == 9.11.1 or >= 9.13 (#1405)

ttyio · web-flow · commit 7ce370237ab9 · 2025-08-06T20:18:36.000-07:00
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -968,6 +968,20 @@ def _check_cudnn_fp4_availability():
         )
 
 
+def _is_cublas_fp4_available_in_cudnn():
+    """Check if cuBLAS backend for FP4 GEMM is available in cuDNN."""
+    _check_cudnn_availability()
+
+    # Check cuDNN backend version for FP4 support (requires cudnn_version == 9.11.1 or cudnn_version >= 9.13)
+    backend_version = cudnn.backend_version()
+    CUDNN_VERSION_9_11_1 = 91101
+    CUDNN_VERSION_9_13_0 = 91300
+    return (
+        backend_version == CUDNN_VERSION_9_11_1
+        or backend_version >= CUDNN_VERSION_9_13_0
+    )
+
+
 def _get_native_fp4_dtype():
     """get native fp4 datatype if supported in the torch, otherwise return uint8."""
     if hasattr(torch, "float4_e2m1fn_x2"):
@@ -1084,8 +1098,11 @@ def build_cudnn_gemm_block_scale_dequantize_graph(
         graph.validate()
         graph.build_operation_graph()
         graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.B])
-        # WAR: the alpha (contains the global scale) is not supported by the cuBLAS backend, need to deselect it.
-        graph.deselect_engines(["eng0"])
+
+        # WAR: The alpha (contains the global scale) is not supported by the cuBLAS backend (eng0)
+        # in older cuDNN versions, so we deselect it.
+        if not _is_cublas_fp4_available_in_cudnn():
+            graph.deselect_engines(["eng0"])
         graph.check_support()
         graph.build_plans()