[TRTLLM-9989][fix] Disable tvm_ffi for CuteDSL nvFP4 dense GEMM. (#10040)

hyukn · web-flow · commit 18b335d5844d · 2025-12-17T00:41:26.000-08:00
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -371,7 +371,7 @@ class CuteDSLNVFP4BlackwellLinear(TunableRunner):
         def __init__(self,
                      output_dtype: torch.dtype,
                      to_userbuffers: bool = False,
-                     use_tvm_ffi: bool = True):
+                     use_tvm_ffi: bool = False):
             super().__init__()
 
             if output_dtype != torch.bfloat16:
@@ -775,7 +775,7 @@ def cute_dsl_nvfp4_gemm_blackwell(
         alpha: torch.Tensor,
         output_dtype: torch.dtype,
         to_userbuffers: bool = False,
-        use_tvm_ffi: bool = True,
+        use_tvm_ffi: bool = False,
     ) -> torch.Tensor:
         """CuteDSL-based NVFP4 GEMM optimized for Blackwell.
 
@@ -825,7 +825,7 @@ def _(
         alpha: torch.Tensor,  # Match custom op signature
         output_dtype: torch.dtype,
         to_userbuffers: bool = False,
-        use_tvm_ffi: bool = True,
+        use_tvm_ffi: bool = False,
     ):
         # [m, k]
         shape = list(mat_a.shape)