From e9fb82abb6a6219740d4834b51cf5ae6f4e183bf Mon Sep 17 00:00:00 2001
From: Yukun He <23156053+hyukn@users.noreply.github.com>
Date: Tue, 30 Dec 2025 09:21:15 +0000
Subject: [PATCH] [TRTLLM-9661][chore] Further reduce tuning time for cuteDSL
 nvFP4 dense gemm.

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
---
 tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py | 4 +++-
 tensorrt_llm/_torch/modules/linear.py                 | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
index 06f01fb1f71..15342506f2e 100644
--- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -479,7 +479,9 @@ def get_valid_tactics(
                 (4, 4),
             ]
             swap_ab_candidates = [True, False]
-            use_prefetch_candidates = [True, False]
+            # prune: prefetch is beneficial only when K is large enough
+            use_prefetch_candidates = [True, False
+                                       ] if real_k >= 16384 else [False]
 
             valid_tactics = []
             for mma_tiler_mn, cluster_shape_mn, swap_ab, use_prefetch in itertools.product(
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
index 44daa25eb3c..2fe4bdf7d3b 100644
--- a/tensorrt_llm/_torch/modules/linear.py
+++ b/tensorrt_llm/_torch/modules/linear.py
@@ -2105,6 +2105,11 @@ def __init__(
             'cutlass', 'cublaslt', 'cuda_core'
         ]
 
+        # Add cutedsl to the allowed backends if tp size is greater than or equal to 4,
+        # because distributed tuning can decrease the tuning time by tp_size.
+        if self.tp_size >= 4 and 'cutedsl' not in self.nvfp4_allowed_backends:
+            self.nvfp4_allowed_backends.append('cutedsl')
+
         local_in_features = in_features
         local_out_features = out_features