diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
index 6a7b36e4c3c..5f368bfac1b 100644
--- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -454,7 +454,9 @@ def get_valid_tactics(
                 (4, 4),
             ]
             swap_ab_candidates = [True, False]
-            use_prefetch_candidates = [True, False]
+            # prune: prefetch is beneficial only when K is large enough
+            use_prefetch_candidates = [True, False
+                                       ] if real_k >= 16384 else [False]
 
             valid_tactics = []
             for mma_tiler_mn, cluster_shape_mn, swap_ab, use_prefetch in itertools.product(
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
index 44daa25eb3c..2fe4bdf7d3b 100644
--- a/tensorrt_llm/_torch/modules/linear.py
+++ b/tensorrt_llm/_torch/modules/linear.py
@@ -2105,6 +2105,11 @@ def __init__(
             'cutlass', 'cublaslt', 'cuda_core'
         ]
 
+        # Add cutedsl to the allowed backends if tp size is greater than or equal to 4,
+        # because distributed tuning can decrease the tuning time by tp_size.
+        if self.tp_size >= 4 and 'cutedsl' not in self.nvfp4_allowed_backends:
+            self.nvfp4_allowed_backends.append('cutedsl')
+
         local_in_features = in_features
         local_out_features = out_features