NVIDIA · hyukn · Dec 30, 2025 · Dec 31, 2025 · Jan 1, 2026
@@ -454,7 +454,9 @@ def get_valid_tactics(
                 (4, 4),
             ]
             swap_ab_candidates = [True, False]
-            use_prefetch_candidates = [True, False]
+            # prune: prefetch is beneficial only when K is large enough
+            use_prefetch_candidates = [True, False
+                                       ] if real_k >= 16384 else [False]
 
             valid_tactics = []
             for mma_tiler_mn, cluster_shape_mn, swap_ab, use_prefetch in itertools.product(

@@ -2105,6 +2105,11 @@ def __init__(
             'cutlass', 'cublaslt', 'cuda_core'
         ]
 
+        # Add cutedsl to the allowed backends if tp size is greater than or equal to 4,
+        # because distributed tuning can decrease the tuning time by tp_size.
+        if self.tp_size >= 4 and 'cutedsl' not in self.nvfp4_allowed_backends:
+            self.nvfp4_allowed_backends.append('cutedsl')
+
         local_in_features = in_features
         local_out_features = out_features