diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py index 6a7b36e4c3c..5f368bfac1b 100644 --- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py @@ -454,7 +454,9 @@ def get_valid_tactics( (4, 4), ] swap_ab_candidates = [True, False] - use_prefetch_candidates = [True, False] + # prune: prefetch is beneficial only when K is large enough + use_prefetch_candidates = [True, False + ] if real_k >= 16384 else [False] valid_tactics = [] for mma_tiler_mn, cluster_shape_mn, swap_ab, use_prefetch in itertools.product( diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py index 44daa25eb3c..2fe4bdf7d3b 100644 --- a/tensorrt_llm/_torch/modules/linear.py +++ b/tensorrt_llm/_torch/modules/linear.py @@ -2105,6 +2105,11 @@ def __init__( 'cutlass', 'cublaslt', 'cuda_core' ] + # Add cutedsl to the allowed backends if tp size is greater than or equal to 4, + # because distributed tuning can decrease the tuning time by tp_size. + if self.tp_size >= 4 and 'cutedsl' not in self.nvfp4_allowed_backends: + self.nvfp4_allowed_backends.append('cutedsl') + local_in_features = in_features local_out_features = out_features