From e9fb82abb6a6219740d4834b51cf5ae6f4e183bf Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Tue, 30 Dec 2025 09:21:15 +0000 Subject: [PATCH] [TRTLLM-9661][chore] Further reduce tuning time for cuteDSL nvFP4 dense gemm. Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py | 4 +++- tensorrt_llm/_torch/modules/linear.py | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py index 06f01fb1f71..15342506f2e 100644 --- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py @@ -479,7 +479,9 @@ def get_valid_tactics( (4, 4), ] swap_ab_candidates = [True, False] - use_prefetch_candidates = [True, False] + # prune: prefetch is beneficial only when K is large enough + use_prefetch_candidates = [True, False + ] if real_k >= 16384 else [False] valid_tactics = [] for mma_tiler_mn, cluster_shape_mn, swap_ab, use_prefetch in itertools.product( diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py index 44daa25eb3c..2fe4bdf7d3b 100644 --- a/tensorrt_llm/_torch/modules/linear.py +++ b/tensorrt_llm/_torch/modules/linear.py @@ -2105,6 +2105,11 @@ def __init__( 'cutlass', 'cublaslt', 'cuda_core' ] + # Add cutedsl to the allowed backends if tp size is greater than or equal to 4, + # because distributed tuning can decrease the tuning time by tp_size. + if self.tp_size >= 4 and 'cutedsl' not in self.nvfp4_allowed_backends: + self.nvfp4_allowed_backends.append('cutedsl') + local_in_features = in_features local_out_features = out_features