From 0c27f02bf74ec54ea45f8d2f85942318c5570613 Mon Sep 17 00:00:00 2001
From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com>
Date: Sun, 14 Dec 2025 16:58:24 +0800
Subject: [PATCH 1/3] TRTLLM MoE maps to lower tuning buckets when ep>1

Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com>
---
 tensorrt_llm/_torch/autotuner.py              |  49 ++++-
 .../custom_ops/trtllm_gen_custom_ops.py       | 195 +++++++++++-------
 tests/unittest/_torch/misc/test_autotuner.py  |  71 +++++++
 3 files changed, 229 insertions(+), 86 deletions(-)

diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
index 33ef41af8a9..6d20df8bbaf 100644
--- a/tensorrt_llm/_torch/autotuner.py
+++ b/tensorrt_llm/_torch/autotuner.py
@@ -49,12 +49,15 @@ class DynamicTensorSpec:
         input_idx: The index of the input tensor.
         dim_idx: The index of the dimension to tune.
         gen_tuning_buckets: A tuple of values to try or a function generating values.
-        map_to_tuning_buckets: A function to map dimensions to valid values during inference.
+        map_to_tuning_buckets: A function to map dimensions to valid values during tuning.
+        map_to_runtime_buckets: A function to map dimensions to valid values during inference.
+          If None, use map_to_tuning_buckets.
     """
     input_idx: int
     dim_idx: int
     gen_tuning_buckets: Union[Tuple[int], Callable] = ()
     map_to_tuning_buckets: Callable = lambda x: x
+    map_to_runtime_buckets: Optional[Callable] = None
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -392,6 +395,7 @@ def search_cache(
         runners: List[TunableRunner],
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
+        use_tuning_mapping: bool = False,
     ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]:
         """Search for cached profiling results matching the current configuration.
 
@@ -399,6 +403,8 @@ def search_cache(
             custom_op (str): The name of the custom operation to be tuned
             runners (List[TunableRunner]): List of candidate implementations to profile
             profile (OptimizationProfile): Optimization profile
+            use_tuning_mapping: If True, use map_to_tuning_buckets for cache key.
+                If False, use map_to_runtime_buckets for runtime cache lookups.
 
         Returns:
             A tuple containing:
@@ -406,8 +412,10 @@ def search_cache(
             runner_id is the index in the current runners list
         """
         for idx, r in enumerate(runners):
-            if (cache_key := self.get_cache_key(custom_op, r, input_shapes,
-                                                tuning_config)) in self.cache:
+            if (cache_key :=
+                    self.get_cache_key(custom_op, r, input_shapes,
+                                       tuning_config,
+                                       use_tuning_mapping)) in self.cache:
                 # Return the current index in runners list, not the cached runner_id
                 cached_runner_id, tactic, min_time = self.cache[cache_key]
                 return True, idx, tactic, min_time
@@ -420,6 +428,7 @@ def get_cache_key(
         runner: TunableRunner,
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
+        use_tuning_mapping: bool = False,
     ) -> Tuple:
         return (
             custom_op,
@@ -430,6 +439,7 @@ def get_cache_key(
                 tuning_config.dynamic_tensor_specs,
                 tuning_config.constraint_specs,
                 tuning_config.tune_max_num_tokens,
+                use_tuning_mapping,
             ),
         )
 
@@ -841,7 +851,12 @@ def choose_one(
             for p in profiles:
                 tensors = self._prepare_input_tensors(p, inputs)
                 is_cache_hit, *_ = self.profiling_cache.search_cache(
-                    custom_op, runners, p.get_opt_shapes(), tuning_config)
+                    custom_op,
+                    runners,
+                    p.get_opt_shapes(),
+                    tuning_config,
+                    use_tuning_mapping=True,
+                )
                 if not is_cache_hit:
                     # Initialize runner and tactic as None in case of no valid tactic or runners are found
                     best_runner_id, best_tactic, min_time, has_tuning_failure_occurred = self._profile_runners(
@@ -928,8 +943,11 @@ def _profile_runners(
                     # Record the failed profiling combinations
                     self.stats.failed_profiling_count[custom_op].add(
                         self.profiling_cache.get_cache_key(
-                            custom_op, runner, profile.get_opt_shapes(),
-                            tuning_config))
+                            custom_op,
+                            runner,
+                            profile.get_opt_shapes(),
+                            tuning_config,
+                            use_tuning_mapping=True))
 
                     # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
                     # or some runtime error occurs during profiling.
@@ -942,8 +960,11 @@ def _profile_runners(
         if best_runner_id is not None:
             # At least one valid (runner, tactic) pair is found
             cache_key = self.profiling_cache.get_cache_key(
-                custom_op, runners[best_runner_id], profile.get_opt_shapes(),
-                tuning_config)
+                custom_op,
+                runners[best_runner_id],
+                profile.get_opt_shapes(),
+                tuning_config,
+                use_tuning_mapping=True)
 
             self._debug_logger(
                 f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
@@ -1164,6 +1185,7 @@ def _find_nearest_profile(
         dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...],
         constraint_specs: Tuple[ConstraintSpec, ...],
         tune_max_num_tokens: int = None,
+        use_tuning_mapping: bool = False,
     ) -> Tuple:
         """Find the nearest optimization profile for given inputs
         User can define their own nearest profile generation method to reduce the host overhead.
@@ -1171,6 +1193,8 @@ def _find_nearest_profile(
         Args:
             shapes: Tuple of input tensor shapes
             tuning_config: Tuning configuration
+            use_tuning_mapping: If True, use map_to_tuning_buckets to store tuning cache.
+                If False, use map_to_runtime_buckets for runtime cache lookups.
 
         Return:
             Tuple: A tuple containing:
@@ -1180,9 +1204,12 @@ def _find_nearest_profile(
         base_profile = list(list(shape) for shape in shapes)
 
         for spec in dynamic_tensor_specs:
-            base_profile[spec.input_idx][
-                spec.dim_idx] = spec.map_to_tuning_buckets(
-                    base_profile[spec.input_idx][spec.dim_idx])
+
+            bucket_mapper = spec.map_to_tuning_buckets
+            if not use_tuning_mapping and spec.map_to_runtime_buckets is not None:
+                bucket_mapper = spec.map_to_runtime_buckets
+            base_profile[spec.input_idx][spec.dim_idx] = bucket_mapper(
+                base_profile[spec.input_idx][spec.dim_idx])
 
             if tune_max_num_tokens is not None:
                 base_profile[spec.input_idx][spec.dim_idx] = min(
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
index a8236d88fcf..7755f2aba65 100644
--- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -207,8 +207,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
         self.routing_method_type = routing_method_type
         self.do_finalize = do_finalize
 
-        FP4BlockScaleMoERunner.tuning_config = FP4BlockScaleMoERunner.get_tuning_config(
-        )
+        self.tuning_config = FP4BlockScaleMoERunner.get_tuning_config(
+            self.num_experts // self.local_num_experts)
 
     # The unique_id is used by the autotuner to get the cache key, so we hash on members
     # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing type does not matter
@@ -269,17 +269,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
         return tactics
 
     @classmethod
-    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+    def get_dynamic_tensor_specs(cls,
+                                 ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
         MAX_PROFILE_BUCKET = 4096
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
-        round_rule = lambda x: min(last_positive_power_of_2(x),
-                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
-                                   round_rule), )
+        def round_rule(x: int, ep_size_: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size_
+            return min(max(1, value), MAX_PROFILE_BUCKET)
+
+        specs = (DynamicTensorSpec(
+            HIDDEN_STATES_IDX,
+            TUNED_DIM,
+            m_values,
+            map_to_tuning_buckets=lambda x: round_rule(x, 1),
+            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
 
         return specs
 
@@ -340,9 +347,9 @@ def _constrain_fp4_linear_layout(shapes: Tuple[torch.Size]) -> int:
 
     @classmethod
     @lru_cache(maxsize=None)
-    def get_tuning_config(cls) -> TuningConfig:
+    def get_tuning_config(cls, ep_size: int) -> TuningConfig:
 
-        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size)
         constraint_specs = cls.get_constraint_specs()
 
         tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
@@ -399,7 +406,7 @@ def fp4_block_scale_moe_runner(
             topk_ids=topk_ids,
             hidden_states=hidden_states,
             routing_logits=routing_logits,
-            base_tuning_config=FP4BlockScaleMoERunner.get_tuning_config(),
+            base_tuning_config=kernel_runner.tuning_config,
             top_k=top_k,
             num_experts=num_experts,
             n_group=n_group,
@@ -550,8 +557,8 @@ def __init__(
         self.routed_scaling_factor = routed_scaling_factor
         self.routing_method_type = routing_method_type
 
-        FP8BlockScaleMoERunner.tuning_config = FP8BlockScaleMoERunner.get_tuning_config(
-        )
+        self.tuning_config = FP8BlockScaleMoERunner.get_tuning_config(
+            self.num_experts // self.local_num_experts)
 
     # The unique_id is used by the autotuner to get the cache key, so we hash on members
     # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing
@@ -608,18 +615,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
         return tactics
 
     @classmethod
-    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+    def get_dynamic_tensor_specs(cls,
+                                 ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-
         MAX_PROFILE_BUCKET = 4096
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
-        round_rule = lambda x: min(last_positive_power_of_2(x),
-                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
-                                   round_rule), )
+        def round_rule(x: int, ep_size_: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size_
+            return min(max(1, value), MAX_PROFILE_BUCKET)
+
+        specs = (DynamicTensorSpec(
+            HIDDEN_STATES_IDX,
+            TUNED_DIM,
+            m_values,
+            map_to_tuning_buckets=lambda x: round_rule(x, 1),
+            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
 
         return specs
 
@@ -662,9 +675,9 @@ def _constrain_to_num_tokens(shapes: Tuple[torch.Size]) -> int:
 
     @classmethod
     @lru_cache(maxsize=None)
-    def get_tuning_config(cls) -> TuningConfig:
+    def get_tuning_config(cls, ep_size: int) -> TuningConfig:
 
-        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size)
         constraint_specs = cls.get_constraint_specs()
 
         tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
@@ -696,19 +709,17 @@ def fp8_block_scale_moe_runner(
         topk_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
 
     tuner = AutoTuner.get()
-    kernel_runners = [
-        FP8BlockScaleMoERunner(
-            num_experts,
-            top_k,
-            n_group,
-            topk_group,
-            intermediate_size,
-            local_expert_offset,
-            local_num_experts,
-            routed_scaling_factor,
-            routing_method_type,
-        )
-    ]
+    kernel_runner = FP8BlockScaleMoERunner(
+        num_experts,
+        top_k,
+        n_group,
+        topk_group,
+        intermediate_size,
+        local_expert_offset,
+        local_num_experts,
+        routed_scaling_factor,
+        routing_method_type,
+    )
 
     # Prepare dummy topk tensors and hook for AutoTuner profiling
     routing_logits_for_tuner, topk_weights_for_tuner, topk_ids_for_tuner, tuning_config_with_hook = \
@@ -718,7 +729,7 @@ def fp8_block_scale_moe_runner(
             topk_ids=topk_ids,
             hidden_states=hidden_states,
             routing_logits=routing_logits,
-            base_tuning_config=FP8BlockScaleMoERunner.get_tuning_config(),
+            base_tuning_config=kernel_runner.tuning_config,
             top_k=top_k,
             num_experts=num_experts,
             n_group=n_group,
@@ -742,7 +753,7 @@ def fp8_block_scale_moe_runner(
 
     kernel_runner, best_tactic = tuner.choose_one(
         "trtllm::fp8_block_scale_moe_runner",
-        kernel_runners,
+        [kernel_runner],
         tuning_config_with_hook,
         input_tensors_for_tuner,
     )
@@ -827,8 +838,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
         self.routing_method_type = routing_method_type
         self.act_type = act_type
 
-        MxE4m3MxE2m1BlockScaleMoERunner.tuning_config = MxE4m3MxE2m1BlockScaleMoERunner.get_tuning_config(
-        )
+        self.tuning_config = MxE4m3MxE2m1BlockScaleMoERunner.get_tuning_config(
+            self.num_experts // self.local_num_experts)
 
     # The unique_id is used by the autotuner to get the cache key, so we hash on members
     # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing
@@ -899,15 +910,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
         return tactics
 
     @classmethod
-    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+    def get_dynamic_tensor_specs(cls,
+                                 ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
+        MAX_PROFILE_BUCKET = 4096
 
-        m_values = get_last_power_of_2_num_tokens_buckets(4096)
-        round_rule = lambda x: min(last_positive_power_of_2(x), 4096)
+        m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
+
+        def round_rule(x: int, ep_size_: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size_
+            return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
-                                   round_rule), )
+        specs = (DynamicTensorSpec(
+            HIDDEN_STATES_IDX,
+            TUNED_DIM,
+            m_values,
+            map_to_tuning_buckets=lambda x: round_rule(x, 1),
+            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
 
         return specs
 
@@ -961,9 +981,9 @@ def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int:
 
     @classmethod
     @lru_cache(maxsize=None)
-    def get_tuning_config(cls) -> TuningConfig:
+    def get_tuning_config(cls, ep_size: int) -> TuningConfig:
 
-        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size)
         constraint_specs = cls.get_constraint_specs()
 
         tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
@@ -1028,7 +1048,7 @@ def mxe4m3_mxe2m1_block_scale_moe_runner(
             topk_ids=topk_ids,
             hidden_states=hidden_states,
             routing_logits=routing_logits,
-            base_tuning_config=MxE4m3MxE2m1BlockScaleMoERunner.get_tuning_config(),
+            base_tuning_config=kernel_runner.tuning_config,
             top_k=top_k,
             num_experts=num_experts,
             n_group=n_group,
@@ -1118,8 +1138,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
         self.routing_method_type = routing_method_type
         self.act_type = act_type
 
-        E4m3MxE2m1BlockScaleMoERunner.tuning_config = E4m3MxE2m1BlockScaleMoERunner.get_tuning_config(
-        )
+        self.tuning_config = E4m3MxE2m1BlockScaleMoERunner.get_tuning_config(
+            self.num_experts // self.local_num_experts)
 
     # The unique_id is used by the autotuner to get the cache key, so we hash on members
     # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing
@@ -1190,15 +1210,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
         return tactics
 
     @classmethod
-    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+    def get_dynamic_tensor_specs(cls,
+                                 ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
+        MAX_PROFILE_BUCKET = 4096
 
-        m_values = get_last_power_of_2_num_tokens_buckets(4096)
-        round_rule = lambda x: min(last_positive_power_of_2(x), 4096)
+        m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
+
+        def round_rule(x: int, ep_size_: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size_
+            return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
-                                   round_rule), )
+        specs = (DynamicTensorSpec(
+            HIDDEN_STATES_IDX,
+            TUNED_DIM,
+            m_values,
+            map_to_tuning_buckets=lambda x: round_rule(x, 1),
+            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
 
         return specs
 
@@ -1232,9 +1261,9 @@ def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int:
 
     @classmethod
     @lru_cache(maxsize=None)
-    def get_tuning_config(cls) -> TuningConfig:
+    def get_tuning_config(cls, ep_size: int) -> TuningConfig:
 
-        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size)
         constraint_specs = cls.get_constraint_specs()
 
         tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
@@ -1300,7 +1329,7 @@ def e4m3_mxe2m1_block_scale_moe_runner(
             topk_ids=topk_ids,
             hidden_states=hidden_states,
             routing_logits=routing_logits,
-            base_tuning_config=E4m3MxE2m1BlockScaleMoERunner.get_tuning_config(),
+            base_tuning_config=kernel_runner.tuning_config,
             top_k=top_k,
             num_experts=num_experts,
             n_group=n_group,
@@ -1389,8 +1418,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
         self.routing_method_type = routing_method_type
         self.act_type = act_type
 
-        Bf16MxE2m1BlockScaleMoERunner.tuning_config = Bf16MxE2m1BlockScaleMoERunner.get_tuning_config(
-        )
+        self.tuning_config = Bf16MxE2m1BlockScaleMoERunner.get_tuning_config(
+            self.num_experts // self.local_num_experts)
 
     # The unique_id is used by the autotuner to get the cache key, so we hash on members
     # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing
@@ -1459,15 +1488,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
         return tactics
 
     @classmethod
-    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+    def get_dynamic_tensor_specs(cls,
+                                 ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
+        MAX_PROFILE_BUCKET = 4096
 
-        m_values = get_last_power_of_2_num_tokens_buckets(4096)
-        round_rule = lambda x: min(last_positive_power_of_2(x), 4096)
+        m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
+
+        def round_rule(x: int, ep_size_: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size_
+            return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
-                                   round_rule), )
+        specs = (DynamicTensorSpec(
+            HIDDEN_STATES_IDX,
+            TUNED_DIM,
+            m_values,
+            map_to_tuning_buckets=lambda x: round_rule(x, 1),
+            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
 
         return specs
 
@@ -1501,9 +1539,9 @@ def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int:
 
     @classmethod
     @lru_cache(maxsize=None)
-    def get_tuning_config(cls) -> TuningConfig:
+    def get_tuning_config(cls, ep_size: int) -> TuningConfig:
 
-        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size)
         constraint_specs = cls.get_constraint_specs()
 
         tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
@@ -1566,7 +1604,7 @@ def bf16_mxe2m1_block_scale_moe_runner(
             topk_ids=topk_ids,
             hidden_states=hidden_states,
             routing_logits=routing_logits,
-            base_tuning_config=Bf16MxE2m1BlockScaleMoERunner.get_tuning_config(),
+            base_tuning_config=kernel_runner.tuning_config,
             top_k=top_k,
             num_experts=num_experts,
             n_group=n_group,
@@ -1650,8 +1688,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
         self.do_finalize = do_finalize
         self.act_type = act_type
 
-        FP8FP4BlockScaleMoERunner.tuning_config = FP8FP4BlockScaleMoERunner.get_tuning_config(
-        )
+        self.tuning_config = FP8FP4BlockScaleMoERunner.get_tuning_config(
+            self.num_experts // self.local_num_experts)
 
     def unique_id(self):
         return (
@@ -1713,17 +1751,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
         return tactics
 
     @classmethod
-    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+    def get_dynamic_tensor_specs(cls,
+                                 ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
         MAX_PROFILE_BUCKET = 4096
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
-        round_rule = lambda x: min(last_positive_power_of_2(x),
-                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
-                                   round_rule), )
+        def round_rule(x: int, ep_size_: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size_
+            return min(max(1, value), MAX_PROFILE_BUCKET)
+
+        specs = (DynamicTensorSpec(
+            HIDDEN_STATES_IDX,
+            TUNED_DIM,
+            m_values,
+            map_to_tuning_buckets=lambda x: round_rule(x, 1),
+            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
 
         return specs
 
@@ -1759,9 +1804,9 @@ def _constrain_to_num_tokens(shapes: Tuple[torch.Size]) -> int:
 
     @classmethod
     @lru_cache(maxsize=None)
-    def get_tuning_config(cls) -> TuningConfig:
+    def get_tuning_config(cls, ep_size: int) -> TuningConfig:
 
-        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size)
         constraint_specs = cls.get_constraint_specs()
 
         tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
@@ -1820,7 +1865,7 @@ def fp8_fp4_block_scale_moe_runner(
             topk_ids=topk_ids,
             hidden_states=hidden_states,
             routing_logits=routing_logits,
-            base_tuning_config=FP8FP4BlockScaleMoERunner.get_tuning_config(),
+            base_tuning_config=kernel_runner.tuning_config,
             top_k=top_k,
             num_experts=num_experts,
             n_group=n_group,
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
index a6116d544f2..da760ac5e65 100644
--- a/tests/unittest/_torch/misc/test_autotuner.py
+++ b/tests/unittest/_torch/misc/test_autotuner.py
@@ -171,6 +171,77 @@ def test_autotuner_cache_basic():
         m //= 2
 
 
+def test_runtime_bucket_mapping():
+    """Test that map_to_runtime_buckets correctly maps runtime sizes to tuning buckets.
+
+    This test demonstrates the distinction between map_to_tuning_buckets and map_to_runtime_buckets:
+    - map_to_tuning_buckets: used during tuning to store cache keys with raw bucket values
+    - map_to_runtime_buckets: used during runtime to map input sizes to tuning buckets
+
+    With sparsity=0.25, the buffer contains 25% actual work:
+    - Tuning stores buckets: 1, 2, 4, 8, 16, 32
+    - Runtime buffer 4 -> maps to bucket int(4 * 0.25) = 1
+    - Runtime buffer 16 -> maps to bucket int(16 * 0.25) = 4
+
+    In MoE EP, the input buffer is allocated for worst-case but sparsely filled.
+    Using map_to_runtime_buckets allows us to map buffer size to actual work size.
+    """
+    w = torch.randn(64, 128)
+    tuner = AutoTuner.get()
+    tuner.clear_cache()
+
+    # Sparsity indicates the fraction of buffer containing valid work
+    def bucket_mapper(x: int, sparsity: float) -> int:
+        return int(x * sparsity)
+
+    tuning_config = TuningConfig(dynamic_tensor_specs=(DynamicTensorSpec(
+        input_idx=0,
+        dim_idx=0,
+        gen_tuning_buckets=get_power_of_2_num_tokens_buckets(M),
+        map_to_tuning_buckets=lambda x: bucket_mapper(x, sparsity=1.0),
+        map_to_runtime_buckets=lambda x: bucket_mapper(x, sparsity=0.25)), ), )
+
+    with autotune():
+        tuner.choose_one("test_runtime_bucket_mapping", [GemmRunner()],
+                         tuning_config, [torch.randn(1, 64), w])
+
+    # Verify cache entries use raw tuning bucket values, not deflated values
+    cache_entries = tuner.profiling_cache.get_specific_custom_op(
+        "test_runtime_bucket_mapping")
+
+    # Extract the first dimension of the first input shape from each cache key
+    assert len(cache_entries) == len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets), \
+        f"Expected len(({len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets)}) cache entries, got len({(cache_entries)})"
+
+    # Test runtime mapping: buffer size is mapped via map_to_runtime_buckets
+    # to find the correct tuning bucket based on actual work size
+    test_cases = [
+        # size 4 -> valid work size (4*0.25)=1, tactic 0 since 1 <= M//2
+        (4, 1, 0),
+        # size 8 -> valid work size (8*0.25)=2, tactic 0 since 2 <= M//2
+        (8, 2, 0),
+        # size 16 -> valid work size (16*0.25)=4, tactic 0 since 4 <= M//2
+        (16, 4, 0),
+        # size 32 -> valid work size (32*0.25)=8, tactic 0 since 8 <= M//2
+        (32, 8, 0),
+        # size 64 -> valid work size (64*0.25)=16, tactic 0 since 16 <= M//2
+        (64, 16, 0),
+        # size 128 -> valid work size (128*0.25)=32, tactic 1 since 32 > M//2
+        (128, 32, 1),
+        # size 256 -> valid work size (256*0.25)=64, tactic -1 since 64 > M
+        (256, 64, -1),
+    ]
+
+    for buffer_size, valid_size, expected_tactic in test_cases:
+        # Verify cache lookup succeeds with the mapped bucket
+        x = torch.randn(buffer_size, 64)
+        runner, tactic = tuner.choose_one("test_runtime_bucket_mapping",
+                                          [GemmRunner()], tuning_config, [x, w])
+        assert (
+            tactic == expected_tactic
+        ), f"buffer size={buffer_size} -> valid work size={valid_size}, expected tactic {expected_tactic} but got {tactic}"
+
+
 def test_autotuner_try_block():
 
     class PartialCrashedRunner(TunableRunner):

From 4121b670051ec0aa7ba52b0f606b2f0197fb37cc Mon Sep 17 00:00:00 2001
From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com>
Date: Fri, 19 Dec 2025 11:47:53 +0800
Subject: [PATCH 2/3] [breaking] map_to_tuning_buckets maps input during
 inference only

Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com>
---
 tensorrt_llm/_torch/autotuner.py              | 57 +++++++------
 .../custom_ops/trtllm_gen_custom_ops.py       | 84 ++++++++-----------
 tests/unittest/_torch/misc/test_autotuner.py  | 43 +++++-----
 3 files changed, 86 insertions(+), 98 deletions(-)

diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
index 6d20df8bbaf..f2cc354b641 100644
--- a/tensorrt_llm/_torch/autotuner.py
+++ b/tensorrt_llm/_torch/autotuner.py
@@ -49,15 +49,12 @@ class DynamicTensorSpec:
         input_idx: The index of the input tensor.
         dim_idx: The index of the dimension to tune.
         gen_tuning_buckets: A tuple of values to try or a function generating values.
-        map_to_tuning_buckets: A function to map dimensions to valid values during tuning.
-        map_to_runtime_buckets: A function to map dimensions to valid values during inference.
-          If None, use map_to_tuning_buckets.
+        map_to_tuning_buckets: A function to map dimensions to tuning buckets during inference.
     """
     input_idx: int
     dim_idx: int
     gen_tuning_buckets: Union[Tuple[int], Callable] = ()
     map_to_tuning_buckets: Callable = lambda x: x
-    map_to_runtime_buckets: Optional[Callable] = None
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -84,7 +81,7 @@ class TuningConfig:
             should be tuned to optimize performance. Each spec defines:
             - Which input tensor dimension is dynamic
             - How to generate tuning values
-            - How to map dimensions to valid values during inference
+            - How to map dimensions to tuning values during inference
 
             Example:
                 >>> config = TuningConfig(
@@ -395,7 +392,7 @@ def search_cache(
         runners: List[TunableRunner],
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
-        use_tuning_mapping: bool = False,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]:
         """Search for cached profiling results matching the current configuration.
 
@@ -403,8 +400,8 @@ def search_cache(
             custom_op (str): The name of the custom operation to be tuned
             runners (List[TunableRunner]): List of candidate implementations to profile
             profile (OptimizationProfile): Optimization profile
-            use_tuning_mapping: If True, use map_to_tuning_buckets for cache key.
-                If False, use map_to_runtime_buckets for runtime cache lookups.
+            apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups.
+                If False, use raw bucket values for tuning cache storage.
 
         Returns:
             A tuple containing:
@@ -412,10 +409,9 @@ def search_cache(
             runner_id is the index in the current runners list
         """
         for idx, r in enumerate(runners):
-            if (cache_key :=
-                    self.get_cache_key(custom_op, r, input_shapes,
-                                       tuning_config,
-                                       use_tuning_mapping)) in self.cache:
+            if (cache_key := self.get_cache_key(
+                    custom_op, r, input_shapes, tuning_config,
+                    apply_map_to_tuning_buckets)) in self.cache:
                 # Return the current index in runners list, not the cached runner_id
                 cached_runner_id, tactic, min_time = self.cache[cache_key]
                 return True, idx, tactic, min_time
@@ -428,7 +424,7 @@ def get_cache_key(
         runner: TunableRunner,
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
-        use_tuning_mapping: bool = False,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple:
         return (
             custom_op,
@@ -439,7 +435,7 @@ def get_cache_key(
                 tuning_config.dynamic_tensor_specs,
                 tuning_config.constraint_specs,
                 tuning_config.tune_max_num_tokens,
-                use_tuning_mapping,
+                apply_map_to_tuning_buckets,
             ),
         )
 
@@ -805,7 +801,11 @@ def choose_one(
 
         input_shapes = tuple(self._get_input_sizes(inputs))
         is_cache_hit, best_runner_id, best_tactic, min_time = self.profiling_cache.search_cache(
-            custom_op, runners, input_shapes, tuning_config)
+            custom_op,
+            runners,
+            input_shapes,
+            tuning_config,
+            apply_map_to_tuning_buckets=True)
 
         # Early return if it's not tuning, use cache found one or fallback one
         if not self.is_tuning_mode:
@@ -855,7 +855,7 @@ def choose_one(
                     runners,
                     p.get_opt_shapes(),
                     tuning_config,
-                    use_tuning_mapping=True,
+                    apply_map_to_tuning_buckets=False,
                 )
                 if not is_cache_hit:
                     # Initialize runner and tactic as None in case of no valid tactic or runners are found
@@ -947,7 +947,7 @@ def _profile_runners(
                             runner,
                             profile.get_opt_shapes(),
                             tuning_config,
-                            use_tuning_mapping=True))
+                            apply_map_to_tuning_buckets=False))
 
                     # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
                     # or some runtime error occurs during profiling.
@@ -964,7 +964,7 @@ def _profile_runners(
                 runners[best_runner_id],
                 profile.get_opt_shapes(),
                 tuning_config,
-                use_tuning_mapping=True)
+                apply_map_to_tuning_buckets=False)
 
             self._debug_logger(
                 f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
@@ -1141,8 +1141,7 @@ def _optimization_profiles(
             # Add the current input value as one of the opt values
             opt_shapes = set(opt_shapes)
             opt_shapes.add(
-                spec.map_to_tuning_buckets(
-                    base_profile.shapes[spec.input_idx][spec.dim_idx].val))
+                base_profile.shapes[spec.input_idx][spec.dim_idx].val)
             opt_shapes = sorted(list(opt_shapes))
             opt_shapes_max = tuple(opt_shapes[1:]) + (float('inf'), )
             opt_shapes_max = {
@@ -1185,7 +1184,7 @@ def _find_nearest_profile(
         dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...],
         constraint_specs: Tuple[ConstraintSpec, ...],
         tune_max_num_tokens: int = None,
-        use_tuning_mapping: bool = False,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple:
         """Find the nearest optimization profile for given inputs
         User can define their own nearest profile generation method to reduce the host overhead.
@@ -1193,8 +1192,8 @@ def _find_nearest_profile(
         Args:
             shapes: Tuple of input tensor shapes
             tuning_config: Tuning configuration
-            use_tuning_mapping: If True, use map_to_tuning_buckets to store tuning cache.
-                If False, use map_to_runtime_buckets for runtime cache lookups.
+            apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups.
+                If False, use raw bucket values for tuning cache storage.
 
         Return:
             Tuple: A tuple containing:
@@ -1204,12 +1203,12 @@ def _find_nearest_profile(
         base_profile = list(list(shape) for shape in shapes)
 
         for spec in dynamic_tensor_specs:
-
-            bucket_mapper = spec.map_to_tuning_buckets
-            if not use_tuning_mapping and spec.map_to_runtime_buckets is not None:
-                bucket_mapper = spec.map_to_runtime_buckets
-            base_profile[spec.input_idx][spec.dim_idx] = bucket_mapper(
-                base_profile[spec.input_idx][spec.dim_idx])
+            # During runtime: apply map_to_tuning_buckets to map input to bucket
+            # During tuning: no mapper, use raw bucket value
+            if apply_map_to_tuning_buckets:
+                base_profile[spec.input_idx][
+                    spec.dim_idx] = spec.map_to_tuning_buckets(
+                        base_profile[spec.input_idx][spec.dim_idx])
 
             if tune_max_num_tokens is not None:
                 base_profile[spec.input_idx][spec.dim_idx] = min(
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
index 7755f2aba65..7802ba4243a 100644
--- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -277,16 +277,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -623,16 +621,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -918,16 +914,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -1218,16 +1212,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -1496,16 +1488,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -1759,16 +1749,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
index da760ac5e65..fcbf6550bdf 100644
--- a/tests/unittest/_torch/misc/test_autotuner.py
+++ b/tests/unittest/_torch/misc/test_autotuner.py
@@ -38,18 +38,21 @@
 def test_multi_dynamic_dims():
     tuner = autotuner.AutoTuner()
     x = torch.rand([5, 1024])
-    w = torch.rand([7, 19])
+    w = torch.rand([7, 9])
     dynamic_tensor_specs = (
         DynamicTensorSpec(0, 0, [1, 3, 5]),
         DynamicTensorSpec(0, 1, [16, 24, 1024]),
-        DynamicTensorSpec(1, 1, [3, 7, 9], lambda x: x // 2),
+        # map_to_tuning_buckets is only applied at runtime, not during tuning
+        DynamicTensorSpec(1,
+                          1, [3, 7, 9],
+                          map_to_tuning_buckets=lambda x: x // 2),
     )
 
     profiles = tuner._optimization_profiles(
         tuning_config=TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs),
         inputs=[x, w])
     # choice(0, 0) * choice(0, 1) * choice(1, 1)
-    # 3 * 3 * 3 = 27, because 19 is mapped to 9 and already inside the bucket
+    # 3 * 3 * 3 = 27, input value 9 is already inside the bucket
     assert len(profiles) == 27
     sample_0 = OptimizationProfile(shapes=[[
         DynamicDim(min=1, opt=1, max=3),
@@ -171,47 +174,45 @@ def test_autotuner_cache_basic():
         m //= 2
 
 
-def test_runtime_bucket_mapping():
-    """Test that map_to_runtime_buckets correctly maps runtime sizes to tuning buckets.
+def test_bucket_mapping():
+    """Test that map_to_tuning_buckets correctly maps runtime sizes to tuning buckets.
 
-    This test demonstrates the distinction between map_to_tuning_buckets and map_to_runtime_buckets:
-    - map_to_tuning_buckets: used during tuning to store cache keys with raw bucket values
-    - map_to_runtime_buckets: used during runtime to map input sizes to tuning buckets
+    This test demonstrates the single mapper approach:
+    - During tuning: NO mapper is applied, raw bucket values are used as cache keys
+    - During runtime: map_to_tuning_buckets is applied to map buffer size to actual work size
 
     With sparsity=0.25, the buffer contains 25% actual work:
-    - Tuning stores buckets: 1, 2, 4, 8, 16, 32
+    - Tuning stores buckets: 1, 2, 4, 8, 16, 32 as raw cache keys
     - Runtime buffer 4 -> maps to bucket int(4 * 0.25) = 1
     - Runtime buffer 16 -> maps to bucket int(16 * 0.25) = 4
 
     In MoE EP, the input buffer is allocated for worst-case but sparsely filled.
-    Using map_to_runtime_buckets allows us to map buffer size to actual work size.
+    Using map_to_tuning_buckets allows us to map buffer size to actual work size at runtime.
     """
     w = torch.randn(64, 128)
     tuner = AutoTuner.get()
     tuner.clear_cache()
 
     # Sparsity indicates the fraction of buffer containing valid work
-    def bucket_mapper(x: int, sparsity: float) -> int:
-        return int(x * sparsity)
+    sparsity = 0.25
 
     tuning_config = TuningConfig(dynamic_tensor_specs=(DynamicTensorSpec(
         input_idx=0,
         dim_idx=0,
         gen_tuning_buckets=get_power_of_2_num_tokens_buckets(M),
-        map_to_tuning_buckets=lambda x: bucket_mapper(x, sparsity=1.0),
-        map_to_runtime_buckets=lambda x: bucket_mapper(x, sparsity=0.25)), ), )
+        map_to_tuning_buckets=lambda x: int(x * sparsity)), ), )
 
     with autotune():
-        tuner.choose_one("test_runtime_bucket_mapping", [GemmRunner()],
-                         tuning_config, [torch.randn(1, 64), w])
+        tuner.choose_one("test_bucket_mapping", [GemmRunner()], tuning_config,
+                         [torch.randn(1, 64), w])
 
-    # Verify cache entries use raw tuning bucket values, not deflated values
+    # Verify cache entries use raw tuning bucket values
     cache_entries = tuner.profiling_cache.get_specific_custom_op(
-        "test_runtime_bucket_mapping")
+        "test_bucket_mapping")
 
     # Extract the first dimension of the first input shape from each cache key
     assert len(cache_entries) == len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets), \
-        f"Expected len(({len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets)}) cache entries, got len({(cache_entries)})"
+        f"Expected {len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets)} cache entries, got {len(cache_entries)}"
 
     # Test runtime mapping: buffer size is mapped via map_to_runtime_buckets
     # to find the correct tuning bucket based on actual work size
@@ -235,8 +236,8 @@ def bucket_mapper(x: int, sparsity: float) -> int:
     for buffer_size, valid_size, expected_tactic in test_cases:
         # Verify cache lookup succeeds with the mapped bucket
         x = torch.randn(buffer_size, 64)
-        runner, tactic = tuner.choose_one("test_runtime_bucket_mapping",
-                                          [GemmRunner()], tuning_config, [x, w])
+        runner, tactic = tuner.choose_one("test_bucket_mapping", [GemmRunner()],
+                                          tuning_config, [x, w])
         assert (
             tactic == expected_tactic
         ), f"buffer size={buffer_size} -> valid work size={valid_size}, expected tactic {expected_tactic} but got {tactic}"

From 2a4eb963d50f1b58ed457202cd6db9d3f5e6261e Mon Sep 17 00:00:00 2001
From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com>
Date: Fri, 2 Jan 2026 12:14:52 +0800
Subject: [PATCH 3/3] clip input shape to max tunable token count

Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com>
---
 tensorrt_llm/_torch/autotuner.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
index f2cc354b641..41caf648adf 100644
--- a/tensorrt_llm/_torch/autotuner.py
+++ b/tensorrt_llm/_torch/autotuner.py
@@ -1140,8 +1140,15 @@ def _optimization_profiles(
                 opt_shapes = spec.gen_tuning_buckets
             # Add the current input value as one of the opt values
             opt_shapes = set(opt_shapes)
-            opt_shapes.add(
-                base_profile.shapes[spec.input_idx][spec.dim_idx].val)
+            if tuning_config.tune_max_num_tokens is not None:
+                opt_shapes.add(
+                    min(
+                        tuning_config.tune_max_num_tokens,
+                        base_profile.shapes[spec.input_idx][spec.dim_idx].val,
+                    ))
+            else:
+                opt_shapes.add(
+                    base_profile.shapes[spec.input_idx][spec.dim_idx].val)
             opt_shapes = sorted(list(opt_shapes))
             opt_shapes_max = tuple(opt_shapes[1:]) + (float('inf'), )
             opt_shapes_max = {