[breaking] map_to_tuning_buckets maps input during inference only

rosenrodt · rosenrodt · commit fb30e6589722 · 2025-12-19T12:05:48.000+08:00
Signed-off-by: Anthony Chang &lt;27950904+rosenrodt@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -46,15 +46,12 @@ class DynamicTensorSpec:
         input_idx: The index of the input tensor.
         dim_idx: The index of the dimension to tune.
         gen_tuning_buckets: A tuple of values to try or a function generating values.
-        map_to_tuning_buckets: A function to map dimensions to valid values during tuning.
-        map_to_runtime_buckets: A function to map dimensions to valid values during inference.
-          If None, use map_to_tuning_buckets.
+        map_to_tuning_buckets: A function to map dimensions to tuning buckets during inference.
     """
     input_idx: int
     dim_idx: int
     gen_tuning_buckets: Union[Tuple[int], Callable] = ()
     map_to_tuning_buckets: Callable = lambda x: x
-    map_to_runtime_buckets: Optional[Callable] = None
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -81,7 +78,7 @@ class TuningConfig:
             should be tuned to optimize performance. Each spec defines:
             - Which input tensor dimension is dynamic
             - How to generate tuning values
-            - How to map dimensions to valid values during inference
+            - How to map dimensions to tuning values during inference
 
             Example:
                 >>> config = TuningConfig(
@@ -392,27 +389,26 @@ def search_cache(
         runners: List[TunableRunner],
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
-        use_tuning_mapping: bool = False,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]:
         """Search for cached profiling results matching the current configuration.
 
         Args:
             custom_op (str): The name of the custom operation to be tuned
             runners (List[TunableRunner]): List of candidate implementations to profile
             profile (OptimizationProfile): Optimization profile
-            use_tuning_mapping: If True, use map_to_tuning_buckets for cache key.
-                If False, use map_to_runtime_buckets for runtime cache lookups.
+            apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups.
+                If False, use raw bucket values for tuning cache storage.
 
         Returns:
             A tuple containing:
             [is_cache_hit, runner_id, tactic, stored_profile]
             runner_id is the index in the current runners list
         """
         for idx, r in enumerate(runners):
-            if (cache_key :=
-                    self.get_cache_key(custom_op, r, input_shapes,
-                                       tuning_config,
-                                       use_tuning_mapping)) in self.cache:
+            if (cache_key := self.get_cache_key(
+                    custom_op, r, input_shapes, tuning_config,
+                    apply_map_to_tuning_buckets)) in self.cache:
                 # Return the current index in runners list, not the cached runner_id
                 cached_runner_id, tactic, min_time = self.cache[cache_key]
                 return True, idx, tactic, min_time
@@ -425,7 +421,7 @@ def get_cache_key(
         runner: TunableRunner,
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
-        use_tuning_mapping: bool = False,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple:
         return (
             custom_op,
@@ -436,7 +432,7 @@ def get_cache_key(
                 tuning_config.dynamic_tensor_specs,
                 tuning_config.constraint_specs,
                 tuning_config.tune_max_num_tokens,
-                use_tuning_mapping,
+                apply_map_to_tuning_buckets,
             ),
         )
 
@@ -789,7 +785,11 @@ def choose_one(
 
         input_shapes = tuple(self._get_input_sizes(inputs))
         is_cache_hit, best_runner_id, best_tactic, min_time = self.profiling_cache.search_cache(
-            custom_op, runners, input_shapes, tuning_config)
+            custom_op,
+            runners,
+            input_shapes,
+            tuning_config,
+            apply_map_to_tuning_buckets=True)
 
         # Early return if it's not tuning, use cache found one or fallback one
         if not self.is_tuning_mode:
@@ -831,7 +831,7 @@ def choose_one(
                     runners,
                     p.get_opt_shapes(),
                     tuning_config,
-                    use_tuning_mapping=True,
+                    apply_map_to_tuning_buckets=False,
                 )
                 if not is_cache_hit:
                     # Initialize runner and tactic as None in case of no valid tactic or runners are found
@@ -923,7 +923,7 @@ def _profile_runners(
                             runner,
                             profile.get_opt_shapes(),
                             tuning_config,
-                            use_tuning_mapping=True))
+                            apply_map_to_tuning_buckets=False))
 
                     # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
                     # or some runtime error occurs during profiling.
@@ -940,7 +940,7 @@ def _profile_runners(
                 runners[best_runner_id],
                 profile.get_opt_shapes(),
                 tuning_config,
-                use_tuning_mapping=True)
+                apply_map_to_tuning_buckets=False)
 
             self._debug_logger(
                 f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
@@ -1117,8 +1117,7 @@ def _optimization_profiles(
             # Add the current input value as one of the opt values
             opt_shapes = set(opt_shapes)
             opt_shapes.add(
-                spec.map_to_tuning_buckets(
-                    base_profile.shapes[spec.input_idx][spec.dim_idx].val))
+                base_profile.shapes[spec.input_idx][spec.dim_idx].val)
             opt_shapes = sorted(list(opt_shapes))
             opt_shapes_max = tuple(opt_shapes[1:]) + (float('inf'), )
             opt_shapes_max = {
@@ -1161,16 +1160,16 @@ def _find_nearest_profile(
         dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...],
         constraint_specs: Tuple[ConstraintSpec, ...],
         tune_max_num_tokens: int = None,
-        use_tuning_mapping: bool = False,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple:
         """Find the nearest optimization profile for given inputs
         User can define their own nearest profile generation method to reduce the host overhead.
 
         Args:
             shapes: Tuple of input tensor shapes
             tuning_config: Tuning configuration
-            use_tuning_mapping: If True, use map_to_tuning_buckets to store tuning cache.
-                If False, use map_to_runtime_buckets for runtime cache lookups.
+            apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups.
+                If False, use raw bucket values for tuning cache storage.
 
         Return:
             Tuple: A tuple containing:
@@ -1180,12 +1179,12 @@ def _find_nearest_profile(
         base_profile = list(list(shape) for shape in shapes)
 
         for spec in dynamic_tensor_specs:
-
-            bucket_mapper = spec.map_to_tuning_buckets
-            if not use_tuning_mapping and spec.map_to_runtime_buckets is not None:
-                bucket_mapper = spec.map_to_runtime_buckets
-            base_profile[spec.input_idx][spec.dim_idx] = bucket_mapper(
-                base_profile[spec.input_idx][spec.dim_idx])
+            # During runtime: apply map_to_tuning_buckets to map input to bucket
+            # During tuning: no mapper, use raw bucket value
+            if apply_map_to_tuning_buckets:
+                base_profile[spec.input_idx][
+                    spec.dim_idx] = spec.map_to_tuning_buckets(
+                        base_profile[spec.input_idx][spec.dim_idx])
 
             if tune_max_num_tokens is not None:
                 base_profile[spec.input_idx][spec.dim_idx] = min(
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -277,16 +277,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -623,16 +621,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -918,16 +914,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -1218,16 +1212,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -1496,16 +1488,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
@@ -1759,16 +1749,14 @@ def get_dynamic_tensor_specs(cls,
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            value = last_positive_power_of_2(x) // ep_size_
+        def round_rule(x: int) -> int:
+            value = last_positive_power_of_2(x) // ep_size
             return min(max(1, value), MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX,
+                                   TUNED_DIM,
+                                   m_values,
+                                   map_to_tuning_buckets=round_rule), )
 
         return specs
 
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py