NVIDIA · rosenrodt · Dec 14, 2025 · Dec 19, 2025 · Jan 2, 2026 · syuoni
@@ -49,7 +49,7 @@ class DynamicTensorSpec:
         input_idx: The index of the input tensor.
         dim_idx: The index of the dimension to tune.
         gen_tuning_buckets: A tuple of values to try or a function generating values.
-        map_to_tuning_buckets: A function to map dimensions to valid values during inference.
+        map_to_tuning_buckets: A function to map dimensions to tuning buckets during inference.
     """
     input_idx: int
     dim_idx: int
@@ -81,7 +81,7 @@ class TuningConfig:
             should be tuned to optimize performance. Each spec defines:
             - Which input tensor dimension is dynamic
             - How to generate tuning values
-            - How to map dimensions to valid values during inference
+            - How to map dimensions to tuning values during inference
 
             Example:
                 >>> config = TuningConfig(
@@ -392,22 +392,26 @@ def search_cache(
         runners: List[TunableRunner],
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]:
         """Search for cached profiling results matching the current configuration.
 
         Args:
             custom_op (str): The name of the custom operation to be tuned
             runners (List[TunableRunner]): List of candidate implementations to profile
             profile (OptimizationProfile): Optimization profile
+            apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups.
+                If False, use raw bucket values for tuning cache storage.
 
         Returns:
             A tuple containing:
             [is_cache_hit, runner_id, tactic, stored_profile]
             runner_id is the index in the current runners list
         """
         for idx, r in enumerate(runners):
-            if (cache_key := self.get_cache_key(custom_op, r, input_shapes,
-                                                tuning_config)) in self.cache:
+            if (cache_key := self.get_cache_key(
+                    custom_op, r, input_shapes, tuning_config,
+                    apply_map_to_tuning_buckets)) in self.cache:
                 # Return the current index in runners list, not the cached runner_id
                 cached_runner_id, tactic, min_time = self.cache[cache_key]
                 return True, idx, tactic, min_time
@@ -420,6 +424,7 @@ def get_cache_key(
         runner: TunableRunner,
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple:
         return (
             custom_op,
@@ -430,6 +435,7 @@ def get_cache_key(
                 tuning_config.dynamic_tensor_specs,
                 tuning_config.constraint_specs,
                 tuning_config.tune_max_num_tokens,
+                apply_map_to_tuning_buckets,
             ),
         )
 
@@ -795,7 +801,11 @@ def choose_one(
 
         input_shapes = tuple(self._get_input_sizes(inputs))
         is_cache_hit, best_runner_id, best_tactic, min_time = self.profiling_cache.search_cache(
-            custom_op, runners, input_shapes, tuning_config)
+            custom_op,
+            runners,
+            input_shapes,
+            tuning_config,
+            apply_map_to_tuning_buckets=True)
 
         # Early return if it's not tuning, use cache found one or fallback one
         if not self.is_tuning_mode:
@@ -841,7 +851,12 @@ def choose_one(
             for p in profiles:
                 tensors = self._prepare_input_tensors(p, inputs)
                 is_cache_hit, *_ = self.profiling_cache.search_cache(
-                    custom_op, runners, p.get_opt_shapes(), tuning_config)
+                    custom_op,
+                    runners,
+                    p.get_opt_shapes(),
+                    tuning_config,
+                    apply_map_to_tuning_buckets=False,
 def generate_num_tokens_per_expert(self, num_tokens: int) -> List[int]: 
     average_num_tokens_per_expert = num_tokens * self.top_k / self.num_experts 
     balance = 0 
     num_tokens_per_expert = [] 
     for i in range(self.num_local_experts): 
         balance += average_num_tokens_per_expert 
         if balance <= 1e-3: 
             continue 
         curr_num_tokens = int(balance) + 1 
         num_tokens_per_expert.append(curr_num_tokens) 
         balance -= curr_num_tokens 
     return num_tokens_per_expert 
 def generate_num_tokens_per_expert(self, num_tokens: int) -> List[int]: 
     average_num_tokens_per_expert = num_tokens * self.top_k / self.num_experts 
     balance = 0 
     num_tokens_per_expert = [] 
     for i in range(self.num_local_experts): 
         balance += average_num_tokens_per_expert 
         if balance <= 1e-3: 
             continue 
         curr_num_tokens = int(balance) + 1 
         num_tokens_per_expert.append(curr_num_tokens) 
         balance -= curr_num_tokens 
     return num_tokens_per_expert 
+                )
                 if not is_cache_hit:
                     # Initialize runner and tactic as None in case of no valid tactic or runners are found
                     best_runner_id, best_tactic, min_time, has_tuning_failure_occurred = self._profile_runners(
@@ -928,8 +943,11 @@ def _profile_runners(
                     # Record the failed profiling combinations
                     self.stats.failed_profiling_count[custom_op].add(
                         self.profiling_cache.get_cache_key(
-                            custom_op, runner, profile.get_opt_shapes(),
-                            tuning_config))
+                            custom_op,
+                            runner,
+                            profile.get_opt_shapes(),
+                            tuning_config,
+                            apply_map_to_tuning_buckets=False))
 
                     # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
                     # or some runtime error occurs during profiling.
@@ -942,8 +960,11 @@ def _profile_runners(
         if best_runner_id is not None:
             # At least one valid (runner, tactic) pair is found
             cache_key = self.profiling_cache.get_cache_key(
-                custom_op, runners[best_runner_id], profile.get_opt_shapes(),
-                tuning_config)
+                custom_op,
+                runners[best_runner_id],
+                profile.get_opt_shapes(),
+                tuning_config,
+                apply_map_to_tuning_buckets=False)
 
             self._debug_logger(
                 f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
@@ -1119,9 +1140,15 @@ def _optimization_profiles(
                 opt_shapes = spec.gen_tuning_buckets
             # Add the current input value as one of the opt values
             opt_shapes = set(opt_shapes)
-            opt_shapes.add(
-                spec.map_to_tuning_buckets(
-                    base_profile.shapes[spec.input_idx][spec.dim_idx].val))
+            if tuning_config.tune_max_num_tokens is not None:
+                opt_shapes.add(
+                    min(
+                        tuning_config.tune_max_num_tokens,
+                        base_profile.shapes[spec.input_idx][spec.dim_idx].val,
+                    ))
+            else:
+                opt_shapes.add(
+                    base_profile.shapes[spec.input_idx][spec.dim_idx].val)
             opt_shapes = sorted(list(opt_shapes))
             opt_shapes_max = tuple(opt_shapes[1:]) + (float('inf'), )
             opt_shapes_max = {
@@ -1164,13 +1191,16 @@ def _find_nearest_profile(
         dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...],
         constraint_specs: Tuple[ConstraintSpec, ...],
         tune_max_num_tokens: int = None,
+        apply_map_to_tuning_buckets: bool = True,
     ) -> Tuple:
         """Find the nearest optimization profile for given inputs
         User can define their own nearest profile generation method to reduce the host overhead.
 
         Args:
             shapes: Tuple of input tensor shapes
             tuning_config: Tuning configuration
+            apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups.
+                If False, use raw bucket values for tuning cache storage.
 
         Return:
             Tuple: A tuple containing:
@@ -1180,9 +1210,12 @@ def _find_nearest_profile(
         base_profile = list(list(shape) for shape in shapes)
 
         for spec in dynamic_tensor_specs:
-            base_profile[spec.input_idx][
-                spec.dim_idx] = spec.map_to_tuning_buckets(
-                    base_profile[spec.input_idx][spec.dim_idx])
+            # During runtime: apply map_to_tuning_buckets to map input to bucket
+            # During tuning: no mapper, use raw bucket value
+            if apply_map_to_tuning_buckets:
+                base_profile[spec.input_idx][
+                    spec.dim_idx] = spec.map_to_tuning_buckets(
+                        base_profile[spec.input_idx][spec.dim_idx])
 
             if tune_max_num_tokens is not None:
                 base_profile[spec.input_idx][spec.dim_idx] = min(