use single bucket mapper instead; map down tuning bucket according to expected fill rate

rosenrodt · rosenrodt · commit aba5ea62c207 · 2025-12-18T17:16:14.000+08:00
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -46,15 +46,12 @@ class DynamicTensorSpec:
         input_idx: The index of the input tensor.
         dim_idx: The index of the dimension to tune.
         gen_tuning_buckets: A tuple of values to try or a function generating values.
-        map_to_tuning_buckets: A function to map dimensions to valid values during tuning.
-        map_to_runtime_buckets: A function to map dimensions to valid values during inference.
-          If None, use map_to_tuning_buckets.
+        map_to_tuning_buckets: A function to map dimensions to valid values during inference.
     """
     input_idx: int
     dim_idx: int
     gen_tuning_buckets: Union[Tuple[int], Callable] = ()
     map_to_tuning_buckets: Callable = lambda x: x
-    map_to_runtime_buckets: Optional[Callable] = None
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -392,27 +389,22 @@ def search_cache(
         runners: List[TunableRunner],
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
-        use_tuning_mapping: bool = False,
     ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]:
         """Search for cached profiling results matching the current configuration.
 
         Args:
             custom_op (str): The name of the custom operation to be tuned
             runners (List[TunableRunner]): List of candidate implementations to profile
             profile (OptimizationProfile): Optimization profile
-            use_tuning_mapping: If True, use map_to_tuning_buckets for cache key.
-                If False, use map_to_runtime_buckets for runtime cache lookups.
 
         Returns:
             A tuple containing:
             [is_cache_hit, runner_id, tactic, stored_profile]
             runner_id is the index in the current runners list
         """
         for idx, r in enumerate(runners):
-            if (cache_key :=
-                    self.get_cache_key(custom_op, r, input_shapes,
-                                       tuning_config,
-                                       use_tuning_mapping)) in self.cache:
+            if (cache_key := self.get_cache_key(custom_op, r, input_shapes,
+                                                tuning_config)) in self.cache:
                 # Return the current index in runners list, not the cached runner_id
                 cached_runner_id, tactic, min_time = self.cache[cache_key]
                 return True, idx, tactic, min_time
@@ -425,7 +417,6 @@ def get_cache_key(
         runner: TunableRunner,
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
-        use_tuning_mapping: bool = False,
     ) -> Tuple:
         return (
             custom_op,
@@ -436,7 +427,6 @@ def get_cache_key(
                 tuning_config.dynamic_tensor_specs,
                 tuning_config.constraint_specs,
                 tuning_config.tune_max_num_tokens,
-                use_tuning_mapping,
             ),
         )
 
@@ -827,12 +817,7 @@ def choose_one(
             for p in profiles:
                 tensors = self._prepare_input_tensors(p, inputs)
                 is_cache_hit, *_ = self.profiling_cache.search_cache(
-                    custom_op,
-                    runners,
-                    p.get_opt_shapes(),
-                    tuning_config,
-                    use_tuning_mapping=True,
-                )
+                    custom_op, runners, p.get_opt_shapes(), tuning_config)
                 if not is_cache_hit:
                     # Initialize runner and tactic as None in case of no valid tactic or runners are found
                     best_runner_id, best_tactic, min_time, has_tuning_failure_occurred = self._profile_runners(
@@ -919,11 +904,8 @@ def _profile_runners(
                     # Record the failed profiling combinations
                     self.stats.failed_profiling_count[custom_op].add(
                         self.profiling_cache.get_cache_key(
-                            custom_op,
-                            runner,
-                            profile.get_opt_shapes(),
-                            tuning_config,
-                            use_tuning_mapping=True))
+                            custom_op, runner, profile.get_opt_shapes(),
+                            tuning_config))
 
                     # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
                     # or some runtime error occurs during profiling.
@@ -936,11 +918,8 @@ def _profile_runners(
         if best_runner_id is not None:
             # At least one valid (runner, tactic) pair is found
             cache_key = self.profiling_cache.get_cache_key(
-                custom_op,
-                runners[best_runner_id],
-                profile.get_opt_shapes(),
-                tuning_config,
-                use_tuning_mapping=True)
+                custom_op, runners[best_runner_id], profile.get_opt_shapes(),
+                tuning_config)
 
             self._debug_logger(
                 f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
@@ -1161,16 +1140,13 @@ def _find_nearest_profile(
         dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...],
         constraint_specs: Tuple[ConstraintSpec, ...],
         tune_max_num_tokens: int = None,
-        use_tuning_mapping: bool = False,
     ) -> Tuple:
         """Find the nearest optimization profile for given inputs
         User can define their own nearest profile generation method to reduce the host overhead.
 
         Args:
             shapes: Tuple of input tensor shapes
             tuning_config: Tuning configuration
-            use_tuning_mapping: If True, use map_to_tuning_buckets to store tuning cache.
-                If False, use map_to_runtime_buckets for runtime cache lookups.
 
         Return:
             Tuple: A tuple containing:
@@ -1180,12 +1156,9 @@ def _find_nearest_profile(
         base_profile = list(list(shape) for shape in shapes)
 
         for spec in dynamic_tensor_specs:
-
-            bucket_mapper = spec.map_to_tuning_buckets
-            if not use_tuning_mapping and spec.map_to_runtime_buckets is not None:
-                bucket_mapper = spec.map_to_runtime_buckets
-            base_profile[spec.input_idx][spec.dim_idx] = bucket_mapper(
-                base_profile[spec.input_idx][spec.dim_idx])
+            base_profile[spec.input_idx][
+                spec.dim_idx] = spec.map_to_tuning_buckets(
+                    base_profile[spec.input_idx][spec.dim_idx])
 
             if tune_max_num_tokens is not None:
                 base_profile[spec.input_idx][spec.dim_idx] = min(
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -273,20 +273,18 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
 
+        # Extend max profiled bucket by ep_size
+        MAX_PROFILE_BUCKET = 4096 * ep_size
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            return min(
-                last_positive_power_of_2(x) // ep_size_, MAX_PROFILE_BUCKET)
+        # 1/ep_size is the expected token fill rate
+        # Fill rate maps buffer size into expected token count that represents actual works
+        round_rule = lambda x: min(last_positive_power_of_2(x // ep_size),
+                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
 
         return specs
 
@@ -619,20 +617,18 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
 
+        # Extend max profiled bucket by ep_size
+        MAX_PROFILE_BUCKET = 4096 * ep_size
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            return min(
-                last_positive_power_of_2(x) // ep_size_, MAX_PROFILE_BUCKET)
+        # 1/ep_size is the expected token fill rate
+        # Fill rate maps buffer size into expected token count that represents actual works
+        round_rule = lambda x: min(last_positive_power_of_2(x // ep_size),
+                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
 
         return specs
 
@@ -914,20 +910,18 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
 
+        # Extend max profiled bucket by ep_size
+        MAX_PROFILE_BUCKET = 4096 * ep_size
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            return min(
-                last_positive_power_of_2(x) // ep_size_, MAX_PROFILE_BUCKET)
+        # 1/ep_size is the expected token fill rate
+        # Fill rate maps buffer size into expected token count that represents actual works
+        round_rule = lambda x: min(last_positive_power_of_2(x // ep_size),
+                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
 
         return specs
 
@@ -1214,20 +1208,18 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
 
+        # Extend max profiled bucket by ep_size
+        MAX_PROFILE_BUCKET = 4096 * ep_size
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            return min(
-                last_positive_power_of_2(x) // ep_size_, MAX_PROFILE_BUCKET)
+        # 1/ep_size is the expected token fill rate
+        # Fill rate maps buffer size into expected token count that represents actual works
+        round_rule = lambda x: min(last_positive_power_of_2(x // ep_size),
+                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
 
         return specs
 
@@ -1492,20 +1484,18 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
 
+        # Extend max profiled bucket by ep_size
+        MAX_PROFILE_BUCKET = 4096 * ep_size
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            return min(
-                last_positive_power_of_2(x) // ep_size_, MAX_PROFILE_BUCKET)
+        # 1/ep_size is the expected token fill rate
+        # Fill rate maps buffer size into expected token count that represents actual works
+        round_rule = lambda x: min(last_positive_power_of_2(x // ep_size),
+                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
 
         return specs
 
@@ -1755,20 +1745,18 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
 
+        # Extend max profiled bucket by ep_size
+        MAX_PROFILE_BUCKET = 4096 * ep_size
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
-        def round_rule(x: int, ep_size_: int) -> int:
-            return min(
-                last_positive_power_of_2(x) // ep_size_, MAX_PROFILE_BUCKET)
+        # 1/ep_size is the expected token fill rate
+        # Fill rate maps buffer size into expected token count that represents actual works
+        round_rule = lambda x: min(last_positive_power_of_2(x // ep_size),
+                                   MAX_PROFILE_BUCKET)
 
-        specs = (DynamicTensorSpec(
-            HIDDEN_STATES_IDX,
-            TUNED_DIM,
-            m_values,
-            map_to_tuning_buckets=lambda x: round_rule(x, 1),
-            map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), )
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
 
         return specs
 
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
@@ -171,82 +171,6 @@ def test_autotuner_cache_basic():
         m //= 2
 
 
-def test_runtime_bucket_mapping():
-    """Test that map_to_runtime_buckets correctly maps runtime sizes to tuning buckets.
-
-    This test demonstrates the distinction between map_to_tuning_buckets and map_to_runtime_buckets:
-    - map_to_tuning_buckets: used during tuning to store cache keys with raw bucket values
-    - map_to_runtime_buckets: used during runtime to map input sizes to tuning buckets
-
-    With inflate_factor=4:
-    - Tuning stores buckets: 1, 2, 4, 8, 16, 32
-    - Runtime input 4 -> maps to bucket 1 via round_rule(4) = 4 // 4 = 1
-    - Runtime input 16 -> maps to bucket 4 via round_rule(16) = 16 // 4 = 4
-
-    In MoE EP, the input buffer size is inflated by factor of the EP size to expect the worse case.
-    Using map_to_runtime_buckets allows us to adjust the expected token count, instead of maximum
-    possible token count.
-    """
-    w = torch.randn(64, 128)
-    tuner = AutoTuner.get()
-    tuner.clear_cache()
-
-    # The factor indicating the input shape is inflated by X
-    def bucket_mapper(x: int, inflate_factor: int) -> int:
-        return x // inflate_factor
-
-    tuning_config = TuningConfig(dynamic_tensor_specs=(DynamicTensorSpec(
-        input_idx=0,
-        dim_idx=0,
-        gen_tuning_buckets=get_power_of_2_num_tokens_buckets,
-        map_to_tuning_buckets=lambda x: bucket_mapper(x, inflate_factor=1),
-        map_to_runtime_buckets=lambda x: bucket_mapper(x, inflate_factor=4)),
-                                                       ), )
-
-    runners = [GemmRunner()]
-
-    # Tune with M=32, which generates buckets 1, 2, 4, 8, 16, 32
-    with autotune():
-        tuner.choose_one("test_runtime_bucket_mapping", runners, tuning_config,
-                         [torch.randn(M, 64), w])
-
-    # Verify cache entries use raw tuning bucket values, not deflated values
-    cache_entries = tuner.profiling_cache.get_specific_custom_op(
-        "test_runtime_bucket_mapping")
-
-    # Extract the first dimension of the first input shape from each cache key
-    assert len(cache_entries) == 6, \
-        f"Expected 6 cache entries (buckets 1, 2, 4, 8, 16, 32), got {len(cache_entries)}"
-
-    # Test runtime mapping: input size should be mapped via map_to_runtime_buckets
-    # to find the correct tuning bucket
-    test_cases = [
-        # size 4 maps to bucket 4//4 = 1, tactic 0 (1 <= M // 2)
-        (4, 1, 0),
-        # size 8 maps to bucket 8//4 = 2, tactic 0 (2 <= M // 2)
-        (8, 2, 0),
-        # size 16 maps to bucket 16//4 = 4, tactic 0 (4 <= M // 2)
-        (16, 4, 0),
-        # size 32 maps to bucket 32//4 = 8, tactic 0 (8 <= M // 2)
-        (32, 8, 0),
-        # size 64 maps to bucket 64//4 = 16, tactic 0 (16 <= M // 2)
-        (64, 16, 0),
-        # size 128 maps to bucket 128//4 = 32, tactic 1 (32 > M // 2)
-        (128, 32, 1),
-        # size 256 maps to bucket 256//4 = 64, tactic -1 (64 > M)
-        (256, 64, -1),
-    ]
-
-    for input_size, expected_bucket, expected_tactic in test_cases:
-        # Verify cache lookup succeeds with the mapped bucket
-        x = torch.randn(input_size, 64)
-        runner, tactic = tuner.choose_one("test_runtime_bucket_mapping",
-                                          runners, tuning_config, [x, w])
-        assert (
-            tactic == expected_tactic
-        ), f"Cache mismatch for input_size={input_size}, expected to map to bucket {expected_tactic} but got {tactic}"
-
-
 def test_autotuner_try_block():
 
     class PartialCrashedRunner(TunableRunner):