From 0c27f02bf74ec54ea45f8d2f85942318c5570613 Mon Sep 17 00:00:00 2001 From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> Date: Sun, 14 Dec 2025 16:58:24 +0800 Subject: [PATCH 1/3] TRTLLM MoE maps to lower tuning buckets when ep>1 Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> --- tensorrt_llm/_torch/autotuner.py | 49 ++++- .../custom_ops/trtllm_gen_custom_ops.py | 195 +++++++++++------- tests/unittest/_torch/misc/test_autotuner.py | 71 +++++++ 3 files changed, 229 insertions(+), 86 deletions(-) diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index 33ef41af8a9..6d20df8bbaf 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -49,12 +49,15 @@ class DynamicTensorSpec: input_idx: The index of the input tensor. dim_idx: The index of the dimension to tune. gen_tuning_buckets: A tuple of values to try or a function generating values. - map_to_tuning_buckets: A function to map dimensions to valid values during inference. + map_to_tuning_buckets: A function to map dimensions to valid values during tuning. + map_to_runtime_buckets: A function to map dimensions to valid values during inference. + If None, use map_to_tuning_buckets. """ input_idx: int dim_idx: int gen_tuning_buckets: Union[Tuple[int], Callable] = () map_to_tuning_buckets: Callable = lambda x: x + map_to_runtime_buckets: Optional[Callable] = None @dataclass(slots=True, unsafe_hash=True) @@ -392,6 +395,7 @@ def search_cache( runners: List[TunableRunner], input_shapes: Tuple[torch.Size], tuning_config: TuningConfig, + use_tuning_mapping: bool = False, ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]: """Search for cached profiling results matching the current configuration. @@ -399,6 +403,8 @@ def search_cache( custom_op (str): The name of the custom operation to be tuned runners (List[TunableRunner]): List of candidate implementations to profile profile (OptimizationProfile): Optimization profile + use_tuning_mapping: If True, use map_to_tuning_buckets for cache key. + If False, use map_to_runtime_buckets for runtime cache lookups. Returns: A tuple containing: @@ -406,8 +412,10 @@ def search_cache( runner_id is the index in the current runners list """ for idx, r in enumerate(runners): - if (cache_key := self.get_cache_key(custom_op, r, input_shapes, - tuning_config)) in self.cache: + if (cache_key := + self.get_cache_key(custom_op, r, input_shapes, + tuning_config, + use_tuning_mapping)) in self.cache: # Return the current index in runners list, not the cached runner_id cached_runner_id, tactic, min_time = self.cache[cache_key] return True, idx, tactic, min_time @@ -420,6 +428,7 @@ def get_cache_key( runner: TunableRunner, input_shapes: Tuple[torch.Size], tuning_config: TuningConfig, + use_tuning_mapping: bool = False, ) -> Tuple: return ( custom_op, @@ -430,6 +439,7 @@ def get_cache_key( tuning_config.dynamic_tensor_specs, tuning_config.constraint_specs, tuning_config.tune_max_num_tokens, + use_tuning_mapping, ), ) @@ -841,7 +851,12 @@ def choose_one( for p in profiles: tensors = self._prepare_input_tensors(p, inputs) is_cache_hit, *_ = self.profiling_cache.search_cache( - custom_op, runners, p.get_opt_shapes(), tuning_config) + custom_op, + runners, + p.get_opt_shapes(), + tuning_config, + use_tuning_mapping=True, + ) if not is_cache_hit: # Initialize runner and tactic as None in case of no valid tactic or runners are found best_runner_id, best_tactic, min_time, has_tuning_failure_occurred = self._profile_runners( @@ -928,8 +943,11 @@ def _profile_runners( # Record the failed profiling combinations self.stats.failed_profiling_count[custom_op].add( self.profiling_cache.get_cache_key( - custom_op, runner, profile.get_opt_shapes(), - tuning_config)) + custom_op, + runner, + profile.get_opt_shapes(), + tuning_config, + use_tuning_mapping=True)) # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics # or some runtime error occurs during profiling. @@ -942,8 +960,11 @@ def _profile_runners( if best_runner_id is not None: # At least one valid (runner, tactic) pair is found cache_key = self.profiling_cache.get_cache_key( - custom_op, runners[best_runner_id], profile.get_opt_shapes(), - tuning_config) + custom_op, + runners[best_runner_id], + profile.get_opt_shapes(), + tuning_config, + use_tuning_mapping=True) self._debug_logger( f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}." @@ -1164,6 +1185,7 @@ def _find_nearest_profile( dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...], constraint_specs: Tuple[ConstraintSpec, ...], tune_max_num_tokens: int = None, + use_tuning_mapping: bool = False, ) -> Tuple: """Find the nearest optimization profile for given inputs User can define their own nearest profile generation method to reduce the host overhead. @@ -1171,6 +1193,8 @@ def _find_nearest_profile( Args: shapes: Tuple of input tensor shapes tuning_config: Tuning configuration + use_tuning_mapping: If True, use map_to_tuning_buckets to store tuning cache. + If False, use map_to_runtime_buckets for runtime cache lookups. Return: Tuple: A tuple containing: @@ -1180,9 +1204,12 @@ def _find_nearest_profile( base_profile = list(list(shape) for shape in shapes) for spec in dynamic_tensor_specs: - base_profile[spec.input_idx][ - spec.dim_idx] = spec.map_to_tuning_buckets( - base_profile[spec.input_idx][spec.dim_idx]) + + bucket_mapper = spec.map_to_tuning_buckets + if not use_tuning_mapping and spec.map_to_runtime_buckets is not None: + bucket_mapper = spec.map_to_runtime_buckets + base_profile[spec.input_idx][spec.dim_idx] = bucket_mapper( + base_profile[spec.input_idx][spec.dim_idx]) if tune_max_num_tokens is not None: base_profile[spec.input_idx][spec.dim_idx] = min( diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py index a8236d88fcf..7755f2aba65 100644 --- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py @@ -207,8 +207,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int], self.routing_method_type = routing_method_type self.do_finalize = do_finalize - FP4BlockScaleMoERunner.tuning_config = FP4BlockScaleMoERunner.get_tuning_config( - ) + self.tuning_config = FP4BlockScaleMoERunner.get_tuning_config( + self.num_experts // self.local_num_experts) # The unique_id is used by the autotuner to get the cache key, so we hash on members # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing type does not matter @@ -269,17 +269,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor], return tactics @classmethod - def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]: + def get_dynamic_tensor_specs(cls, + ep_size: int) -> Tuple[DynamicTensorSpec, ...]: HIDDEN_STATES_IDX = 2 TUNED_DIM = 0 MAX_PROFILE_BUCKET = 4096 m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - round_rule = lambda x: min(last_positive_power_of_2(x), - MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values, - round_rule), ) + def round_rule(x: int, ep_size_: int) -> int: + value = last_positive_power_of_2(x) // ep_size_ + return min(max(1, value), MAX_PROFILE_BUCKET) + + specs = (DynamicTensorSpec( + HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=lambda x: round_rule(x, 1), + map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) return specs @@ -340,9 +347,9 @@ def _constrain_fp4_linear_layout(shapes: Tuple[torch.Size]) -> int: @classmethod @lru_cache(maxsize=None) - def get_tuning_config(cls) -> TuningConfig: + def get_tuning_config(cls, ep_size: int) -> TuningConfig: - dynamic_tensor_specs = cls.get_dynamic_tensor_specs() + dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size) constraint_specs = cls.get_constraint_specs() tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, @@ -399,7 +406,7 @@ def fp4_block_scale_moe_runner( topk_ids=topk_ids, hidden_states=hidden_states, routing_logits=routing_logits, - base_tuning_config=FP4BlockScaleMoERunner.get_tuning_config(), + base_tuning_config=kernel_runner.tuning_config, top_k=top_k, num_experts=num_experts, n_group=n_group, @@ -550,8 +557,8 @@ def __init__( self.routed_scaling_factor = routed_scaling_factor self.routing_method_type = routing_method_type - FP8BlockScaleMoERunner.tuning_config = FP8BlockScaleMoERunner.get_tuning_config( - ) + self.tuning_config = FP8BlockScaleMoERunner.get_tuning_config( + self.num_experts // self.local_num_experts) # The unique_id is used by the autotuner to get the cache key, so we hash on members # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing @@ -608,18 +615,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor], return tactics @classmethod - def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]: + def get_dynamic_tensor_specs(cls, + ep_size: int) -> Tuple[DynamicTensorSpec, ...]: HIDDEN_STATES_IDX = 2 TUNED_DIM = 0 - MAX_PROFILE_BUCKET = 4096 m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - round_rule = lambda x: min(last_positive_power_of_2(x), - MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values, - round_rule), ) + def round_rule(x: int, ep_size_: int) -> int: + value = last_positive_power_of_2(x) // ep_size_ + return min(max(1, value), MAX_PROFILE_BUCKET) + + specs = (DynamicTensorSpec( + HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=lambda x: round_rule(x, 1), + map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) return specs @@ -662,9 +675,9 @@ def _constrain_to_num_tokens(shapes: Tuple[torch.Size]) -> int: @classmethod @lru_cache(maxsize=None) - def get_tuning_config(cls) -> TuningConfig: + def get_tuning_config(cls, ep_size: int) -> TuningConfig: - dynamic_tensor_specs = cls.get_dynamic_tensor_specs() + dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size) constraint_specs = cls.get_constraint_specs() tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, @@ -696,19 +709,17 @@ def fp8_block_scale_moe_runner( topk_ids: Optional[torch.Tensor] = None) -> torch.Tensor: tuner = AutoTuner.get() - kernel_runners = [ - FP8BlockScaleMoERunner( - num_experts, - top_k, - n_group, - topk_group, - intermediate_size, - local_expert_offset, - local_num_experts, - routed_scaling_factor, - routing_method_type, - ) - ] + kernel_runner = FP8BlockScaleMoERunner( + num_experts, + top_k, + n_group, + topk_group, + intermediate_size, + local_expert_offset, + local_num_experts, + routed_scaling_factor, + routing_method_type, + ) # Prepare dummy topk tensors and hook for AutoTuner profiling routing_logits_for_tuner, topk_weights_for_tuner, topk_ids_for_tuner, tuning_config_with_hook = \ @@ -718,7 +729,7 @@ def fp8_block_scale_moe_runner( topk_ids=topk_ids, hidden_states=hidden_states, routing_logits=routing_logits, - base_tuning_config=FP8BlockScaleMoERunner.get_tuning_config(), + base_tuning_config=kernel_runner.tuning_config, top_k=top_k, num_experts=num_experts, n_group=n_group, @@ -742,7 +753,7 @@ def fp8_block_scale_moe_runner( kernel_runner, best_tactic = tuner.choose_one( "trtllm::fp8_block_scale_moe_runner", - kernel_runners, + [kernel_runner], tuning_config_with_hook, input_tensors_for_tuner, ) @@ -827,8 +838,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int], self.routing_method_type = routing_method_type self.act_type = act_type - MxE4m3MxE2m1BlockScaleMoERunner.tuning_config = MxE4m3MxE2m1BlockScaleMoERunner.get_tuning_config( - ) + self.tuning_config = MxE4m3MxE2m1BlockScaleMoERunner.get_tuning_config( + self.num_experts // self.local_num_experts) # The unique_id is used by the autotuner to get the cache key, so we hash on members # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing @@ -899,15 +910,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor], return tactics @classmethod - def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]: + def get_dynamic_tensor_specs(cls, + ep_size: int) -> Tuple[DynamicTensorSpec, ...]: HIDDEN_STATES_IDX = 2 TUNED_DIM = 0 + MAX_PROFILE_BUCKET = 4096 - m_values = get_last_power_of_2_num_tokens_buckets(4096) - round_rule = lambda x: min(last_positive_power_of_2(x), 4096) + m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) + + def round_rule(x: int, ep_size_: int) -> int: + value = last_positive_power_of_2(x) // ep_size_ + return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values, - round_rule), ) + specs = (DynamicTensorSpec( + HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=lambda x: round_rule(x, 1), + map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) return specs @@ -961,9 +981,9 @@ def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int: @classmethod @lru_cache(maxsize=None) - def get_tuning_config(cls) -> TuningConfig: + def get_tuning_config(cls, ep_size: int) -> TuningConfig: - dynamic_tensor_specs = cls.get_dynamic_tensor_specs() + dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size) constraint_specs = cls.get_constraint_specs() tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, @@ -1028,7 +1048,7 @@ def mxe4m3_mxe2m1_block_scale_moe_runner( topk_ids=topk_ids, hidden_states=hidden_states, routing_logits=routing_logits, - base_tuning_config=MxE4m3MxE2m1BlockScaleMoERunner.get_tuning_config(), + base_tuning_config=kernel_runner.tuning_config, top_k=top_k, num_experts=num_experts, n_group=n_group, @@ -1118,8 +1138,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int], self.routing_method_type = routing_method_type self.act_type = act_type - E4m3MxE2m1BlockScaleMoERunner.tuning_config = E4m3MxE2m1BlockScaleMoERunner.get_tuning_config( - ) + self.tuning_config = E4m3MxE2m1BlockScaleMoERunner.get_tuning_config( + self.num_experts // self.local_num_experts) # The unique_id is used by the autotuner to get the cache key, so we hash on members # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing @@ -1190,15 +1210,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor], return tactics @classmethod - def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]: + def get_dynamic_tensor_specs(cls, + ep_size: int) -> Tuple[DynamicTensorSpec, ...]: HIDDEN_STATES_IDX = 2 TUNED_DIM = 0 + MAX_PROFILE_BUCKET = 4096 - m_values = get_last_power_of_2_num_tokens_buckets(4096) - round_rule = lambda x: min(last_positive_power_of_2(x), 4096) + m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) + + def round_rule(x: int, ep_size_: int) -> int: + value = last_positive_power_of_2(x) // ep_size_ + return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values, - round_rule), ) + specs = (DynamicTensorSpec( + HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=lambda x: round_rule(x, 1), + map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) return specs @@ -1232,9 +1261,9 @@ def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int: @classmethod @lru_cache(maxsize=None) - def get_tuning_config(cls) -> TuningConfig: + def get_tuning_config(cls, ep_size: int) -> TuningConfig: - dynamic_tensor_specs = cls.get_dynamic_tensor_specs() + dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size) constraint_specs = cls.get_constraint_specs() tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, @@ -1300,7 +1329,7 @@ def e4m3_mxe2m1_block_scale_moe_runner( topk_ids=topk_ids, hidden_states=hidden_states, routing_logits=routing_logits, - base_tuning_config=E4m3MxE2m1BlockScaleMoERunner.get_tuning_config(), + base_tuning_config=kernel_runner.tuning_config, top_k=top_k, num_experts=num_experts, n_group=n_group, @@ -1389,8 +1418,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int], self.routing_method_type = routing_method_type self.act_type = act_type - Bf16MxE2m1BlockScaleMoERunner.tuning_config = Bf16MxE2m1BlockScaleMoERunner.get_tuning_config( - ) + self.tuning_config = Bf16MxE2m1BlockScaleMoERunner.get_tuning_config( + self.num_experts // self.local_num_experts) # The unique_id is used by the autotuner to get the cache key, so we hash on members # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing @@ -1459,15 +1488,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor], return tactics @classmethod - def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]: + def get_dynamic_tensor_specs(cls, + ep_size: int) -> Tuple[DynamicTensorSpec, ...]: HIDDEN_STATES_IDX = 2 TUNED_DIM = 0 + MAX_PROFILE_BUCKET = 4096 - m_values = get_last_power_of_2_num_tokens_buckets(4096) - round_rule = lambda x: min(last_positive_power_of_2(x), 4096) + m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) + + def round_rule(x: int, ep_size_: int) -> int: + value = last_positive_power_of_2(x) // ep_size_ + return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values, - round_rule), ) + specs = (DynamicTensorSpec( + HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=lambda x: round_rule(x, 1), + map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) return specs @@ -1501,9 +1539,9 @@ def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int: @classmethod @lru_cache(maxsize=None) - def get_tuning_config(cls) -> TuningConfig: + def get_tuning_config(cls, ep_size: int) -> TuningConfig: - dynamic_tensor_specs = cls.get_dynamic_tensor_specs() + dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size) constraint_specs = cls.get_constraint_specs() tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, @@ -1566,7 +1604,7 @@ def bf16_mxe2m1_block_scale_moe_runner( topk_ids=topk_ids, hidden_states=hidden_states, routing_logits=routing_logits, - base_tuning_config=Bf16MxE2m1BlockScaleMoERunner.get_tuning_config(), + base_tuning_config=kernel_runner.tuning_config, top_k=top_k, num_experts=num_experts, n_group=n_group, @@ -1650,8 +1688,8 @@ def __init__(self, num_experts: int, top_k: int, n_group: Optional[int], self.do_finalize = do_finalize self.act_type = act_type - FP8FP4BlockScaleMoERunner.tuning_config = FP8FP4BlockScaleMoERunner.get_tuning_config( - ) + self.tuning_config = FP8FP4BlockScaleMoERunner.get_tuning_config( + self.num_experts // self.local_num_experts) def unique_id(self): return ( @@ -1713,17 +1751,24 @@ def get_valid_tactics(self, inputs: List[torch.Tensor], return tactics @classmethod - def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]: + def get_dynamic_tensor_specs(cls, + ep_size: int) -> Tuple[DynamicTensorSpec, ...]: HIDDEN_STATES_IDX = 2 TUNED_DIM = 0 MAX_PROFILE_BUCKET = 4096 m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - round_rule = lambda x: min(last_positive_power_of_2(x), - MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values, - round_rule), ) + def round_rule(x: int, ep_size_: int) -> int: + value = last_positive_power_of_2(x) // ep_size_ + return min(max(1, value), MAX_PROFILE_BUCKET) + + specs = (DynamicTensorSpec( + HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=lambda x: round_rule(x, 1), + map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) return specs @@ -1759,9 +1804,9 @@ def _constrain_to_num_tokens(shapes: Tuple[torch.Size]) -> int: @classmethod @lru_cache(maxsize=None) - def get_tuning_config(cls) -> TuningConfig: + def get_tuning_config(cls, ep_size: int) -> TuningConfig: - dynamic_tensor_specs = cls.get_dynamic_tensor_specs() + dynamic_tensor_specs = cls.get_dynamic_tensor_specs(ep_size) constraint_specs = cls.get_constraint_specs() tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs, @@ -1820,7 +1865,7 @@ def fp8_fp4_block_scale_moe_runner( topk_ids=topk_ids, hidden_states=hidden_states, routing_logits=routing_logits, - base_tuning_config=FP8FP4BlockScaleMoERunner.get_tuning_config(), + base_tuning_config=kernel_runner.tuning_config, top_k=top_k, num_experts=num_experts, n_group=n_group, diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py index a6116d544f2..da760ac5e65 100644 --- a/tests/unittest/_torch/misc/test_autotuner.py +++ b/tests/unittest/_torch/misc/test_autotuner.py @@ -171,6 +171,77 @@ def test_autotuner_cache_basic(): m //= 2 +def test_runtime_bucket_mapping(): + """Test that map_to_runtime_buckets correctly maps runtime sizes to tuning buckets. + + This test demonstrates the distinction between map_to_tuning_buckets and map_to_runtime_buckets: + - map_to_tuning_buckets: used during tuning to store cache keys with raw bucket values + - map_to_runtime_buckets: used during runtime to map input sizes to tuning buckets + + With sparsity=0.25, the buffer contains 25% actual work: + - Tuning stores buckets: 1, 2, 4, 8, 16, 32 + - Runtime buffer 4 -> maps to bucket int(4 * 0.25) = 1 + - Runtime buffer 16 -> maps to bucket int(16 * 0.25) = 4 + + In MoE EP, the input buffer is allocated for worst-case but sparsely filled. + Using map_to_runtime_buckets allows us to map buffer size to actual work size. + """ + w = torch.randn(64, 128) + tuner = AutoTuner.get() + tuner.clear_cache() + + # Sparsity indicates the fraction of buffer containing valid work + def bucket_mapper(x: int, sparsity: float) -> int: + return int(x * sparsity) + + tuning_config = TuningConfig(dynamic_tensor_specs=(DynamicTensorSpec( + input_idx=0, + dim_idx=0, + gen_tuning_buckets=get_power_of_2_num_tokens_buckets(M), + map_to_tuning_buckets=lambda x: bucket_mapper(x, sparsity=1.0), + map_to_runtime_buckets=lambda x: bucket_mapper(x, sparsity=0.25)), ), ) + + with autotune(): + tuner.choose_one("test_runtime_bucket_mapping", [GemmRunner()], + tuning_config, [torch.randn(1, 64), w]) + + # Verify cache entries use raw tuning bucket values, not deflated values + cache_entries = tuner.profiling_cache.get_specific_custom_op( + "test_runtime_bucket_mapping") + + # Extract the first dimension of the first input shape from each cache key + assert len(cache_entries) == len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets), \ + f"Expected len(({len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets)}) cache entries, got len({(cache_entries)})" + + # Test runtime mapping: buffer size is mapped via map_to_runtime_buckets + # to find the correct tuning bucket based on actual work size + test_cases = [ + # size 4 -> valid work size (4*0.25)=1, tactic 0 since 1 <= M//2 + (4, 1, 0), + # size 8 -> valid work size (8*0.25)=2, tactic 0 since 2 <= M//2 + (8, 2, 0), + # size 16 -> valid work size (16*0.25)=4, tactic 0 since 4 <= M//2 + (16, 4, 0), + # size 32 -> valid work size (32*0.25)=8, tactic 0 since 8 <= M//2 + (32, 8, 0), + # size 64 -> valid work size (64*0.25)=16, tactic 0 since 16 <= M//2 + (64, 16, 0), + # size 128 -> valid work size (128*0.25)=32, tactic 1 since 32 > M//2 + (128, 32, 1), + # size 256 -> valid work size (256*0.25)=64, tactic -1 since 64 > M + (256, 64, -1), + ] + + for buffer_size, valid_size, expected_tactic in test_cases: + # Verify cache lookup succeeds with the mapped bucket + x = torch.randn(buffer_size, 64) + runner, tactic = tuner.choose_one("test_runtime_bucket_mapping", + [GemmRunner()], tuning_config, [x, w]) + assert ( + tactic == expected_tactic + ), f"buffer size={buffer_size} -> valid work size={valid_size}, expected tactic {expected_tactic} but got {tactic}" + + def test_autotuner_try_block(): class PartialCrashedRunner(TunableRunner): From 4121b670051ec0aa7ba52b0f606b2f0197fb37cc Mon Sep 17 00:00:00 2001 From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> Date: Fri, 19 Dec 2025 11:47:53 +0800 Subject: [PATCH 2/3] [breaking] map_to_tuning_buckets maps input during inference only Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> --- tensorrt_llm/_torch/autotuner.py | 57 +++++++------ .../custom_ops/trtllm_gen_custom_ops.py | 84 ++++++++----------- tests/unittest/_torch/misc/test_autotuner.py | 43 +++++----- 3 files changed, 86 insertions(+), 98 deletions(-) diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index 6d20df8bbaf..f2cc354b641 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -49,15 +49,12 @@ class DynamicTensorSpec: input_idx: The index of the input tensor. dim_idx: The index of the dimension to tune. gen_tuning_buckets: A tuple of values to try or a function generating values. - map_to_tuning_buckets: A function to map dimensions to valid values during tuning. - map_to_runtime_buckets: A function to map dimensions to valid values during inference. - If None, use map_to_tuning_buckets. + map_to_tuning_buckets: A function to map dimensions to tuning buckets during inference. """ input_idx: int dim_idx: int gen_tuning_buckets: Union[Tuple[int], Callable] = () map_to_tuning_buckets: Callable = lambda x: x - map_to_runtime_buckets: Optional[Callable] = None @dataclass(slots=True, unsafe_hash=True) @@ -84,7 +81,7 @@ class TuningConfig: should be tuned to optimize performance. Each spec defines: - Which input tensor dimension is dynamic - How to generate tuning values - - How to map dimensions to valid values during inference + - How to map dimensions to tuning values during inference Example: >>> config = TuningConfig( @@ -395,7 +392,7 @@ def search_cache( runners: List[TunableRunner], input_shapes: Tuple[torch.Size], tuning_config: TuningConfig, - use_tuning_mapping: bool = False, + apply_map_to_tuning_buckets: bool = True, ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]: """Search for cached profiling results matching the current configuration. @@ -403,8 +400,8 @@ def search_cache( custom_op (str): The name of the custom operation to be tuned runners (List[TunableRunner]): List of candidate implementations to profile profile (OptimizationProfile): Optimization profile - use_tuning_mapping: If True, use map_to_tuning_buckets for cache key. - If False, use map_to_runtime_buckets for runtime cache lookups. + apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups. + If False, use raw bucket values for tuning cache storage. Returns: A tuple containing: @@ -412,10 +409,9 @@ def search_cache( runner_id is the index in the current runners list """ for idx, r in enumerate(runners): - if (cache_key := - self.get_cache_key(custom_op, r, input_shapes, - tuning_config, - use_tuning_mapping)) in self.cache: + if (cache_key := self.get_cache_key( + custom_op, r, input_shapes, tuning_config, + apply_map_to_tuning_buckets)) in self.cache: # Return the current index in runners list, not the cached runner_id cached_runner_id, tactic, min_time = self.cache[cache_key] return True, idx, tactic, min_time @@ -428,7 +424,7 @@ def get_cache_key( runner: TunableRunner, input_shapes: Tuple[torch.Size], tuning_config: TuningConfig, - use_tuning_mapping: bool = False, + apply_map_to_tuning_buckets: bool = True, ) -> Tuple: return ( custom_op, @@ -439,7 +435,7 @@ def get_cache_key( tuning_config.dynamic_tensor_specs, tuning_config.constraint_specs, tuning_config.tune_max_num_tokens, - use_tuning_mapping, + apply_map_to_tuning_buckets, ), ) @@ -805,7 +801,11 @@ def choose_one( input_shapes = tuple(self._get_input_sizes(inputs)) is_cache_hit, best_runner_id, best_tactic, min_time = self.profiling_cache.search_cache( - custom_op, runners, input_shapes, tuning_config) + custom_op, + runners, + input_shapes, + tuning_config, + apply_map_to_tuning_buckets=True) # Early return if it's not tuning, use cache found one or fallback one if not self.is_tuning_mode: @@ -855,7 +855,7 @@ def choose_one( runners, p.get_opt_shapes(), tuning_config, - use_tuning_mapping=True, + apply_map_to_tuning_buckets=False, ) if not is_cache_hit: # Initialize runner and tactic as None in case of no valid tactic or runners are found @@ -947,7 +947,7 @@ def _profile_runners( runner, profile.get_opt_shapes(), tuning_config, - use_tuning_mapping=True)) + apply_map_to_tuning_buckets=False)) # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics # or some runtime error occurs during profiling. @@ -964,7 +964,7 @@ def _profile_runners( runners[best_runner_id], profile.get_opt_shapes(), tuning_config, - use_tuning_mapping=True) + apply_map_to_tuning_buckets=False) self._debug_logger( f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}." @@ -1141,8 +1141,7 @@ def _optimization_profiles( # Add the current input value as one of the opt values opt_shapes = set(opt_shapes) opt_shapes.add( - spec.map_to_tuning_buckets( - base_profile.shapes[spec.input_idx][spec.dim_idx].val)) + base_profile.shapes[spec.input_idx][spec.dim_idx].val) opt_shapes = sorted(list(opt_shapes)) opt_shapes_max = tuple(opt_shapes[1:]) + (float('inf'), ) opt_shapes_max = { @@ -1185,7 +1184,7 @@ def _find_nearest_profile( dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...], constraint_specs: Tuple[ConstraintSpec, ...], tune_max_num_tokens: int = None, - use_tuning_mapping: bool = False, + apply_map_to_tuning_buckets: bool = True, ) -> Tuple: """Find the nearest optimization profile for given inputs User can define their own nearest profile generation method to reduce the host overhead. @@ -1193,8 +1192,8 @@ def _find_nearest_profile( Args: shapes: Tuple of input tensor shapes tuning_config: Tuning configuration - use_tuning_mapping: If True, use map_to_tuning_buckets to store tuning cache. - If False, use map_to_runtime_buckets for runtime cache lookups. + apply_map_to_tuning_buckets: If True, apply map_to_tuning_buckets for runtime cache lookups. + If False, use raw bucket values for tuning cache storage. Return: Tuple: A tuple containing: @@ -1204,12 +1203,12 @@ def _find_nearest_profile( base_profile = list(list(shape) for shape in shapes) for spec in dynamic_tensor_specs: - - bucket_mapper = spec.map_to_tuning_buckets - if not use_tuning_mapping and spec.map_to_runtime_buckets is not None: - bucket_mapper = spec.map_to_runtime_buckets - base_profile[spec.input_idx][spec.dim_idx] = bucket_mapper( - base_profile[spec.input_idx][spec.dim_idx]) + # During runtime: apply map_to_tuning_buckets to map input to bucket + # During tuning: no mapper, use raw bucket value + if apply_map_to_tuning_buckets: + base_profile[spec.input_idx][ + spec.dim_idx] = spec.map_to_tuning_buckets( + base_profile[spec.input_idx][spec.dim_idx]) if tune_max_num_tokens is not None: base_profile[spec.input_idx][spec.dim_idx] = min( diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py index 7755f2aba65..7802ba4243a 100644 --- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py @@ -277,16 +277,14 @@ def get_dynamic_tensor_specs(cls, m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - def round_rule(x: int, ep_size_: int) -> int: - value = last_positive_power_of_2(x) // ep_size_ + def round_rule(x: int) -> int: + value = last_positive_power_of_2(x) // ep_size return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec( - HIDDEN_STATES_IDX, - TUNED_DIM, - m_values, - map_to_tuning_buckets=lambda x: round_rule(x, 1), - map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) + specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=round_rule), ) return specs @@ -623,16 +621,14 @@ def get_dynamic_tensor_specs(cls, m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - def round_rule(x: int, ep_size_: int) -> int: - value = last_positive_power_of_2(x) // ep_size_ + def round_rule(x: int) -> int: + value = last_positive_power_of_2(x) // ep_size return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec( - HIDDEN_STATES_IDX, - TUNED_DIM, - m_values, - map_to_tuning_buckets=lambda x: round_rule(x, 1), - map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) + specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=round_rule), ) return specs @@ -918,16 +914,14 @@ def get_dynamic_tensor_specs(cls, m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - def round_rule(x: int, ep_size_: int) -> int: - value = last_positive_power_of_2(x) // ep_size_ + def round_rule(x: int) -> int: + value = last_positive_power_of_2(x) // ep_size return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec( - HIDDEN_STATES_IDX, - TUNED_DIM, - m_values, - map_to_tuning_buckets=lambda x: round_rule(x, 1), - map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) + specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=round_rule), ) return specs @@ -1218,16 +1212,14 @@ def get_dynamic_tensor_specs(cls, m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - def round_rule(x: int, ep_size_: int) -> int: - value = last_positive_power_of_2(x) // ep_size_ + def round_rule(x: int) -> int: + value = last_positive_power_of_2(x) // ep_size return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec( - HIDDEN_STATES_IDX, - TUNED_DIM, - m_values, - map_to_tuning_buckets=lambda x: round_rule(x, 1), - map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) + specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=round_rule), ) return specs @@ -1496,16 +1488,14 @@ def get_dynamic_tensor_specs(cls, m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - def round_rule(x: int, ep_size_: int) -> int: - value = last_positive_power_of_2(x) // ep_size_ + def round_rule(x: int) -> int: + value = last_positive_power_of_2(x) // ep_size return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec( - HIDDEN_STATES_IDX, - TUNED_DIM, - m_values, - map_to_tuning_buckets=lambda x: round_rule(x, 1), - map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) + specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=round_rule), ) return specs @@ -1759,16 +1749,14 @@ def get_dynamic_tensor_specs(cls, m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET) - def round_rule(x: int, ep_size_: int) -> int: - value = last_positive_power_of_2(x) // ep_size_ + def round_rule(x: int) -> int: + value = last_positive_power_of_2(x) // ep_size return min(max(1, value), MAX_PROFILE_BUCKET) - specs = (DynamicTensorSpec( - HIDDEN_STATES_IDX, - TUNED_DIM, - m_values, - map_to_tuning_buckets=lambda x: round_rule(x, 1), - map_to_runtime_buckets=lambda x: round_rule(x, ep_size)), ) + specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, + TUNED_DIM, + m_values, + map_to_tuning_buckets=round_rule), ) return specs diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py index da760ac5e65..fcbf6550bdf 100644 --- a/tests/unittest/_torch/misc/test_autotuner.py +++ b/tests/unittest/_torch/misc/test_autotuner.py @@ -38,18 +38,21 @@ def test_multi_dynamic_dims(): tuner = autotuner.AutoTuner() x = torch.rand([5, 1024]) - w = torch.rand([7, 19]) + w = torch.rand([7, 9]) dynamic_tensor_specs = ( DynamicTensorSpec(0, 0, [1, 3, 5]), DynamicTensorSpec(0, 1, [16, 24, 1024]), - DynamicTensorSpec(1, 1, [3, 7, 9], lambda x: x // 2), + # map_to_tuning_buckets is only applied at runtime, not during tuning + DynamicTensorSpec(1, + 1, [3, 7, 9], + map_to_tuning_buckets=lambda x: x // 2), ) profiles = tuner._optimization_profiles( tuning_config=TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs), inputs=[x, w]) # choice(0, 0) * choice(0, 1) * choice(1, 1) - # 3 * 3 * 3 = 27, because 19 is mapped to 9 and already inside the bucket + # 3 * 3 * 3 = 27, input value 9 is already inside the bucket assert len(profiles) == 27 sample_0 = OptimizationProfile(shapes=[[ DynamicDim(min=1, opt=1, max=3), @@ -171,47 +174,45 @@ def test_autotuner_cache_basic(): m //= 2 -def test_runtime_bucket_mapping(): - """Test that map_to_runtime_buckets correctly maps runtime sizes to tuning buckets. +def test_bucket_mapping(): + """Test that map_to_tuning_buckets correctly maps runtime sizes to tuning buckets. - This test demonstrates the distinction between map_to_tuning_buckets and map_to_runtime_buckets: - - map_to_tuning_buckets: used during tuning to store cache keys with raw bucket values - - map_to_runtime_buckets: used during runtime to map input sizes to tuning buckets + This test demonstrates the single mapper approach: + - During tuning: NO mapper is applied, raw bucket values are used as cache keys + - During runtime: map_to_tuning_buckets is applied to map buffer size to actual work size With sparsity=0.25, the buffer contains 25% actual work: - - Tuning stores buckets: 1, 2, 4, 8, 16, 32 + - Tuning stores buckets: 1, 2, 4, 8, 16, 32 as raw cache keys - Runtime buffer 4 -> maps to bucket int(4 * 0.25) = 1 - Runtime buffer 16 -> maps to bucket int(16 * 0.25) = 4 In MoE EP, the input buffer is allocated for worst-case but sparsely filled. - Using map_to_runtime_buckets allows us to map buffer size to actual work size. + Using map_to_tuning_buckets allows us to map buffer size to actual work size at runtime. """ w = torch.randn(64, 128) tuner = AutoTuner.get() tuner.clear_cache() # Sparsity indicates the fraction of buffer containing valid work - def bucket_mapper(x: int, sparsity: float) -> int: - return int(x * sparsity) + sparsity = 0.25 tuning_config = TuningConfig(dynamic_tensor_specs=(DynamicTensorSpec( input_idx=0, dim_idx=0, gen_tuning_buckets=get_power_of_2_num_tokens_buckets(M), - map_to_tuning_buckets=lambda x: bucket_mapper(x, sparsity=1.0), - map_to_runtime_buckets=lambda x: bucket_mapper(x, sparsity=0.25)), ), ) + map_to_tuning_buckets=lambda x: int(x * sparsity)), ), ) with autotune(): - tuner.choose_one("test_runtime_bucket_mapping", [GemmRunner()], - tuning_config, [torch.randn(1, 64), w]) + tuner.choose_one("test_bucket_mapping", [GemmRunner()], tuning_config, + [torch.randn(1, 64), w]) - # Verify cache entries use raw tuning bucket values, not deflated values + # Verify cache entries use raw tuning bucket values cache_entries = tuner.profiling_cache.get_specific_custom_op( - "test_runtime_bucket_mapping") + "test_bucket_mapping") # Extract the first dimension of the first input shape from each cache key assert len(cache_entries) == len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets), \ - f"Expected len(({len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets)}) cache entries, got len({(cache_entries)})" + f"Expected {len(tuning_config.dynamic_tensor_specs[0].gen_tuning_buckets)} cache entries, got {len(cache_entries)}" # Test runtime mapping: buffer size is mapped via map_to_runtime_buckets # to find the correct tuning bucket based on actual work size @@ -235,8 +236,8 @@ def bucket_mapper(x: int, sparsity: float) -> int: for buffer_size, valid_size, expected_tactic in test_cases: # Verify cache lookup succeeds with the mapped bucket x = torch.randn(buffer_size, 64) - runner, tactic = tuner.choose_one("test_runtime_bucket_mapping", - [GemmRunner()], tuning_config, [x, w]) + runner, tactic = tuner.choose_one("test_bucket_mapping", [GemmRunner()], + tuning_config, [x, w]) assert ( tactic == expected_tactic ), f"buffer size={buffer_size} -> valid work size={valid_size}, expected tactic {expected_tactic} but got {tactic}" From 2a4eb963d50f1b58ed457202cd6db9d3f5e6261e Mon Sep 17 00:00:00 2001 From: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> Date: Fri, 2 Jan 2026 12:14:52 +0800 Subject: [PATCH 3/3] clip input shape to max tunable token count Signed-off-by: Anthony Chang <27950904+rosenrodt@users.noreply.github.com> --- tensorrt_llm/_torch/autotuner.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index f2cc354b641..41caf648adf 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -1140,8 +1140,15 @@ def _optimization_profiles( opt_shapes = spec.gen_tuning_buckets # Add the current input value as one of the opt values opt_shapes = set(opt_shapes) - opt_shapes.add( - base_profile.shapes[spec.input_idx][spec.dim_idx].val) + if tuning_config.tune_max_num_tokens is not None: + opt_shapes.add( + min( + tuning_config.tune_max_num_tokens, + base_profile.shapes[spec.input_idx][spec.dim_idx].val, + )) + else: + opt_shapes.add( + base_profile.shapes[spec.input_idx][spec.dim_idx].val) opt_shapes = sorted(list(opt_shapes)) opt_shapes_max = tuple(opt_shapes[1:]) + (float('inf'), ) opt_shapes_max = {