[TRTLLM-9615][feat] Support PP in the distributed tuning system

hyukn · hyukn · commit ebae79d60f36 · 2025-12-17T05:39:47.000Z
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -125,7 +125,7 @@ class TuningConfig:
     inputs_pre_hook: Callable = None
     use_cold_l2_cache: bool = False
     use_cuda_graph: bool = True
-    distributed_tuning_strategy: DistributedTuningStrategy = DistributedTuningStrategy.INDEPENDENT
+    distributed_tuning_strategy: DistributedTuningStrategy = DistributedTuningStrategy.BROADCAST
 
 
 @dataclass(unsafe_hash=True)
@@ -358,7 +358,7 @@ class AutoTunerProfilingCache:
     """
 
     def __init__(self):
-        self.cache = {}
+        self.cache: Dict[Tuple, Tuple] = dict()
 
         # Cache metadata for local storage and validation
         self.lib_version = tensorrt_llm.__version__
@@ -430,7 +430,7 @@ def get_cache_key(
             ),
         )
 
-    def merge_cache_data(self, cache_data: Dict[str, Any]):
+    def merge_cache_data(self, cache_data: Dict[Tuple, Tuple]):
         self.cache.update(cache_data)
 
     def get_specific_custom_op(self, custom_op: str) -> Dict[Tuple, Tuple]:
@@ -615,6 +615,8 @@ def __init__(self, warmup=2, repeat=10, stream_delay_micro_secs=1000):
         self._last_capture: Optional['AutoTuner.TacticsCapture'] = None
 
         # Dsitributed tuning state
+        self._map_op_to_distributed_strategy: Dict[
+            str, DistributedTuningStrategy] = {}
         self._dist: Optional[Distributed] = None
         self.mapping: Mapping = Mapping()
 
@@ -801,6 +803,10 @@ def choose_one(
         assert all([isinstance(r, TunableRunner) for r in runners]), \
             "All Given runners must be subclass of TunableRunner"
 
+        # Record the distributed tuning strategy for the custom_op
+        self._map_op_to_distributed_strategy[
+            custom_op] = tuning_config.distributed_tuning_strategy
+
         tuning_start_time = time.perf_counter()
         profiles = self._optimization_profiles(tuning_config, inputs)
 
@@ -1510,3 +1516,24 @@ def _should_current_rank_tune(self,
                 f"[AutoTuner] Unknown distributed tuning strategy: {strategy}, falling back to independent"
             )
             return True
+
+    def cache_sync_pp_recv(self):
+        if self.mapping.has_pp() and not self.mapping.is_first_pp_rank:
+            profiling_cache = self._dist.recv_object(
+                self.mapping.prev_pp_rank())
+            self.profiling_cache.merge_cache_data(profiling_cache)
+
+    def cache_sync_pp_send(self):
+        # Op with INDEPENDENT strategy shall not be send
+        if self.mapping.has_pp() and not self.mapping.is_last_pp_rank:
+            dependent_custom_ops = [
+                op for op, strategy in
+                self._map_op_to_distributed_strategy.items()
+                if strategy != DistributedTuningStrategy.INDEPENDENT
+            ]
+            dependent_custom_ops_cache = dict()
+            for op in dependent_custom_ops:
+                dependent_custom_ops_cache.update(
+                    self.profiling_cache.get_specific_custom_op(op))
+            self._dist.send_object(dependent_custom_ops_cache,
+                                   self.mapping.next_pp_rank())
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -693,6 +693,14 @@ def _(
 
 class NVFP4GemmUnifiedRunner(TunableRunner):
     runner_dict = dict()
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            0, 0, get_last_power_of_2_num_tokens_buckets,
+            last_positive_power_of_2), ),
+        constraint_specs=(ConstraintSpec(2, 0, fp4_scale_infer_shape), ),
+        # nested tuning should always be independent
+        distributed_tuning_strategy=DistributedTuningStrategy.INDEPENDENT,
+    )
 
     def __init__(self, to_userbuffers: bool, output_dtype: torch.dtype,
                  allowed_backends: List[str]):
@@ -943,7 +951,7 @@ def nvfp4_gemm(
         _, best_tactic = tuner.choose_one(
             "trtllm::nvfp4_gemm::gemm",
             [runner],
-            FP4GemmRunner.
+            NVFP4GemmUnifiedRunner.
             tuning_config,  # All runners use the same tuning_config
             [act_fp4, weight, act_sf, weight_scale, alpha],
         )
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -649,9 +649,16 @@ def _run_autotuner_warmup(self, resource_manager: ResourceManager):
                     if self.is_draft_model and isinstance(
                             spec_resource_manager, Eagle3ResourceManager):
                         spec_resource_manager.is_first_draft = True
+                    # Sync the cache before the forward pass for not the first pp ranks
+                    AutoTuner.get().cache_sync_pp_recv()
+
                     self.forward(batch,
                                  new_tensors_device=None,
                                  resource_manager=resource_manager)
+
+                    # Sync the cache after the forward pass for not the last pp ranks
+                    AutoTuner.get().cache_sync_pp_send()
+
                     torch.cuda.synchronize()
 
         logger.info(