[TRTLLM-9615][feat] Support synchronization through PP ranks in the distributed tuning system (NVIDIA#10011)

hyukn · videodanchik · commit 12bf10c72d2a · 2026-01-14T02:06:25.000Z
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
Signed-off-by: Daniil Kulko &lt;kulkodaniil@gmail.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -22,6 +22,9 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
+# Unique tag to avoid collisions with other comms
+PP_COMM_TAG_AUTOTUNING = 30000
+
 
 class DistributedTuningStrategy(enum.Enum):
     """
@@ -358,7 +361,7 @@ class AutoTunerProfilingCache:
     """
 
     def __init__(self):
-        self.cache = {}
+        self.cache: Dict[Tuple, Tuple] = dict()
 
         # Cache metadata for local storage and validation
         self.lib_version = tensorrt_llm.__version__
@@ -430,7 +433,7 @@ def get_cache_key(
             ),
         )
 
-    def merge_cache_data(self, cache_data: Dict[str, Any]):
+    def merge_cache_data(self, cache_data: Dict[Tuple, Tuple]):
         self.cache.update(cache_data)
 
     def get_specific_custom_op(self, custom_op: str) -> Dict[Tuple, Tuple]:
@@ -615,7 +618,10 @@ def __init__(self, warmup=2, repeat=10, stream_delay_micro_secs=1000):
         self._last_capture: Optional['AutoTuner.TacticsCapture'] = None
 
         # Dsitributed tuning state
+        self._map_op_to_distributed_strategy: Dict[
+            str, DistributedTuningStrategy] = {}
         self._dist: Optional[Distributed] = None
+        self._has_received_cache: bool = False
         self.mapping: Mapping = Mapping()
 
     @classmethod
@@ -624,9 +630,6 @@ def get(cls):
             cls._instance = AutoTuner()
         return cls._instance
 
-    def set_mapping(self, mapping: Mapping = None):
-        self.mapping = mapping
-
     class TacticsCapture:
         """Object returned by capture() that can be iterated to get all tactic combinations.
 
@@ -797,10 +800,18 @@ def choose_one(
         if self.is_tuning_mode and is_cache_hit:
             return (runners[best_runner_id], best_tactic)
 
+        # PP rank does not have cache hit, so we try to receive the cache from the previous rank
+        # Notice that only under tuning mode, pp_recv will be called
+        self.cache_pp_recv()
+
         assert len(runners) > 0, "At least one runner is required"
         assert all([isinstance(r, TunableRunner) for r in runners]), \
             "All Given runners must be subclass of TunableRunner"
 
+        # Record the distributed tuning strategy for the custom_op
+        self._map_op_to_distributed_strategy[
+            custom_op] = tuning_config.distributed_tuning_strategy
+
         tuning_start_time = time.perf_counter()
         profiles = self._optimization_profiles(tuning_config, inputs)
 
@@ -1507,3 +1518,32 @@ def _should_current_rank_tune(self,
                 f"[AutoTuner] Unknown distributed tuning strategy: {strategy}, falling back to independent"
             )
             return True
+
+    def cache_pp_recv(self):
+        if self.mapping.has_pp() and not self.mapping.is_first_pp_rank(
+        ) and not self._has_received_cache:
+            self._debug_logger(
+                f"[AutoTuner] Receiving cache data from previous pp rank {self.mapping.prev_pp_rank()}"
+            )
+            profiling_cache = self._dist.recv_object(
+                src=self.mapping.prev_pp_rank(),
+                tag=PP_COMM_TAG_AUTOTUNING,
+            )
+            # Guarantee that only receive cache once during a single warm-up run
+            # Notice that this flag should be reset after each warm-up run because isend is always called
+            self._has_received_cache = True
+            self.profiling_cache.merge_cache_data(profiling_cache)
+
+    def cache_pp_send(self):
+        if self.mapping.has_pp() and not self.mapping.is_last_pp_rank():
+            self._debug_logger(
+                f"[AutoTuner] Sending cache data to next pp rank {self.mapping.next_pp_rank()}"
+            )
+            self._dist.isend_object(
+                self.profiling_cache.cache,
+                dest=self.mapping.next_pp_rank(),
+                tag=PP_COMM_TAG_AUTOTUNING,
+            ).wait()
+
+    def clean_pp_flag(self):
+        self._has_received_cache = False
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -693,6 +693,14 @@ def _(
 
 class NVFP4GemmUnifiedRunner(TunableRunner):
     runner_dict = dict()
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            0, 0, get_last_power_of_2_num_tokens_buckets,
+            last_positive_power_of_2), ),
+        constraint_specs=(ConstraintSpec(2, 0, fp4_scale_infer_shape), ),
+        # nested tuning should always be independent
+        distributed_tuning_strategy=DistributedTuningStrategy.INDEPENDENT,
+    )
 
     def __init__(self, to_userbuffers: bool, output_dtype: torch.dtype,
                  allowed_backends: List[str]):
@@ -943,7 +951,7 @@ def nvfp4_gemm(
         _, best_tactic = tuner.choose_one(
             "trtllm::nvfp4_gemm::gemm",
             [runner],
-            FP4GemmRunner.
+            NVFP4GemmUnifiedRunner.
             tuning_config,  # All runners use the same tuning_config
             [act_fp4, weight, act_sf, weight_scale, alpha],
         )
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -667,9 +667,19 @@ def _run_autotuner_warmup(self, resource_manager: ResourceManager):
                     if self.is_draft_model and isinstance(
                             spec_resource_manager, Eagle3ResourceManager):
                         spec_resource_manager.is_first_draft = True
+
                     self.forward(batch,
                                  new_tensors_device=None,
                                  resource_manager=resource_manager)
+
+                    # pp_recv in AutoTuner choose_one will never be called if there is no tuning op during the forward pass.
+                    # So we need to make an extra call to consume the previous rank's pp_send to guarantee that the previous rank's pp_send is released.
+                    AutoTuner.get().cache_pp_recv()
+                    # Send the cache after the tuning process to the next PP rank
+                    AutoTuner.get().cache_pp_send()
+                    # Clean the pp flag to avoid deadlock with synchronous send/recv
+                    AutoTuner.get().clean_pp_flag()
+
                     torch.cuda.synchronize()
 
         logger.info(