replace tp_allgather with tp_cp_allgather where apt

brb-nv · brb-nv · commit 8966bc34d499 · 2026-01-09T18:07:03.000-08:00
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -1547,7 +1547,7 @@ def _maybe_sync_cache_data(self, strategy: DistributedTuningStrategy,
     def _merge_cache_data(self, custom_op: str):
         cache_data = self.profiling_cache.get_specific_custom_op(custom_op)
         merged_cache_data = dict()
-        all_cache_data = self._dist.tp_allgather(obj=cache_data)
+        all_cache_data = self._dist.tp_cp_allgather(obj=cache_data)
 
         for data in all_cache_data:
             for key, value in data.items():
diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py
@@ -149,11 +149,19 @@ def tp_cp_allgather(self, obj):
         First gathers within CP group, then across TP groups, returning
         a flattened list with tp_size * cp_size entries.
         """
+        # Gather across CP dimension.
         if self.cp_size > 1:
             obj = self.cp_allgather(obj)
+        else:
+            obj = [obj]  # Wrap to match cp_allgather output format.
+
+        # Gather across TP dimension.
         if self.tp_size > 1:
             obj = self.tp_allgather(obj)
-        # Flatten: [[cp0, cp1], [cp0, cp1], ...] -> [tp0_cp0, tp0_cp1, tp1_cp0, ...].
+        else:
+            obj = [obj]  # Wrap to match tp_allgather output format.
+
+        # Flatten: [[cp0, cp1], [cp0, cp1], ...] -> [tp0_cp0, tp0_cp1, tp1_cp0, ...]
         return [entry for tp_group in obj for entry in tp_group]
 
 
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -246,7 +246,7 @@ def maybe_get_cuda_graph(
         can_run_cuda_graph = batch.can_run_cuda_graph
         batch_size = batch.batch_size
         if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1:
-            all_can_graph_batch = self.config.dist.tp_allgather(
+            all_can_graph_batch = self.config.dist.tp_cp_allgather(
                 [can_run_cuda_graph, batch_size])
             is_all_gen_only = all(all_can_graph[0]
                                   for all_can_graph in all_can_graph_batch)
@@ -408,7 +408,7 @@ def _get_padded_batch(self, batch: ScheduledRequests,
         new_batch_size = batch_size
 
         if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1:
-            graph_batch_size = self.config.dist.tp_allgather(
+            graph_batch_size = self.config.dist.tp_cp_allgather(
                 [can_run_cuda_graph, batch_size])
             all_can_graph = all(graph_batch[0]
                                 for graph_batch in graph_batch_size)
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -370,6 +370,8 @@ def _fetch_new_requests_attention_dp(
             num_active_tokens = sum(
                 [req.py_orig_prompt_len for req in activate_requests])
 
+        # Note: We use tp_allgather even for CP assuming that all CP ranks a
+        # DP group have the same num_active_tokens and num_active_requests.
         responses_list = self.dist.tp_allgather(
             [len(activate_requests), num_active_tokens])
 
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1369,7 +1369,7 @@ def get_padded_piecewise_tokens(tokens):
                                                 max(attn_all_rank_num_tokens)
                                                 <= max_captured_num_tokens)
                 all_ranks_can_run_piecewise_cuda_graph = list(
-                    self.dist.tp_allgather(can_run_piecewise_cuda_graph))
+                    self.dist.tp_cp_allgather(can_run_piecewise_cuda_graph))
                 if all(all_ranks_can_run_piecewise_cuda_graph):
                     padded_num_tokens = get_padded_piecewise_tokens(
                         max(attn_all_rank_num_tokens))
@@ -1536,7 +1536,7 @@ def _prepare_incremental_update_metadata(
             # Handle distributed spec metadata
             if enable_attention_dp:
                 sequence_lengths = spec_metadata.seq_lens
-                all_rank_num_tokens = self.dist.tp_allgather(
+                all_rank_num_tokens = self.dist.tp_cp_allgather(
                     [spec_metadata.num_tokens,
                      len(sequence_lengths)])
                 spec_metadata.all_rank_num_tokens = [
@@ -2691,7 +2691,7 @@ def previous_seq_slots_device():
             inputs['spec_metadata'] = spec_metadata
 
             if self.enable_attention_dp:
-                all_rank_num_tokens = self.dist.tp_allgather(
+                all_rank_num_tokens = self.dist.tp_cp_allgather(
                     [spec_metadata.num_tokens,
                      len(sequence_lengths)])
 
@@ -2856,7 +2856,7 @@ def _prepare_tp_inputs_no_cache(
         # support attention dp
         if self.enable_attention_dp:
             if spec_metadata is not None:
-                all_rank_num_tokens = self.dist.tp_allgather([
+                all_rank_num_tokens = self.dist.tp_cp_allgather([
                     attn_metadata.num_tokens, spec_metadata.num_tokens,
                     len(sequence_lengths)
                 ])
@@ -2871,7 +2871,7 @@ def _prepare_tp_inputs_no_cache(
                 spec_metadata.all_rank_num_tokens = spec_all_rank_num_tokens
                 spec_metadata.all_rank_num_seqs = all_rank_num_seqs
             else:
-                all_rank_num_tokens = self.dist.tp_allgather(
+                all_rank_num_tokens = self.dist.tp_cp_allgather(
                     attn_metadata.num_tokens)
                 attn_metadata.all_rank_num_tokens = all_rank_num_tokens
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1207,7 +1207,8 @@ def wait_on_pp_send_handles(self, microbatch_id):
     def _can_queue(self, scheduled_batch):
 
         if self.enable_attention_dp:
-            tp_batch_sizes = self.dist.tp_allgather(scheduled_batch.batch_size)
+            tp_batch_sizes = self.dist.tp_cp_allgather(
+                scheduled_batch.batch_size)
             can_queue = 0 not in tp_batch_sizes
         else:
             can_queue = scheduled_batch.batch_size > 0
@@ -1552,7 +1553,7 @@ def _executor_loop_overlap(self):
                     if self.enable_attention_dp:
                         local_can_forward = self.executor_request_queue.num_fetch_requests + \
                             len(scheduled_batch.generation_requests) >= self.benchmark_req_queues_size
-                        all_can_forward = self.dist.tp_allgather(
+                        all_can_forward = self.dist.tp_cp_allgather(
                             local_can_forward)
                         if all(all_can_forward):
                             can_forward = True
@@ -1924,6 +1925,8 @@ def _balance_adp_requests(self, context_requests: list[LlmRequest],
         num_scheduled_tokens = sum(
             [len(req.get_tokens(0))
              for req in context_requests]) + num_scheduled_generation_requests
+        # Note: We use tp_allgather instead of tp_cp_allgather because we want to
+        # balance the requests across DP ranks; not CP ranks within those DP ranks.
         responses_list = self.dist.tp_allgather([
             num_scheduled_context_requests, num_scheduled_generation_requests,
             num_scheduled_tokens