revert tp_cp_allgather in multiple places

brb-nv · brb-nv · commit d1ab93cd3abf · 2026-01-13T18:48:47.000Z
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@@ -133,19 +133,8 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
 
     if (worldConfig.isTensorParallel())
     {
-        if (worldConfig.isContextParallel())
-        {
-            // When CP is enabled, group ranks with same (ppRank, cpRank) to exclude both PP and CP.
-            auto const tpGroupId = worldConfig.getContextParallelRank()
-                + worldConfig.getContextParallelism() * worldConfig.getPipelineParallelRank();
-            mGroupTensorParaComm
-                = std::make_shared<CacheTransceiverComm>(mGroupComm->split(tpGroupId, worldConfig.getRank()));
-        }
-        else
-        {
-            mGroupTensorParaComm = std::make_shared<CacheTransceiverComm>(
-                mGroupComm->split(worldConfig.getPipelineParallelRank(), worldConfig.getTensorParallelRank()));
-        }
+        mGroupTensorParaComm = std::make_shared<CacheTransceiverComm>(
+            mGroupComm->split(worldConfig.getPipelineParallelRank(), worldConfig.getTensorParallelRank()));
     }
     int kvFactor = 2;
     if (cacheManager->getCacheType() == kv_cache_manager::CacheType::kSELFKONLY)
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -247,7 +247,7 @@ def maybe_get_cuda_graph(
         can_run_cuda_graph = batch.can_run_cuda_graph
         batch_size = batch.batch_size
         if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1:
-            all_can_graph_batch = self.config.dist.tp_cp_allgather(
+            all_can_graph_batch = self.config.dist.tp_allgather(
                 [can_run_cuda_graph, batch_size])
             is_all_gen_only = all(all_can_graph[0]
                                   for all_can_graph in all_can_graph_batch)
@@ -409,7 +409,7 @@ def _get_padded_batch(self, batch: ScheduledRequests,
         new_batch_size = batch_size
 
         if self.enabled and self.config.enable_attention_dp and self.config.mapping.tp_size > 1:
-            graph_batch_size = self.config.dist.tp_cp_allgather(
+            graph_batch_size = self.config.dist.tp_allgather(
                 [can_run_cuda_graph, batch_size])
             all_can_graph = all(graph_batch[0]
                                 for graph_batch in graph_batch_size)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1369,7 +1369,7 @@ def get_padded_piecewise_tokens(tokens):
                                                 max(attn_all_rank_num_tokens)
                                                 <= max_captured_num_tokens)
                 all_ranks_can_run_piecewise_cuda_graph = list(
-                    self.dist.tp_cp_allgather(can_run_piecewise_cuda_graph))
+                    self.dist.tp_allgather(can_run_piecewise_cuda_graph))
                 if all(all_ranks_can_run_piecewise_cuda_graph):
                     padded_num_tokens = get_padded_piecewise_tokens(
                         max(attn_all_rank_num_tokens))
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1227,8 +1227,7 @@ def wait_on_pp_send_handles(self, microbatch_id):
     def _can_queue(self, scheduled_batch):
 
         if self.enable_attention_dp:
-            tp_batch_sizes = self.dist.tp_cp_allgather(
-                scheduled_batch.batch_size)
+            tp_batch_sizes = self.dist.tp_allgather(scheduled_batch.batch_size)
             can_queue = 0 not in tp_batch_sizes
         else:
             can_queue = scheduled_batch.batch_size > 0
@@ -1573,7 +1572,7 @@ def _executor_loop_overlap(self):
                     if self.enable_attention_dp:
                         local_can_forward = self.executor_request_queue.num_fetch_requests + \
                             len(scheduled_batch.generation_requests) >= self.benchmark_req_queues_size
-                        all_can_forward = self.dist.tp_cp_allgather(
+                        all_can_forward = self.dist.tp_allgather(
                             local_can_forward)
                         if all(all_can_forward):
                             can_forward = True