[TRTLLM-9465][fix] Swap TP-CP grouping order

brb-nv · brb-nv · commit 3bf41e7c47bc · 2026-01-01T22:44:19.000Z
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -154,7 +154,8 @@ bool CacheFormatter::needSendCache(
         return true;
     }
 
-    int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;
+    int selfCpSize = selfConfig.getParallelConfig().mContextParallelism;
+    int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize;
     int selfTpRankInDpGroup = selfTpRank;
     if (selfConfig.getParallelConfig().mEnableAttentionDP)
     {
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@@ -60,7 +60,8 @@ std::vector<size_t> MLACacheFormatter::pickRecvConnections(
 bool MLACacheFormatter::needSendCache(
     CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx)
 {
-    int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;
+    int selfCpSize = selfConfig.getParallelConfig().mContextParallelism;
+    int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize;
 
     int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP
         ? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu b/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu
@@ -107,9 +107,9 @@ TargetRanksInfo TargetRanksInfoForDP(
     auto const peerCPNum = peerParConfig.mContextParallelism;
     auto const selfCPNum = selfParConfig.mContextParallelism;
 
-    auto const selfTPRank = selfRank % selfTPNum;
+    auto const selfCPRank = selfRank % selfCPNum;
+    auto const selfTPRank = (selfRank % (selfTPNum * selfCPNum)) / selfCPNum;
     auto const selfPPRank = selfRank / (selfTPNum * selfCPNum);
-    auto const selfCPRank = (selfRank % (selfTPNum * selfCPNum)) / selfTPNum;
 
     int peerPPRankStart = 0;
     int mDomainPPSize = 1;
@@ -211,7 +211,9 @@ TargetRanksInfo TargetRanksInfoForDP(
         {
             for (int k = peerPPRankStart; k < peerPPRankEnd; k++)
             {
-                int irank = (k * peerTPNum * peerCPNum) + (j * peerTPNum) + i;
+                // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank
+                // where i=tpRank, j=cpRank, k=ppRank
+                int irank = (k * peerTPNum * peerCPNum) + (i * peerCPNum) + j;
                 retRanks.push_back(irank);
             }
         }
diff --git a/tensorrt_llm/_torch/device_mesh.py b/tensorrt_llm/_torch/device_mesh.py
@@ -118,15 +118,17 @@ def build_mesh(self):
                 "DeviceMesh creation requested but torch.distributed process group "
                 "has not been initialised.")
 
-        dims = ["cp", "pp"]
-        shape = [self.cp_size, self.pp_size]
+        # Dimensions go from slowest-varying (outermost) to fastest-varying (innermost)
+        # Layout: pp is outermost, then tp, then cp is innermost (consecutive)
+        dims = ["pp", "tp"]
+        shape = [self.pp_size, self.tp_size]
 
         if self.moe_ep_size > 1:
             dims += ["moe_tp", "moe_ep"]
             shape += [self.moe_tp_size, self.moe_ep_size]
         else:
-            dims += ["tp"]
-            shape += [self.tp_size]
+            dims += ["cp"]
+            shape += [self.cp_size]
 
         cls.device_mesh = init_device_mesh(
             "cuda",
diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py
@@ -292,18 +292,16 @@ def has_cp(self):
         return self.cp_size > 1
 
     def prev_cp_rank(self):
-        p = self.rank - self.tp_size
-        if p // (self.tp_size * self.cp_size) < self.rank // (self.tp_size *
-                                                              self.cp_size):
-            return p + self.tp_size * self.cp_size
-        return p
+        # cp ranks are consecutive, so prev is rank - 1 with wraparound within cp group
+        if self.cp_rank == 0:
+            return self.rank + self.cp_size - 1
+        return self.rank - 1
 
     def next_cp_rank(self):
-        p = self.rank + self.tp_size
-        if p // (self.tp_size * self.cp_size) > self.rank // (self.tp_size *
-                                                              self.cp_size):
-            return p - self.tp_size * self.cp_size
-        return p
+        # cp ranks are consecutive, so next is rank + 1 with wraparound within cp group
+        if self.cp_rank == self.cp_size - 1:
+            return self.rank - self.cp_size + 1
+        return self.rank + 1
 
     def has_moe_cluster(self):
         return self.moe_cluster_size > 1
@@ -378,17 +376,17 @@ class Mapping(MappingBase):
 
     A node with 8 GPUs, tp_size = 4, cp_size = 2, pp_size = 1
 
-    2 tp groups:
+    4 cp groups:
 
-    - [0, 1, 2, 3]
-    - [4, 5, 6, 7]
+    - [0, 1]
+    - [2, 3]
+    - [4, 5]
+    - [6, 7]
 
-    4 cp groups:
+    2 tp groups:
 
-    - [0, 4]
-    - [1, 5]
-    - [2, 6]
-    - [3, 7]
+    - [0, 2, 4, 6]
+    - [1, 3, 5, 7]
 
     A node with 8 GPUs, moe_tp_size = 2, moe_ep_size = 4
 
@@ -437,23 +435,23 @@ class Mapping(MappingBase):
 
     2 nodes with 8 GPUs, tp_size 2, pp_size 2, cp_size 2
 
-    4 tp groups:
+    4 cp groups:
     - [0, 1]
     - [2, 3]
     - [4, 5]
     - [6, 7]
 
+    4 tp groups:
+    - [0, 2]
+    - [1, 3]
+    - [4, 6]
+    - [5, 7]
+
     4 pp groups:
     - [0, 4]
     - [1, 5]
     - [2, 6]
     - [3, 7]
-
-    4 cp groups:
-    - [0, 2]
-    - [1, 3]
-    - [4, 6]
-    - [5, 7]
     """
 
     def __new__(cls, *args, **kwargs):
@@ -551,23 +549,23 @@ def __init__(self, *args, **kwargs):
 
     @property
     def tp_rank(self) -> int:
-        return self.rank % self.tp_size
+        return self.rank % (self.tp_size * self.cp_size) // self.cp_size
 
     @property
     def pp_rank(self) -> int:
         return self.rank // (self.tp_size * self.cp_size)
 
     @property
     def cp_rank(self) -> int:
-        return self.rank % (self.tp_size * self.cp_size) // self.tp_size
+        return self.rank % self.cp_size
 
     @property
     def tp_group(self) -> List[int]:
         return self.tp_groups[self.pp_rank * self.cp_size + self.cp_rank]
 
     @property
     def pp_group(self) -> List[int]:
-        return self.pp_groups[self.cp_rank * self.tp_size + self.tp_rank]
+        return self.pp_groups[self.tp_rank * self.cp_size + self.cp_rank]
 
     @property
     def cp_group(self) -> List[int]:
@@ -598,20 +596,20 @@ def _init_parallel_groups(self):
             ranks = range(i, self.world_size, self.tp_size * self.cp_size)
             self.pp_groups.append(list(ranks))
 
-        # init cp group
+        # init cp group (consecutive ranks within each tp slice)
         for i in range(self.pp_size):
             for j in range(self.tp_size):
-                ranks = range(i * self.tp_size * self.cp_size + j,
-                              (i + 1) * self.tp_size * self.cp_size + j,
-                              self.tp_size)
+                ranks = range(
+                    i * self.tp_size * self.cp_size + j * self.cp_size,
+                    i * self.tp_size * self.cp_size + (j + 1) * self.cp_size)
                 self.cp_groups.append(list(ranks))
 
-        # init tp group
+        # init tp group (interleaved ranks with stride of cp_size)
         for i in range(self.pp_size):
             for j in range(self.cp_size):
-                ranks = range(
-                    i * self.tp_size * self.cp_size + j * self.tp_size,
-                    i * self.tp_size * self.cp_size + (j + 1) * self.tp_size)
+                ranks = range(i * self.tp_size * self.cp_size + j,
+                              (i + 1) * self.tp_size * self.cp_size + j,
+                              self.cp_size)
                 self.tp_groups.append(list(ranks))
 
         # init moe tp group
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
@@ -740,10 +740,11 @@ def from_checkpoint(
 
         rank = config.mapping.rank
         if config.mapping.cp_size > 1:
-            # tp_cp_pp rank -> tp_pp rank: because different cp ranks share the same ckpt
+            # cp_tp_pp rank -> tp_pp rank: because different cp ranks share the same ckpt
             tp_size = config.mapping.tp_size
             cp_size = config.mapping.cp_size
-            rank = rank % tp_size + rank // (tp_size * cp_size) * tp_size
+            rank = (rank % (tp_size * cp_size)) // cp_size + rank // (
+                tp_size * cp_size) * tp_size
         weights_path = os.path.join(ckpt_dir, f'rank{rank}.safetensors')
 
         assert os.path.isfile(weights_path)
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -872,8 +872,9 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn):
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(8)
-    @pytest.mark.parametrize("gen_pp,gen_tp,gen_cp", [(1, 2, 2), (2, 1, 2)],
-                             ids=["pp1tp2cp2", "pp2tp1cp2"])
+    @pytest.mark.parametrize("gen_pp,gen_tp,gen_cp", [(1, 1, 4), (1, 2, 2),
+                                                      (2, 1, 2)],
+                             ids=["pp1tp1cp4", "pp1tp2cp2", "pp2tp1cp2"])
     @pytest.mark.parametrize("cuda_graph_config", [
         None,
         {
@@ -912,6 +913,7 @@ def test_auto_dtype_with_helix(self, comms_medium, cuda_graph_config,
                 "backend": "UCX",
                 "max_tokens_in_buffer": 8192,
             },
+            # "print_iter_log": True,
         }
         gen_server_config = {
             "tensor_parallel_size": gen_tp,
@@ -931,6 +933,7 @@ def test_auto_dtype_with_helix(self, comms_medium, cuda_graph_config,
                 "backend": "UCX",
                 "max_tokens_in_buffer": 8192,
             },
+            # "print_iter_log": True,
         }
         disaggregated_server_config = {
             "hostname": "localhost",
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -540,6 +540,12 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp1tp2cp2]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp1tp2cp2]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2]
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -67,6 +67,8 @@ l0_dgx_b200:
       orchestrator: mpi
   tests:
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60)
@@ -94,6 +96,8 @@ l0_dgx_b200:
       orchestrator: mpi
   tests:
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60)
diff --git a/tests/unittest/others/test_mapping.py b/tests/unittest/others/test_mapping.py
@@ -57,27 +57,27 @@ def test_mapping(self):
         self.assertEqual(len(m.tp_groups), 4)
         self.assertEqual(len(m.pp_groups), 4)
         self.assertEqual(len(m.cp_groups), 4)
-        self.assertEqual(m.tp_group, [2, 3])
+        self.assertEqual(m.tp_group, [1, 3])
         self.assertEqual(m.pp_group, [3, 7])
-        self.assertEqual(m.cp_group, [1, 3])
+        self.assertEqual(m.cp_group, [2, 3])
         self.assertTrue(m.is_first_pp_rank())
         self.assertFalse(m.is_last_pp_rank())
         self.assertFalse(m.is_first_cp_rank())
         self.assertTrue(m.is_last_cp_rank())
         self.assertEqual(m.prev_pp_rank(), 7)
         self.assertEqual(m.next_pp_rank(), 7)
-        self.assertEqual(m.prev_cp_rank(), 1)
-        self.assertEqual(m.next_cp_rank(), 1)
+        self.assertEqual(m.prev_cp_rank(), 2)
+        self.assertEqual(m.next_cp_rank(), 2)
 
         m = Mapping(world_size=16, rank=9, tp_size=2, pp_size=2, cp_size=4)
-        self.assertEqual(m.tp_group, [8, 9])
+        self.assertEqual(m.tp_group, [9, 13])
         self.assertEqual(m.pp_group, [1, 9])
-        self.assertEqual(m.cp_group, [9, 11, 13, 15])
+        self.assertEqual(m.cp_group, [8, 9, 10, 11])
         self.assertFalse(m.is_first_pp_rank())
         self.assertTrue(m.is_last_pp_rank())
-        self.assertTrue(m.is_first_cp_rank())
+        self.assertFalse(m.is_first_cp_rank())
         self.assertFalse(m.is_last_cp_rank())
         self.assertEqual(m.prev_pp_rank(), 1)
         self.assertEqual(m.next_pp_rank(), 1)
-        self.assertEqual(m.prev_cp_rank(), 15)
-        self.assertEqual(m.next_cp_rank(), 11)
+        self.assertEqual(m.prev_cp_rank(), 8)
+        self.assertEqual(m.next_cp_rank(), 10)

Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,8 @@ bool CacheFormatter::needSendCache(`
`154`	`154`	`return true;`
`155`	`155`	`}`
`156`	`156`
`157`		`- int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;`
	`157`	`+ int selfCpSize = selfConfig.getParallelConfig().mContextParallelism;`
	`158`	`+ int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize;`
`158`	`159`	`int selfTpRankInDpGroup = selfTpRank;`
`159`	`160`	`if (selfConfig.getParallelConfig().mEnableAttentionDP)`
`160`	`161`	`{`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,8 @@ std::vector<size_t> MLACacheFormatter::pickRecvConnections(`
`60`	`60`	`bool MLACacheFormatter::needSendCache(`
`61`	`61`	`CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx)`
`62`	`62`	`{`
`63`		`- int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;`
	`63`	`+ int selfCpSize = selfConfig.getParallelConfig().mContextParallelism;`
	`64`	`+ int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize;`
`64`	`65`
`65`	`66`	`int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP`
`66`	`67`	`? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize`
Original file line number	Diff line number	Diff line change
`@@ -107,9 +107,9 @@ TargetRanksInfo TargetRanksInfoForDP(`
`107`	`107`	`auto const peerCPNum = peerParConfig.mContextParallelism;`
`108`	`108`	`auto const selfCPNum = selfParConfig.mContextParallelism;`
`109`	`109`
`110`		`- auto const selfTPRank = selfRank % selfTPNum;`
	`110`	`+ auto const selfCPRank = selfRank % selfCPNum;`
	`111`	`+ auto const selfTPRank = (selfRank % (selfTPNum * selfCPNum)) / selfCPNum;`
`111`	`112`	`auto const selfPPRank = selfRank / (selfTPNum * selfCPNum);`
`112`		`- auto const selfCPRank = (selfRank % (selfTPNum * selfCPNum)) / selfTPNum;`
`113`	`113`
`114`	`114`	`int peerPPRankStart = 0;`
`115`	`115`	`int mDomainPPSize = 1;`
`@@ -211,7 +211,9 @@ TargetRanksInfo TargetRanksInfoForDP(`
`211`	`211`	`{`
`212`	`212`	`for (int k = peerPPRankStart; k < peerPPRankEnd; k++)`
`213`	`213`	`{`
`214`		`- int irank = (k * peerTPNum * peerCPNum) + (j * peerTPNum) + i;`
	`214`	`+ // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank`
	`215`	`+ // where i=tpRank, j=cpRank, k=ppRank`
	`216`	`+ int irank = (k * peerTPNum * peerCPNum) + (i * peerCPNum) + j;`
`215`	`217`	`retRanks.push_back(irank);`
`216`	`218`	`}`
`217`	`219`	`}`