cleaner order of iteration

brb-nv · brb-nv · commit c229c874197c · 2025-12-31T01:11:33.000Z
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu b/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu
@@ -205,15 +205,14 @@ TargetRanksInfo TargetRanksInfoForDP(
     }
 
     std::vector<int> retRanks;
-    for (int i = peerTPRankStart; i < peerTPRankEnd; i++)
+    for (int i = peerCPRankStart; i < peerCPRankEnd; i++)
     {
-        for (int j = peerCPRankStart; j < peerCPRankEnd; j++)
+        for (int j = peerTPRankStart; j < peerTPRankEnd; j++)
         {
             for (int k = peerPPRankStart; k < peerPPRankEnd; k++)
             {
-                // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank
-                // where i=tpRank, j=cpRank, k=ppRank
-                int irank = (k * peerTPNum * peerCPNum) + (i * peerCPNum) + j;
+                // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank.
+                int irank = (k * peerTPNum * peerCPNum) + (j * peerCPNum) + i;
                 retRanks.push_back(irank);
             }
         }
diff --git a/tensorrt_llm/_torch/device_mesh.py b/tensorrt_llm/_torch/device_mesh.py
@@ -118,8 +118,8 @@ def build_mesh(self):
                 "DeviceMesh creation requested but torch.distributed process group "
                 "has not been initialised.")
 
-        # Dimensions go from slowest-varying (outermost) to fastest-varying (innermost)
-        # Layout: pp is outermost, then tp, then cp is innermost (consecutive)
+        # Dimensions go from slowest-varying (outermost) to fastest-varying (innermost).
+        # Layout: pp is outermost, then tp, then cp is innermost (consecutive).
         dims = ["pp", "tp"]
         shape = [self.pp_size, self.tp_size]
 
diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py
@@ -292,13 +292,13 @@ def has_cp(self):
         return self.cp_size > 1
 
     def prev_cp_rank(self):
-        # cp ranks are consecutive, so prev is rank - 1 with wraparound within cp group
+        # cp ranks are consecutive, so prev is rank - 1 with wraparound within cp group.
         if self.cp_rank == 0:
             return self.rank + self.cp_size - 1
         return self.rank - 1
 
     def next_cp_rank(self):
-        # cp ranks are consecutive, so next is rank + 1 with wraparound within cp group
+        # cp ranks are consecutive, so next is rank + 1 with wraparound within cp group.
         if self.cp_rank == self.cp_size - 1:
             return self.rank - self.cp_size + 1
         return self.rank + 1
@@ -596,15 +596,15 @@ def _init_parallel_groups(self):
             ranks = range(i, self.world_size, self.tp_size * self.cp_size)
             self.pp_groups.append(list(ranks))
 
-        # init cp group (consecutive ranks within each tp slice)
+        # init cp group (consecutive ranks within each tp slice).
         for i in range(self.pp_size):
             for j in range(self.tp_size):
                 ranks = range(
                     i * self.tp_size * self.cp_size + j * self.cp_size,
                     i * self.tp_size * self.cp_size + (j + 1) * self.cp_size)
                 self.cp_groups.append(list(ranks))
 
-        # init tp group (interleaved ranks with stride of cp_size)
+        # init tp group (interleaved ranks with stride of cp_size).
         for i in range(self.pp_size):
             for j in range(self.cp_size):
                 ranks = range(i * self.tp_size * self.cp_size + j,
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
@@ -740,7 +740,7 @@ def from_checkpoint(
 
         rank = config.mapping.rank
         if config.mapping.cp_size > 1:
-            # cp_tp_pp rank -> tp_pp rank: because different cp ranks share the same ckpt
+            # cp_tp_pp rank -> tp_pp rank: because different cp ranks share the same ckpt.
             tp_size = config.mapping.tp_size
             cp_size = config.mapping.cp_size
             rank = (rank % (tp_size * cp_size)) // cp_size + rank // (
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -541,11 +541,13 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2]
-accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp1tp2cp2]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2]
-accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp1tp2cp2]
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4]
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4]
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
 accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -66,7 +66,8 @@ l0_dgx_b200:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60)
@@ -93,7 +94,8 @@ l0_dgx_b200:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60)

Original file line number	Diff line number	Diff line change
`@@ -205,15 +205,14 @@ TargetRanksInfo TargetRanksInfoForDP(`
`205`	`205`	`}`
`206`	`206`
`207`	`207`	`std::vector<int> retRanks;`
`208`		`- for (int i = peerTPRankStart; i < peerTPRankEnd; i++)`
	`208`	`+ for (int i = peerCPRankStart; i < peerCPRankEnd; i++)`
`209`	`209`	`{`
`210`		`- for (int j = peerCPRankStart; j < peerCPRankEnd; j++)`
	`210`	`+ for (int j = peerTPRankStart; j < peerTPRankEnd; j++)`
`211`	`211`	`{`
`212`	`212`	`for (int k = peerPPRankStart; k < peerPPRankEnd; k++)`
`213`	`213`	`{`
`214`		`- // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank`
`215`		`- // where i=tpRank, j=cpRank, k=ppRank`
`216`		`- int irank = (k * peerTPNum * peerCPNum) + (i * peerCPNum) + j;`
	`214`	`+ // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank.`
	`215`	`+ int irank = (k * peerTPNum * peerCPNum) + (j * peerCPNum) + i;`
`217`	`216`	`retRanks.push_back(irank);`
`218`	`217`	`}`
`219`	`218`	`}`