[https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to collect more info with CI failure. (#8440)

SimengLiu-nv · web-flow · commit 1375b9f0744f · 2025-10-21T18:12:05.000-07:00
Signed-off-by: Simeng Liu &lt;simengl@nvidia.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -1664,6 +1664,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
         // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
         extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
+        // Enable NCCL debug information for multi-GPU tests
+        extraInternalEnv += " NCCL_DEBUG=INFO"
 
         def testDBList = renderTestDB(testList, llmSrc, stageName)
         testList = "${testList}_${splitId}"
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -183,7 +183,7 @@ def func(input, residual, norm_weight, eps, enable_fusion):
 )
 def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,
                                          fusion):
-
+    pytest.skip("https://nvbugs/5597647")
     torch.manual_seed(42)
     tensor_parallel_size = 2
 
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
     is_fp8 = quant == "fp8"
     is_fp4 = quant == "fp4"
 
-    if tp_size == 4:
-        pytest.skip(f"https://nvbugs/5515753")
-
     if torch.cuda.device_count() < tp_size:
         pytest.skip(f"Not enough GPUs available, need {tp_size} "
                     f"but only have {torch.cuda.device_count()}")
diff --git a/tests/unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/parallel/test_moe.py
@@ -1062,6 +1062,7 @@ class TestMoeFp4:
     )
     def test_autotune(self, num_tokens, hidden_size, intermediate_size,
                       routing_info):
+        pytest.skip("https://nvbugs/5575841")
 
         self.run_moe_fp4_test(num_tokens,
                               hidden_size,
@@ -1114,6 +1115,7 @@ def test_autotune(self, num_tokens, hidden_size, intermediate_size,
                              ids=["use_score_as_input", "use_topk_as_input"])
     def test_no_autotune(self, num_tokens, hidden_size, intermediate_size,
                          routing_info, use_topk_as_input):
+        pytest.skip("https://nvbugs/5575841")
 
         self.run_moe_fp4_test(num_tokens,
                               hidden_size,
@@ -1154,6 +1156,9 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int,
         if padding >= 256:
             pytest.skip("Routing kernel requires that padding be less than 256")
 
+        if intermediate_size == 384:
+            pytest.skip("https://nvbugs/5434352")
+
         assert top_k <= num_experts
         assert top_k <= 8
         assert num_experts % 4 == 0

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ def func(input, residual, norm_weight, eps, enable_fusion):`
`183`	`183`	`)`
`184`	`184`	`def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,`
`185`	`185`	`fusion):`
`186`		`-`
	`186`	`+ pytest.skip("https://nvbugs/5597647")`
`187`	`187`	`torch.manual_seed(42)`
`188`	`188`	`tensor_parallel_size = 2`
`189`	`189`