Skip to content

Commit 1375b9f

Browse files
authored
[https://nvbugs/5515753][ci] Add NCCL_DEBUG=INFO flag to collect more info with CI failure. (#8440)
Signed-off-by: Simeng Liu <simengl@nvidia.com>
1 parent 0acdecb commit 1375b9f

File tree

4 files changed

+8
-4
lines changed

4 files changed

+8
-4
lines changed

jenkins/L0_Test.groovy

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1664,6 +1664,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
16641664
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
16651665
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
16661666
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
1667+
// Enable NCCL debug information for multi-GPU tests
1668+
extraInternalEnv += " NCCL_DEBUG=INFO"
16671669

16681670
def testDBList = renderTestDB(testList, llmSrc, stageName)
16691671
testList = "${testList}_${splitId}"

tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def func(input, residual, norm_weight, eps, enable_fusion):
183183
)
184184
def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,
185185
fusion):
186-
186+
pytest.skip("https://nvbugs/5597647")
187187
torch.manual_seed(42)
188188
tensor_parallel_size = 2
189189

tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
3333
is_fp8 = quant == "fp8"
3434
is_fp4 = quant == "fp4"
3535

36-
if tp_size == 4:
37-
pytest.skip(f"https://nvbugs/5515753")
38-
3936
if torch.cuda.device_count() < tp_size:
4037
pytest.skip(f"Not enough GPUs available, need {tp_size} "
4138
f"but only have {torch.cuda.device_count()}")

tests/unittest/_torch/thop/parallel/test_moe.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,7 @@ class TestMoeFp4:
10621062
)
10631063
def test_autotune(self, num_tokens, hidden_size, intermediate_size,
10641064
routing_info):
1065+
pytest.skip("https://nvbugs/5575841")
10651066

10661067
self.run_moe_fp4_test(num_tokens,
10671068
hidden_size,
@@ -1114,6 +1115,7 @@ def test_autotune(self, num_tokens, hidden_size, intermediate_size,
11141115
ids=["use_score_as_input", "use_topk_as_input"])
11151116
def test_no_autotune(self, num_tokens, hidden_size, intermediate_size,
11161117
routing_info, use_topk_as_input):
1118+
pytest.skip("https://nvbugs/5575841")
11171119

11181120
self.run_moe_fp4_test(num_tokens,
11191121
hidden_size,
@@ -1154,6 +1156,9 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int,
11541156
if padding >= 256:
11551157
pytest.skip("Routing kernel requires that padding be less than 256")
11561158

1159+
if intermediate_size == 384:
1160+
pytest.skip("https://nvbugs/5434352")
1161+
11571162
assert top_k <= num_experts
11581163
assert top_k <= 8
11591164
assert num_experts % 4 == 0

0 commit comments

Comments
 (0)