diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 6351e083f52..bf9cf18e028 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1664,6 +1664,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\"" // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}" + // Enable NCCL debug information for multi-GPU tests + extraInternalEnv += " NCCL_DEBUG=INFO" def testDBList = renderTestDB(testList, llmSrc, stageName) testList = "${testList}_${splitId}" diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py index 7ccbc50d7b6..7a3560cc14b 100644 --- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py @@ -183,7 +183,7 @@ def func(input, residual, norm_weight, eps, enable_fusion): ) def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy, fusion): - + pytest.skip("https://nvbugs/5597647") torch.manual_seed(42) tensor_parallel_size = 2 diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py index a94e89c743f..5a38f0d0788 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py @@ -33,9 +33,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size): is_fp8 = quant == "fp8" is_fp4 = quant == "fp4" - if tp_size == 4: - pytest.skip(f"https://nvbugs/5515753") - if torch.cuda.device_count() < tp_size: pytest.skip(f"Not enough GPUs available, need {tp_size} " f"but only have {torch.cuda.device_count()}") diff --git a/tests/unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/parallel/test_moe.py index 559aba5eb06..3c60c57283e 100644 --- a/tests/unittest/_torch/thop/parallel/test_moe.py +++ b/tests/unittest/_torch/thop/parallel/test_moe.py @@ -1062,6 +1062,7 @@ class TestMoeFp4: ) def test_autotune(self, num_tokens, hidden_size, intermediate_size, routing_info): + pytest.skip("https://nvbugs/5575841") self.run_moe_fp4_test(num_tokens, hidden_size, @@ -1114,6 +1115,7 @@ def test_autotune(self, num_tokens, hidden_size, intermediate_size, ids=["use_score_as_input", "use_topk_as_input"]) def test_no_autotune(self, num_tokens, hidden_size, intermediate_size, routing_info, use_topk_as_input): + pytest.skip("https://nvbugs/5575841") self.run_moe_fp4_test(num_tokens, hidden_size, @@ -1154,6 +1156,9 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int, if padding >= 256: pytest.skip("Routing kernel requires that padding be less than 256") + if intermediate_size == 384: + pytest.skip("https://nvbugs/5434352") + assert top_k <= num_experts assert top_k <= 8 assert num_experts % 4 == 0