@@ -751,6 +751,8 @@ def getPytestBaseCommandLine(
751751 extraInternalEnv = " __LUNOWUD=\" -thread_pool_size=${ TESTER_CORES} \" "
752752 // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
753753 extraInternalEnv + = " CPP_TEST_TIMEOUT_OVERRIDDEN=${ pytestTestTimeout} "
754+ // Enable NCCL debug information for multi-GPU tests
755+ extraInternalEnv + = " NCCL_DEBUG=INFO"
754756
755757 def testCmdLine = [
756758 " LLM_ROOT=${ llmSrc} " ,
@@ -2252,20 +2254,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
22522254 def noRegularTests = false
22532255 def noIsolateTests = false
22542256 def rerunFailed = false
2255-
2256- echoNodeAndGpuInfo(pipeline, stageName)
2257- sh ' if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
2258-
2259- def extraInternalEnv = " "
2260- def pytestTestTimeout = " 3600"
2261-
2262- // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
2263- extraInternalEnv = " __LUNOWUD=\" -thread_pool_size=${ TESTER_CORES} \" "
2264- // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
2265- extraInternalEnv + = " CPP_TEST_TIMEOUT_OVERRIDDEN=${ pytestTestTimeout} "
2266- // Enable NCCL debug information for multi-GPU tests
2267- extraInternalEnv + = " NCCL_DEBUG=INFO"
2268-
22692257 def testDBList = renderTestDB(testList, llmSrc, stageName)
22702258
22712259 // Process shard test list and create separate files for regular and isolate tests
0 commit comments