@@ -642,6 +642,11 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
642642 echo "--gpus ${ gpuCount} "
643643 fi
644644 """ , returnStdout : true ). trim()
645+
646+ if (cluster. host. contains(" dlcluster" )) {
647+ dockerArgs + = " " + sh(script : ' echo " -e NVIDIA_IMEX_CHANNELS=${NVIDIA_IMEX_CHANNELS:-0}"' , returnStdout : true ). trim()
648+ dockerArgs + = " --device=/dev/gdrdrv:/dev/gdrdrv"
649+ }
645650 }
646651
647652 dockerArgs = " ${ dockerArgs} " +
@@ -655,10 +660,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
655660 " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
656661 " --cap-add=SYSLOG"
657662
658- if (partition. clusterName == " dlcluster" ) {
659- dockerArgs + = " -e NVIDIA_IMEX_CHANNELS=0"
660- dockerArgs + = " --device=/dev/gdrdrv:/dev/gdrdrv"
661- }
662663 echo " Final dockerArgs: ${ dockerArgs} "
663664 } else {
664665 error " The Slurm node does not come online in the waiting period. Terminating the job."
@@ -996,8 +997,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
996997 export resourcePathNode=$resourcePathNode
997998 export pytestCommand="$pytestCommand "
998999 export coverageConfigFile="$coverageConfigFile "
999- export NVIDIA_IMEX_CHANNELS=0
1000- [ -z "\$ {NVIDIA_VISIBLE_DEVICES:-}" ] && export NVIDIA_VISIBLE_DEVICES=\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
1000+ export NVIDIA_IMEX_CHANNELS=\$ {NVIDIA_IMEX_CHANNELS:-0}
1001+ export NVIDIA_VISIBLE_DEVICES=\$ {NVIDIA_VISIBLE_DEVICES:-\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))}
1002+
1003+ echo "Env NVIDIA_IMEX_CHANNELS: \$ NVIDIA_IMEX_CHANNELS"
1004+ echo "Env NVIDIA_VISIBLE_DEVICES: \$ NVIDIA_VISIBLE_DEVICES"
10011005
10021006 ${ srunPrologue}
10031007
0 commit comments