Skip to content

Commit f59d64e

Browse files
authored
[None][fix] Several minor fixes to CI setting (#9765)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 7c6c493 commit f59d64e

File tree

3 files changed

+15
-9
lines changed

3 files changed

+15
-9
lines changed

jenkins/L0_Test.groovy

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
16391639
sh "tar -zxf ${tarName}"
16401640
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
16411641
def llmSrc = "${llmPath}/TensorRT-LLM/src"
1642+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
16421643
sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
16431644
} catch (InterruptedException e) {
16441645
throw e
@@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter)
29032904
"DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
29042905
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
29052906
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
2906-
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
2907-
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
2907+
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
2908+
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
2909+
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
2910+
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
29082911
// Perf sanity post merge test
29092912
// Disable perf stages due to https://nvbugs/5643646
29102913
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
@@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter)
29332936
fullSet += SBSATestConfigs.keySet()
29342937

29352938
SBSASlurmTestConfigs = [
2936-
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
2939+
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
2940+
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
29372941
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
29382942
// Disable GB300 stages due to nodes will be offline temporarily.
29392943
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],

jenkins/scripts/slurm_run.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,14 @@ set_value_in_command() {
2929
echo "$result"
3030
}
3131

32-
# Only the first process will save the job ID
32+
# Only the first process will save the job ID and set the git config
3333
if [ $SLURM_PROCID -eq 0 ]; then
3434
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
3535
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
36+
# Update HOME/.gitconfig
37+
if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
38+
git config --global --add safe.directory "*"
39+
fi
3640
fi
3741

3842
if [ $SLURM_LOCALID -eq 0 ]; then
@@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
4751
fi
4852
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
4953
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
50-
git config --global --add safe.directory "*"
5154
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
5255
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
5356
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"

scripts/check_test_list.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,9 @@
2323

2424

2525
def install_python_dependencies(llm_src):
26-
subprocess.run(
27-
f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
28-
shell=True,
29-
check=True)
26+
subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
27+
shell=True,
28+
check=True)
3029
subprocess.run(
3130
f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
3231
shell=True,

0 commit comments

Comments
 (0)