Skip to content

Commit 00ec189

Browse files
Merge branch 'main' into gk/FP4_weight_shape_fix
2 parents e43d366 + bdf6953 commit 00ec189

File tree

140 files changed

+7122
-3304
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+7122
-3304
lines changed

examples/auto_deploy/build_and_run_ad.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,11 @@ def main(config: Optional[ExperimentConfig] = None):
277277
config.prompt.queries,
278278
sampling_params=SamplingParams(**config.prompt.sp_kwargs),
279279
)
280-
results = {"prompts_and_outputs": print_outputs(outs)}
280+
results = {
281+
"prompts_and_outputs": print_outputs(outs),
282+
}
283+
# Add config values so they get logged to JET extra
284+
results.update(config.model_dump(mode="json"))
281285

282286
# run a benchmark for the model with batch_size == config.benchmark_bs
283287
if config.benchmark.enabled and config.args.runtime != "trtllm":

jenkins/L0_Test.groovy

Lines changed: 58 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -697,9 +697,13 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
697697

698698
slurmRunner = null
699699
if (cluster.containerRuntime.toString() == "DOCKER") {
700-
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
700+
echo "${stageName} partitionTimeout: ${partition.time}"
701+
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
702+
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
701703
} else if (cluster.containerRuntime.toString() == "ENROOT") {
702-
slurmRunner = runInEnrootOnNode(nodeName)
704+
echo "${stageName} partitionTimeout: ${partition.time}"
705+
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
706+
slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
703707
} else {
704708
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
705709
}
@@ -889,7 +893,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
889893
// Create a unique suffix for the job name
890894
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
891895
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
892-
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
896+
def perfSanityMode = stageName.contains("PerfSanity")
897+
def disaggMode = stageName.contains("PerfSanity-Disagg")
893898
def setSegment = disaggMode
894899

895900
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@@ -1090,7 +1095,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
10901095
// Define environment variables to export
10911096
def envVarNames = [
10921097
'OPEN_SEARCH_DB_BASE_URL',
1093-
'OPEN_SEARCH_DB_CREDENTIALS',
1098+
'OPEN_SEARCH_DB_CREDENTIALS_USR',
1099+
'OPEN_SEARCH_DB_CREDENTIALS_PSW',
10941100
'BUILD_ID',
10951101
'BUILD_URL',
10961102
'JOB_NAME',
@@ -1133,6 +1139,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
11331139
#SBATCH --output=${outputPath}
11341140
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
11351141
#SBATCH ${partition.additionalArgs}
1142+
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
11361143
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
11371144
11381145
# SBATCH directives must appear before any executable commands.
@@ -2780,7 +2787,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
27802787
error "Some tests still failed after rerun attempts, please check the test report."
27812788
}
27822789

2783-
if (perfMode && !stageName.contains("Perf-Sanity")) {
2790+
if (perfMode) {
27842791
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
27852792
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
27862793
stage("Check perf result") {
@@ -2806,7 +2813,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28062813
}
28072814
}
28082815

2809-
if (perfMode && stageName.contains("Perf-Sanity")) {
2816+
if (stageName.contains("PerfSanity")) {
28102817
stage ("Check perf result") {
28112818
def perfCheckResult = sh(
28122819
script: """
@@ -2815,10 +2822,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
28152822
""",
28162823
returnStatus: true
28172824
)
2818-
// TODO: Enable this when perf regression check is stable
2819-
// if (perfCheckResult != 0) {
2820-
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2821-
// }
2825+
if (perfCheckResult != 0) {
2826+
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
2827+
}
28222828
}
28232829
}
28242830
}
@@ -3013,7 +3019,7 @@ def ensureStageResultNotUploaded(stageName) {
30133019
}
30143020

30153021
// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
3016-
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
3022+
def runInDockerOnNodeMultiStage(image, label, dockerArgs, partitionTimeout, needToDeleteDir=true)
30173023
{
30183024
return {
30193025
runner -> node(label) {
@@ -3024,9 +3030,9 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30243030
stage('Pull Docker Image') {
30253031
docker.image(image).pull()
30263032
}
3027-
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
3033+
// We submit the Slurm job with the Slurm partition's time spec.
30283034
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
3029-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT - 10, unit: 'MINUTES') {
3035+
timeout(time: partitionTimeout - 10, unit: 'MINUTES') {
30303036
docker.image(image).inside(dockerArgs) {
30313037
runner()
30323038
}
@@ -3042,13 +3048,13 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
30423048
}
30433049
}
30443050

3045-
def runInEnrootOnNode(label)
3051+
def runInEnrootOnNode(label, partitionTimeout)
30463052
{
30473053
return {
30483054
runner -> node(label) {
3049-
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
3055+
// We submit the Slurm job with the Slurm partition's time spec.
30503056
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
3051-
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT - 10, unit: 'MINUTES') {
3057+
timeout(time: partitionTimeout - 10, unit: 'MINUTES') {
30523058
runner()
30533059
}
30543060
}
@@ -3182,7 +3188,7 @@ def launchTestJobs(pipeline, testFilter)
31823188
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
31833189
]
31843190

3185-
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
3191+
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), {
31863192
def config = VANILLA_CONFIG
31873193
if (key.contains("single-device")) {
31883194
config = SINGLE_DEVICE_CONFIG
@@ -3193,7 +3199,7 @@ def launchTestJobs(pipeline, testFilter)
31933199
if (key.contains("Pybind")) {
31943200
config = PYBIND_CONFIG
31953201
}
3196-
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
3202+
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
31973203
}]]}
31983204
fullSet = parallelJobs.keySet()
31993205

@@ -3214,9 +3220,12 @@ def launchTestJobs(pipeline, testFilter)
32143220
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
32153221
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
32163222
// Perf sanity post merge test
3217-
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
3218-
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
3219-
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
3223+
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
3224+
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
3225+
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
3226+
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
3227+
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
3228+
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
32203229
]
32213230
fullSet += x86SlurmTestConfigs.keySet()
32223231

@@ -3228,7 +3237,7 @@ def launchTestJobs(pipeline, testFilter)
32283237
if (key.contains("llvm")) {
32293238
config = LLVM_CONFIG
32303239
}
3231-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
3240+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
32323241
}]]}
32333242

32343243
parallelJobs += parallelSlurmJobs
@@ -3247,11 +3256,25 @@ def launchTestJobs(pipeline, testFilter)
32473256
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
32483257
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
32493258
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
3250-
// Perf sanity post merge test
3251-
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
32523259
// Disable GB300 stages due to nodes will be offline temporarily.
32533260
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
32543261
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
3262+
// Perf sanity pre merge test
3263+
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
3264+
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
3265+
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
3266+
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
3267+
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
3268+
// Perf sanity post merge test
3269+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
3270+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
3271+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
3272+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
3273+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
3274+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
3275+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
3276+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
3277+
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
32553278
]
32563279
fullSet += SBSASlurmTestConfigs.keySet()
32573280

@@ -3263,13 +3286,15 @@ def launchTestJobs(pipeline, testFilter)
32633286
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
32643287
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
32653288
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
3266-
// Perf sanity post merge aggr tests
3267-
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
3268-
// Perf sanity post merge disagg tests
3269-
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
3270-
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
3271-
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
3272-
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
3289+
// Perf sanity pre merge tests
3290+
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3291+
// Perf sanity post merge tests
3292+
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
3293+
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
3294+
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
3295+
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
3296+
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
3297+
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
32733298
]
32743299
fullSet += multiNodesSBSAConfigs.keySet()
32753300

@@ -3287,7 +3312,7 @@ def launchTestJobs(pipeline, testFilter)
32873312
if (key.contains("llvm")) {
32883313
config = LLVM_CONFIG
32893314
}
3290-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
3315+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
32913316
}]]}
32923317
parallelJobs += parallelSlurmJobs
32933318

@@ -3300,7 +3325,7 @@ def launchTestJobs(pipeline, testFilter)
33003325
if (key.contains("llvm")) {
33013326
config = LLVM_CONFIG
33023327
}
3303-
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
3328+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
33043329
}]]}
33053330

33063331
parallelJobs += parallelMultiNodesSBSAJobs

jenkins/scripts/open_search_db.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
JOB_MACHINE_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-job_machine_info"
5252
FAILED_STEP_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-failed_step_info"
5353
PR_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-pr_info"
54+
PERF_SANITY_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-perf_sanity_info"
5455

5556
READ_ACCESS_PROJECT_NAME = [
5657
JOB_PROJECT_NAME,
@@ -59,9 +60,12 @@
5960
JOB_MACHINE_PROJECT_NAME,
6061
FAILED_STEP_PROJECT_NAME,
6162
PR_PROJECT_NAME,
63+
PERF_SANITY_PROJECT_NAME,
6264
]
6365

64-
WRITE_ACCESS_PROJECT_NAME = []
66+
WRITE_ACCESS_PROJECT_NAME = [
67+
PERF_SANITY_PROJECT_NAME,
68+
]
6569

6670
DISABLE_OPEN_SEARCH_DB_FOR_LOCAL_TEST = False
6771

jenkins/scripts/slurm_run.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ eval $pytestCommand
108108
pytest_exit_code=$?
109109
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
110110

111-
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
111+
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
112112
if [[ "$stageName" == *PyTorch* ]]; then
113113
basePerfFilename="base_perf_pytorch.csv"
114114
else
@@ -135,7 +135,7 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
135135
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
136136
fi
137137

138-
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
138+
if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
139139
echo "Check Perf-Sanity Result"
140140
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
141141
$jobWorkspace

0 commit comments

Comments
 (0)