Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 64 additions & 25 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -887,7 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Create a unique suffix for the job name
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
def perfSanityMode = stageName.contains("PerfSanity")
def disaggMode = stageName.contains("PerfSanity-Disagg")
def setSegment = disaggMode

Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
Expand Down Expand Up @@ -930,6 +931,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py"

stage("[${stageName}] Initializing Test") {
// Create Job Workspace folder in Frontend Node
Expand Down Expand Up @@ -1004,6 +1007,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
coverageConfigFile
)

if (perfSanityMode) {
Utils.copyFileToRemoteHost(
pipeline,
remote,
perfCheckScriptLocal,
perfCheckScriptNode,
true
)
}

// Generate Pytest command
String pytestUtil = ""
if (nodeCount > 1) {
Expand Down Expand Up @@ -1078,7 +1091,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Define environment variables to export
def envVarNames = [
'OPEN_SEARCH_DB_BASE_URL',
'OPEN_SEARCH_DB_CREDENTIALS',
'OPEN_SEARCH_DB_CREDENTIALS_USR',
'OPEN_SEARCH_DB_CREDENTIALS_PSW',
'BUILD_ID',
'BUILD_URL',
'JOB_NAME',
Expand Down Expand Up @@ -1245,6 +1259,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
),
numRetries: 3
)

if (perfSanityMode) {
stage("[${stageName}] Check perf result") {
def perfCheckResult = Utils.exec(
pipeline,
script: Utils.sshUserCmd(
remote,
"python3 ${perfCheckScriptNode} ${jobWorkspace}/${stageName}"
),
returnStatus: true
)
if (perfCheckResult != 0) {
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
}
}
}
}

echo "Finished test stage execution."
Expand Down Expand Up @@ -2698,7 +2728,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
error "Some tests still failed after rerun attempts, please check the test report."
}

if (perfMode && !stageName.contains("Perf-Sanity")) {
if (perfMode) {
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
stage("Check perf result") {
Expand All @@ -2724,7 +2754,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
}
}

if (perfMode && stageName.contains("Perf-Sanity")) {
if (stageName.contains("PerfSanity")) {
stage ("Check perf result") {
def perfCheckResult = sh(
script: """
Expand All @@ -2733,10 +2763,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
""",
returnStatus: true
)
// TODO: Enable this when perf regression check is stable
// if (perfCheckResult != 0) {
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
// }
if (perfCheckResult != 0) {
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
}
}
}
}
Expand Down Expand Up @@ -3100,7 +3129,7 @@ def launchTestJobs(pipeline, testFilter)
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
]

parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), {
def config = VANILLA_CONFIG
if (key.contains("single-device")) {
config = SINGLE_DEVICE_CONFIG
Expand All @@ -3111,7 +3140,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("Pybind")) {
config = PYBIND_CONFIG
}
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
}]]}
fullSet = parallelJobs.keySet()

Expand All @@ -3132,9 +3161,9 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// Perf sanity post merge test
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
// "DGX_B300-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
]
fullSet += x86SlurmTestConfigs.keySet()

Expand All @@ -3146,7 +3175,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
}]]}

parallelJobs += parallelSlurmJobs
Expand All @@ -3162,11 +3191,19 @@ def launchTestJobs(pipeline, testFilter)
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
// Perf sanity post merge test
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
// Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
// Perf sanity pre merge test
"GB200-4_GPUs-PyTorch-PerfSanity-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 6, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 6, 4],
// Perf sanity post merge test
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 6, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 6, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 6, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 6, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 6, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 6, 4],
]
fullSet += SBSASlurmTestConfigs.keySet()

Expand All @@ -3178,13 +3215,15 @@ def launchTestJobs(pipeline, testFilter)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
// Perf sanity post merge aggr tests
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
// Perf sanity post merge disagg tests
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
// Perf sanity pre merge tests
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
// Perf sanity post merge tests
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
]
fullSet += multiNodesSBSAConfigs.keySet()

Expand All @@ -3202,7 +3241,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
}]]}
parallelJobs += parallelSlurmJobs

Expand All @@ -3215,7 +3254,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
}]]}

parallelJobs += parallelMultiNodesSBSAJobs
Expand Down
6 changes: 5 additions & 1 deletion jenkins/scripts/open_search_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
JOB_MACHINE_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-job_machine_info"
FAILED_STEP_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-failed_step_info"
PR_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-pr_info"
PERF_SANITY_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-perf_sanity_info"

READ_ACCESS_PROJECT_NAME = [
JOB_PROJECT_NAME,
Expand All @@ -59,9 +60,12 @@
JOB_MACHINE_PROJECT_NAME,
FAILED_STEP_PROJECT_NAME,
PR_PROJECT_NAME,
PERF_SANITY_PROJECT_NAME,
]

WRITE_ACCESS_PROJECT_NAME = []
WRITE_ACCESS_PROJECT_NAME = [
PERF_SANITY_PROJECT_NAME,
]

DISABLE_OPEN_SEARCH_DB_FOR_LOCAL_TEST = False

Expand Down
8 changes: 1 addition & 7 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ echo "Full Command: $pytestCommand"
eval $pytestCommand
echo "Rank${SLURM_PROCID} Pytest finished execution"

if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
if [[ "$stageName" == *PyTorch* ]]; then
basePerfFilename="base_perf_pytorch.csv"
else
Expand All @@ -117,9 +117,3 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
--files $stageName/perf_script_test_results.csv \
$basePerfPath
fi

if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
echo "Check Perf-Sanity Result"
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
$jobWorkspace
fi
Loading