diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 1e4c3309fdc..1e226c78d21 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -939,8 +939,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh") def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh" def coverageConfigFile = "${jobWorkspace}/.coveragerc" - def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py" - def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py" stage("[${stageName}] Initializing Test") { // Create Job Workspace folder in Frontend Node @@ -1023,16 +1021,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG coverageConfigFile ) - if (perfSanityMode) { - Utils.copyFileToRemoteHost( - pipeline, - remote, - perfCheckScriptLocal, - perfCheckScriptNode, - true - ) - } - // Generate Pytest command String pytestUtil = "" if (nodeCount > 1) { @@ -1314,22 +1302,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG ), numRetries: 3 ) - - if (perfSanityMode) { - stage("[${stageName}] Check perf result") { - def perfCheckResult = Utils.exec( - pipeline, - script: Utils.sshUserCmd( - remote, - "python3 ${perfCheckScriptNode} ${jobWorkspace}" - ), - returnStatus: true - ) - if (perfCheckResult != 0) { - error "Performance regression detected and failing the build (exit code: ${perfCheckResult})" - } - } - } } echo "Finished test stage execution." @@ -3297,15 +3269,10 @@ def launchTestJobs(pipeline, testFilter) "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4], - "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4], - "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4], - "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4], - "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4], - "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4], ] diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index e8085f0e095..584ae1e7c97 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -135,6 +135,14 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code" fi +if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then + echo "Check Perf-Sanity Result" + python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \ + $jobWorkspace + perf_sanity_check_exit_code=$? + echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code" +fi + if [ "$pytest_exit_code" -ne 0 ]; then final_exit_code=$pytest_exit_code elif [ "$perf_check_exit_code" -ne 0 ]; then diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py index 28306b6fe1c..9dec673688a 100644 --- a/tests/integration/defs/perf/open_search_db_utils.py +++ b/tests/integration/defs/perf/open_search_db_utils.py @@ -62,6 +62,10 @@ "d_p99_e2el", ] +# Default threshold values for performance regression detection +POST_MERGE_THRESHOLD = 0.05 +PRE_MERGE_THRESHOLD = 0.1 + # Fields for scenario-only matching for recipe tests. # Unlike regular tests that match on all config fields, recipes match only on the benchmark # scenario, allowing the underlying config to change while still comparing against baselines diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index 748fdd71352..f7fb6b536a7 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -463,6 +463,7 @@ class DisaggConfig: def __init__( self, + name: str, disagg_serving_type: str, hostname: str, numa_bind: bool, @@ -472,6 +473,7 @@ def __init__( hardware: dict, server_env_var: str, ): + self.name = name self.disagg_serving_type = disagg_serving_type self.hostname = hostname self.numa_bind = numa_bind @@ -971,7 +973,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str): # Create ctx server config ctx_server_config_data = { "concurrency": max(concurrency_values), - "name": f"ctx_{config_file_base_name}", + "name": config_file_base_name, "model_name": model_name, "gpus_per_node": gpus_per_node, **worker_config.get("ctx", {}), @@ -980,7 +982,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str): # Create gen server config gen_server_config_data = { "concurrency": max(concurrency_values), - "name": f"gen_{config_file_base_name}", + "name": config_file_base_name, "model_name": model_name, "gpus_per_node": gpus_per_node, **worker_config.get("gen", {}), @@ -991,6 +993,7 @@ def _parse_disagg_config_file(self, config_file_path: str, config_file: str): # Create disagg config disagg_config = DisaggConfig( + name=config_file_base_name, disagg_serving_type=disagg_serving_type, hostname=socket.gethostname(), numa_bind=numa_bind, @@ -1249,6 +1252,8 @@ def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict: new_data.update(job_config) new_data.update(server_config_dict) new_data.update(client_config_dict) + # Add test_case_name for convenient filtering on OpenSearch + new_data["s_test_case_name"] = f"{server_config.name}-{client_config.name}" for metric_name in PERF_METRIC_LOG_QUERIES: if metric_name in self._test_results[cmd_idx]: @@ -1308,12 +1313,13 @@ def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict: "l_num_gen_servers": num_gen_servers, } new_data.update(job_config) - if num_ctx_servers > 0: new_data.update(ctx_server_config_dict) if num_gen_servers > 0: new_data.update(gen_server_config_dict) new_data.update(client_config_dict) + # Add test_case_name for convenient filtering on OpenSearch + new_data["s_test_case_name"] = f"{disagg_config.name}-{client_config.name}" for metric_name in PERF_METRIC_LOG_QUERIES: if metric_name in self._test_results[cmd_idx]: diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml index 4b78b74a264..5d38a2a36a5 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml @@ -14,20 +14,20 @@ l0_gb200_multi_gpus_perf_sanity: stage: pre_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90) - condition: ranges: system_gpu_count: @@ -42,17 +42,17 @@ l0_gb200_multi_gpus_perf_sanity: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)