From 7b8983a8cfff742a507000352aa7ccbe25f7afb8 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Tue, 23 Dec 2025 11:24:08 +0000 Subject: [PATCH 01/13] fix slurm log path error Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 113 ++++++------------ .../defs/perf/disagg/utils/common.py | 13 -- 2 files changed, 37 insertions(+), 89 deletions(-) diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index 8f8f82a063d..2f4c482f052 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -219,7 +219,10 @@ def submit_job(test_config) -> tuple: # Call submit.py with the temporary config file submit_script = os.path.join(EnvManager.get_script_dir(), "submit.py") - cmd = ["python3", submit_script, "-c", temp_config_path] + + case_log_dir = JobManager.get_result_dir(test_config) + + cmd = ["python3", submit_script, "-c", temp_config_path, "--log-dir", case_log_dir] logger.info(f"Command: {' '.join(cmd)}") @@ -265,55 +268,49 @@ def backup_logs( Args: job_id: SLURM job ID test_config: TestConfig object - result_dir: Result directory path + result_dir: Result directory path (already named as test_id) is_passed: Whether the job passed Returns: - backup_dir path if successful, None otherwise + Final directory path if successful, None otherwise """ if not os.path.exists(result_dir): logger.warning(f"Result directory does not exist yet: {result_dir}") return None - # Replace colons with hyphens for safe directory naming - dst_dir_name = test_config.test_id.replace(":", "-") - # Add ERROR suffix if the job failed - if not is_passed: - dst_dir_name = f"{dst_dir_name}_ERROR" - backup_dir = os.path.join(os.path.dirname(result_dir), dst_dir_name) - try: - logger.info("Copying result directory to backup...") - logger.info(f"Source: {result_dir}") - logger.info(f"Destination: {backup_dir}") - - # Remove old backup if it exists - if os.path.exists(backup_dir): - logger.warning("Backup directory already exists, removing old backup") - shutil.rmtree(backup_dir) - - # Copy result directory - shutil.copytree(result_dir, backup_dir) - logger.success(f"Backup created successfully: {backup_dir}") - - # Move temporary config file to backup directory (not copy) + final_dir = result_dir + + # For FAILED cases, rename directory to add _ERROR suffix + if not is_passed: + error_dir = f"{result_dir}_ERROR" + logger.info(f"Renaming failed case directory: {result_dir} -> {error_dir}") + + # Remove old error directory if exists + if os.path.exists(error_dir): + logger.warning(f"Removing existing error directory: {error_dir}") + shutil.rmtree(error_dir) + + # Rename to add _ERROR suffix + shutil.move(result_dir, error_dir) + final_dir = error_dir + logger.success(f"Directory renamed to: {final_dir}") + + # Copy temporary config file to the directory temp_config_path = test_config.temp_config_path if os.path.exists(temp_config_path): - dest_path = os.path.join(backup_dir, os.path.basename(temp_config_path)) - shutil.move(temp_config_path, dest_path) - logger.success(f"Temporary config moved to backup: {dest_path}") + dest_path = os.path.join(final_dir, os.path.basename(temp_config_path)) + shutil.copy(temp_config_path, dest_path) + logger.success(f"Temporary config copied to: {dest_path}") + # Clean up the original temp config file + os.remove(temp_config_path) + logger.info(f"Cleaned up temporary config: {temp_config_path}") else: - # Fallback: copy original config if no temp file (backward compatibility) - case_config_path = test_config.config_path - if os.path.exists(case_config_path): - shutil.copy(case_config_path, backup_dir) - logger.success(f"Case config copied successfully: {case_config_path}") - else: - logger.warning(f"Case config not found: {case_config_path}") + logger.warning(f"Temporary config not found: {temp_config_path}") - return backup_dir + return final_dir except Exception as e: - logger.warning(f"Failed to create backup copy: {e}") + logger.warning(f"Failed to backup logs: {e}") # Try to clean up temporary file on backup failure temp_config_path = test_config.temp_config_path if os.path.exists(temp_config_path): @@ -324,26 +321,6 @@ def backup_logs( logger.warning(f"Failed to cleanup temp config: {cleanup_error}") return None - @staticmethod - def cleanup_result_dir(result_dir: str) -> bool: - """Clean up result directory. - - Args: - result_dir: Result directory path - - Returns: - True if successful, False otherwise - """ - if os.path.exists(result_dir): - try: - shutil.rmtree(result_dir) - logger.success(f"Result directory removed: {result_dir}") - return True - except Exception as e: - logger.warning(f"Failed to remove result directory: {e}") - return False - return True - @staticmethod def get_result_dir(test_config) -> str: """Get result directory. @@ -354,16 +331,10 @@ def get_result_dir(test_config) -> str: Returns: Result directory path """ - config_data = test_config.config_data - fields = extract_config_fields(config_data) - - # Extract fields for logging and result directory - log_base = fields["log_base"] - context_dir = fields["context_dir"] - log_dir_name = log_base - - result_dir = os.path.join(EnvManager.get_script_dir(), log_dir_name, context_dir) - return result_dir + # Use the same path as in submit_job: {output_path}/slurm_logs/{test_id} + log_dir = os.path.join(EnvManager.get_output_path(), "slurm_logs") + case_log_dir = os.path.join(log_dir, test_config.test_id.replace(":", "-")) + return case_log_dir @staticmethod def check_result( @@ -413,16 +384,6 @@ def check_result( except Exception as e: logger.error(f"Exception during result checking: {e}") check_result["error"] = f"Exception during result checking: {str(e)}" - - # Clean up result directory - if EnvManager.get_debug_mode(): - logger.debug(f"Debug mode: Skipping result directory cleanup: {result_dir}") - else: - try: - JobManager.cleanup_result_dir(result_dir) - except Exception as e: - logger.warning(f"Failed to cleanup result directory: {e}") - return check_result @staticmethod diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index 622aed81ce2..6c8805e3636 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -1,7 +1,6 @@ """Disaggregated Benchmark Configuration.""" import os -from datetime import datetime SESSION_COLLECT_CMD_TYPE = "session_collect" @@ -191,15 +190,6 @@ def extract_config_fields(config_data: dict) -> dict: if "speculative_config" in gen_config: mtp_size = gen_config["speculative_config"].get("num_nextn_predict_layers", 0) - # Generate derived fields - dep_flag = "dep" if gen_enable_dp else "tep" - date_prefix = datetime.now().strftime("%Y%m%d") - log_base = f"{date_prefix}/{isl}-{osl}" - context_dir = ( - f"disagg_ctx{ctx_num}_gen{gen_num}_{dep_flag}{gen_tp_size}_" - f"batch{gen_batch_size}_eplb{eplb_slots}_mtp{mtp_size}" - ) - return { "isl": isl, "osl": osl, @@ -210,10 +200,7 @@ def extract_config_fields(config_data: dict) -> dict: "gen_enable_dp": gen_enable_dp, "eplb_slots": eplb_slots, "mtp_size": mtp_size, - "dep_flag": dep_flag, "cache_transceiver_backend": cache_transceiver_backend, - "log_base": log_base, - "context_dir": context_dir, "gen_max_tokens": gen_max_tokens, "gen_max_batch_size": gen_max_batch_size, "streaming": streaming, From db33cbce4a00261d9ee8cc81688484412cb355be Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Wed, 24 Dec 2025 02:45:00 +0000 Subject: [PATCH 02/13] fix default backend issue - adapt for the current logic - skip single backend result for comparison Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/compare_backends.py | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/perf/disagg/compare_backends.py b/tests/integration/defs/perf/disagg/compare_backends.py index 1812fd36d59..8ff9c1d3631 100644 --- a/tests/integration/defs/perf/disagg/compare_backends.py +++ b/tests/integration/defs/perf/disagg/compare_backends.py @@ -12,8 +12,10 @@ def extract_backend(test_name): """Extract backend type from test_name. - New format: ccb-NIXL or ccb-UCX + New format: ccb-NIXL or ccb-UCX or ccb-DEFAULT Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL + + Note: "DEFAULT" is a special marker that represents the default backend """ match = re.search(r"ccb-(\w+)", test_name) return match.group(1) if match else None @@ -41,6 +43,7 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): csv_path: CSV file path threshold: Performance difference threshold (percentage) default_backend: DEFAULT backend name (currently NIXL, may switch in the future) + Cases marked as "ccb-DEFAULT" will be treated as this backend Returns: DataFrame: Comparison results @@ -71,20 +74,37 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): df["backend"] = df["test_name"].apply(extract_backend) df["base_case_name"] = df["test_name"].apply(extract_base_case_name) + # Normalize "DEFAULT" backend to the actual default_backend value + # This allows cases marked as "ccb-DEFAULT" to be treated as the default backend + df["backend"] = df["backend"].apply( + lambda x: default_backend if x and x.upper() == "DEFAULT" else x + ) + # Group by base_case_name and metric_type grouped = df.groupby(["base_case_name", "metric_type"]) results = [] + comparison_pairs = 0 + single_backend_skipped = 0 for (base_case, metric_type), group in grouped: # Get DEFAULT backend and UCX data default_data = group[group["backend"] == default_backend] ucx_data = group[group["backend"] == "UCX"] - # If both have no data, skip (this case may not exist) + # Skip if both have no data (this case may not exist) if len(default_data) == 0 and len(ucx_data) == 0: continue + # Skip single-backend cases (only has one backend, not a comparison pair) + # This happens when a test case only runs on one backend + if len(default_data) == 0 or len(ucx_data) == 0: + single_backend_skipped += 1 + continue + + # This is a valid comparison pair + comparison_pairs += 1 + # Extract values and original test names default_value = default_data["perf_metric"].values[0] if len(default_data) > 0 else None default_original_name = ( @@ -137,6 +157,19 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): } ) + # Print statistics + print(f"\n=== Backend Comparison Statistics ===") + print(f"Default backend: {default_backend}") + print(f"Comparison pairs: {comparison_pairs}") + print(f"Single-backend cases (skipped): {single_backend_skipped}") + print("=" * 37) + + # If no comparison pairs found, exit with success + if comparison_pairs == 0: + print(f"\nInfo: No backend comparison pairs found in disagg_perf tests") + print(f"All cases are single-backend only, no comparison needed") + sys.exit(0) + # Convert to DataFrame result_df = pd.DataFrame(results) From e6f88b48a1fcde69a9e05c7355805614c37dbfae Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Wed, 31 Dec 2025 05:32:36 +0000 Subject: [PATCH 03/13] fx Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 5 +- ...gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 110 ++++++++++++++++ ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 116 +++++++++++++++++ ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 110 ++++++++++++++++ ..._dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml | 117 ++++++++++++++++++ ...gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml | 110 ++++++++++++++++ ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 116 +++++++++++++++++ .../defs/perf/disagg/testlist/all.txt | 6 + .../defs/perf/disagg/testlist/wideep.txt | 6 + 9 files changed, 694 insertions(+), 2 deletions(-) create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml index cfcbf66c8df..d2f81b865ed 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -7,8 +7,9 @@ metadata: - GB200 - GB300 script_file: disaggr_torch.slurm - benchmark_type: 8k1k - dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json + benchmark_type: 1k1k + config_index: 7 + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json slurm: script_file: disaggr_torch.slurm partition: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml new file mode 100644 index 00000000000..451a995e303 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek-v32-fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 1 + dataset_file: disagg_datasets/deepseek-v32-1024-1024-200000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1075' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 32 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml new file mode 100644 index 00000000000..f67ff56f88a --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -0,0 +1,116 @@ +metadata: + model_name: deepseek-v32-fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 0 + dataset_file: disagg_datasets/deepseek-v32-1024-1024-200000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '2150' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml new file mode 100644 index 00000000000..76f4f78276c --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -0,0 +1,110 @@ +# nvbugs: 5422621 +metadata: + model_name: deepseek-v32-fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 7 + dataset_file: disagg_datasets/deepseek-v32-1024-1024-200000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '12288' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 48 + moe_expert_parallel_size: 48 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2176 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT + stream_interval: 20 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4480 + max_seq_len: 2176 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml new file mode 100644 index 00000000000..4a91160a99b --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml @@ -0,0 +1,117 @@ +metadata: + model_name: deepseek-v32-fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 14 + dataset_file: disagg_datasets/deepseek-v32-8192-1024-200000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: disaggr-test + extra_args: "--gres=gpu:4" + numa_bind: true +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9423 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: DEFAULT + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9423 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: DEFAULT + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml new file mode 100644 index 00000000000..4d3a716c675 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek-v32-fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 5 + dataset_file: disagg_datasets/deepseek-v32-8192-1024-200000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1075' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 6 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml new file mode 100644 index 00000000000..441aebf189c --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -0,0 +1,116 @@ +metadata: + model_name: deepseek-v32-fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 4 + dataset_file: disagg_datasets/deepseek-v32-8192-1024-200000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '538' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 8 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 16 + max_num_tokens: 64 + max_seq_len: 9419 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 1 + max_num_tokens: 8448 + max_seq_len: 9419 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/integration/defs/perf/disagg/testlist/all.txt b/tests/integration/defs/perf/disagg/testlist/all.txt index dd2d14b5acf..da40a0f46d8 100644 --- a/tests/integration/defs/perf/disagg/testlist/all.txt +++ b/tests/integration/defs/perf/disagg/testlist/all.txt @@ -77,6 +77,12 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL] diff --git a/tests/integration/defs/perf/disagg/testlist/wideep.txt b/tests/integration/defs/perf/disagg/testlist/wideep.txt index baee1d5a10f..4a599a964e6 100644 --- a/tests/integration/defs/perf/disagg/testlist/wideep.txt +++ b/tests/integration/defs/perf/disagg/testlist/wideep.txt @@ -8,6 +8,12 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL] From 414b91af7f017c70b52d1d1b50c924e420f31f56 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Wed, 24 Dec 2025 09:14:31 +0000 Subject: [PATCH 04/13] add stress test cases here Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 58 ++- .../test_configs/disagg/stress/README.md | 480 ++++++++++++++++++ .../defs/perf/disagg/test_disagg.py | 90 +++- .../defs/perf/disagg/utils/config_loader.py | 43 +- 4 files changed, 664 insertions(+), 7 deletions(-) create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index 2f4c482f052..8e717c50ce8 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -756,11 +756,11 @@ def _check_job_result( Args: job_id: SLURM job ID - test_category: Test category ("perf" or "accuracy") + test_category: Test category ("perf", "accuracy", or "stress") benchmark_type: Benchmark type (1k1k, 8k1k, etc.) config: Configuration dict (YAML data) metrics_config: MetricsConfig object (default or custom) - accuracy_config: AccuracyConfig object (required for accuracy tests) + accuracy_config: AccuracyConfig object (required for accuracy and stress tests) model_name: Model name result_dir: Result directory timestamps: Optional timestamps dict @@ -768,6 +768,7 @@ def _check_job_result( Returns: Dict with success status and details + For stress tests, includes both perf and accuracy results """ logger.info(f"Checking result directory: {result_dir}") @@ -776,12 +777,63 @@ def _check_job_result( # Route based on test_category if test_category == "accuracy": + # Use metrics config from accuracy_config (defaults to _COMMON_ACCURACY_METRICS) + accuracy_metrics = accuracy_config.get_metrics_config() return JobManager._check_accuracy_result( job_id=job_id, - metrics_config=metrics_config, + metrics_config=accuracy_metrics, accuracy_config=accuracy_config, result_dir=result_dir, ) + elif test_category == "stress": + # Stress tests combine both perf and accuracy validation + # First check performance and write CSV + perf_result = JobManager._check_perf_result( + job_id=job_id, + benchmark_type=benchmark_type, + config=config, + metrics_config=metrics_config, + model_name=model_name, + result_dir=result_dir, + timestamps=timestamps, + test_name=test_name, + ) + + # If perf check failed, return immediately + if not perf_result.get("success", False): + return perf_result + + # Then check accuracy if accuracy_config is provided + if accuracy_config: + # Use metrics config from accuracy_config (defaults to _COMMON_ACCURACY_METRICS) + accuracy_metrics = accuracy_config.get_metrics_config() + + accuracy_result = JobManager._check_accuracy_result( + job_id=job_id, + metrics_config=accuracy_metrics, + accuracy_config=accuracy_config, + result_dir=result_dir, + ) + + # If accuracy check failed, merge results and return + if not accuracy_result.get("success", False): + return { + **perf_result, + "success": False, + "accuracy_result": accuracy_result, + "error": f"Perf passed but accuracy failed: {accuracy_result.get('error', 'Unknown')}", + } + + # Both passed, merge results + return { + **perf_result, + "accuracy_result": accuracy_result, + "success": True, + } + else: + # No accuracy config, just return perf result + logger.warning("Stress test has no accuracy_config, only perf validation performed") + return perf_result else: # perf return JobManager._check_perf_result( job_id=job_id, diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md new file mode 100644 index 00000000000..528c8e33e8b --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md @@ -0,0 +1,480 @@ +# Disaggregated Stress Tests + +## Purpose + +Stress tests combine **performance benchmarking** and **accuracy validation** in a single test run. They are designed to: + +- Validate performance under high load/stress conditions +- Ensure accuracy is maintained while pushing system limits +- Write performance metrics to CSV (same as `perf` tests) +- Validate accuracy against expected thresholds (e.g., GSM8K, MMLU) + +Test name prefix: `disagg_stress_*` + +--- + +## Quick Start + +```bash +# 1. Copy the example template +cp EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml \ + your_model_1k1k_stress_gsm8k.yaml + +# 2. Edit the configuration (see Field Reference below) + +# 3. Run the test +cd /path/to/tests/integration/defs/perf/disagg/ +poetry run pytest --disagg test_disagg.py -s -vv -m stress +``` + +--- + +## Configuration Template + +### Minimal Template + +```yaml +metadata: + model_name: your-model-name + precision: fp8 + model_dir_name: YourModelDir + supported_gpus: [GB200, GB300] + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 0 + + # Accuracy configuration (required for stress tests) + accuracy: + datasets: + - name: gsm8k + expected_value: 0.85 + threshold_type: hypothesis_test + filter_type: flexible-extract + +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: stress-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true + +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: 1 2 4 8 16 32 + input_length: 1024 + output_length: 1024 + dataset_file: + +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 4 + +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO ..." + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" + +profiling: + nsys_on: false + +# Enable accuracy evaluation (required for stress tests) +accuracy: + enable_accuracy_test: true + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,timeout=1200 + +worker_config: + gen: + tensor_parallel_size: 8 + max_batch_size: 32 + max_num_tokens: 128 + # ... other gen worker configs + ctx: + tensor_parallel_size: 4 + max_batch_size: 4 + max_num_tokens: 4608 + # ... other ctx worker configs +``` + +--- + +## Field Reference + +### 1. `metadata` Section + +#### Required Fields + +| Field | Type | Description | Example | +|-------|------|-------------|---------| +| `model_name` | string | Model identifier | `deepseek-r1-fp4` | +| `precision` | string | Model precision | `fp8`, `fp4`, `int8` | +| `model_dir_name` | string | Model directory name | `DeepSeek-R1-0528-FP4-v2` | +| `supported_gpus` | list | GPU types supported | `[GB200, GB300]` | +| `script_file` | string | SLURM script to use | `disaggr_torch.slurm` | +| `benchmark_type` | string | Benchmark configuration | `1k1k`, `8k1k`, etc. | +| `config_index` | int | Configuration index | `0`, `1`, etc. | + +#### Accuracy Configuration (Required for Stress Tests) + +```yaml +metadata: + accuracy: + datasets: + - name: gsm8k # Dataset name + expected_value: 0.85 # Expected accuracy (0.0-1.0) + threshold_type: hypothesis_test # "hypothesis_test" or "absolute" + filter_type: flexible-extract # "flexible-extract" or "strict-match" + + # Optional: Hypothesis testing parameters + alpha: 0.05 # Type I error rate (default: 0.05) + beta: 0.20 # Type II error rate (default: 0.20) + sigma: 0.05 # Standard deviation (default: 0.05) + num_samples: 100 # Number of samples (default: 100) + higher_is_better: true # Direction (default: true) + + # Optional: Custom accuracy metrics parsing + # metrics: + # log_file: "7_accuracy_eval.log" + # extractor_pattern: '\|...\|' + # metric_names: [flexible-extract, strict-match] +``` + +**Threshold Types:** +- `hypothesis_test`: Statistical hypothesis testing (recommended) +- `absolute`: Simple threshold comparison + +**Filter Types:** +- `flexible-extract`: More lenient matching +- `strict-match`: Exact matching required + +--- + +### 2. `slurm` Section + +| Field | Type | Description | Recommended | +|-------|------|-------------|-------------| +| `partition` | string | SLURM partition | Your cluster partition | +| `account` | string | SLURM account | Your cluster account | +| `job_time` | string | Maximum job time | `04:00:00` (4 hours) | +| `job_name` | string | Job name | `stress-benchmark` | +| `extra_args` | string | Extra SLURM args | `"--gres=gpu:4"` | +| `numa_bind` | bool | Enable NUMA binding | `true` | + +--- + +### 3. `benchmark` Section + +| Field | Type | Description | Example | +|-------|------|-------------|---------| +| `mode` | string | Benchmark mode | `e2e` | +| `use_nv_sa_benchmark` | bool | Use NV benchmark | `true` | +| `multi_round` | int | Rounds per concurrency | `8` | +| `benchmark_ratio` | float | Benchmark ratio | `0.8` | +| `streaming` | bool | Enable streaming | `true` | +| `concurrency_list` | string | Concurrency levels | `1 2 4 8 16 32` | +| `input_length` | int | Input token length | `1024` | +| `output_length` | int | Output token length | `1024` | +| `dataset_file` | string | Dataset file path | `` | + +**Tip:** Increase `concurrency_list` for more stress (e.g., `1 2 4 8 16 32 64 128`) + +--- + +### 4. `hardware` Section + +| Field | Type | Description | Example | +|-------|------|-------------|---------| +| `gpus_per_node` | int | GPUs per node | `4` | +| `num_ctx_servers` | int | Context servers | `1` | +| `num_gen_servers` | int | Generation servers | `4` | + +--- + +### 5. `accuracy` Section (SLURM Script Config) + +**Note:** This is different from `metadata.accuracy`. This section is used by the SLURM script to run `lm-evaluation-harness`. + +| Field | Type | Description | Example | +|-------|------|-------------|---------| +| `enable_accuracy_test` | bool | Enable accuracy eval | `true` (required) | +| `model` | string | Model type | `local-completions` | +| `tasks` | string | Eval tasks | `gsm8k`, `mmlu`, `humaneval` | +| `model_args_extra` | string | Extra arguments | See below | + +**Common `model_args_extra` parameters:** +``` +num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +``` + +--- + +### 6. `worker_config` Section + +Configure generation and context workers. See [main README](../../README.md) for detailed worker configuration options. + +**Key parameters:** +- `tensor_parallel_size`: TP parallelism +- `max_batch_size`: Maximum batch size +- `max_num_tokens`: Maximum tokens per batch +- `max_seq_len`: Maximum sequence length + +--- + +## Test Execution Flow + +``` +1. Configuration Validation + ↓ +2. SLURM Job Submission + ↓ +3. Performance Benchmark + - Runs benchmark with specified concurrency levels + - Generates: 6_bench.log + ↓ +4. Accuracy Evaluation + - Runs lm-evaluation-harness + - Generates: 7_accuracy_eval.log + ↓ +5. Result Validation + - Parse performance metrics → Write to CSV + - Parse accuracy results → Validate against thresholds + ↓ +6. Pass/Fail Decision + - PASS: Both performance and accuracy checks pass + - FAIL: Either performance or accuracy fails +``` + +--- + +## Output Files + +### Log Directory +``` +{OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/ +├── config.yaml # Test configuration copy +├── 6_bench.log # Performance benchmark log +├── 7_accuracy_eval.log # Accuracy evaluation log +├── output_gen_*.log # Generation worker logs +├── output_ctx_*.log # Context worker logs +└── slurm-{job_id}.out # SLURM output +``` + +### CSV Output +``` +{OUTPUT_PATH}/perf_script_test_results.csv +``` + +Performance metrics are written to the same CSV as `perf` tests, with `test_name` prefix `disagg_stress_*`. + +### Failed Test Directories +Failed tests are automatically renamed with `_ERROR` suffix: +``` +disagg_stress_{test_id}_ERROR/ +``` + +--- + +## Supported Accuracy Datasets + +| Dataset | Task | Description | +|---------|------|-------------| +| `gsm8k` | Math reasoning | Grade school math problems | +| `mmlu` | Knowledge | Multi-domain multiple choice | +| `humaneval` | Coding | Python code generation | +| `hellaswag` | Reasoning | Commonsense reasoning | + +See [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for full list. + +--- + +## Common Pitfalls + +### 1. Missing Accuracy Config + +**Error:** `Stress test has no accuracy_config` + +**Solution:** Ensure both accuracy sections are present: +```yaml +metadata: + accuracy: + datasets: [...] # For validation framework + +accuracy: + enable_accuracy_test: true # For SLURM script +``` + +### 2. Timeout Issues + +**Error:** Job times out before completion + +**Solution:** Increase `job_time`: +```yaml +slurm: + job_time: 06:00:00 # 6 hours for larger models +``` + +### 3. Accuracy Threshold Too High + +**Error:** Accuracy test fails but model performance is reasonable + +**Solution:** Adjust `expected_value` or use `hypothesis_test`: +```yaml +metadata: + accuracy: + datasets: + - expected_value: 0.80 # Lower threshold + threshold_type: hypothesis_test # More statistical +``` + +--- + +## Advanced Usage + +### Custom Accuracy Metrics Parsing + +Override default accuracy log parsing: + +```yaml +metadata: + accuracy: + datasets: + - name: gsm8k + expected_value: 0.85 + + metrics: + log_file: "custom_accuracy.log" + extractor_pattern: '\|custom_pattern\|' + metric_names: [custom_metric_1, custom_metric_2] +``` + +### Multiple Datasets + +Test multiple accuracy benchmarks: + +```yaml +metadata: + accuracy: + datasets: + - name: gsm8k + expected_value: 0.85 + threshold_type: hypothesis_test + - name: mmlu + expected_value: 0.75 + threshold_type: absolute +``` + +--- + +## Running Tests + +### Run All Stress Tests +```bash +poetry run pytest --disagg test_disagg.py -s -vv -m stress +``` + +### Run Specific Test +```bash +poetry run pytest --disagg test_disagg.py -s -vv -k "your_model_1k1k_stress" +``` + +### Run from Test List +```bash +echo "disagg_stress_your_model_1k1k_stress_gsm8k" > testlist/stress.txt +poetry run pytest --disagg test_disagg.py -s -vv --disagg-test-list=./testlist/stress.txt +``` + +--- + +## Naming Convention + +Format: `{model}_{benchmark_type}_{config_details}_stress_{dataset}.yaml` + +Examples: +- `deepseek-r1-fp4_1k1k_ctx1_gen4_stress_gsm8k.yaml` +- `llama3-8b_8k1k_ctx2_gen2_stress_mmlu.yaml` +- `qwen3-235b_1k1k_ctx1_gen1_stress_humaneval.yaml` + +--- + +## Comparison with Other Test Types + +| Feature | perf | accuracy | stress | +|---------|------|----------|--------| +| Performance Metrics | ✅ | ❌ | ✅ | +| CSV Output | ✅ | ❌ | ✅ | +| Accuracy Validation | ❌ | ✅ | ✅ | +| Default Timeout | 2h | 3h | 4h | +| Use Case | Performance only | Accuracy only | Both | + +--- + +## Troubleshooting + +### Check Test Status +```bash +# View SLURM jobs +squeue -u $USER + +# Check logs +tail -f {OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/slurm-*.out +``` + +### Debug Mode +```bash +export DEBUG_MODE=1 +export DEBUG_JOB_ID=12345 + +poetry run pytest --disagg test_disagg.py -s -vv -k "your_test" +``` + +### View Results +```bash +# Performance CSV +cat {OUTPUT_PATH}/perf_script_test_results.csv + +# Accuracy log +cat {OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/7_accuracy_eval.log +``` + +--- + +## Best Practices + +1. **Start Conservative:** Begin with lower concurrency and shorter job times +2. **Monitor Resources:** Check GPU memory and CPU usage during stress tests +3. **Baseline First:** Run `perf` and `accuracy` tests separately before `stress` +4. **Document Results:** Keep records of thresholds and performance baselines +5. **Iterate:** Gradually increase stress (concurrency, sequence length) until failure + +--- + +## Related Documentation + +- [Main README](../../README.md) - General test framework documentation +- [Example Config](EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml) - Full example configuration +- [Config Loader](../../utils/config_loader.py) - Configuration loading logic +- [Executor](../../execution/executor.py) - Test execution logic + +--- + +## Support + +For issues or questions: +1. Check logs in `{OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/` +2. Review configuration against this README +3. Compare with `EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml` +4. Contact your team's test infrastructure maintainer + diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py index ff71446bb1e..9c4f87679c0 100644 --- a/tests/integration/defs/perf/disagg/test_disagg.py +++ b/tests/integration/defs/perf/disagg/test_disagg.py @@ -14,13 +14,15 @@ config_loader = ConfigLoader(base_dir=CONFIG_BASE_DIR) ALL_TEST_CONFIGS = config_loader.scan_configs() -# Separate performance and accuracy test configurations +# Separate performance, accuracy, and stress test configurations PERF_TEST_CONFIGS = [c for c in ALL_TEST_CONFIGS if c.test_category == "perf"] ACCURACY_TEST_CONFIGS = [c for c in ALL_TEST_CONFIGS if c.test_category == "accuracy"] +STRESS_TEST_CONFIGS = [c for c in ALL_TEST_CONFIGS if c.test_category == "stress"] # Convert to pytest parameters PERF_TEST_CASES = [pytest.param(config, id=config.test_id) for config in PERF_TEST_CONFIGS] ACCURACY_TEST_CASES = [pytest.param(config, id=config.test_id) for config in ACCURACY_TEST_CONFIGS] +STRESS_TEST_CASES = [pytest.param(config, id=config.test_id) for config in STRESS_TEST_CONFIGS] # Flag to track if session end has been called _session_ended = False @@ -212,6 +214,92 @@ def test_accuracy(self, request, test_config: TestConfig): except Exception as backup_error: logger.error(f"Failed to backup logs: {backup_error}") + @pytest.mark.stress + @pytest.mark.parametrize("test_config", STRESS_TEST_CASES) + def test_stress(self, request, test_config: TestConfig): + """Stress test combining performance benchmarks and accuracy validation. + + This test type is designed for stress testing scenarios where both + performance metrics (CSV output) and accuracy (e.g., GSM8K) need to be validated. + """ + full_test_name = request.node.name + + # Validate configuration first (before any other operations) + try: + ConfigValidator.validate_test_config(test_config) + except Exception as e: + pytest.fail(f"Configuration validation failed: {e}") + + # Create test case tracker + test_tracker = TestCaseTracker() + test_case_name = test_config.test_id + + # Start tracking test case + test_tracker.start_test_case(test_case_name) + + job_id = None + result = None + + try: + logger.info(f"\n{'=' * 60}") + logger.info(f"Stress Test (Perf + Accuracy): {test_config.display_name}") + logger.info(f"Test ID: {test_config.test_id}") + logger.info(f"Config file: {test_config.config_path}") + logger.info(f"Test type: {test_config.test_type}") + logger.info(f"Category: {test_config.test_category}") + logger.info(f"Model: {test_config.model_name}") + logger.info(f"Benchmark: {test_config.benchmark_type}") + + # Log accuracy datasets if configured + if test_config.accuracy_config: + dataset_names = test_config.accuracy_config.get_all_dataset_names() + logger.info(f"Accuracy Datasets: {', '.join(dataset_names)}") + + logger.info(f"Metrics log: {test_config.metrics_config.log_file}") + logger.info(f"Supported GPUs: {', '.join(test_config.supported_gpus)}") + logger.info(f"{'=' * 60}") + + if EnvManager.get_debug_mode(): + logger.debug( + f"Debug mode: Skipping job submission, using job_id: {EnvManager.get_debug_job_id()}" + ) + job_id = EnvManager.get_debug_job_id() + else: + # Submit job using JobManager + success, job_id = JobManager.submit_job(test_config) + + # Validate submission result + assert success, f"Job submission failed: {test_config.test_id}" + assert job_id, "Unable to get job ID" + + # Wait for completion (longer timeout for stress tests: 4 hours) + JobManager.wait_for_completion(job_id, 14400, test_config, check_early_failure=True) + + # End tracking test case + test_tracker.end_test_case() + + # Get timestamps information + timestamps = test_tracker.get_timestamps() + + # Check results - this will handle both perf CSV writing AND accuracy validation + result = JobManager.check_result(job_id, test_config, timestamps, full_test_name) + assert result["success"], ( + f"Stress test failed: {result.get('error', 'Unknown error')}" + ) + + except Exception as e: + test_tracker.end_test_case() + raise e + finally: + # Always backup logs, regardless of success or failure + if job_id: + result_dir = JobManager.get_result_dir(test_config) + is_passed = result.get("success", False) if result else False + try: + JobManager.backup_logs(job_id, test_config, result_dir, is_passed) + except Exception as backup_error: + logger.error(f"Failed to backup logs: {backup_error}") + if __name__ == "__main__": """Run benchmark tests""" diff --git a/tests/integration/defs/perf/disagg/utils/config_loader.py b/tests/integration/defs/perf/disagg/utils/config_loader.py index a74ea94089b..07531a816ed 100644 --- a/tests/integration/defs/perf/disagg/utils/config_loader.py +++ b/tests/integration/defs/perf/disagg/utils/config_loader.py @@ -45,6 +45,7 @@ class AccuracyConfig: """Accuracy test configuration (supports multiple datasets).""" datasets: List[DatasetThreshold] # List of dataset threshold configurations + metrics: Optional[MetricsConfig] = None # Optional custom metrics config (defaults to _COMMON_ACCURACY_METRICS) def get_dataset_config(self, dataset_name: str) -> Optional[DatasetThreshold]: """Get configuration by dataset name. @@ -67,6 +68,16 @@ def get_all_dataset_names(self) -> List[str]: List of dataset names """ return [ds.dataset_name for ds in self.datasets] + + def get_metrics_config(self) -> MetricsConfig: + """Get metrics configuration for accuracy parsing. + + Returns: + Custom metrics config if provided, otherwise _COMMON_ACCURACY_METRICS + """ + if self.metrics is not None: + return self.metrics + return _COMMON_ACCURACY_METRICS # ============================================================================ @@ -139,6 +150,19 @@ def get_all_dataset_names(self) -> List[str]: "SERVER_P99_E2EL", ], ), + # Stress test configuration (combines perf metrics + accuracy validation) + # Uses the same perf metrics pattern as disagg perf tests + ("disagg", "stress"): MetricsConfig( + log_file="6_bench.log", + extractor_pattern=r""" + ^.*?Median\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? + ^.*?Median\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n + (?:.*\n)*? + ^.*?Benchmark\ with\ concurrency\ (\d+)\ done + """, + metric_names=["SERVER_MEDIAN_TTFT", "SERVER_MEDIAN_E2EL"], + ), # Accuracy test configuration ("disagg", "accuracy"): _COMMON_ACCURACY_METRICS, ("wideep", "accuracy"): _COMMON_ACCURACY_METRICS, @@ -336,7 +360,8 @@ def _load_config_file(self, yaml_path: Path, test_type: str, test_category: str) # Load accuracy configuration (only for accuracy tests) accuracy_config = None - if test_category == "accuracy": + # Load accuracy config for both "accuracy" and "stress" test categories + if test_category in ["accuracy", "stress"]: acc_meta = metadata.get("accuracy", {}) if acc_meta and "datasets" in acc_meta: datasets = [] @@ -373,8 +398,20 @@ def _load_config_file(self, yaml_path: Path, test_type: str, test_category: str) higher_is_better=higher_is_better, ) ) - accuracy_config = AccuracyConfig(datasets=datasets) - logger.info(f"Loaded accuracy config with {len(datasets)} dataset(s)") + + # Check if custom accuracy metrics are provided + custom_metrics = None + if "metrics" in acc_meta: + metrics_override = acc_meta["metrics"] + custom_metrics = MetricsConfig( + log_file=metrics_override.get("log_file", "7_accuracy_eval.log"), + extractor_pattern=metrics_override.get("extractor_pattern", r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|"), + metric_names=metrics_override.get("metric_names", ["flexible-extract", "strict-match"]), + ) + logger.info(f"Using custom accuracy metrics config from YAML") + + accuracy_config = AccuracyConfig(datasets=datasets, metrics=custom_metrics) + logger.info(f"Loaded accuracy config with {len(datasets)} dataset(s) for {test_category} test") return TestConfig( config_path=str(yaml_path), From 56ad3d482d8bbf97dd2be997340f07ba4fe4a66d Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Wed, 31 Dec 2025 05:36:03 +0000 Subject: [PATCH 05/13] fix conflict Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 115 ++++++++++++++++++ ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 2 +- ...n1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml | 2 +- .../defs/perf/disagg/test_disagg.py | 2 +- .../defs/perf/disagg/testlist/all.txt | 2 + .../defs/perf/disagg/testlist/disagg.txt | 1 + 6 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml new file mode 100644 index 00000000000..9d5ca29c122 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -0,0 +1,115 @@ +# nvbugs: 5422621 +metadata: + model_name: deepseek-r1-fp4 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json + accuracy: + datasets: + - dataset_name: gsm8k + expected_value: 0.9454 + threshold_type: hypothesis_test + filter_type: flexible-extract +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 03:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 20 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '12288' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: true + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 48 + moe_expert_parallel_size: 48 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2176 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT + stream_interval: 20 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4480 + max_seq_len: 2176 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index 30eb5ef5bda..259010f2550 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -18,7 +18,7 @@ slurm: script_file: disaggr_torch.slurm partition: account: - job_time: 02:00:00 + job_time: 03:00:00 job_name: unified-benchmark extra_args: "--gres=gpu:4" numa_bind: true diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml index b17b96df7d7..73b11ea4157 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml @@ -18,7 +18,7 @@ slurm: script_file: disaggr_torch.slurm partition: account: - job_time: 00:45:00 + job_time: 03:00:00 job_name: unified-benchmark extra_args: "--gres=gpu:4" numa_bind: true diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py index 9c4f87679c0..7fe141c5d10 100644 --- a/tests/integration/defs/perf/disagg/test_disagg.py +++ b/tests/integration/defs/perf/disagg/test_disagg.py @@ -273,7 +273,7 @@ def test_stress(self, request, test_config: TestConfig): assert job_id, "Unable to get job ID" # Wait for completion (longer timeout for stress tests: 4 hours) - JobManager.wait_for_completion(job_id, 14400, test_config, check_early_failure=True) + JobManager.wait_for_completion(job_id, 10800, test_config, check_early_failure=True) # End tracking test case test_tracker.end_test_case() diff --git a/tests/integration/defs/perf/disagg/testlist/all.txt b/tests/integration/defs/perf/disagg/testlist/all.txt index da40a0f46d8..d5a5e1d3419 100644 --- a/tests/integration/defs/perf/disagg/testlist/all.txt +++ b/tests/integration/defs/perf/disagg/testlist/all.txt @@ -64,6 +64,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default] test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default] test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default] +# disagg stress test cases +test_disagg.py::TestDisaggBenchmark::test_stress[disagg_stress_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] # WIDEEP cases diff --git a/tests/integration/defs/perf/disagg/testlist/disagg.txt b/tests/integration/defs/perf/disagg/testlist/disagg.txt index 73dda2187da..d928d36b638 100644 --- a/tests/integration/defs/perf/disagg/testlist/disagg.txt +++ b/tests/integration/defs/perf/disagg/testlist/disagg.txt @@ -63,3 +63,4 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default] test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default] test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default] +test_disagg.py::TestDisaggBenchmark::test_stress[disagg_stress_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] From f26a0f57c6bf5d72f8eadafa057e7b2f46e59a9b Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Fri, 26 Dec 2025 10:20:33 +0000 Subject: [PATCH 06/13] fix srun logic to sbatch Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 356 ++++++++++-------- .../defs/perf/disagg/simple_collect.py | 60 --- .../defs/perf/disagg/test_disagg.py | 6 +- .../defs/perf/disagg/utils/common.py | 2 - .../defs/perf/disagg/utils/trackers.py | 46 ++- 5 files changed, 241 insertions(+), 229 deletions(-) diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index 8e717c50ce8..659ccfb6881 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -4,13 +4,12 @@ import re import shutil import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional import yaml from reporting.report import LogParser, LogWriter, ResultSaver from utils.common import ( GPU_RESOURCE_CONFIG, - SESSION_COLLECT_CMD_TYPE, EnvManager, extract_config_fields, ) @@ -18,177 +17,237 @@ from execution.subprocess_utils import exec_cmd, exec_cmd_with_output + # ============================================================================ -# SLURM Run Command Builder +# Job Manager # ============================================================================ -class SlurmRunCommandBuilder: - """SLURM Run Command Builder. +class JobManager: + """Job manager class for test jobs and session collection.""" + + # ============================================================================ + # Generic Job Submission (Direct sbatch) + # ============================================================================ + + @staticmethod + def submit_shell_job( + job_name: str, + shell_script: str, + output_log_file: str, + timeout: int = 7200 + ) -> tuple[bool, str]: + """Submit a generic shell script job using sbatch --wrap. - Build srun commands for different GPU types and command types. - Reuses GPU_RESOURCE_CONFIG for consistency with SlurmJobBuilder. - """ + This is a low-level method for submitting arbitrary shell scripts + directly to SLURM via sbatch --wrap (non-blocking). - def build_srun_prefix(self, job_name: str) -> List[str]: - """Build srun command prefix based on GPU type.""" - gpu_type = EnvManager.get_gpu_type() + Args: + job_name: SLURM job name + shell_script: Shell script content to execute + output_log_file: Full path to output log file + timeout: Job timeout in seconds (default: 7200 = 2 hours) - # Reuse the same GPU_RESOURCE_CONFIG as SlurmJobBuilder - gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type) - if not gpu_config: - raise ValueError( - f"GPU resource configuration not found for {gpu_type}. " - f"Please add configuration in GPU_RESOURCE_CONFIG." + Returns: + tuple: (success: bool, job_id: str) + """ + try: + # Get environment configuration + container_image = EnvManager.get_container_image() + container_mount = EnvManager.get_container_mount() + output_path = EnvManager.get_output_path() + + # Ensure output directory exists + os.makedirs(output_path, exist_ok=True) + + # Build complete srun command (runs inside sbatch) + srun_command = ( + f"srun -l " + f"--container-name={job_name} " + f"--container-image={container_image} " + f"--container-mounts={container_mount} " + f"bash -c '{shell_script}'" ) - # Common srun arguments - srun_args = [ - "srun", - "-l", - "--container-name=sysinfo-get", - f"--container-image={EnvManager.get_container_image()}", - f"--container-mounts={EnvManager.get_container_mount()}", - ] + # Build sbatch command with all parameters + gpu_type = EnvManager.get_gpu_type() + gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type) + if not gpu_config: + raise ValueError(f"GPU resource configuration not found for {gpu_type}") - # Add GPU-specific gres parameter (reuse gres_gpu field) - # If gres_gpu is not None, add --gres parameter - if gpu_config["gres_gpu"] is not None: - srun_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}") + # Convert timeout to HH:MM:SS format + hours = timeout // 3600 + minutes = (timeout % 3600) // 60 + seconds = timeout % 60 + time_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" - # Add common parameters - srun_args.extend( - [ + sbatch_args = [ + "sbatch", + f"--job-name={job_name}", f"--partition={EnvManager.get_slurm_partition()}", f"--account={EnvManager.get_slurm_account()}", - f"--job-name={job_name}", - "--time=02:00:00", - "--mpi=pmix", - # Note: Removed --overlap to ensure GPU allocation for session_collect - # which runs after all test jobs have completed - "-N", - "1", - "-n", - "1", + f"--time={time_str}", + "--nodes=1", + "--ntasks=1", + f"--output={output_log_file}", + "--parsable", # Easier job ID parsing ] - ) - return srun_args + # Conditionally add gres parameter based on GPU configuration + if gpu_config["gres_gpu"] is not None: + sbatch_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}") - def build_script_command(self, cmd_type: str) -> List[str]: - """Build script command based on command type.""" - work_dir = EnvManager.get_work_dir() - output_path = EnvManager.get_output_path() - install_mode = EnvManager.get_install_mode() - repo_dir = EnvManager.get_repo_dir() - trtllm_wheel_path = EnvManager.get_trtllm_wheel_path() - - if cmd_type == SESSION_COLLECT_CMD_TYPE: - if install_mode == "none": - return [ - "bash", - "-c", - f"cd {work_dir} && python3 {work_dir}/simple_collect.py {output_path}", - ] - elif install_mode == "wheel": - # Install TensorRT-LLM wheel first, then run simple_collect.py - # Note: Use --no-deps to avoid overwriting container's pre-installed packages (like torch) - install_cmd = f""" - cd {repo_dir} - echo 'Step 1: Installing TensorRT-LLM wheel...' - pip3 install {trtllm_wheel_path} || echo 'Wheel install failed, continuing...' - echo 'Wheel installation completed' - - echo 'Step 2: Running simple_collect.py...' - cd {work_dir} - python3 {work_dir}/simple_collect.py {output_path} - """ - return ["bash", "-c", install_cmd] - elif install_mode == "source": - install_cmd = f""" - cd {repo_dir} - pip3 install -e . || echo 'Source install failed, continuing...' - - echo 'Source installation completed' - - echo 'Step 3: Running simple_collect.py...' - cd {work_dir} - python3 {work_dir}/simple_collect.py {output_path} - """ - return ["bash", "-c", install_cmd] - else: - raise ValueError(f"Invalid install mode: {install_mode}") - else: - # Future command types can be added here - # elif cmd_type == "benchmark_collect": - # model_dir = EnvManager.get_model_dir() - # return [ - # "bash", "-c", - # f"cd {work_dir} && python3 {work_dir}/benchmark_collect.py " - # f"--model-dir {model_dir} --output {output_path}" - # ] - # elif cmd_type == "metrics_collect": - # return [ - # "bash", "-c", - # f"cd {work_dir} && python3 {work_dir}/metrics_collect.py --config {work_dir}/config.yaml" - # ] - raise ValueError( - f"Unsupported command type: {cmd_type}. " - f"Currently supported: {SESSION_COLLECT_CMD_TYPE}" - ) + # Add extra SLURM arguments if configured + slurm_extra_args = EnvManager.get_slurm_extra_args() + if slurm_extra_args: + sbatch_args.append(slurm_extra_args) - def run_job(self, cmd_type: str, job_name: str, log_file: str = None) -> Dict[str, Any]: - """Execute srun job. + # Add --wrap with the srun command + sbatch_args.extend(["--wrap", srun_command]) - Args: - cmd_type: Type of command to execute - job_name: Name for the SLURM job - log_file: Optional path to save command output + # Submit the job + logger.info(f"Submitting job '{job_name}' (using sbatch --wrap)...") + logger.debug(f"Log file: {output_log_file}") - Returns: - Dict with status and message - """ - try: - # Build complete command - srun_prefix = self.build_srun_prefix(job_name) - script_command = self.build_script_command(cmd_type) - full_command = srun_prefix + script_command - - # Execute with optional log file - if log_file: - logger.info(f"Saving output to: {log_file}") - # Use Python file redirection to avoid shell quoting issues - import subprocess - - with open(log_file, "w") as f: - result = subprocess.run( - full_command, stdout=f, stderr=subprocess.STDOUT, timeout=7200, text=True - ) - if result.returncode != 0: - raise subprocess.CalledProcessError(result.returncode, full_command) - logger.success(f"Output saved to {log_file}") - output = "" # Output is in file - else: - output = exec_cmd_with_output(full_command, timeout=7200) + output = exec_cmd_with_output(sbatch_args, timeout=60) + job_id = output.strip() + + # Parse job ID (--parsable returns just the job ID) + if job_id.isdigit(): + logger.success(f"Job '{job_name}' submitted: {job_id}") + logger.info(f"All logs will be written to: {output_log_file}") + return True, job_id + + # Fallback: try to extract from "Submitted batch job" format + match = re.search(r"Submitted batch job (\d+)", output) + if match: + job_id = match.group(1) + logger.success(f"Job '{job_name}' submitted: {job_id}") + return True, job_id + + logger.error(f"Failed to parse job ID from output: {output}") + return False, "" - return {"status": True, "msg": "Job executed successfully", "output": output} except Exception as e: - logger.error(f"Job execution failed: {e}") - return {"status": False, "msg": str(e)} + logger.error(f"Failed to submit job '{job_name}': {e}") + import traceback + logger.debug(traceback.format_exc()) + return False, str(e) + # ============================================================================ + # Session Collection Job Submission + # ============================================================================ -def make_slurm_run_command(): - """Create run command function (maintain interface compatibility).""" - builder = SlurmRunCommandBuilder() - return builder.run_job + @staticmethod + def submit_session_collect_job() -> tuple[bool, str]: + """Submit session collect job using sbatch (non-blocking). + This method builds the shell script for session collection and + delegates to submit_shell_job() for actual submission. -class JobManager: - """Job manager class.""" + Key benefits: + - Non-blocking execution (pytest doesn't wait) + - Better resource scheduling (queues if resources unavailable) + - Fault tolerance (job survives parent process exit) + - Unified job management (reuses wait_for_completion) + - All logs redirected to session_collect.log + + Returns: + tuple: (success: bool, job_id: str) + """ + try: + # Get environment configuration for building the script + work_dir = EnvManager.get_work_dir() + repo_dir = EnvManager.get_repo_dir() + install_mode = EnvManager.get_install_mode() + trtllm_wheel_path = EnvManager.get_trtllm_wheel_path() + output_path = EnvManager.get_output_path() + + # Build the inner script specific to session collection + inner_script = f""" +INSTALL_MODE="{install_mode}" +REPO_DIR="{repo_dir}" +WORK_DIR="{work_dir}" +OUTPUT_PATH="{output_path}" +WHEEL_PATH="{trtllm_wheel_path}" + +echo "==========================================" +echo "Session Collect Job Started" +echo "Time: $(date)" +echo "Install Mode: $INSTALL_MODE" +echo "==========================================" + +# Handle different installation modes +if [ "$INSTALL_MODE" = "none" ]; then + echo "Using built-in TensorRT-LLM, skipping installation" + +elif [ "$INSTALL_MODE" = "wheel" ]; then + echo "Installing TensorRT-LLM wheel..." + echo "Wheel path: $WHEEL_PATH" + pip3 install "$WHEEL_PATH" 2>&1 || echo "Wheel install failed, continuing..." + echo "Wheel installation completed" + +elif [ "$INSTALL_MODE" = "source" ]; then + echo "Installing TensorRT-LLM from source..." + cd "$REPO_DIR" + pip3 install -e . 2>&1 || echo "Source install failed, continuing..." + echo "Source installation completed" + +else + echo "ERROR: Invalid install mode: $INSTALL_MODE" + exit 1 +fi + +echo "" +echo "Collecting TensorRT-LLM version information..." +# Get TensorRT-LLM version and write to file +VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt" +python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}')" > "$VERSION_FILE" 2>&1 || {{ + echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE" + echo "Failed to get TensorRT-LLM version, wrote 'unknown' to $VERSION_FILE" +}} +echo "TensorRT-LLM version written to: $VERSION_FILE" + +echo "" +echo "Running simple_collect.py..." +cd "$WORK_DIR" +python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1 + +echo "" +echo "==========================================" +echo "Session Collect Job Completed" +echo "Time: $(date)" +echo "==========================================" + +# Explicitly exit to ensure job terminates immediately +exit 0 +""" + + # Submit using the shell job method + log_file = f"{output_path}/session_collect.log" + return JobManager.submit_shell_job( + job_name="session_collect", + shell_script=inner_script, + output_log_file=log_file, + timeout=7200 # 2 hours + ) + + except Exception as e: + logger.error(f"Failed to prepare session collect job: {e}") + import traceback + logger.debug(traceback.format_exc()) + return False, str(e) + + # ============================================================================ + # Test Job Submission (Via submit.py script) + # ============================================================================ @staticmethod - def submit_job(test_config) -> tuple: - """Submit job using submit.py with YAML config. + def submit_test_job(test_config) -> tuple: + """Submit benchmark test job using submit.py script. + + This method submits test jobs by calling the submit.py script, + which handles test-specific configuration and SLURM job setup. Args: test_config: TestConfig object containing configuration @@ -196,7 +255,7 @@ def submit_job(test_config) -> tuple: Returns: tuple: (success: bool, job_id: str) """ - logger.info("Submitting job using submit.py...") + logger.info("Submitting test job via submit.py...") try: import re @@ -846,6 +905,3 @@ def _check_job_result( test_name=test_name, ) - -# create executor function -run_job = make_slurm_run_command() diff --git a/tests/integration/defs/perf/disagg/simple_collect.py b/tests/integration/defs/perf/disagg/simple_collect.py index 1e3e32ee1e9..5a3b017a604 100644 --- a/tests/integration/defs/perf/disagg/simple_collect.py +++ b/tests/integration/defs/perf/disagg/simple_collect.py @@ -235,63 +235,6 @@ def write_cpu_info(self, data): print(f"Generated CPU file: {cpu_file}") return cpu_model - def write_trtllm_version(self): - """Write TensorRT-LLM version information to trtllm_version.txt.""" - version_info = "[TensorRT-LLM] TensorRT-LLM version: unknown" - - try: - # Try to import tensorrt_llm and get version - result = subprocess.run( - [ - sys.executable, - "-c", - 'import tensorrt_llm; print(f"[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}")', - ], - capture_output=True, - text=True, - timeout=30, - ) - - if result.returncode == 0 and result.stdout.strip(): - version_info = result.stdout.strip() - else: - # Print error for debugging - print(f"TensorRT-LLM import failed (returncode={result.returncode}):") - if result.stderr: - print(f" stderr:\n{result.stderr}") - - # Try one more time with a simple sleep - print("Retrying after 10 seconds...") - time.sleep(10) - result = subprocess.run( - [ - sys.executable, - "-c", - "import tensorrt_llm; " - 'print(f"[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}")', - ], - capture_output=True, - text=True, - timeout=30, - ) - - if result.returncode == 0 and result.stdout.strip(): - version_info = result.stdout.strip() - print("TensorRT-LLM version retrieved on second attempt") - else: - print(f"TensorRT-LLM import failed again (returncode={result.returncode}):") - if result.stderr: - print(f" stderr:\n{result.stderr}") - - except Exception as e: - print(f"Error getting TensorRT-LLM version: {e}") # Keep default unknown version - - trtllm_file = os.path.join(self.output_dir, "trtllm_version.txt") - with open(trtllm_file, "w") as f: - f.write(version_info) - print(f"Generated TensorRT-LLM version file: {trtllm_file}") - return version_info - def write_driver_info(self, data): """Write GPU driver information to driver.txt.""" driver_version = data.get("nvidia_driver_version", "unknown") @@ -307,13 +250,11 @@ def write_all_txt_files(self, data): gpu_info = self.write_gpu_info(data) cpu_info = self.write_cpu_info(data) driver_info = self.write_driver_info(data) - trtllm_info = self.write_trtllm_version() return { "GPU": gpu_info, "CPU": cpu_info, "Driver": driver_info, - "TensorRT-LLM": trtllm_info, } @@ -347,7 +288,6 @@ def main(): print(" - gpu.txt") print(" - cpu.txt") print(" - driver.txt") - print(" - trtllm_version.txt") print("\n=== Collected Information ===") for key, value in system_data.items(): diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py index 7fe141c5d10..4c6391b205f 100644 --- a/tests/integration/defs/perf/disagg/test_disagg.py +++ b/tests/integration/defs/perf/disagg/test_disagg.py @@ -102,7 +102,7 @@ def test_benchmark(self, request, test_config: TestConfig): job_id = EnvManager.get_debug_job_id() else: # Submit job using JobManager - success, job_id = JobManager.submit_job(test_config) + success, job_id = JobManager.submit_test_job(test_config) # Validate submission result assert success, f"Job submission failed: {test_config.test_id}" @@ -180,7 +180,7 @@ def test_accuracy(self, request, test_config: TestConfig): job_id = EnvManager.get_debug_job_id() else: # Submit job using JobManager - success, job_id = JobManager.submit_job(test_config) + success, job_id = JobManager.submit_test_job(test_config) # Validate submission result assert success, f"Job submission failed: {test_config.test_id}" @@ -266,7 +266,7 @@ def test_stress(self, request, test_config: TestConfig): job_id = EnvManager.get_debug_job_id() else: # Submit job using JobManager - success, job_id = JobManager.submit_job(test_config) + success, job_id = JobManager.submit_test_job(test_config) # Validate submission result assert success, f"Job submission failed: {test_config.test_id}" diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index 6c8805e3636..4ba9b812401 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -2,8 +2,6 @@ import os -SESSION_COLLECT_CMD_TYPE = "session_collect" - # GPU resource configuration # Simplified - only fields actually used in the codebase GPU_RESOURCE_CONFIG = { diff --git a/tests/integration/defs/perf/disagg/utils/trackers.py b/tests/integration/defs/perf/disagg/utils/trackers.py index 69c72b0cdbb..3ffcebd55fb 100644 --- a/tests/integration/defs/perf/disagg/utils/trackers.py +++ b/tests/integration/defs/perf/disagg/utils/trackers.py @@ -4,10 +4,10 @@ import pandas as pd -# Import run_job from execution (cross-package import) -from execution.executor import run_job +# Import JobManager from execution +from execution.executor import JobManager -from utils.common import SESSION_COLLECT_CMD_TYPE, EnvManager +from utils.common import EnvManager from utils.logger import logger @@ -74,25 +74,43 @@ def start(self): logger.info(f"Session started: {self.start_time}") def end_and_collect(self): - """Record end time and trigger information collection.""" + """Record end time and trigger session collection. + + Uses the new sbatch-based approach for non-blocking execution. + Submits the job and waits for completion using JobManager. + """ self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") logger.info(f"Session ended: {self.end_time}") - # Prepare log file path + # Submit session collect job (non-blocking sbatch) + success, job_id = JobManager.submit_session_collect_job() + + if not success: + logger.error(f"Failed to submit session collect job: {job_id}") + return False + + # Wait for job completion (reuses wait_for_completion method) + logger.info(f"Waiting for session collect job {job_id} to complete...") + JobManager.wait_for_completion( + job_id=job_id, + timeout=7200, # 2 hours + test_config=None, # No test config for session collect + check_early_failure=False # Don't check early failures + ) + + # Check if log file was created (indicates success) output_path = EnvManager.get_output_path() log_file = os.path.join(output_path, "session_collect.log") - - job_name = f"{EnvManager.get_slurm_job_name()}-session-collect" - run_result = run_job(SESSION_COLLECT_CMD_TYPE, job_name, log_file=log_file) - - if run_result["status"]: - # update timestamps in CSV + + if os.path.exists(log_file): + # Update timestamps in CSV self._update_csv_timestamps() logger.success("Session properties collected successfully") + logger.info(f"Session collect log: {log_file}") + return True else: - logger.error(f"Failed to collect session properties: {run_result['msg']}") - - return run_result["status"] + logger.error(f"Session collect log not found: {log_file}") + return False def _update_csv_timestamps(self): """Update timestamps in CSV using pandas.""" From 339ecd314cac3c93c746bd439ea8da3ce59f65dc Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Fri, 26 Dec 2025 11:00:05 +0000 Subject: [PATCH 07/13] fx Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index 659ccfb6881..4d8712265ef 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -177,14 +177,31 @@ def submit_session_collect_job() -> tuple[bool, str]: echo "Install Mode: $INSTALL_MODE" echo "==========================================" -# Handle different installation modes +# Step 1: Collect system information (no dependencies) +echo "" +echo "Step 1: Collecting system information..." +cd "$WORK_DIR" +python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1 +echo "System information collection completed" + +# Step 2: Handle different installation modes +echo "" +echo "Step 2: Installing TensorRT-LLM..." if [ "$INSTALL_MODE" = "none" ]; then echo "Using built-in TensorRT-LLM, skipping installation" elif [ "$INSTALL_MODE" = "wheel" ]; then echo "Installing TensorRT-LLM wheel..." - echo "Wheel path: $WHEEL_PATH" - pip3 install "$WHEEL_PATH" 2>&1 || echo "Wheel install failed, continuing..." + echo "Wheel path pattern: $WHEEL_PATH" + + # Expand wildcard and install (use unquoted variable to allow glob expansion) + for wheel_file in $WHEEL_PATH; do + if [ -f "$wheel_file" ]; then + echo "Found wheel: $wheel_file" + pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..." + break + fi + done echo "Wheel installation completed" elif [ "$INSTALL_MODE" = "source" ]; then @@ -198,21 +215,13 @@ def submit_session_collect_job() -> tuple[bool, str]: exit 1 fi +# Step 3: Collect TensorRT-LLM version information echo "" -echo "Collecting TensorRT-LLM version information..." -# Get TensorRT-LLM version and write to file +echo "Step 3: Collecting TensorRT-LLM version information..." VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt" -python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}')" > "$VERSION_FILE" 2>&1 || {{ - echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE" - echo "Failed to get TensorRT-LLM version, wrote 'unknown' to $VERSION_FILE" -}} +python3 -c 'import tensorrt_llm; print(f"[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}")' > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE" echo "TensorRT-LLM version written to: $VERSION_FILE" -echo "" -echo "Running simple_collect.py..." -cd "$WORK_DIR" -python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1 - echo "" echo "==========================================" echo "Session Collect Job Completed" From 1348b073d9f02a4bbbcdfdfc815c12c6b6d1e4a6 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Fri, 26 Dec 2025 12:18:50 +0000 Subject: [PATCH 08/13] try to fix sbatch isssue Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 128 ++++++------------ .../defs/perf/disagg/session_collect.sh | 70 ++++++++++ 2 files changed, 112 insertions(+), 86 deletions(-) create mode 100644 tests/integration/defs/perf/disagg/session_collect.sh diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index 4d8712265ef..515dbf55eb2 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -33,20 +33,25 @@ class JobManager: @staticmethod def submit_shell_job( job_name: str, - shell_script: str, - output_log_file: str, - timeout: int = 7200 + script_path: str, + script_args: list[str] = None, + output_log_file: str = None, + timeout: int = 7200, + container_name: str = None ) -> tuple[bool, str]: """Submit a generic shell script job using sbatch --wrap. - This is a low-level method for submitting arbitrary shell scripts - directly to SLURM via sbatch --wrap (non-blocking). + This is a low-level method for submitting shell scripts to SLURM + via sbatch --wrap (non-blocking). Supports executing script files + with arguments inside containers. Args: job_name: SLURM job name - shell_script: Shell script content to execute - output_log_file: Full path to output log file + script_path: Path to the shell script file to execute + script_args: List of arguments to pass to the script (optional) + output_log_file: Full path to output log file (optional, defaults to OUTPUT_PATH/{job_name}.log) timeout: Job timeout in seconds (default: 7200 = 2 hours) + container_name: Container name for srun (optional, defaults to job_name) Returns: tuple: (success: bool, job_id: str) @@ -60,13 +65,27 @@ def submit_shell_job( # Ensure output directory exists os.makedirs(output_path, exist_ok=True) + # Set defaults + if output_log_file is None: + output_log_file = f"{output_path}/{job_name}.log" + if container_name is None: + container_name = job_name + if script_args is None: + script_args = [] + + # Build the bash command with script and arguments + # Quote the script path and each argument separately + quoted_script = f'"{script_path}"' + quoted_args = ' '.join(f'"{arg}"' for arg in script_args) + bash_command = f"bash {quoted_script} {quoted_args}".strip() + # Build complete srun command (runs inside sbatch) srun_command = ( f"srun -l " - f"--container-name={job_name} " + f"--container-name={container_name} " f"--container-image={container_image} " f"--container-mounts={container_mount} " - f"bash -c '{shell_script}'" + f"{bash_command}" ) # Build sbatch command with all parameters @@ -107,6 +126,7 @@ def submit_shell_job( # Submit the job logger.info(f"Submitting job '{job_name}' (using sbatch --wrap)...") + logger.debug(f"Script: {script_path}") logger.debug(f"Log file: {output_log_file}") output = exec_cmd_with_output(sbatch_args, timeout=60) @@ -142,8 +162,8 @@ def submit_shell_job( def submit_session_collect_job() -> tuple[bool, str]: """Submit session collect job using sbatch (non-blocking). - This method builds the shell script for session collection and - delegates to submit_shell_job() for actual submission. + This method prepares the arguments for the session_collect.sh script + and submits it via the generic submit_shell_job() method. Key benefits: - Non-blocking execution (pytest doesn't wait) @@ -156,89 +176,25 @@ def submit_session_collect_job() -> tuple[bool, str]: tuple: (success: bool, job_id: str) """ try: - # Get environment configuration for building the script + # Get environment configuration work_dir = EnvManager.get_work_dir() repo_dir = EnvManager.get_repo_dir() install_mode = EnvManager.get_install_mode() trtllm_wheel_path = EnvManager.get_trtllm_wheel_path() output_path = EnvManager.get_output_path() - # Build the inner script specific to session collection - inner_script = f""" -INSTALL_MODE="{install_mode}" -REPO_DIR="{repo_dir}" -WORK_DIR="{work_dir}" -OUTPUT_PATH="{output_path}" -WHEEL_PATH="{trtllm_wheel_path}" - -echo "==========================================" -echo "Session Collect Job Started" -echo "Time: $(date)" -echo "Install Mode: $INSTALL_MODE" -echo "==========================================" - -# Step 1: Collect system information (no dependencies) -echo "" -echo "Step 1: Collecting system information..." -cd "$WORK_DIR" -python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1 -echo "System information collection completed" - -# Step 2: Handle different installation modes -echo "" -echo "Step 2: Installing TensorRT-LLM..." -if [ "$INSTALL_MODE" = "none" ]; then - echo "Using built-in TensorRT-LLM, skipping installation" - -elif [ "$INSTALL_MODE" = "wheel" ]; then - echo "Installing TensorRT-LLM wheel..." - echo "Wheel path pattern: $WHEEL_PATH" - - # Expand wildcard and install (use unquoted variable to allow glob expansion) - for wheel_file in $WHEEL_PATH; do - if [ -f "$wheel_file" ]; then - echo "Found wheel: $wheel_file" - pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..." - break - fi - done - echo "Wheel installation completed" - -elif [ "$INSTALL_MODE" = "source" ]; then - echo "Installing TensorRT-LLM from source..." - cd "$REPO_DIR" - pip3 install -e . 2>&1 || echo "Source install failed, continuing..." - echo "Source installation completed" - -else - echo "ERROR: Invalid install mode: $INSTALL_MODE" - exit 1 -fi - -# Step 3: Collect TensorRT-LLM version information -echo "" -echo "Step 3: Collecting TensorRT-LLM version information..." -VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt" -python3 -c 'import tensorrt_llm; print(f"[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}")' > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE" -echo "TensorRT-LLM version written to: $VERSION_FILE" - -echo "" -echo "==========================================" -echo "Session Collect Job Completed" -echo "Time: $(date)" -echo "==========================================" - -# Explicitly exit to ensure job terminates immediately -exit 0 -""" - - # Submit using the shell job method - log_file = f"{output_path}/session_collect.log" + # Prepare script path and arguments + script_path = f"{work_dir}/session_collect.sh" + script_args = [install_mode, repo_dir, work_dir, output_path, trtllm_wheel_path] + + # Submit using the generic shell job method return JobManager.submit_shell_job( job_name="session_collect", - shell_script=inner_script, - output_log_file=log_file, - timeout=7200 # 2 hours + script_path=script_path, + script_args=script_args, + output_log_file=f"{output_path}/session_collect.log", + timeout=7200, # 2 hours + container_name="session-collect" ) except Exception as e: diff --git a/tests/integration/defs/perf/disagg/session_collect.sh b/tests/integration/defs/perf/disagg/session_collect.sh new file mode 100644 index 00000000000..cbc7c775036 --- /dev/null +++ b/tests/integration/defs/perf/disagg/session_collect.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# Session Collection Script +# Collects system information and TensorRT-LLM version + +# Get parameters from environment or command line +INSTALL_MODE="${1:-none}" +REPO_DIR="${2:-.}" +WORK_DIR="${3:-.}" +OUTPUT_PATH="${4:-./output}" +WHEEL_PATH="${5:-}" + +echo "==========================================" +echo "Session Collect Job Started" +echo "Time: $(date)" +echo "Install Mode: $INSTALL_MODE" +echo "==========================================" + +# Step 1: Collect system information (no dependencies) +echo "" +echo "Step 1: Collecting system information..." +cd "$WORK_DIR" +python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1 +echo "System information collection completed" + +# Step 2: Handle different installation modes +echo "" +echo "Step 2: Installing TensorRT-LLM..." +if [ "$INSTALL_MODE" = "none" ]; then + echo "Using built-in TensorRT-LLM, skipping installation" + +elif [ "$INSTALL_MODE" = "wheel" ]; then + echo "Installing TensorRT-LLM wheel..." + echo "Wheel path pattern: $WHEEL_PATH" + + # Expand wildcard and install + for wheel_file in $WHEEL_PATH; do + if [ -f "$wheel_file" ]; then + echo "Found wheel: $wheel_file" + pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..." + break + fi + done + echo "Wheel installation completed" + +elif [ "$INSTALL_MODE" = "source" ]; then + echo "Installing TensorRT-LLM from source..." + cd "$REPO_DIR" + pip3 install -e . 2>&1 || echo "Source install failed, continuing..." + echo "Source installation completed" + +else + echo "ERROR: Invalid install mode: $INSTALL_MODE" + exit 1 +fi + +# Step 3: Collect TensorRT-LLM version information +echo "" +echo "Step 3: Collecting TensorRT-LLM version information..." +VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt" +python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE" +echo "TensorRT-LLM version written to: $VERSION_FILE" + +echo "" +echo "==========================================" +echo "Session Collect Job Completed" +echo "Time: $(date)" +echo "==========================================" + +exit 0 + From 10d040dcee85e85012f92b57739af1c4f5464e50 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Tue, 30 Dec 2025 01:54:27 +0000 Subject: [PATCH 09/13] expand the logic of slurm extra args Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 13 +----- .../defs/perf/disagg/utils/common.py | 44 ++++++++++++++----- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index 515dbf55eb2..a3e961c9c0a 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -9,7 +9,6 @@ import yaml from reporting.report import LogParser, LogWriter, ResultSaver from utils.common import ( - GPU_RESOURCE_CONFIG, EnvManager, extract_config_fields, ) @@ -88,12 +87,6 @@ def submit_shell_job( f"{bash_command}" ) - # Build sbatch command with all parameters - gpu_type = EnvManager.get_gpu_type() - gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type) - if not gpu_config: - raise ValueError(f"GPU resource configuration not found for {gpu_type}") - # Convert timeout to HH:MM:SS format hours = timeout // 3600 minutes = (timeout % 3600) // 60 @@ -112,11 +105,7 @@ def submit_shell_job( "--parsable", # Easier job ID parsing ] - # Conditionally add gres parameter based on GPU configuration - if gpu_config["gres_gpu"] is not None: - sbatch_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}") - - # Add extra SLURM arguments if configured + # Add extra SLURM arguments (including --gres from GPU_RESOURCE_CONFIG) slurm_extra_args = EnvManager.get_slurm_extra_args() if slurm_extra_args: sbatch_args.append(slurm_extra_args) diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index 4ba9b812401..9bd3a14b5a7 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -3,35 +3,40 @@ import os # GPU resource configuration -# Simplified - only fields actually used in the codebase +# Centralized configuration for all GPU-specific parameters GPU_RESOURCE_CONFIG = { # OCI GB200 "GB200": { - "gres_gpu": 4, # srun --gres parameter (None = not required) + "slurm_extra_args": "--gres=gpu:4", # SLURM extra arguments (empty string if not required) + "set_segment": True, "lock_freq_graphics_mhz": 2062, # GPU graphics clock lock frequency (MHz) "lock_freq_memory_mhz": 3996, # GPU memory clock lock frequency (MHz) }, # OCI GB300 "GB300": { - "gres_gpu": None, # GB300 does not require gres + "slurm_extra_args": "", # GB300 does not require extra args + "set_segment": True, "lock_freq_graphics_mhz": None, # TODO: Set GB300 lock frequency "lock_freq_memory_mhz": None, }, # H100 "H100": { - "gres_gpu": None, # H100 does not require gres + "slurm_extra_args": "", # H100 does not require extra args + "set_segment": False, "lock_freq_graphics_mhz": None, # TODO: Set H100 lock frequency "lock_freq_memory_mhz": None, }, # B200 "B200": { - "gres_gpu": 4, + "slurm_extra_args": "--gres=gpu:4", + "set_segment": False, "lock_freq_graphics_mhz": None, # TODO: Set B200 lock frequency "lock_freq_memory_mhz": None, }, # B300 "B300": { - "gres_gpu": 4, + "slurm_extra_args": "--gres=gpu:4", + "set_segment": False, "lock_freq_graphics_mhz": None, # TODO: Set B300 lock frequency "lock_freq_memory_mhz": None, }, @@ -59,15 +64,34 @@ def get_slurm_job_name() -> str: @staticmethod def get_slurm_set_segment() -> bool: + """Get whether to use SLURM segment parameter based on GPU type. + + Returns: + bool: True if GPU type requires --segment parameter, False otherwise + """ gpu_type = EnvManager.get_gpu_type() - gpu_type_support_segment = {"GB200": True, "GB300": True} - return gpu_type_support_segment.get(gpu_type, False) + gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type, {}) + return gpu_config.get("set_segment", False) @staticmethod def get_slurm_extra_args() -> str: + """Get SLURM extra arguments based on GPU configuration. + + Returns extra SLURM arguments from GPU_RESOURCE_CONFIG. + This allows flexible configuration of GPU-specific SLURM parameters + like --gres, --constraint, etc. + + Returns: + str: Extra SLURM arguments (e.g., "--gres=gpu:4" or "") + + Examples: + GB200: "--gres=gpu:4" + GB300: "" + Custom: "--gres=gpu:4 --constraint=v100" + """ gpu_type = EnvManager.get_gpu_type() - gpu_type_support_extra_args = {"GB200": "--gres=gpu:4", "GB300": ""} - return gpu_type_support_extra_args.get(gpu_type, "") + gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type, {}) + return gpu_config.get("slurm_extra_args", "") @staticmethod def get_container_image() -> str: From 01617baf7fd20957a6658129fa7d486d4d29a80c Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Wed, 31 Dec 2025 05:36:44 +0000 Subject: [PATCH 10/13] fx Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/compare_backends.py | 8 ++--- .../defs/perf/disagg/execution/executor.py | 35 +++++++++---------- .../defs/perf/disagg/session_collect.sh | 1 - .../defs/perf/disagg/simple_collect.py | 1 - .../test_configs/disagg/stress/README.md | 1 - .../defs/perf/disagg/test_disagg.py | 12 +++---- .../defs/perf/disagg/utils/common.py | 8 ++--- .../defs/perf/disagg/utils/config_loader.py | 27 +++++++++----- .../defs/perf/disagg/utils/trackers.py | 8 ++--- 9 files changed, 51 insertions(+), 50 deletions(-) diff --git a/tests/integration/defs/perf/disagg/compare_backends.py b/tests/integration/defs/perf/disagg/compare_backends.py index 8ff9c1d3631..46c2223fa3a 100644 --- a/tests/integration/defs/perf/disagg/compare_backends.py +++ b/tests/integration/defs/perf/disagg/compare_backends.py @@ -14,7 +14,7 @@ def extract_backend(test_name): New format: ccb-NIXL or ccb-UCX or ccb-DEFAULT Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL - + Note: "DEFAULT" is a special marker that represents the default backend """ match = re.search(r"ccb-(\w+)", test_name) @@ -158,7 +158,7 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): ) # Print statistics - print(f"\n=== Backend Comparison Statistics ===") + print("\n=== Backend Comparison Statistics ===") print(f"Default backend: {default_backend}") print(f"Comparison pairs: {comparison_pairs}") print(f"Single-backend cases (skipped): {single_backend_skipped}") @@ -166,8 +166,8 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"): # If no comparison pairs found, exit with success if comparison_pairs == 0: - print(f"\nInfo: No backend comparison pairs found in disagg_perf tests") - print(f"All cases are single-backend only, no comparison needed") + print("\nInfo: No backend comparison pairs found in disagg_perf tests") + print("All cases are single-backend only, no comparison needed") sys.exit(0) # Convert to DataFrame diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index a3e961c9c0a..d454765c536 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -8,15 +8,11 @@ import yaml from reporting.report import LogParser, LogWriter, ResultSaver -from utils.common import ( - EnvManager, - extract_config_fields, -) +from utils.common import EnvManager from utils.logger import logger from execution.subprocess_utils import exec_cmd, exec_cmd_with_output - # ============================================================================ # Job Manager # ============================================================================ @@ -36,7 +32,7 @@ def submit_shell_job( script_args: list[str] = None, output_log_file: str = None, timeout: int = 7200, - container_name: str = None + container_name: str = None, ) -> tuple[bool, str]: """Submit a generic shell script job using sbatch --wrap. @@ -75,7 +71,7 @@ def submit_shell_job( # Build the bash command with script and arguments # Quote the script path and each argument separately quoted_script = f'"{script_path}"' - quoted_args = ' '.join(f'"{arg}"' for arg in script_args) + quoted_args = " ".join(f'"{arg}"' for arg in script_args) bash_command = f"bash {quoted_script} {quoted_args}".strip() # Build complete srun command (runs inside sbatch) @@ -140,6 +136,7 @@ def submit_shell_job( except Exception as e: logger.error(f"Failed to submit job '{job_name}': {e}") import traceback + logger.debug(traceback.format_exc()) return False, str(e) @@ -183,12 +180,13 @@ def submit_session_collect_job() -> tuple[bool, str]: script_args=script_args, output_log_file=f"{output_path}/session_collect.log", timeout=7200, # 2 hours - container_name="session-collect" + container_name="session-collect", ) except Exception as e: logger.error(f"Failed to prepare session collect job: {e}") import traceback + logger.debug(traceback.format_exc()) return False, str(e) @@ -232,7 +230,7 @@ def submit_test_job(test_config) -> tuple: # Call submit.py with the temporary config file submit_script = os.path.join(EnvManager.get_script_dir(), "submit.py") - + case_log_dir = JobManager.get_result_dir(test_config) cmd = ["python3", submit_script, "-c", temp_config_path, "--log-dir", case_log_dir] @@ -292,22 +290,22 @@ def backup_logs( try: final_dir = result_dir - + # For FAILED cases, rename directory to add _ERROR suffix if not is_passed: error_dir = f"{result_dir}_ERROR" logger.info(f"Renaming failed case directory: {result_dir} -> {error_dir}") - + # Remove old error directory if exists if os.path.exists(error_dir): logger.warning(f"Removing existing error directory: {error_dir}") shutil.rmtree(error_dir) - + # Rename to add _ERROR suffix shutil.move(result_dir, error_dir) final_dir = error_dir logger.success(f"Directory renamed to: {final_dir}") - + # Copy temporary config file to the directory temp_config_path = test_config.temp_config_path if os.path.exists(temp_config_path): @@ -811,23 +809,23 @@ def _check_job_result( timestamps=timestamps, test_name=test_name, ) - + # If perf check failed, return immediately if not perf_result.get("success", False): return perf_result - + # Then check accuracy if accuracy_config is provided if accuracy_config: # Use metrics config from accuracy_config (defaults to _COMMON_ACCURACY_METRICS) accuracy_metrics = accuracy_config.get_metrics_config() - + accuracy_result = JobManager._check_accuracy_result( job_id=job_id, metrics_config=accuracy_metrics, accuracy_config=accuracy_config, result_dir=result_dir, ) - + # If accuracy check failed, merge results and return if not accuracy_result.get("success", False): return { @@ -836,7 +834,7 @@ def _check_job_result( "accuracy_result": accuracy_result, "error": f"Perf passed but accuracy failed: {accuracy_result.get('error', 'Unknown')}", } - + # Both passed, merge results return { **perf_result, @@ -858,4 +856,3 @@ def _check_job_result( timestamps=timestamps, test_name=test_name, ) - diff --git a/tests/integration/defs/perf/disagg/session_collect.sh b/tests/integration/defs/perf/disagg/session_collect.sh index cbc7c775036..30cd3c4c1d4 100644 --- a/tests/integration/defs/perf/disagg/session_collect.sh +++ b/tests/integration/defs/perf/disagg/session_collect.sh @@ -67,4 +67,3 @@ echo "Time: $(date)" echo "==========================================" exit 0 - diff --git a/tests/integration/defs/perf/disagg/simple_collect.py b/tests/integration/defs/perf/disagg/simple_collect.py index 5a3b017a604..118759e9b7d 100644 --- a/tests/integration/defs/perf/disagg/simple_collect.py +++ b/tests/integration/defs/perf/disagg/simple_collect.py @@ -17,7 +17,6 @@ import socket import subprocess import sys -import time from collections import OrderedDict from datetime import datetime diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md index 528c8e33e8b..ed440ef0409 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md @@ -477,4 +477,3 @@ For issues or questions: 2. Review configuration against this README 3. Compare with `EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml` 4. Contact your team's test infrastructure maintainer - diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py index 4c6391b205f..39008ca11a1 100644 --- a/tests/integration/defs/perf/disagg/test_disagg.py +++ b/tests/integration/defs/perf/disagg/test_disagg.py @@ -218,8 +218,8 @@ def test_accuracy(self, request, test_config: TestConfig): @pytest.mark.parametrize("test_config", STRESS_TEST_CASES) def test_stress(self, request, test_config: TestConfig): """Stress test combining performance benchmarks and accuracy validation. - - This test type is designed for stress testing scenarios where both + + This test type is designed for stress testing scenarios where both performance metrics (CSV output) and accuracy (e.g., GSM8K) need to be validated. """ full_test_name = request.node.name @@ -249,12 +249,12 @@ def test_stress(self, request, test_config: TestConfig): logger.info(f"Category: {test_config.test_category}") logger.info(f"Model: {test_config.model_name}") logger.info(f"Benchmark: {test_config.benchmark_type}") - + # Log accuracy datasets if configured if test_config.accuracy_config: dataset_names = test_config.accuracy_config.get_all_dataset_names() logger.info(f"Accuracy Datasets: {', '.join(dataset_names)}") - + logger.info(f"Metrics log: {test_config.metrics_config.log_file}") logger.info(f"Supported GPUs: {', '.join(test_config.supported_gpus)}") logger.info(f"{'=' * 60}") @@ -283,9 +283,7 @@ def test_stress(self, request, test_config: TestConfig): # Check results - this will handle both perf CSV writing AND accuracy validation result = JobManager.check_result(job_id, test_config, timestamps, full_test_name) - assert result["success"], ( - f"Stress test failed: {result.get('error', 'Unknown error')}" - ) + assert result["success"], f"Stress test failed: {result.get('error', 'Unknown error')}" except Exception as e: test_tracker.end_test_case() diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index 9bd3a14b5a7..55d08875b52 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -65,7 +65,7 @@ def get_slurm_job_name() -> str: @staticmethod def get_slurm_set_segment() -> bool: """Get whether to use SLURM segment parameter based on GPU type. - + Returns: bool: True if GPU type requires --segment parameter, False otherwise """ @@ -76,14 +76,14 @@ def get_slurm_set_segment() -> bool: @staticmethod def get_slurm_extra_args() -> str: """Get SLURM extra arguments based on GPU configuration. - + Returns extra SLURM arguments from GPU_RESOURCE_CONFIG. This allows flexible configuration of GPU-specific SLURM parameters like --gres, --constraint, etc. - + Returns: str: Extra SLURM arguments (e.g., "--gres=gpu:4" or "") - + Examples: GB200: "--gres=gpu:4" GB300: "" diff --git a/tests/integration/defs/perf/disagg/utils/config_loader.py b/tests/integration/defs/perf/disagg/utils/config_loader.py index 07531a816ed..567834d6a77 100644 --- a/tests/integration/defs/perf/disagg/utils/config_loader.py +++ b/tests/integration/defs/perf/disagg/utils/config_loader.py @@ -45,7 +45,9 @@ class AccuracyConfig: """Accuracy test configuration (supports multiple datasets).""" datasets: List[DatasetThreshold] # List of dataset threshold configurations - metrics: Optional[MetricsConfig] = None # Optional custom metrics config (defaults to _COMMON_ACCURACY_METRICS) + metrics: Optional[MetricsConfig] = ( + None # Optional custom metrics config (defaults to _COMMON_ACCURACY_METRICS) + ) def get_dataset_config(self, dataset_name: str) -> Optional[DatasetThreshold]: """Get configuration by dataset name. @@ -68,10 +70,10 @@ def get_all_dataset_names(self) -> List[str]: List of dataset names """ return [ds.dataset_name for ds in self.datasets] - + def get_metrics_config(self) -> MetricsConfig: """Get metrics configuration for accuracy parsing. - + Returns: Custom metrics config if provided, otherwise _COMMON_ACCURACY_METRICS """ @@ -398,20 +400,27 @@ def _load_config_file(self, yaml_path: Path, test_type: str, test_category: str) higher_is_better=higher_is_better, ) ) - + # Check if custom accuracy metrics are provided custom_metrics = None if "metrics" in acc_meta: metrics_override = acc_meta["metrics"] custom_metrics = MetricsConfig( log_file=metrics_override.get("log_file", "7_accuracy_eval.log"), - extractor_pattern=metrics_override.get("extractor_pattern", r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|"), - metric_names=metrics_override.get("metric_names", ["flexible-extract", "strict-match"]), + extractor_pattern=metrics_override.get( + "extractor_pattern", + r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|", + ), + metric_names=metrics_override.get( + "metric_names", ["flexible-extract", "strict-match"] + ), ) - logger.info(f"Using custom accuracy metrics config from YAML") - + logger.info("Using custom accuracy metrics config from YAML") + accuracy_config = AccuracyConfig(datasets=datasets, metrics=custom_metrics) - logger.info(f"Loaded accuracy config with {len(datasets)} dataset(s) for {test_category} test") + logger.info( + f"Loaded accuracy config with {len(datasets)} dataset(s) for {test_category} test" + ) return TestConfig( config_path=str(yaml_path), diff --git a/tests/integration/defs/perf/disagg/utils/trackers.py b/tests/integration/defs/perf/disagg/utils/trackers.py index 3ffcebd55fb..acee8d7fd68 100644 --- a/tests/integration/defs/perf/disagg/utils/trackers.py +++ b/tests/integration/defs/perf/disagg/utils/trackers.py @@ -75,7 +75,7 @@ def start(self): def end_and_collect(self): """Record end time and trigger session collection. - + Uses the new sbatch-based approach for non-blocking execution. Submits the job and waits for completion using JobManager. """ @@ -84,7 +84,7 @@ def end_and_collect(self): # Submit session collect job (non-blocking sbatch) success, job_id = JobManager.submit_session_collect_job() - + if not success: logger.error(f"Failed to submit session collect job: {job_id}") return False @@ -95,13 +95,13 @@ def end_and_collect(self): job_id=job_id, timeout=7200, # 2 hours test_config=None, # No test config for session collect - check_early_failure=False # Don't check early failures + check_early_failure=False, # Don't check early failures ) # Check if log file was created (indicates success) output_path = EnvManager.get_output_path() log_file = os.path.join(output_path, "session_collect.log") - + if os.path.exists(log_file): # Update timestamps in CSV self._update_csv_timestamps() From 1596f71e5662920bbb7b37d01886550c51cb1e19 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Wed, 31 Dec 2025 12:29:16 +0000 Subject: [PATCH 11/13] add batch job support batch manager Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- tests/integration/defs/perf/disagg/README.md | 145 +++++++++++++- .../integration/defs/perf/disagg/conftest.py | 187 +++++++++++++++++- .../defs/perf/disagg/test_disagg.py | 39 ++-- 3 files changed, 343 insertions(+), 28 deletions(-) diff --git a/tests/integration/defs/perf/disagg/README.md b/tests/integration/defs/perf/disagg/README.md index 28ba839c6e7..5921900b707 100644 --- a/tests/integration/defs/perf/disagg/README.md +++ b/tests/integration/defs/perf/disagg/README.md @@ -132,6 +132,141 @@ poetry run pytest --disagg test_disagg.py -s -vv -m accuracy poetry run pytest --disagg test_disagg.py -s -vv -k "deepseek-r1-fp4_1k1k" ``` +## Batch Job Submission + +The framework supports automatic batch job submission to maximize parallelism in SLURM cluster environments. Instead of submitting jobs one-by-one, it groups test cases into batches and submits entire batches when needed. + +### Quick Start + +**Default batch size (5 jobs per batch):** +```bash +# Run all tests with default batching +poetry run pytest --disagg test_disagg.py -s -vv + +# Run with test list +poetry run pytest --disagg test_disagg.py -s -vv --disagg-test-list=./testlist/all.txt +``` + +**Custom batch size:** +```bash +# Set batch size via command line +poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=10 + +# Set batch size via environment variable +export DISAGG_BATCH_SIZE=20 +poetry run pytest --disagg test_disagg.py -s -vv + +# Submit all jobs at once (unlimited batch) +poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=0 +``` + +### How Batch Submission Works + +``` +Pytest Collection Phase: + - Collects all test cases (e.g., 100 tests) + - BatchManager splits them into batches (e.g., 20 batches of 5) + +Pytest Execution Phase: + Test 0 runs: + -> Triggers submission of Batch 0 (jobs 0-4) + -> Waits for job 0 to complete + + Test 1-4 run: + -> Batch 0 already submitted, directly wait for completion + + Test 5 runs: + -> Triggers submission of Batch 1 (jobs 5-9) + -> Waits for job 5 to complete + + ... and so on +``` + +### Key Benefits + +- **Parallel Execution**: All jobs in a batch run simultaneously on SLURM cluster +- **Reduced Wait Time**: Total time ≈ MAX(job time) instead of SUM(job times) +- **Automatic Management**: No need to manually split test lists +- **Lazy Loading**: Only submits batches when needed + +### Configuration Options + +**Priority**: Command line option > Environment variable > Default (5) + +**Examples:** + +```bash +# Small batch for quick testing +poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=3 \ + --disagg-test-list=./testlist/debug.txt + +# Large batch for production +poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=50 \ + --disagg-test-list=./testlist/all.txt + +# Submit all at once +poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=0 +``` + +### Timeout Configuration + +The default timeout for waiting for job completion is **10 hours (36000 seconds)**, which accounts for: +- SLURM queue wait time +- Job execution time +- Buffer for delays + +### Performance Comparison + +**Before (Sequential Submission):** +``` +Case 1: submit + wait (1.5h) = 1.5h +Case 2: submit + wait (1.5h) = 1.5h +Case 3: submit + wait (1.5h) = 1.5h +... +Total: 50 × 1.5h = 75 hours +``` + +**After (Batch Submission, batch_size=50):** +``` +Batch 0 (50 jobs): submitted in parallel + Case 1: wait (1.5h) + Case 2-50: wait (0s, already done) + +Total: ~1.5 hours +``` + +**Speedup: 50x** + +### Troubleshooting + +**Check BatchManager initialization:** +``` +====================================================================== +Batch Manager Initialized +Batch size: 5 jobs per batch +====================================================================== + +Total test configs: 20 +Total batches: 4 +``` + +**Monitor batch submission:** +``` +====================================================================== +Submitting Batch 0 +Range: [0:5] (5 jobs) +====================================================================== + + [ 1/5] Job 1234 <- test_config_id_1 + [ 2/5] Job 1235 <- test_config_id_2 + ... +``` + +**If jobs timeout frequently:** +- Check SLURM queue status +- Consider reducing batch size to avoid resource contention +- Verify that timeout (36000s) is sufficient for your workload + ## Test Naming Convention Tests are automatically named using the format: @@ -193,6 +328,7 @@ Test results are saved to: - `GPU_TYPE`: Current GPU type (default: GB200) - `OUTPUT_PATH`: Directory for test results and logs - `WORK_DIR`: Working directory for benchmark execution +- `DISAGG_BATCH_SIZE`: Default batch size for job submission (default: 5) - `DEBUG_MODE`: Enable debug mode (set to "1" to skip job submission) - `DEBUG_JOB_ID`: Job ID to use in debug mode @@ -212,10 +348,11 @@ The framework consists of: 1. **ConfigLoader**: Scans and loads YAML configurations 2. **ConfigValidator**: Validates configuration correctness -3. **JobManager**: Handles SLURM job submission and monitoring -4. **LogParser**: Extracts metrics from benchmark logs -5. **TestCaseTracker**: Tracks test execution timing -6. **ResultSaver**: Saves results to CSV +3. **BatchManager**: Manages batch job submission for parallel execution +4. **JobManager**: Handles SLURM job submission and monitoring +5. **LogParser**: Extracts metrics from benchmark logs +6. **TestCaseTracker**: Tracks test execution timing +7. **ResultSaver**: Saves results to CSV ## Benefits diff --git a/tests/integration/defs/perf/disagg/conftest.py b/tests/integration/defs/perf/disagg/conftest.py index 2dabeda1cd9..1ff436e38d5 100644 --- a/tests/integration/defs/perf/disagg/conftest.py +++ b/tests/integration/defs/perf/disagg/conftest.py @@ -1,9 +1,10 @@ """Pytest configuration for disagg tests. Only collects tests in this directory when --disagg parameter is provided. -Can share options like --disagg-test-list defined in this conftest.py. +Provides batch job submission capability to improve parallelism. """ +import os import pytest from utils.logger import logger @@ -23,6 +24,15 @@ def pytest_addoption(parser): help="Path to a file containing test IDs (one per line) to run. " "Example: pytest --disagg --disagg-test-list=testlist/testlist_gb200.txt", ) + parser.addoption( + "--disagg-batch-size", + action="store", + type=int, + default=None, + help="Number of jobs to submit per batch. Default: from env DISAGG_BATCH_SIZE or 5. " + "Set to 0 for unlimited (submit all at once). " + "Example: pytest --disagg --disagg-batch-size=10", + ) def pytest_collect_directory(path, parent): @@ -45,7 +55,6 @@ def pytest_collect_directory(path, parent): return True # With --disagg parameter, proceed with normal collection - # Can subsequently use --disagg-test-list and other options from main conftest.py for filtering return None @@ -88,7 +97,7 @@ def pytest_collection_modifyitems(config, items): for item in items: # item.nodeid is the full test identifier like: - # "test_disagg_simple.py::TestDisaggBenchmark::test_benchmark[deepseek-r1-fp4:1k1k:...]" + # "test_disagg.py::TestDisaggBenchmark::test_benchmark[deepseek-r1-fp4:1k1k:...]" if item.nodeid in wanted_tests: selected.append(item) else: @@ -112,3 +121,175 @@ def pytest_collection_modifyitems(config, items): logger.warning(f"Please check that the test IDs in {test_list_file} are correct.") logger.info(f"{'=' * 70}\n") + + +class BatchManager: + """Batch job submission manager for disagg tests. + + Automatically splits test cases into batches and submits them on-demand + to maximize parallelism in SLURM cluster environments. + + Key features: + - Lazy batch submission: only submits when needed + - Configurable batch size via CLI or environment variable + - Maintains job_id mapping for all submitted jobs + """ + + def __init__(self, batch_size=5): + """Initialize batch manager. + + Args: + batch_size: Number of jobs per batch. None or 0 means unlimited (submit all at once). + Default is 5 if not specified. + """ + # Normalize batch_size: None, 0, or negative means unlimited + if batch_size is None or batch_size <= 0: + self.batch_size = None + else: + self.batch_size = batch_size + + self.submitted_batches = set() # Track which batch numbers have been submitted + self.job_mapping = {} # Map test_id -> SLURM job_id + self.all_configs = [] # Ordered list of all test configs + + logger.info(f"\n{'=' * 70}") + logger.info("Batch Manager Initialized") + if self.batch_size: + logger.info(f"Batch size: {self.batch_size} jobs per batch") + else: + logger.info("Batch size: unlimited (submit all at once)") + logger.info(f"{'=' * 70}\n") + + def add_config(self, test_config): + """Add a test configuration to the manager. + + Called during initialization to build the ordered list of configs. + + Args: + test_config: TestConfig object to add + """ + self.all_configs.append(test_config) + + def get_job_id(self, test_config): + """Get SLURM job ID for a test config, submitting batch if needed. + + This is the main entry point. It: + 1. Determines which batch the test belongs to + 2. Submits the entire batch if not already submitted + 3. Returns the job_id for this specific test + + Args: + test_config: TestConfig object to get job_id for + + Returns: + str: SLURM job ID, or None if submission failed + """ + # Find the index of this config in the ordered list + try: + idx = next(i for i, c in enumerate(self.all_configs) + if c.test_id == test_config.test_id) + except StopIteration: + logger.error(f"Config not found in manager: {test_config.test_id}") + return None + + # Calculate which batch this test belongs to + if self.batch_size: + batch_num = idx // self.batch_size + else: + batch_num = 0 # All tests in one batch + + # Submit the batch if not already submitted + if batch_num not in self.submitted_batches: + self._submit_batch(batch_num) + + # Return the cached job_id + return self.job_mapping.get(test_config.test_id) + + def _submit_batch(self, batch_num): + """Submit all jobs in a specific batch. + + Args: + batch_num: Batch number to submit (0-indexed) + """ + from execution.executor import JobManager + + # Calculate batch range + if self.batch_size: + start_idx = batch_num * self.batch_size + end_idx = min(start_idx + self.batch_size, len(self.all_configs)) + else: + start_idx = 0 + end_idx = len(self.all_configs) + + batch_configs = self.all_configs[start_idx:end_idx] + + logger.info(f"\n{'=' * 70}") + logger.info(f"Submitting Batch {batch_num}") + logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)") + logger.info(f"{'=' * 70}\n") + + # Submit all jobs in this batch + success_count = 0 + for i, config in enumerate(batch_configs, 1): + try: + success, job_id = JobManager.submit_test_job(config) + if success and job_id: + self.job_mapping[config.test_id] = job_id + success_count += 1 + # Truncate test_id for display + display_id = config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id + logger.success(f" [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}") + else: + self.job_mapping[config.test_id] = None + logger.error(f" [{i:3d}/{len(batch_configs)}] Failed: {config.test_id[:50]}") + except Exception as e: + self.job_mapping[config.test_id] = None + logger.error(f" [{i:3d}/{len(batch_configs)}] Error: {e}") + + # Mark batch as submitted + self.submitted_batches.add(batch_num) + + logger.info(f"\n{'=' * 70}") + logger.success(f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded") + logger.info(f"{'=' * 70}\n") + + +@pytest.fixture(scope="session") +def batch_manager(request): + """Provide batch manager fixture for test methods. + + This session-scoped fixture creates and initializes the BatchManager + with all collected test configs. + + Returns: + BatchManager: Initialized batch manager instance + """ + # Get batch size from CLI option or environment variable + batch_size = request.config.getoption("--disagg-batch-size") + if batch_size is None: + env_batch_size = os.getenv("DISAGG_BATCH_SIZE") + if env_batch_size: + try: + batch_size = int(env_batch_size) + except ValueError: + logger.warning(f"Invalid DISAGG_BATCH_SIZE: {env_batch_size}, using default 5") + batch_size = 5 + else: + batch_size = 5 # Default batch size + + # Create batch manager + manager = BatchManager(batch_size=batch_size) + + # Extract all test configs from collected items + for item in request.session.items: + if hasattr(item, 'callspec') and 'test_config' in item.callspec.params: + manager.add_config(item.callspec.params['test_config']) + + # Log statistics + logger.info(f"Total test configs: {len(manager.all_configs)}") + if manager.batch_size: + total_batches = (len(manager.all_configs) + manager.batch_size - 1) // manager.batch_size + logger.info(f"Total batches: {total_batches}") + logger.info("") + + return manager diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py index 39008ca11a1..b60ba851967 100644 --- a/tests/integration/defs/perf/disagg/test_disagg.py +++ b/tests/integration/defs/perf/disagg/test_disagg.py @@ -62,7 +62,7 @@ class TestDisaggBenchmark: @pytest.mark.perf @pytest.mark.parametrize("test_config", PERF_TEST_CASES) - def test_benchmark(self, request, test_config: TestConfig): + def test_benchmark(self, request, batch_manager, test_config: TestConfig): """Performance benchmark test for YAML configurations.""" full_test_name = request.node.name @@ -101,15 +101,14 @@ def test_benchmark(self, request, test_config: TestConfig): ) job_id = EnvManager.get_debug_job_id() else: - # Submit job using JobManager - success, job_id = JobManager.submit_test_job(test_config) + # Get job_id from batch manager (auto-submits batch if needed) + job_id = batch_manager.get_job_id(test_config) # Validate submission result - assert success, f"Job submission failed: {test_config.test_id}" - assert job_id, "Unable to get job ID" + assert job_id, f"Failed to get job_id for {test_config.test_id}" - # Wait for completion (timeout/early failure handled inside) - JobManager.wait_for_completion(job_id, 7200, test_config, check_early_failure=True) + # Wait for completion (timeout: 10 hours = 36000 seconds) + JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True) # End tracking test case test_tracker.end_test_case() @@ -136,7 +135,7 @@ def test_benchmark(self, request, test_config: TestConfig): @pytest.mark.accuracy @pytest.mark.parametrize("test_config", ACCURACY_TEST_CASES) - def test_accuracy(self, request, test_config: TestConfig): + def test_accuracy(self, request, batch_manager, test_config: TestConfig): """Accuracy test for YAML configurations.""" full_test_name = request.node.name @@ -179,15 +178,14 @@ def test_accuracy(self, request, test_config: TestConfig): ) job_id = EnvManager.get_debug_job_id() else: - # Submit job using JobManager - success, job_id = JobManager.submit_test_job(test_config) + # Get job_id from batch manager (auto-submits batch if needed) + job_id = batch_manager.get_job_id(test_config) # Validate submission result - assert success, f"Job submission failed: {test_config.test_id}" - assert job_id, "Unable to get job ID" + assert job_id, f"Failed to get job_id for {test_config.test_id}" - # Wait for completion (timeout/early failure handled inside) - JobManager.wait_for_completion(job_id, 10800, test_config, check_early_failure=True) + # Wait for completion (timeout: 10 hours = 36000 seconds) + JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True) # End tracking test case test_tracker.end_test_case() @@ -216,7 +214,7 @@ def test_accuracy(self, request, test_config: TestConfig): @pytest.mark.stress @pytest.mark.parametrize("test_config", STRESS_TEST_CASES) - def test_stress(self, request, test_config: TestConfig): + def test_stress(self, request, batch_manager, test_config: TestConfig): """Stress test combining performance benchmarks and accuracy validation. This test type is designed for stress testing scenarios where both @@ -265,15 +263,14 @@ def test_stress(self, request, test_config: TestConfig): ) job_id = EnvManager.get_debug_job_id() else: - # Submit job using JobManager - success, job_id = JobManager.submit_test_job(test_config) + # Get job_id from batch manager (auto-submits batch if needed) + job_id = batch_manager.get_job_id(test_config) # Validate submission result - assert success, f"Job submission failed: {test_config.test_id}" - assert job_id, "Unable to get job ID" + assert job_id, f"Failed to get job_id for {test_config.test_id}" - # Wait for completion (longer timeout for stress tests: 4 hours) - JobManager.wait_for_completion(job_id, 10800, test_config, check_early_failure=True) + # Wait for completion (timeout: 10 hours = 36000 seconds) + JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True) # End tracking test case test_tracker.end_test_case() From b5406a9d7b290cc01462a15cdb76781074da4488 Mon Sep 17 00:00:00 2001 From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Date: Wed, 31 Dec 2025 12:55:16 +0000 Subject: [PATCH 12/13] fx Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 4 +++- .../perf/disagg/execution/subprocess_utils.py | 17 +++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index d454765c536..547b63aa8c4 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -114,7 +114,9 @@ def submit_shell_job( logger.debug(f"Script: {script_path}") logger.debug(f"Log file: {output_log_file}") - output = exec_cmd_with_output(sbatch_args, timeout=60) + # Use check=False to allow submission even with Kerberos warnings + # (mimics submit.py behavior) + output = exec_cmd_with_output(sbatch_args, timeout=60, check=False) job_id = output.strip() # Parse job ID (--parsable returns just the job ID) diff --git a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py index 9ab77714267..27df7f829d1 100644 --- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py +++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py @@ -33,19 +33,20 @@ def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int: return result.returncode -def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs) -> str: +def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, check: bool = True, **kwargs) -> str: """Execute command and return output as string. Args: *popenargs: Command and arguments timeout: Timeout in seconds + check: If True, raise CalledProcessError on non-zero exit code (default: True) **kwargs: Additional subprocess arguments Returns: stdout as string (decoded from bytes) Raises: - subprocess.CalledProcessError: If command returns non-zero exit code + subprocess.CalledProcessError: If check=True and command returns non-zero exit code subprocess.TimeoutExpired: If timeout is reached """ result = subprocess.run( @@ -53,11 +54,15 @@ def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs) stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout, - check=True, + check=check, **kwargs, ) - # Log stderr if it exists + # Log stderr if it exists (as warning if check=False, as error if check=True) if result.stderr: - stderr_output = result.stderr.decode() - logger.error(f"Command stderr: {stderr_output}") + stderr_output = result.stderr.decode().strip() + if stderr_output: + if check: + logger.error(f"Command stderr: {stderr_output}") + else: + logger.warning(f"Command stderr: {stderr_output}") return result.stdout.decode() From 2259c581f9f9b397bbcac1ed7075cdea51caa847 Mon Sep 17 00:00:00 2001 From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> Date: Wed, 7 Jan 2026 17:15:13 +0800 Subject: [PATCH 13/13] fix pre-commit failed Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com> --- .../integration/defs/perf/disagg/conftest.py | 78 ++++++++++--------- .../perf/disagg/execution/subprocess_utils.py | 4 +- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/tests/integration/defs/perf/disagg/conftest.py b/tests/integration/defs/perf/disagg/conftest.py index 1ff436e38d5..a4b88542dfd 100644 --- a/tests/integration/defs/perf/disagg/conftest.py +++ b/tests/integration/defs/perf/disagg/conftest.py @@ -5,6 +5,7 @@ """ import os + import pytest from utils.logger import logger @@ -125,19 +126,19 @@ def pytest_collection_modifyitems(config, items): class BatchManager: """Batch job submission manager for disagg tests. - + Automatically splits test cases into batches and submits them on-demand to maximize parallelism in SLURM cluster environments. - + Key features: - Lazy batch submission: only submits when needed - Configurable batch size via CLI or environment variable - Maintains job_id mapping for all submitted jobs """ - + def __init__(self, batch_size=5): """Initialize batch manager. - + Args: batch_size: Number of jobs per batch. None or 0 means unlimited (submit all at once). Default is 5 if not specified. @@ -147,11 +148,11 @@ def __init__(self, batch_size=5): self.batch_size = None else: self.batch_size = batch_size - + self.submitted_batches = set() # Track which batch numbers have been submitted self.job_mapping = {} # Map test_id -> SLURM job_id self.all_configs = [] # Ordered list of all test configs - + logger.info(f"\n{'=' * 70}") logger.info("Batch Manager Initialized") if self.batch_size: @@ -159,60 +160,61 @@ def __init__(self, batch_size=5): else: logger.info("Batch size: unlimited (submit all at once)") logger.info(f"{'=' * 70}\n") - + def add_config(self, test_config): """Add a test configuration to the manager. - + Called during initialization to build the ordered list of configs. - + Args: test_config: TestConfig object to add """ self.all_configs.append(test_config) - + def get_job_id(self, test_config): """Get SLURM job ID for a test config, submitting batch if needed. - + This is the main entry point. It: 1. Determines which batch the test belongs to 2. Submits the entire batch if not already submitted 3. Returns the job_id for this specific test - + Args: test_config: TestConfig object to get job_id for - + Returns: str: SLURM job ID, or None if submission failed """ # Find the index of this config in the ordered list try: - idx = next(i for i, c in enumerate(self.all_configs) - if c.test_id == test_config.test_id) + idx = next( + i for i, c in enumerate(self.all_configs) if c.test_id == test_config.test_id + ) except StopIteration: logger.error(f"Config not found in manager: {test_config.test_id}") return None - + # Calculate which batch this test belongs to if self.batch_size: batch_num = idx // self.batch_size else: batch_num = 0 # All tests in one batch - + # Submit the batch if not already submitted if batch_num not in self.submitted_batches: self._submit_batch(batch_num) - + # Return the cached job_id return self.job_mapping.get(test_config.test_id) - + def _submit_batch(self, batch_num): """Submit all jobs in a specific batch. - + Args: batch_num: Batch number to submit (0-indexed) """ from execution.executor import JobManager - + # Calculate batch range if self.batch_size: start_idx = batch_num * self.batch_size @@ -220,14 +222,14 @@ def _submit_batch(self, batch_num): else: start_idx = 0 end_idx = len(self.all_configs) - + batch_configs = self.all_configs[start_idx:end_idx] - + logger.info(f"\n{'=' * 70}") logger.info(f"Submitting Batch {batch_num}") logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)") logger.info(f"{'=' * 70}\n") - + # Submit all jobs in this batch success_count = 0 for i, config in enumerate(batch_configs, 1): @@ -237,7 +239,9 @@ def _submit_batch(self, batch_num): self.job_mapping[config.test_id] = job_id success_count += 1 # Truncate test_id for display - display_id = config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id + display_id = ( + config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id + ) logger.success(f" [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}") else: self.job_mapping[config.test_id] = None @@ -245,22 +249,24 @@ def _submit_batch(self, batch_num): except Exception as e: self.job_mapping[config.test_id] = None logger.error(f" [{i:3d}/{len(batch_configs)}] Error: {e}") - + # Mark batch as submitted self.submitted_batches.add(batch_num) - + logger.info(f"\n{'=' * 70}") - logger.success(f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded") + logger.success( + f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded" + ) logger.info(f"{'=' * 70}\n") @pytest.fixture(scope="session") def batch_manager(request): """Provide batch manager fixture for test methods. - + This session-scoped fixture creates and initializes the BatchManager with all collected test configs. - + Returns: BatchManager: Initialized batch manager instance """ @@ -276,20 +282,20 @@ def batch_manager(request): batch_size = 5 else: batch_size = 5 # Default batch size - + # Create batch manager manager = BatchManager(batch_size=batch_size) - + # Extract all test configs from collected items for item in request.session.items: - if hasattr(item, 'callspec') and 'test_config' in item.callspec.params: - manager.add_config(item.callspec.params['test_config']) - + if hasattr(item, "callspec") and "test_config" in item.callspec.params: + manager.add_config(item.callspec.params["test_config"]) + # Log statistics logger.info(f"Total test configs: {len(manager.all_configs)}") if manager.batch_size: total_batches = (len(manager.all_configs) + manager.batch_size - 1) // manager.batch_size logger.info(f"Total batches: {total_batches}") logger.info("") - + return manager diff --git a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py index 27df7f829d1..39a3f0ac4b9 100644 --- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py +++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py @@ -33,7 +33,9 @@ def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int: return result.returncode -def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, check: bool = True, **kwargs) -> str: +def exec_cmd_with_output( + *popenargs, timeout: Optional[float] = None, check: bool = True, **kwargs +) -> str: """Execute command and return output as string. Args: