From 7b8983a8cfff742a507000352aa7ccbe25f7afb8 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Tue, 23 Dec 2025 11:24:08 +0000
Subject: [PATCH 01/13] fix slurm log path error

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py    | 113 ++++++------------
 .../defs/perf/disagg/utils/common.py          |  13 --
 2 files changed, 37 insertions(+), 89 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 8f8f82a063d..2f4c482f052 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -219,7 +219,10 @@ def submit_job(test_config) -> tuple:
 
             # Call submit.py with the temporary config file
             submit_script = os.path.join(EnvManager.get_script_dir(), "submit.py")
-            cmd = ["python3", submit_script, "-c", temp_config_path]
+           
+            case_log_dir = JobManager.get_result_dir(test_config)
+
+            cmd = ["python3", submit_script, "-c", temp_config_path, "--log-dir", case_log_dir]
 
             logger.info(f"Command: {' '.join(cmd)}")
 
@@ -265,55 +268,49 @@ def backup_logs(
         Args:
             job_id: SLURM job ID
             test_config: TestConfig object
-            result_dir: Result directory path
+            result_dir: Result directory path (already named as test_id)
             is_passed: Whether the job passed
         Returns:
-            backup_dir path if successful, None otherwise
+            Final directory path if successful, None otherwise
         """
         if not os.path.exists(result_dir):
             logger.warning(f"Result directory does not exist yet: {result_dir}")
             return None
 
-        # Replace colons with hyphens for safe directory naming
-        dst_dir_name = test_config.test_id.replace(":", "-")
-        # Add ERROR suffix if the job failed
-        if not is_passed:
-            dst_dir_name = f"{dst_dir_name}_ERROR"
-        backup_dir = os.path.join(os.path.dirname(result_dir), dst_dir_name)
-
         try:
-            logger.info("Copying result directory to backup...")
-            logger.info(f"Source: {result_dir}")
-            logger.info(f"Destination: {backup_dir}")
-
-            # Remove old backup if it exists
-            if os.path.exists(backup_dir):
-                logger.warning("Backup directory already exists, removing old backup")
-                shutil.rmtree(backup_dir)
-
-            # Copy result directory
-            shutil.copytree(result_dir, backup_dir)
-            logger.success(f"Backup created successfully: {backup_dir}")
-
-            # Move temporary config file to backup directory (not copy)
+            final_dir = result_dir
+            
+            # For FAILED cases, rename directory to add _ERROR suffix
+            if not is_passed:
+                error_dir = f"{result_dir}_ERROR"
+                logger.info(f"Renaming failed case directory: {result_dir} -> {error_dir}")
+                
+                # Remove old error directory if exists
+                if os.path.exists(error_dir):
+                    logger.warning(f"Removing existing error directory: {error_dir}")
+                    shutil.rmtree(error_dir)
+                
+                # Rename to add _ERROR suffix
+                shutil.move(result_dir, error_dir)
+                final_dir = error_dir
+                logger.success(f"Directory renamed to: {final_dir}")
+            
+            # Copy temporary config file to the directory
             temp_config_path = test_config.temp_config_path
             if os.path.exists(temp_config_path):
-                dest_path = os.path.join(backup_dir, os.path.basename(temp_config_path))
-                shutil.move(temp_config_path, dest_path)
-                logger.success(f"Temporary config moved to backup: {dest_path}")
+                dest_path = os.path.join(final_dir, os.path.basename(temp_config_path))
+                shutil.copy(temp_config_path, dest_path)
+                logger.success(f"Temporary config copied to: {dest_path}")
+                # Clean up the original temp config file
+                os.remove(temp_config_path)
+                logger.info(f"Cleaned up temporary config: {temp_config_path}")
             else:
-                # Fallback: copy original config if no temp file (backward compatibility)
-                case_config_path = test_config.config_path
-                if os.path.exists(case_config_path):
-                    shutil.copy(case_config_path, backup_dir)
-                    logger.success(f"Case config copied successfully: {case_config_path}")
-                else:
-                    logger.warning(f"Case config not found: {case_config_path}")
+                logger.warning(f"Temporary config not found: {temp_config_path}")
 
-            return backup_dir
+            return final_dir
 
         except Exception as e:
-            logger.warning(f"Failed to create backup copy: {e}")
+            logger.warning(f"Failed to backup logs: {e}")
             # Try to clean up temporary file on backup failure
             temp_config_path = test_config.temp_config_path
             if os.path.exists(temp_config_path):
@@ -324,26 +321,6 @@ def backup_logs(
                     logger.warning(f"Failed to cleanup temp config: {cleanup_error}")
             return None
 
-    @staticmethod
-    def cleanup_result_dir(result_dir: str) -> bool:
-        """Clean up result directory.
-
-        Args:
-            result_dir: Result directory path
-
-        Returns:
-            True if successful, False otherwise
-        """
-        if os.path.exists(result_dir):
-            try:
-                shutil.rmtree(result_dir)
-                logger.success(f"Result directory removed: {result_dir}")
-                return True
-            except Exception as e:
-                logger.warning(f"Failed to remove result directory: {e}")
-                return False
-        return True
-
     @staticmethod
     def get_result_dir(test_config) -> str:
         """Get result directory.
@@ -354,16 +331,10 @@ def get_result_dir(test_config) -> str:
         Returns:
             Result directory path
         """
-        config_data = test_config.config_data
-        fields = extract_config_fields(config_data)
-
-        # Extract fields for logging and result directory
-        log_base = fields["log_base"]
-        context_dir = fields["context_dir"]
-        log_dir_name = log_base
-
-        result_dir = os.path.join(EnvManager.get_script_dir(), log_dir_name, context_dir)
-        return result_dir
+        # Use the same path as in submit_job: {output_path}/slurm_logs/{test_id}
+        log_dir = os.path.join(EnvManager.get_output_path(), "slurm_logs")
+        case_log_dir = os.path.join(log_dir, test_config.test_id.replace(":", "-"))
+        return case_log_dir
 
     @staticmethod
     def check_result(
@@ -413,16 +384,6 @@ def check_result(
         except Exception as e:
             logger.error(f"Exception during result checking: {e}")
             check_result["error"] = f"Exception during result checking: {str(e)}"
-
-        # Clean up result directory
-        if EnvManager.get_debug_mode():
-            logger.debug(f"Debug mode: Skipping result directory cleanup: {result_dir}")
-        else:
-            try:
-                JobManager.cleanup_result_dir(result_dir)
-            except Exception as e:
-                logger.warning(f"Failed to cleanup result directory: {e}")
-
         return check_result
 
     @staticmethod
diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py
index 622aed81ce2..6c8805e3636 100644
--- a/tests/integration/defs/perf/disagg/utils/common.py
+++ b/tests/integration/defs/perf/disagg/utils/common.py
@@ -1,7 +1,6 @@
 """Disaggregated Benchmark Configuration."""
 
 import os
-from datetime import datetime
 
 SESSION_COLLECT_CMD_TYPE = "session_collect"
 
@@ -191,15 +190,6 @@ def extract_config_fields(config_data: dict) -> dict:
     if "speculative_config" in gen_config:
         mtp_size = gen_config["speculative_config"].get("num_nextn_predict_layers", 0)
 
-    # Generate derived fields
-    dep_flag = "dep" if gen_enable_dp else "tep"
-    date_prefix = datetime.now().strftime("%Y%m%d")
-    log_base = f"{date_prefix}/{isl}-{osl}"
-    context_dir = (
-        f"disagg_ctx{ctx_num}_gen{gen_num}_{dep_flag}{gen_tp_size}_"
-        f"batch{gen_batch_size}_eplb{eplb_slots}_mtp{mtp_size}"
-    )
-
     return {
         "isl": isl,
         "osl": osl,
@@ -210,10 +200,7 @@ def extract_config_fields(config_data: dict) -> dict:
         "gen_enable_dp": gen_enable_dp,
         "eplb_slots": eplb_slots,
         "mtp_size": mtp_size,
-        "dep_flag": dep_flag,
         "cache_transceiver_backend": cache_transceiver_backend,
-        "log_base": log_base,
-        "context_dir": context_dir,
         "gen_max_tokens": gen_max_tokens,
         "gen_max_batch_size": gen_max_batch_size,
         "streaming": streaming,

From db33cbce4a00261d9ee8cc81688484412cb355be Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Wed, 24 Dec 2025 02:45:00 +0000
Subject: [PATCH 02/13] fix default backend issue - adapt for the current logic
 - skip single backend result for comparison

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/compare_backends.py      | 37 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/compare_backends.py b/tests/integration/defs/perf/disagg/compare_backends.py
index 1812fd36d59..8ff9c1d3631 100644
--- a/tests/integration/defs/perf/disagg/compare_backends.py
+++ b/tests/integration/defs/perf/disagg/compare_backends.py
@@ -12,8 +12,10 @@
 def extract_backend(test_name):
     """Extract backend type from test_name.
 
-    New format: ccb-NIXL or ccb-UCX
+    New format: ccb-NIXL or ccb-UCX or ccb-DEFAULT
     Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL
+    
+    Note: "DEFAULT" is a special marker that represents the default backend
     """
     match = re.search(r"ccb-(\w+)", test_name)
     return match.group(1) if match else None
@@ -41,6 +43,7 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"):
         csv_path: CSV file path
         threshold: Performance difference threshold (percentage)
         default_backend: DEFAULT backend name (currently NIXL, may switch in the future)
+                        Cases marked as "ccb-DEFAULT" will be treated as this backend
 
     Returns:
         DataFrame: Comparison results
@@ -71,20 +74,37 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"):
     df["backend"] = df["test_name"].apply(extract_backend)
     df["base_case_name"] = df["test_name"].apply(extract_base_case_name)
 
+    # Normalize "DEFAULT" backend to the actual default_backend value
+    # This allows cases marked as "ccb-DEFAULT" to be treated as the default backend
+    df["backend"] = df["backend"].apply(
+        lambda x: default_backend if x and x.upper() == "DEFAULT" else x
+    )
+
     # Group by base_case_name and metric_type
     grouped = df.groupby(["base_case_name", "metric_type"])
 
     results = []
+    comparison_pairs = 0
+    single_backend_skipped = 0
 
     for (base_case, metric_type), group in grouped:
         # Get DEFAULT backend and UCX data
         default_data = group[group["backend"] == default_backend]
         ucx_data = group[group["backend"] == "UCX"]
 
-        # If both have no data, skip (this case may not exist)
+        # Skip if both have no data (this case may not exist)
         if len(default_data) == 0 and len(ucx_data) == 0:
             continue
 
+        # Skip single-backend cases (only has one backend, not a comparison pair)
+        # This happens when a test case only runs on one backend
+        if len(default_data) == 0 or len(ucx_data) == 0:
+            single_backend_skipped += 1
+            continue
+
+        # This is a valid comparison pair
+        comparison_pairs += 1
+
         # Extract values and original test names
         default_value = default_data["perf_metric"].values[0] if len(default_data) > 0 else None
         default_original_name = (
@@ -137,6 +157,19 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"):
             }
         )
 
+    # Print statistics
+    print(f"\n=== Backend Comparison Statistics ===")
+    print(f"Default backend: {default_backend}")
+    print(f"Comparison pairs: {comparison_pairs}")
+    print(f"Single-backend cases (skipped): {single_backend_skipped}")
+    print("=" * 37)
+
+    # If no comparison pairs found, exit with success
+    if comparison_pairs == 0:
+        print(f"\nInfo: No backend comparison pairs found in disagg_perf tests")
+        print(f"All cases are single-backend only, no comparison needed")
+        sys.exit(0)
+
     # Convert to DataFrame
     result_df = pd.DataFrame(results)
 

From e6f88b48a1fcde69a9e05c7355805614c37dbfae Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Wed, 31 Dec 2025 05:32:36 +0000
Subject: [PATCH 03/13] fx

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml |   5 +-
 ...gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 110 ++++++++++++++++
 ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 116 +++++++++++++++++
 ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 110 ++++++++++++++++
 ..._dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml | 117 ++++++++++++++++++
 ...gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml | 110 ++++++++++++++++
 ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 116 +++++++++++++++++
 .../defs/perf/disagg/testlist/all.txt         |   6 +
 .../defs/perf/disagg/testlist/wideep.txt      |   6 +
 9 files changed, 694 insertions(+), 2 deletions(-)
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml

diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
index cfcbf66c8df..d2f81b865ed 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -7,8 +7,9 @@ metadata:
   - GB200
   - GB300
   script_file: disaggr_torch.slurm
-  benchmark_type: 8k1k
-  dataset_file: disagg_datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
+  benchmark_type: 1k1k
+  config_index: 7
+  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
 slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
new file mode 100644
index 00000000000..451a995e303
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml
@@ -0,0 +1,110 @@
+metadata:
+  model_name: deepseek-v32-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 1
+  dataset_file: disagg_datasets/deepseek-v32-1024-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '1075'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 32
+    max_num_tokens: 32
+    max_seq_len: 2251
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 4608
+      backend: NIXL
+    stream_interval: 20
+    num_postprocess_workers: 4
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 4
+    max_num_tokens: 4608
+    max_seq_len: 2251
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 4608
+      backend: NIXL
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
new file mode 100644
index 00000000000..f67ff56f88a
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@@ -0,0 +1,116 @@
+metadata:
+  model_name: deepseek-v32-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 0
+  dataset_file: disagg_datasets/deepseek-v32-1024-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '2150'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 2
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 16
+    moe_expert_parallel_size: 16
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 512
+    max_seq_len: 2251
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.9
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 4608
+      backend: NIXL
+    stream_interval: 20
+    num_postprocess_workers: 4
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 4
+    max_num_tokens: 4608
+    max_seq_len: 2251
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 4608
+      backend: NIXL
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
new file mode 100644
index 00000000000..76f4f78276c
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -0,0 +1,110 @@
+# nvbugs: 5422621
+metadata:
+  model_name: deepseek-v32-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 7
+  dataset_file: disagg_datasets/deepseek-v32-1024-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '12288'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 2
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 48
+    moe_expert_parallel_size: 48
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 1024
+    max_num_tokens: 1024
+    max_seq_len: 2176
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
+    stream_interval: 20
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 4
+    max_num_tokens: 4480
+    max_seq_len: 2176
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
new file mode 100644
index 00000000000..4a91160a99b
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -0,0 +1,117 @@
+metadata:
+  model_name: deepseek-v32-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  config_index: 14
+  dataset_file: disagg_datasets/deepseek-v32-8192-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: disaggr-test
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 2
+  num_gen_servers: 1
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: false
+  multi_round: 1
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: <dataset_file>
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 512
+    max_seq_len: 9423
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+      - 128
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: DEFAULT
+    stream_interval: 20
+    num_postprocess_workers: 4
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 1
+    max_num_tokens: 8448
+    max_seq_len: 9423
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: DEFAULT
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
new file mode 100644
index 00000000000..4d3a716c675
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml
@@ -0,0 +1,110 @@
+metadata:
+  model_name: deepseek-v32-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  config_index: 5
+  dataset_file: disagg_datasets/deepseek-v32-8192-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '1075'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 6
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 16
+    moe_expert_parallel_size: 16
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 64
+    max_seq_len: 9419
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: NIXL
+    stream_interval: 20
+    num_postprocess_workers: 4
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 1
+    max_num_tokens: 8448
+    max_seq_len: 9419
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: NIXL
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
new file mode 100644
index 00000000000..441aebf189c
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml
@@ -0,0 +1,116 @@
+metadata:
+  model_name: deepseek-v32-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-V3.2-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 8k1k
+  config_index: 4
+  dataset_file: disagg_datasets/deepseek-v32-8192-1024-200000-ratio-1_for_serve.json
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '538'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 8
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 16
+    max_num_tokens: 64
+    max_seq_len: 9419
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: NIXL
+    stream_interval: 20
+    num_postprocess_workers: 4
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 1
+    max_num_tokens: 8448
+    max_seq_len: 9419
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: NIXL
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
diff --git a/tests/integration/defs/perf/disagg/testlist/all.txt b/tests/integration/defs/perf/disagg/testlist/all.txt
index dd2d14b5acf..da40a0f46d8 100644
--- a/tests/integration/defs/perf/disagg/testlist/all.txt
+++ b/tests/integration/defs/perf/disagg/testlist/all.txt
@@ -77,6 +77,12 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL]
diff --git a/tests/integration/defs/perf/disagg/testlist/wideep.txt b/tests/integration/defs/perf/disagg/testlist/wideep.txt
index baee1d5a10f..4a599a964e6 100644
--- a/tests/integration/defs/perf/disagg/testlist/wideep.txt
+++ b/tests/integration/defs/perf/disagg/testlist/wideep.txt
@@ -8,6 +8,12 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
+test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_ccb-NIXL]
 # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL]

From 414b91af7f017c70b52d1d1b50c924e420f31f56 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Wed, 24 Dec 2025 09:14:31 +0000
Subject: [PATCH 04/13] add stress test cases here

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py    |  58 ++-
 .../test_configs/disagg/stress/README.md      | 480 ++++++++++++++++++
 .../defs/perf/disagg/test_disagg.py           |  90 +++-
 .../defs/perf/disagg/utils/config_loader.py   |  43 +-
 4 files changed, 664 insertions(+), 7 deletions(-)
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 2f4c482f052..8e717c50ce8 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -756,11 +756,11 @@ def _check_job_result(
 
         Args:
             job_id: SLURM job ID
-            test_category: Test category ("perf" or "accuracy")
+            test_category: Test category ("perf", "accuracy", or "stress")
             benchmark_type: Benchmark type (1k1k, 8k1k, etc.)
             config: Configuration dict (YAML data)
             metrics_config: MetricsConfig object (default or custom)
-            accuracy_config: AccuracyConfig object (required for accuracy tests)
+            accuracy_config: AccuracyConfig object (required for accuracy and stress tests)
             model_name: Model name
             result_dir: Result directory
             timestamps: Optional timestamps dict
@@ -768,6 +768,7 @@ def _check_job_result(
 
         Returns:
             Dict with success status and details
+            For stress tests, includes both perf and accuracy results
         """
         logger.info(f"Checking result directory: {result_dir}")
 
@@ -776,12 +777,63 @@ def _check_job_result(
 
         # Route based on test_category
         if test_category == "accuracy":
+            # Use metrics config from accuracy_config (defaults to _COMMON_ACCURACY_METRICS)
+            accuracy_metrics = accuracy_config.get_metrics_config()
             return JobManager._check_accuracy_result(
                 job_id=job_id,
-                metrics_config=metrics_config,
+                metrics_config=accuracy_metrics,
                 accuracy_config=accuracy_config,
                 result_dir=result_dir,
             )
+        elif test_category == "stress":
+            # Stress tests combine both perf and accuracy validation
+            # First check performance and write CSV
+            perf_result = JobManager._check_perf_result(
+                job_id=job_id,
+                benchmark_type=benchmark_type,
+                config=config,
+                metrics_config=metrics_config,
+                model_name=model_name,
+                result_dir=result_dir,
+                timestamps=timestamps,
+                test_name=test_name,
+            )
+            
+            # If perf check failed, return immediately
+            if not perf_result.get("success", False):
+                return perf_result
+            
+            # Then check accuracy if accuracy_config is provided
+            if accuracy_config:
+                # Use metrics config from accuracy_config (defaults to _COMMON_ACCURACY_METRICS)
+                accuracy_metrics = accuracy_config.get_metrics_config()
+                
+                accuracy_result = JobManager._check_accuracy_result(
+                    job_id=job_id,
+                    metrics_config=accuracy_metrics,
+                    accuracy_config=accuracy_config,
+                    result_dir=result_dir,
+                )
+                
+                # If accuracy check failed, merge results and return
+                if not accuracy_result.get("success", False):
+                    return {
+                        **perf_result,
+                        "success": False,
+                        "accuracy_result": accuracy_result,
+                        "error": f"Perf passed but accuracy failed: {accuracy_result.get('error', 'Unknown')}",
+                    }
+                
+                # Both passed, merge results
+                return {
+                    **perf_result,
+                    "accuracy_result": accuracy_result,
+                    "success": True,
+                }
+            else:
+                # No accuracy config, just return perf result
+                logger.warning("Stress test has no accuracy_config, only perf validation performed")
+                return perf_result
         else:  # perf
             return JobManager._check_perf_result(
                 job_id=job_id,
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md
new file mode 100644
index 00000000000..528c8e33e8b
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md
@@ -0,0 +1,480 @@
+# Disaggregated Stress Tests
+
+## Purpose
+
+Stress tests combine **performance benchmarking** and **accuracy validation** in a single test run. They are designed to:
+
+- Validate performance under high load/stress conditions
+- Ensure accuracy is maintained while pushing system limits
+- Write performance metrics to CSV (same as `perf` tests)
+- Validate accuracy against expected thresholds (e.g., GSM8K, MMLU)
+
+Test name prefix: `disagg_stress_*`
+
+---
+
+## Quick Start
+
+```bash
+# 1. Copy the example template
+cp EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml \
+   your_model_1k1k_stress_gsm8k.yaml
+
+# 2. Edit the configuration (see Field Reference below)
+
+# 3. Run the test
+cd /path/to/tests/integration/defs/perf/disagg/
+poetry run pytest --disagg test_disagg.py -s -vv -m stress
+```
+
+---
+
+## Configuration Template
+
+### Minimal Template
+
+```yaml
+metadata:
+  model_name: your-model-name
+  precision: fp8
+  model_dir_name: YourModelDir
+  supported_gpus: [GB200, GB300]
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: 0
+  
+  # Accuracy configuration (required for stress tests)
+  accuracy:
+    datasets:
+    - name: gsm8k
+      expected_value: 0.85
+      threshold_type: hypothesis_test
+      filter_type: flexible-extract
+
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 04:00:00
+  job_name: stress-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: true
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: 1 2 4 8 16 32
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 4
+
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO ..."
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+
+profiling:
+  nsys_on: false
+
+# Enable accuracy evaluation (required for stress tests)
+accuracy:
+  enable_accuracy_test: true
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,timeout=1200
+
+worker_config:
+  gen:
+    tensor_parallel_size: 8
+    max_batch_size: 32
+    max_num_tokens: 128
+    # ... other gen worker configs
+  ctx:
+    tensor_parallel_size: 4
+    max_batch_size: 4
+    max_num_tokens: 4608
+    # ... other ctx worker configs
+```
+
+---
+
+## Field Reference
+
+### 1. `metadata` Section
+
+#### Required Fields
+
+| Field | Type | Description | Example |
+|-------|------|-------------|---------|
+| `model_name` | string | Model identifier | `deepseek-r1-fp4` |
+| `precision` | string | Model precision | `fp8`, `fp4`, `int8` |
+| `model_dir_name` | string | Model directory name | `DeepSeek-R1-0528-FP4-v2` |
+| `supported_gpus` | list | GPU types supported | `[GB200, GB300]` |
+| `script_file` | string | SLURM script to use | `disaggr_torch.slurm` |
+| `benchmark_type` | string | Benchmark configuration | `1k1k`, `8k1k`, etc. |
+| `config_index` | int | Configuration index | `0`, `1`, etc. |
+
+#### Accuracy Configuration (Required for Stress Tests)
+
+```yaml
+metadata:
+  accuracy:
+    datasets:
+    - name: gsm8k                          # Dataset name
+      expected_value: 0.85                 # Expected accuracy (0.0-1.0)
+      threshold_type: hypothesis_test      # "hypothesis_test" or "absolute"
+      filter_type: flexible-extract        # "flexible-extract" or "strict-match"
+      
+      # Optional: Hypothesis testing parameters
+      alpha: 0.05                          # Type I error rate (default: 0.05)
+      beta: 0.20                           # Type II error rate (default: 0.20)
+      sigma: 0.05                          # Standard deviation (default: 0.05)
+      num_samples: 100                     # Number of samples (default: 100)
+      higher_is_better: true               # Direction (default: true)
+    
+    # Optional: Custom accuracy metrics parsing
+    # metrics:
+    #   log_file: "7_accuracy_eval.log"
+    #   extractor_pattern: '\|...\|'
+    #   metric_names: [flexible-extract, strict-match]
+```
+
+**Threshold Types:**
+- `hypothesis_test`: Statistical hypothesis testing (recommended)
+- `absolute`: Simple threshold comparison
+
+**Filter Types:**
+- `flexible-extract`: More lenient matching
+- `strict-match`: Exact matching required
+
+---
+
+### 2. `slurm` Section
+
+| Field | Type | Description | Recommended |
+|-------|------|-------------|-------------|
+| `partition` | string | SLURM partition | Your cluster partition |
+| `account` | string | SLURM account | Your cluster account |
+| `job_time` | string | Maximum job time | `04:00:00` (4 hours) |
+| `job_name` | string | Job name | `stress-benchmark` |
+| `extra_args` | string | Extra SLURM args | `"--gres=gpu:4"` |
+| `numa_bind` | bool | Enable NUMA binding | `true` |
+
+---
+
+### 3. `benchmark` Section
+
+| Field | Type | Description | Example |
+|-------|------|-------------|---------|
+| `mode` | string | Benchmark mode | `e2e` |
+| `use_nv_sa_benchmark` | bool | Use NV benchmark | `true` |
+| `multi_round` | int | Rounds per concurrency | `8` |
+| `benchmark_ratio` | float | Benchmark ratio | `0.8` |
+| `streaming` | bool | Enable streaming | `true` |
+| `concurrency_list` | string | Concurrency levels | `1 2 4 8 16 32` |
+| `input_length` | int | Input token length | `1024` |
+| `output_length` | int | Output token length | `1024` |
+| `dataset_file` | string | Dataset file path | `<dataset_file>` |
+
+**Tip:** Increase `concurrency_list` for more stress (e.g., `1 2 4 8 16 32 64 128`)
+
+---
+
+### 4. `hardware` Section
+
+| Field | Type | Description | Example |
+|-------|------|-------------|---------|
+| `gpus_per_node` | int | GPUs per node | `4` |
+| `num_ctx_servers` | int | Context servers | `1` |
+| `num_gen_servers` | int | Generation servers | `4` |
+
+---
+
+### 5. `accuracy` Section (SLURM Script Config)
+
+**Note:** This is different from `metadata.accuracy`. This section is used by the SLURM script to run `lm-evaluation-harness`.
+
+| Field | Type | Description | Example |
+|-------|------|-------------|---------|
+| `enable_accuracy_test` | bool | Enable accuracy eval | `true` (required) |
+| `model` | string | Model type | `local-completions` |
+| `tasks` | string | Eval tasks | `gsm8k`, `mmlu`, `humaneval` |
+| `model_args_extra` | string | Extra arguments | See below |
+
+**Common `model_args_extra` parameters:**
+```
+num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+```
+
+---
+
+### 6. `worker_config` Section
+
+Configure generation and context workers. See [main README](../../README.md) for detailed worker configuration options.
+
+**Key parameters:**
+- `tensor_parallel_size`: TP parallelism
+- `max_batch_size`: Maximum batch size
+- `max_num_tokens`: Maximum tokens per batch
+- `max_seq_len`: Maximum sequence length
+
+---
+
+## Test Execution Flow
+
+```
+1. Configuration Validation
+   ↓
+2. SLURM Job Submission
+   ↓
+3. Performance Benchmark
+   - Runs benchmark with specified concurrency levels
+   - Generates: 6_bench.log
+   ↓
+4. Accuracy Evaluation
+   - Runs lm-evaluation-harness
+   - Generates: 7_accuracy_eval.log
+   ↓
+5. Result Validation
+   - Parse performance metrics → Write to CSV
+   - Parse accuracy results → Validate against thresholds
+   ↓
+6. Pass/Fail Decision
+   - PASS: Both performance and accuracy checks pass
+   - FAIL: Either performance or accuracy fails
+```
+
+---
+
+## Output Files
+
+### Log Directory
+```
+{OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/
+├── config.yaml                 # Test configuration copy
+├── 6_bench.log                 # Performance benchmark log
+├── 7_accuracy_eval.log         # Accuracy evaluation log
+├── output_gen_*.log            # Generation worker logs
+├── output_ctx_*.log            # Context worker logs
+└── slurm-{job_id}.out          # SLURM output
+```
+
+### CSV Output
+```
+{OUTPUT_PATH}/perf_script_test_results.csv
+```
+
+Performance metrics are written to the same CSV as `perf` tests, with `test_name` prefix `disagg_stress_*`.
+
+### Failed Test Directories
+Failed tests are automatically renamed with `_ERROR` suffix:
+```
+disagg_stress_{test_id}_ERROR/
+```
+
+---
+
+## Supported Accuracy Datasets
+
+| Dataset | Task | Description |
+|---------|------|-------------|
+| `gsm8k` | Math reasoning | Grade school math problems |
+| `mmlu` | Knowledge | Multi-domain multiple choice |
+| `humaneval` | Coding | Python code generation |
+| `hellaswag` | Reasoning | Commonsense reasoning |
+
+See [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for full list.
+
+---
+
+## Common Pitfalls
+
+### 1. Missing Accuracy Config
+
+**Error:** `Stress test has no accuracy_config`
+
+**Solution:** Ensure both accuracy sections are present:
+```yaml
+metadata:
+  accuracy:
+    datasets: [...]  # For validation framework
+
+accuracy:
+  enable_accuracy_test: true  # For SLURM script
+```
+
+### 2. Timeout Issues
+
+**Error:** Job times out before completion
+
+**Solution:** Increase `job_time`:
+```yaml
+slurm:
+  job_time: 06:00:00  # 6 hours for larger models
+```
+
+### 3. Accuracy Threshold Too High
+
+**Error:** Accuracy test fails but model performance is reasonable
+
+**Solution:** Adjust `expected_value` or use `hypothesis_test`:
+```yaml
+metadata:
+  accuracy:
+    datasets:
+    - expected_value: 0.80  # Lower threshold
+      threshold_type: hypothesis_test  # More statistical
+```
+
+---
+
+## Advanced Usage
+
+### Custom Accuracy Metrics Parsing
+
+Override default accuracy log parsing:
+
+```yaml
+metadata:
+  accuracy:
+    datasets:
+    - name: gsm8k
+      expected_value: 0.85
+    
+    metrics:
+      log_file: "custom_accuracy.log"
+      extractor_pattern: '\|custom_pattern\|'
+      metric_names: [custom_metric_1, custom_metric_2]
+```
+
+### Multiple Datasets
+
+Test multiple accuracy benchmarks:
+
+```yaml
+metadata:
+  accuracy:
+    datasets:
+    - name: gsm8k
+      expected_value: 0.85
+      threshold_type: hypothesis_test
+    - name: mmlu
+      expected_value: 0.75
+      threshold_type: absolute
+```
+
+---
+
+## Running Tests
+
+### Run All Stress Tests
+```bash
+poetry run pytest --disagg test_disagg.py -s -vv -m stress
+```
+
+### Run Specific Test
+```bash
+poetry run pytest --disagg test_disagg.py -s -vv -k "your_model_1k1k_stress"
+```
+
+### Run from Test List
+```bash
+echo "disagg_stress_your_model_1k1k_stress_gsm8k" > testlist/stress.txt
+poetry run pytest --disagg test_disagg.py -s -vv --disagg-test-list=./testlist/stress.txt
+```
+
+---
+
+## Naming Convention
+
+Format: `{model}_{benchmark_type}_{config_details}_stress_{dataset}.yaml`
+
+Examples:
+- `deepseek-r1-fp4_1k1k_ctx1_gen4_stress_gsm8k.yaml`
+- `llama3-8b_8k1k_ctx2_gen2_stress_mmlu.yaml`
+- `qwen3-235b_1k1k_ctx1_gen1_stress_humaneval.yaml`
+
+---
+
+## Comparison with Other Test Types
+
+| Feature | perf | accuracy | stress |
+|---------|------|----------|--------|
+| Performance Metrics | ✅ | ❌ | ✅ |
+| CSV Output | ✅ | ❌ | ✅ |
+| Accuracy Validation | ❌ | ✅ | ✅ |
+| Default Timeout | 2h | 3h | 4h |
+| Use Case | Performance only | Accuracy only | Both |
+
+---
+
+## Troubleshooting
+
+### Check Test Status
+```bash
+# View SLURM jobs
+squeue -u $USER
+
+# Check logs
+tail -f {OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/slurm-*.out
+```
+
+### Debug Mode
+```bash
+export DEBUG_MODE=1
+export DEBUG_JOB_ID=12345
+
+poetry run pytest --disagg test_disagg.py -s -vv -k "your_test"
+```
+
+### View Results
+```bash
+# Performance CSV
+cat {OUTPUT_PATH}/perf_script_test_results.csv
+
+# Accuracy log
+cat {OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/7_accuracy_eval.log
+```
+
+---
+
+## Best Practices
+
+1. **Start Conservative:** Begin with lower concurrency and shorter job times
+2. **Monitor Resources:** Check GPU memory and CPU usage during stress tests
+3. **Baseline First:** Run `perf` and `accuracy` tests separately before `stress`
+4. **Document Results:** Keep records of thresholds and performance baselines
+5. **Iterate:** Gradually increase stress (concurrency, sequence length) until failure
+
+---
+
+## Related Documentation
+
+- [Main README](../../README.md) - General test framework documentation
+- [Example Config](EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml) - Full example configuration
+- [Config Loader](../../utils/config_loader.py) - Configuration loading logic
+- [Executor](../../execution/executor.py) - Test execution logic
+
+---
+
+## Support
+
+For issues or questions:
+1. Check logs in `{OUTPUT_PATH}/slurm_logs/disagg_stress_{test_id}/`
+2. Review configuration against this README
+3. Compare with `EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml`
+4. Contact your team's test infrastructure maintainer
+
diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py
index ff71446bb1e..9c4f87679c0 100644
--- a/tests/integration/defs/perf/disagg/test_disagg.py
+++ b/tests/integration/defs/perf/disagg/test_disagg.py
@@ -14,13 +14,15 @@
 config_loader = ConfigLoader(base_dir=CONFIG_BASE_DIR)
 ALL_TEST_CONFIGS = config_loader.scan_configs()
 
-# Separate performance and accuracy test configurations
+# Separate performance, accuracy, and stress test configurations
 PERF_TEST_CONFIGS = [c for c in ALL_TEST_CONFIGS if c.test_category == "perf"]
 ACCURACY_TEST_CONFIGS = [c for c in ALL_TEST_CONFIGS if c.test_category == "accuracy"]
+STRESS_TEST_CONFIGS = [c for c in ALL_TEST_CONFIGS if c.test_category == "stress"]
 
 # Convert to pytest parameters
 PERF_TEST_CASES = [pytest.param(config, id=config.test_id) for config in PERF_TEST_CONFIGS]
 ACCURACY_TEST_CASES = [pytest.param(config, id=config.test_id) for config in ACCURACY_TEST_CONFIGS]
+STRESS_TEST_CASES = [pytest.param(config, id=config.test_id) for config in STRESS_TEST_CONFIGS]
 
 # Flag to track if session end has been called
 _session_ended = False
@@ -212,6 +214,92 @@ def test_accuracy(self, request, test_config: TestConfig):
                 except Exception as backup_error:
                     logger.error(f"Failed to backup logs: {backup_error}")
 
+    @pytest.mark.stress
+    @pytest.mark.parametrize("test_config", STRESS_TEST_CASES)
+    def test_stress(self, request, test_config: TestConfig):
+        """Stress test combining performance benchmarks and accuracy validation.
+        
+        This test type is designed for stress testing scenarios where both 
+        performance metrics (CSV output) and accuracy (e.g., GSM8K) need to be validated.
+        """
+        full_test_name = request.node.name
+
+        # Validate configuration first (before any other operations)
+        try:
+            ConfigValidator.validate_test_config(test_config)
+        except Exception as e:
+            pytest.fail(f"Configuration validation failed: {e}")
+
+        # Create test case tracker
+        test_tracker = TestCaseTracker()
+        test_case_name = test_config.test_id
+
+        # Start tracking test case
+        test_tracker.start_test_case(test_case_name)
+
+        job_id = None
+        result = None
+
+        try:
+            logger.info(f"\n{'=' * 60}")
+            logger.info(f"Stress Test (Perf + Accuracy): {test_config.display_name}")
+            logger.info(f"Test ID: {test_config.test_id}")
+            logger.info(f"Config file: {test_config.config_path}")
+            logger.info(f"Test type: {test_config.test_type}")
+            logger.info(f"Category: {test_config.test_category}")
+            logger.info(f"Model: {test_config.model_name}")
+            logger.info(f"Benchmark: {test_config.benchmark_type}")
+            
+            # Log accuracy datasets if configured
+            if test_config.accuracy_config:
+                dataset_names = test_config.accuracy_config.get_all_dataset_names()
+                logger.info(f"Accuracy Datasets: {', '.join(dataset_names)}")
+            
+            logger.info(f"Metrics log: {test_config.metrics_config.log_file}")
+            logger.info(f"Supported GPUs: {', '.join(test_config.supported_gpus)}")
+            logger.info(f"{'=' * 60}")
+
+            if EnvManager.get_debug_mode():
+                logger.debug(
+                    f"Debug mode: Skipping job submission, using job_id: {EnvManager.get_debug_job_id()}"
+                )
+                job_id = EnvManager.get_debug_job_id()
+            else:
+                # Submit job using JobManager
+                success, job_id = JobManager.submit_job(test_config)
+
+                # Validate submission result
+                assert success, f"Job submission failed: {test_config.test_id}"
+                assert job_id, "Unable to get job ID"
+
+                # Wait for completion (longer timeout for stress tests: 4 hours)
+                JobManager.wait_for_completion(job_id, 14400, test_config, check_early_failure=True)
+
+            # End tracking test case
+            test_tracker.end_test_case()
+
+            # Get timestamps information
+            timestamps = test_tracker.get_timestamps()
+
+            # Check results - this will handle both perf CSV writing AND accuracy validation
+            result = JobManager.check_result(job_id, test_config, timestamps, full_test_name)
+            assert result["success"], (
+                f"Stress test failed: {result.get('error', 'Unknown error')}"
+            )
+
+        except Exception as e:
+            test_tracker.end_test_case()
+            raise e
+        finally:
+            # Always backup logs, regardless of success or failure
+            if job_id:
+                result_dir = JobManager.get_result_dir(test_config)
+                is_passed = result.get("success", False) if result else False
+                try:
+                    JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
+                except Exception as backup_error:
+                    logger.error(f"Failed to backup logs: {backup_error}")
+
 
 if __name__ == "__main__":
     """Run benchmark tests"""
diff --git a/tests/integration/defs/perf/disagg/utils/config_loader.py b/tests/integration/defs/perf/disagg/utils/config_loader.py
index a74ea94089b..07531a816ed 100644
--- a/tests/integration/defs/perf/disagg/utils/config_loader.py
+++ b/tests/integration/defs/perf/disagg/utils/config_loader.py
@@ -45,6 +45,7 @@ class AccuracyConfig:
     """Accuracy test configuration (supports multiple datasets)."""
 
     datasets: List[DatasetThreshold]  # List of dataset threshold configurations
+    metrics: Optional[MetricsConfig] = None  # Optional custom metrics config (defaults to _COMMON_ACCURACY_METRICS)
 
     def get_dataset_config(self, dataset_name: str) -> Optional[DatasetThreshold]:
         """Get configuration by dataset name.
@@ -67,6 +68,16 @@ def get_all_dataset_names(self) -> List[str]:
             List of dataset names
         """
         return [ds.dataset_name for ds in self.datasets]
+    
+    def get_metrics_config(self) -> MetricsConfig:
+        """Get metrics configuration for accuracy parsing.
+        
+        Returns:
+            Custom metrics config if provided, otherwise _COMMON_ACCURACY_METRICS
+        """
+        if self.metrics is not None:
+            return self.metrics
+        return _COMMON_ACCURACY_METRICS
 
 
 # ============================================================================
@@ -139,6 +150,19 @@ def get_all_dataset_names(self) -> List[str]:
             "SERVER_P99_E2EL",
         ],
     ),
+    # Stress test configuration (combines perf metrics + accuracy validation)
+    # Uses the same perf metrics pattern as disagg perf tests
+    ("disagg", "stress"): MetricsConfig(
+        log_file="6_bench.log",
+        extractor_pattern=r"""
+            ^.*?Median\ TTFT\ \(ms\):\s+([0-9.]+).*?$\n
+            (?:.*\n)*?
+            ^.*?Median\ E2EL\ \(ms\):\s+([0-9.]+).*?$\n
+            (?:.*\n)*?
+            ^.*?Benchmark\ with\ concurrency\ (\d+)\ done
+        """,
+        metric_names=["SERVER_MEDIAN_TTFT", "SERVER_MEDIAN_E2EL"],
+    ),
     # Accuracy test configuration
     ("disagg", "accuracy"): _COMMON_ACCURACY_METRICS,
     ("wideep", "accuracy"): _COMMON_ACCURACY_METRICS,
@@ -336,7 +360,8 @@ def _load_config_file(self, yaml_path: Path, test_type: str, test_category: str)
 
         # Load accuracy configuration (only for accuracy tests)
         accuracy_config = None
-        if test_category == "accuracy":
+        # Load accuracy config for both "accuracy" and "stress" test categories
+        if test_category in ["accuracy", "stress"]:
             acc_meta = metadata.get("accuracy", {})
             if acc_meta and "datasets" in acc_meta:
                 datasets = []
@@ -373,8 +398,20 @@ def _load_config_file(self, yaml_path: Path, test_type: str, test_category: str)
                             higher_is_better=higher_is_better,
                         )
                     )
-                accuracy_config = AccuracyConfig(datasets=datasets)
-                logger.info(f"Loaded accuracy config with {len(datasets)} dataset(s)")
+                
+                # Check if custom accuracy metrics are provided
+                custom_metrics = None
+                if "metrics" in acc_meta:
+                    metrics_override = acc_meta["metrics"]
+                    custom_metrics = MetricsConfig(
+                        log_file=metrics_override.get("log_file", "7_accuracy_eval.log"),
+                        extractor_pattern=metrics_override.get("extractor_pattern", r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|"),
+                        metric_names=metrics_override.get("metric_names", ["flexible-extract", "strict-match"]),
+                    )
+                    logger.info(f"Using custom accuracy metrics config from YAML")
+                
+                accuracy_config = AccuracyConfig(datasets=datasets, metrics=custom_metrics)
+                logger.info(f"Loaded accuracy config with {len(datasets)} dataset(s) for {test_category} test")
 
         return TestConfig(
             config_path=str(yaml_path),

From 56ad3d482d8bbf97dd2be997340f07ba4fe4a66d Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Wed, 31 Dec 2025 05:36:03 +0000
Subject: [PATCH 05/13] fix conflict

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 115 ++++++++++++++++++
 ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml |   2 +-
 ...n1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml |   2 +-
 .../defs/perf/disagg/test_disagg.py           |   2 +-
 .../defs/perf/disagg/testlist/all.txt         |   2 +
 .../defs/perf/disagg/testlist/disagg.txt      |   1 +
 6 files changed, 121 insertions(+), 3 deletions(-)
 create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml

diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
new file mode 100644
index 00000000000..9d5ca29c122
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml
@@ -0,0 +1,115 @@
+# nvbugs: 5422621
+metadata:
+  model_name: deepseek-r1-fp4
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  dataset_file: disagg_datasets/deepseek-r1-1024-1024-100000-ratio-1_for_serve.json
+  accuracy:
+    datasets:
+    - dataset_name: gsm8k
+      expected_value: 0.9454
+      threshold_type: hypothesis_test
+      filter_type: flexible-extract
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 03:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: false
+  multi_round: 20
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '12288'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 2
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: true
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 48
+    moe_expert_parallel_size: 48
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 1024
+    max_num_tokens: 1024
+    max_seq_len: 2176
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.7
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
+    stream_interval: 20
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 4
+    max_num_tokens: 4480
+    max_seq_len: 2176
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.85
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8320
+      backend: DEFAULT
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
index 30eb5ef5bda..259010f2550 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml
@@ -18,7 +18,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 02:00:00
+  job_time: 03:00:00
   job_name: unified-benchmark
   extra_args: "--gres=gpu:4"
   numa_bind: true
diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
index b17b96df7d7..73b11ea4157 100644
--- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml
@@ -18,7 +18,7 @@ slurm:
   script_file: disaggr_torch.slurm
   partition: <partition>
   account: <account>
-  job_time: 00:45:00
+  job_time: 03:00:00
   job_name: unified-benchmark
   extra_args: "--gres=gpu:4"
   numa_bind: true
diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py
index 9c4f87679c0..7fe141c5d10 100644
--- a/tests/integration/defs/perf/disagg/test_disagg.py
+++ b/tests/integration/defs/perf/disagg/test_disagg.py
@@ -273,7 +273,7 @@ def test_stress(self, request, test_config: TestConfig):
                 assert job_id, "Unable to get job ID"
 
                 # Wait for completion (longer timeout for stress tests: 4 hours)
-                JobManager.wait_for_completion(job_id, 14400, test_config, check_early_failure=True)
+                JobManager.wait_for_completion(job_id, 10800, test_config, check_early_failure=True)
 
             # End tracking test case
             test_tracker.end_test_case()
diff --git a/tests/integration/defs/perf/disagg/testlist/all.txt b/tests/integration/defs/perf/disagg/testlist/all.txt
index da40a0f46d8..d5a5e1d3419 100644
--- a/tests/integration/defs/perf/disagg/testlist/all.txt
+++ b/tests/integration/defs/perf/disagg/testlist/all.txt
@@ -64,6 +64,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
+# disagg stress test cases
+test_disagg.py::TestDisaggBenchmark::test_stress[disagg_stress_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
 
 
 # WIDEEP cases
diff --git a/tests/integration/defs/perf/disagg/testlist/disagg.txt b/tests/integration/defs/perf/disagg/testlist/disagg.txt
index 73dda2187da..d928d36b638 100644
--- a/tests/integration/defs/perf/disagg/testlist/disagg.txt
+++ b/tests/integration/defs/perf/disagg/testlist/disagg.txt
@@ -63,3 +63,4 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default]
 test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0-Default]
+test_disagg.py::TestDisaggBenchmark::test_stress[disagg_stress_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]

From f26a0f57c6bf5d72f8eadafa057e7b2f46e59a9b Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Fri, 26 Dec 2025 10:20:33 +0000
Subject: [PATCH 06/13] fix srun logic to sbatch

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py    | 356 ++++++++++--------
 .../defs/perf/disagg/simple_collect.py        |  60 ---
 .../defs/perf/disagg/test_disagg.py           |   6 +-
 .../defs/perf/disagg/utils/common.py          |   2 -
 .../defs/perf/disagg/utils/trackers.py        |  46 ++-
 5 files changed, 241 insertions(+), 229 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 8e717c50ce8..659ccfb6881 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -4,13 +4,12 @@
 import re
 import shutil
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 import yaml
 from reporting.report import LogParser, LogWriter, ResultSaver
 from utils.common import (
     GPU_RESOURCE_CONFIG,
-    SESSION_COLLECT_CMD_TYPE,
     EnvManager,
     extract_config_fields,
 )
@@ -18,177 +17,237 @@
 
 from execution.subprocess_utils import exec_cmd, exec_cmd_with_output
 
+
 # ============================================================================
-# SLURM Run Command Builder
+# Job Manager
 # ============================================================================
 
 
-class SlurmRunCommandBuilder:
-    """SLURM Run Command Builder.
+class JobManager:
+    """Job manager class for test jobs and session collection."""
+
+    # ============================================================================
+    # Generic Job Submission (Direct sbatch)
+    # ============================================================================
+
+    @staticmethod
+    def submit_shell_job(
+        job_name: str,
+        shell_script: str,
+        output_log_file: str,
+        timeout: int = 7200
+    ) -> tuple[bool, str]:
+        """Submit a generic shell script job using sbatch --wrap.
 
-    Build srun commands for different GPU types and command types.
-    Reuses GPU_RESOURCE_CONFIG for consistency with SlurmJobBuilder.
-    """
+        This is a low-level method for submitting arbitrary shell scripts
+        directly to SLURM via sbatch --wrap (non-blocking).
 
-    def build_srun_prefix(self, job_name: str) -> List[str]:
-        """Build srun command prefix based on GPU type."""
-        gpu_type = EnvManager.get_gpu_type()
+        Args:
+            job_name: SLURM job name
+            shell_script: Shell script content to execute
+            output_log_file: Full path to output log file
+            timeout: Job timeout in seconds (default: 7200 = 2 hours)
 
-        # Reuse the same GPU_RESOURCE_CONFIG as SlurmJobBuilder
-        gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type)
-        if not gpu_config:
-            raise ValueError(
-                f"GPU resource configuration not found for {gpu_type}. "
-                f"Please add configuration in GPU_RESOURCE_CONFIG."
+        Returns:
+            tuple: (success: bool, job_id: str)
+        """
+        try:
+            # Get environment configuration
+            container_image = EnvManager.get_container_image()
+            container_mount = EnvManager.get_container_mount()
+            output_path = EnvManager.get_output_path()
+
+            # Ensure output directory exists
+            os.makedirs(output_path, exist_ok=True)
+
+            # Build complete srun command (runs inside sbatch)
+            srun_command = (
+                f"srun -l "
+                f"--container-name={job_name} "
+                f"--container-image={container_image} "
+                f"--container-mounts={container_mount} "
+                f"bash -c '{shell_script}'"
             )
 
-        # Common srun arguments
-        srun_args = [
-            "srun",
-            "-l",
-            "--container-name=sysinfo-get",
-            f"--container-image={EnvManager.get_container_image()}",
-            f"--container-mounts={EnvManager.get_container_mount()}",
-        ]
+            # Build sbatch command with all parameters
+            gpu_type = EnvManager.get_gpu_type()
+            gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type)
+            if not gpu_config:
+                raise ValueError(f"GPU resource configuration not found for {gpu_type}")
 
-        # Add GPU-specific gres parameter (reuse gres_gpu field)
-        # If gres_gpu is not None, add --gres parameter
-        if gpu_config["gres_gpu"] is not None:
-            srun_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}")
+            # Convert timeout to HH:MM:SS format
+            hours = timeout // 3600
+            minutes = (timeout % 3600) // 60
+            seconds = timeout % 60
+            time_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
 
-        # Add common parameters
-        srun_args.extend(
-            [
+            sbatch_args = [
+                "sbatch",
+                f"--job-name={job_name}",
                 f"--partition={EnvManager.get_slurm_partition()}",
                 f"--account={EnvManager.get_slurm_account()}",
-                f"--job-name={job_name}",
-                "--time=02:00:00",
-                "--mpi=pmix",
-                # Note: Removed --overlap to ensure GPU allocation for session_collect
-                # which runs after all test jobs have completed
-                "-N",
-                "1",
-                "-n",
-                "1",
+                f"--time={time_str}",
+                "--nodes=1",
+                "--ntasks=1",
+                f"--output={output_log_file}",
+                "--parsable",  # Easier job ID parsing
             ]
-        )
 
-        return srun_args
+            # Conditionally add gres parameter based on GPU configuration
+            if gpu_config["gres_gpu"] is not None:
+                sbatch_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}")
 
-    def build_script_command(self, cmd_type: str) -> List[str]:
-        """Build script command based on command type."""
-        work_dir = EnvManager.get_work_dir()
-        output_path = EnvManager.get_output_path()
-        install_mode = EnvManager.get_install_mode()
-        repo_dir = EnvManager.get_repo_dir()
-        trtllm_wheel_path = EnvManager.get_trtllm_wheel_path()
-
-        if cmd_type == SESSION_COLLECT_CMD_TYPE:
-            if install_mode == "none":
-                return [
-                    "bash",
-                    "-c",
-                    f"cd {work_dir} && python3 {work_dir}/simple_collect.py {output_path}",
-                ]
-            elif install_mode == "wheel":
-                # Install TensorRT-LLM wheel first, then run simple_collect.py
-                # Note: Use --no-deps to avoid overwriting container's pre-installed packages (like torch)
-                install_cmd = f"""
-                    cd {repo_dir}
-                    echo 'Step 1: Installing TensorRT-LLM wheel...'
-                    pip3 install {trtllm_wheel_path} || echo 'Wheel install failed, continuing...'
-                    echo 'Wheel installation completed'
-
-                    echo 'Step 2: Running simple_collect.py...'
-                    cd {work_dir}
-                    python3 {work_dir}/simple_collect.py {output_path}
-                """
-                return ["bash", "-c", install_cmd]
-            elif install_mode == "source":
-                install_cmd = f"""
-                cd {repo_dir}
-                pip3 install -e . || echo 'Source install failed, continuing...'
-
-                echo 'Source installation completed'
-
-                echo 'Step 3: Running simple_collect.py...'
-                cd {work_dir}
-                python3 {work_dir}/simple_collect.py {output_path}
-                """
-                return ["bash", "-c", install_cmd]
-            else:
-                raise ValueError(f"Invalid install mode: {install_mode}")
-        else:
-            # Future command types can be added here
-            # elif cmd_type == "benchmark_collect":
-            #     model_dir = EnvManager.get_model_dir()
-            #     return [
-            #         "bash", "-c",
-            #         f"cd {work_dir} && python3 {work_dir}/benchmark_collect.py "
-            #         f"--model-dir {model_dir} --output {output_path}"
-            #     ]
-            # elif cmd_type == "metrics_collect":
-            #     return [
-            #         "bash", "-c",
-            #         f"cd {work_dir} && python3 {work_dir}/metrics_collect.py --config {work_dir}/config.yaml"
-            #     ]
-            raise ValueError(
-                f"Unsupported command type: {cmd_type}. "
-                f"Currently supported: {SESSION_COLLECT_CMD_TYPE}"
-            )
+            # Add extra SLURM arguments if configured
+            slurm_extra_args = EnvManager.get_slurm_extra_args()
+            if slurm_extra_args:
+                sbatch_args.append(slurm_extra_args)
 
-    def run_job(self, cmd_type: str, job_name: str, log_file: str = None) -> Dict[str, Any]:
-        """Execute srun job.
+            # Add --wrap with the srun command
+            sbatch_args.extend(["--wrap", srun_command])
 
-        Args:
-            cmd_type: Type of command to execute
-            job_name: Name for the SLURM job
-            log_file: Optional path to save command output
+            # Submit the job
+            logger.info(f"Submitting job '{job_name}' (using sbatch --wrap)...")
+            logger.debug(f"Log file: {output_log_file}")
 
-        Returns:
-            Dict with status and message
-        """
-        try:
-            # Build complete command
-            srun_prefix = self.build_srun_prefix(job_name)
-            script_command = self.build_script_command(cmd_type)
-            full_command = srun_prefix + script_command
-
-            # Execute with optional log file
-            if log_file:
-                logger.info(f"Saving output to: {log_file}")
-                # Use Python file redirection to avoid shell quoting issues
-                import subprocess
-
-                with open(log_file, "w") as f:
-                    result = subprocess.run(
-                        full_command, stdout=f, stderr=subprocess.STDOUT, timeout=7200, text=True
-                    )
-                    if result.returncode != 0:
-                        raise subprocess.CalledProcessError(result.returncode, full_command)
-                logger.success(f"Output saved to {log_file}")
-                output = ""  # Output is in file
-            else:
-                output = exec_cmd_with_output(full_command, timeout=7200)
+            output = exec_cmd_with_output(sbatch_args, timeout=60)
+            job_id = output.strip()
+
+            # Parse job ID (--parsable returns just the job ID)
+            if job_id.isdigit():
+                logger.success(f"Job '{job_name}' submitted: {job_id}")
+                logger.info(f"All logs will be written to: {output_log_file}")
+                return True, job_id
+
+            # Fallback: try to extract from "Submitted batch job" format
+            match = re.search(r"Submitted batch job (\d+)", output)
+            if match:
+                job_id = match.group(1)
+                logger.success(f"Job '{job_name}' submitted: {job_id}")
+                return True, job_id
+
+            logger.error(f"Failed to parse job ID from output: {output}")
+            return False, ""
 
-            return {"status": True, "msg": "Job executed successfully", "output": output}
         except Exception as e:
-            logger.error(f"Job execution failed: {e}")
-            return {"status": False, "msg": str(e)}
+            logger.error(f"Failed to submit job '{job_name}': {e}")
+            import traceback
+            logger.debug(traceback.format_exc())
+            return False, str(e)
 
+    # ============================================================================
+    # Session Collection Job Submission
+    # ============================================================================
 
-def make_slurm_run_command():
-    """Create run command function (maintain interface compatibility)."""
-    builder = SlurmRunCommandBuilder()
-    return builder.run_job
+    @staticmethod
+    def submit_session_collect_job() -> tuple[bool, str]:
+        """Submit session collect job using sbatch (non-blocking).
 
+        This method builds the shell script for session collection and
+        delegates to submit_shell_job() for actual submission.
 
-class JobManager:
-    """Job manager class."""
+        Key benefits:
+        - Non-blocking execution (pytest doesn't wait)
+        - Better resource scheduling (queues if resources unavailable)
+        - Fault tolerance (job survives parent process exit)
+        - Unified job management (reuses wait_for_completion)
+        - All logs redirected to session_collect.log
+
+        Returns:
+            tuple: (success: bool, job_id: str)
+        """
+        try:
+            # Get environment configuration for building the script
+            work_dir = EnvManager.get_work_dir()
+            repo_dir = EnvManager.get_repo_dir()
+            install_mode = EnvManager.get_install_mode()
+            trtllm_wheel_path = EnvManager.get_trtllm_wheel_path()
+            output_path = EnvManager.get_output_path()
+
+            # Build the inner script specific to session collection
+            inner_script = f"""
+INSTALL_MODE="{install_mode}"
+REPO_DIR="{repo_dir}"
+WORK_DIR="{work_dir}"
+OUTPUT_PATH="{output_path}"
+WHEEL_PATH="{trtllm_wheel_path}"
+
+echo "=========================================="
+echo "Session Collect Job Started"
+echo "Time: $(date)"
+echo "Install Mode: $INSTALL_MODE"
+echo "=========================================="
+
+# Handle different installation modes
+if [ "$INSTALL_MODE" = "none" ]; then
+    echo "Using built-in TensorRT-LLM, skipping installation"
+    
+elif [ "$INSTALL_MODE" = "wheel" ]; then
+    echo "Installing TensorRT-LLM wheel..."
+    echo "Wheel path: $WHEEL_PATH"
+    pip3 install "$WHEEL_PATH" 2>&1 || echo "Wheel install failed, continuing..."
+    echo "Wheel installation completed"
+    
+elif [ "$INSTALL_MODE" = "source" ]; then
+    echo "Installing TensorRT-LLM from source..."
+    cd "$REPO_DIR"
+    pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
+    echo "Source installation completed"
+    
+else
+    echo "ERROR: Invalid install mode: $INSTALL_MODE"
+    exit 1
+fi
+
+echo ""
+echo "Collecting TensorRT-LLM version information..."
+# Get TensorRT-LLM version and write to file
+VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
+python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}')" > "$VERSION_FILE" 2>&1 || {{
+    echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
+    echo "Failed to get TensorRT-LLM version, wrote 'unknown' to $VERSION_FILE"
+}}
+echo "TensorRT-LLM version written to: $VERSION_FILE"
+
+echo ""
+echo "Running simple_collect.py..."
+cd "$WORK_DIR"
+python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
+
+echo ""
+echo "=========================================="
+echo "Session Collect Job Completed"
+echo "Time: $(date)"
+echo "=========================================="
+
+# Explicitly exit to ensure job terminates immediately
+exit 0
+"""
+
+            # Submit using the shell job method
+            log_file = f"{output_path}/session_collect.log"
+            return JobManager.submit_shell_job(
+                job_name="session_collect",
+                shell_script=inner_script,
+                output_log_file=log_file,
+                timeout=7200  # 2 hours
+            )
+
+        except Exception as e:
+            logger.error(f"Failed to prepare session collect job: {e}")
+            import traceback
+            logger.debug(traceback.format_exc())
+            return False, str(e)
+
+    # ============================================================================
+    # Test Job Submission (Via submit.py script)
+    # ============================================================================
 
     @staticmethod
-    def submit_job(test_config) -> tuple:
-        """Submit job using submit.py with YAML config.
+    def submit_test_job(test_config) -> tuple:
+        """Submit benchmark test job using submit.py script.
+
+        This method submits test jobs by calling the submit.py script,
+        which handles test-specific configuration and SLURM job setup.
 
         Args:
             test_config: TestConfig object containing configuration
@@ -196,7 +255,7 @@ def submit_job(test_config) -> tuple:
         Returns:
             tuple: (success: bool, job_id: str)
         """
-        logger.info("Submitting job using submit.py...")
+        logger.info("Submitting test job via submit.py...")
 
         try:
             import re
@@ -846,6 +905,3 @@ def _check_job_result(
                 test_name=test_name,
             )
 
-
-# create executor function
-run_job = make_slurm_run_command()
diff --git a/tests/integration/defs/perf/disagg/simple_collect.py b/tests/integration/defs/perf/disagg/simple_collect.py
index 1e3e32ee1e9..5a3b017a604 100644
--- a/tests/integration/defs/perf/disagg/simple_collect.py
+++ b/tests/integration/defs/perf/disagg/simple_collect.py
@@ -235,63 +235,6 @@ def write_cpu_info(self, data):
         print(f"Generated CPU file: {cpu_file}")
         return cpu_model
 
-    def write_trtllm_version(self):
-        """Write TensorRT-LLM version information to trtllm_version.txt."""
-        version_info = "[TensorRT-LLM] TensorRT-LLM version: unknown"
-
-        try:
-            # Try to import tensorrt_llm and get version
-            result = subprocess.run(
-                [
-                    sys.executable,
-                    "-c",
-                    'import tensorrt_llm; print(f"[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}")',
-                ],
-                capture_output=True,
-                text=True,
-                timeout=30,
-            )
-
-            if result.returncode == 0 and result.stdout.strip():
-                version_info = result.stdout.strip()
-            else:
-                # Print error for debugging
-                print(f"TensorRT-LLM import failed (returncode={result.returncode}):")
-                if result.stderr:
-                    print(f"  stderr:\n{result.stderr}")
-
-                # Try one more time with a simple sleep
-                print("Retrying after 10 seconds...")
-                time.sleep(10)
-                result = subprocess.run(
-                    [
-                        sys.executable,
-                        "-c",
-                        "import tensorrt_llm; "
-                        'print(f"[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}")',
-                    ],
-                    capture_output=True,
-                    text=True,
-                    timeout=30,
-                )
-
-                if result.returncode == 0 and result.stdout.strip():
-                    version_info = result.stdout.strip()
-                    print("TensorRT-LLM version retrieved on second attempt")
-                else:
-                    print(f"TensorRT-LLM import failed again (returncode={result.returncode}):")
-                    if result.stderr:
-                        print(f"  stderr:\n{result.stderr}")
-
-        except Exception as e:
-            print(f"Error getting TensorRT-LLM version: {e}")  # Keep default unknown version
-
-        trtllm_file = os.path.join(self.output_dir, "trtllm_version.txt")
-        with open(trtllm_file, "w") as f:
-            f.write(version_info)
-        print(f"Generated TensorRT-LLM version file: {trtllm_file}")
-        return version_info
-
     def write_driver_info(self, data):
         """Write GPU driver information to driver.txt."""
         driver_version = data.get("nvidia_driver_version", "unknown")
@@ -307,13 +250,11 @@ def write_all_txt_files(self, data):
         gpu_info = self.write_gpu_info(data)
         cpu_info = self.write_cpu_info(data)
         driver_info = self.write_driver_info(data)
-        trtllm_info = self.write_trtllm_version()
 
         return {
             "GPU": gpu_info,
             "CPU": cpu_info,
             "Driver": driver_info,
-            "TensorRT-LLM": trtllm_info,
         }
 
 
@@ -347,7 +288,6 @@ def main():
     print("  - gpu.txt")
     print("  - cpu.txt")
     print("  - driver.txt")
-    print("  - trtllm_version.txt")
 
     print("\n=== Collected Information ===")
     for key, value in system_data.items():
diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py
index 7fe141c5d10..4c6391b205f 100644
--- a/tests/integration/defs/perf/disagg/test_disagg.py
+++ b/tests/integration/defs/perf/disagg/test_disagg.py
@@ -102,7 +102,7 @@ def test_benchmark(self, request, test_config: TestConfig):
                 job_id = EnvManager.get_debug_job_id()
             else:
                 # Submit job using JobManager
-                success, job_id = JobManager.submit_job(test_config)
+                success, job_id = JobManager.submit_test_job(test_config)
 
                 # Validate submission result
                 assert success, f"Job submission failed: {test_config.test_id}"
@@ -180,7 +180,7 @@ def test_accuracy(self, request, test_config: TestConfig):
                 job_id = EnvManager.get_debug_job_id()
             else:
                 # Submit job using JobManager
-                success, job_id = JobManager.submit_job(test_config)
+                success, job_id = JobManager.submit_test_job(test_config)
 
                 # Validate submission result
                 assert success, f"Job submission failed: {test_config.test_id}"
@@ -266,7 +266,7 @@ def test_stress(self, request, test_config: TestConfig):
                 job_id = EnvManager.get_debug_job_id()
             else:
                 # Submit job using JobManager
-                success, job_id = JobManager.submit_job(test_config)
+                success, job_id = JobManager.submit_test_job(test_config)
 
                 # Validate submission result
                 assert success, f"Job submission failed: {test_config.test_id}"
diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py
index 6c8805e3636..4ba9b812401 100644
--- a/tests/integration/defs/perf/disagg/utils/common.py
+++ b/tests/integration/defs/perf/disagg/utils/common.py
@@ -2,8 +2,6 @@
 
 import os
 
-SESSION_COLLECT_CMD_TYPE = "session_collect"
-
 # GPU resource configuration
 # Simplified - only fields actually used in the codebase
 GPU_RESOURCE_CONFIG = {
diff --git a/tests/integration/defs/perf/disagg/utils/trackers.py b/tests/integration/defs/perf/disagg/utils/trackers.py
index 69c72b0cdbb..3ffcebd55fb 100644
--- a/tests/integration/defs/perf/disagg/utils/trackers.py
+++ b/tests/integration/defs/perf/disagg/utils/trackers.py
@@ -4,10 +4,10 @@
 
 import pandas as pd
 
-# Import run_job from execution (cross-package import)
-from execution.executor import run_job
+# Import JobManager from execution
+from execution.executor import JobManager
 
-from utils.common import SESSION_COLLECT_CMD_TYPE, EnvManager
+from utils.common import EnvManager
 from utils.logger import logger
 
 
@@ -74,25 +74,43 @@ def start(self):
         logger.info(f"Session started: {self.start_time}")
 
     def end_and_collect(self):
-        """Record end time and trigger information collection."""
+        """Record end time and trigger session collection.
+        
+        Uses the new sbatch-based approach for non-blocking execution.
+        Submits the job and waits for completion using JobManager.
+        """
         self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         logger.info(f"Session ended: {self.end_time}")
 
-        # Prepare log file path
+        # Submit session collect job (non-blocking sbatch)
+        success, job_id = JobManager.submit_session_collect_job()
+        
+        if not success:
+            logger.error(f"Failed to submit session collect job: {job_id}")
+            return False
+
+        # Wait for job completion (reuses wait_for_completion method)
+        logger.info(f"Waiting for session collect job {job_id} to complete...")
+        JobManager.wait_for_completion(
+            job_id=job_id,
+            timeout=7200,  # 2 hours
+            test_config=None,  # No test config for session collect
+            check_early_failure=False  # Don't check early failures
+        )
+
+        # Check if log file was created (indicates success)
         output_path = EnvManager.get_output_path()
         log_file = os.path.join(output_path, "session_collect.log")
-
-        job_name = f"{EnvManager.get_slurm_job_name()}-session-collect"
-        run_result = run_job(SESSION_COLLECT_CMD_TYPE, job_name, log_file=log_file)
-
-        if run_result["status"]:
-            # update timestamps in CSV
+        
+        if os.path.exists(log_file):
+            # Update timestamps in CSV
             self._update_csv_timestamps()
             logger.success("Session properties collected successfully")
+            logger.info(f"Session collect log: {log_file}")
+            return True
         else:
-            logger.error(f"Failed to collect session properties: {run_result['msg']}")
-
-        return run_result["status"]
+            logger.error(f"Session collect log not found: {log_file}")
+            return False
 
     def _update_csv_timestamps(self):
         """Update timestamps in CSV using pandas."""

From 339ecd314cac3c93c746bd439ea8da3ce59f65dc Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Fri, 26 Dec 2025 11:00:05 +0000
Subject: [PATCH 07/13] fx

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py    | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 659ccfb6881..4d8712265ef 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -177,14 +177,31 @@ def submit_session_collect_job() -> tuple[bool, str]:
 echo "Install Mode: $INSTALL_MODE"
 echo "=========================================="
 
-# Handle different installation modes
+# Step 1: Collect system information (no dependencies)
+echo ""
+echo "Step 1: Collecting system information..."
+cd "$WORK_DIR"
+python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
+echo "System information collection completed"
+
+# Step 2: Handle different installation modes
+echo ""
+echo "Step 2: Installing TensorRT-LLM..."
 if [ "$INSTALL_MODE" = "none" ]; then
     echo "Using built-in TensorRT-LLM, skipping installation"
     
 elif [ "$INSTALL_MODE" = "wheel" ]; then
     echo "Installing TensorRT-LLM wheel..."
-    echo "Wheel path: $WHEEL_PATH"
-    pip3 install "$WHEEL_PATH" 2>&1 || echo "Wheel install failed, continuing..."
+    echo "Wheel path pattern: $WHEEL_PATH"
+    
+    # Expand wildcard and install (use unquoted variable to allow glob expansion)
+    for wheel_file in $WHEEL_PATH; do
+        if [ -f "$wheel_file" ]; then
+            echo "Found wheel: $wheel_file"
+            pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
+            break
+        fi
+    done
     echo "Wheel installation completed"
     
 elif [ "$INSTALL_MODE" = "source" ]; then
@@ -198,21 +215,13 @@ def submit_session_collect_job() -> tuple[bool, str]:
     exit 1
 fi
 
+# Step 3: Collect TensorRT-LLM version information
 echo ""
-echo "Collecting TensorRT-LLM version information..."
-# Get TensorRT-LLM version and write to file
+echo "Step 3: Collecting TensorRT-LLM version information..."
 VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
-python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}')" > "$VERSION_FILE" 2>&1 || {{
-    echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
-    echo "Failed to get TensorRT-LLM version, wrote 'unknown' to $VERSION_FILE"
-}}
+python3 -c 'import tensorrt_llm; print(f"[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}")' > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
 echo "TensorRT-LLM version written to: $VERSION_FILE"
 
-echo ""
-echo "Running simple_collect.py..."
-cd "$WORK_DIR"
-python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
-
 echo ""
 echo "=========================================="
 echo "Session Collect Job Completed"

From 1348b073d9f02a4bbbcdfdfc815c12c6b6d1e4a6 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Fri, 26 Dec 2025 12:18:50 +0000
Subject: [PATCH 08/13] try to fix sbatch isssue

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py    | 128 ++++++------------
 .../defs/perf/disagg/session_collect.sh       |  70 ++++++++++
 2 files changed, 112 insertions(+), 86 deletions(-)
 create mode 100644 tests/integration/defs/perf/disagg/session_collect.sh

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 4d8712265ef..515dbf55eb2 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -33,20 +33,25 @@ class JobManager:
     @staticmethod
     def submit_shell_job(
         job_name: str,
-        shell_script: str,
-        output_log_file: str,
-        timeout: int = 7200
+        script_path: str,
+        script_args: list[str] = None,
+        output_log_file: str = None,
+        timeout: int = 7200,
+        container_name: str = None
     ) -> tuple[bool, str]:
         """Submit a generic shell script job using sbatch --wrap.
 
-        This is a low-level method for submitting arbitrary shell scripts
-        directly to SLURM via sbatch --wrap (non-blocking).
+        This is a low-level method for submitting shell scripts to SLURM
+        via sbatch --wrap (non-blocking). Supports executing script files
+        with arguments inside containers.
 
         Args:
             job_name: SLURM job name
-            shell_script: Shell script content to execute
-            output_log_file: Full path to output log file
+            script_path: Path to the shell script file to execute
+            script_args: List of arguments to pass to the script (optional)
+            output_log_file: Full path to output log file (optional, defaults to OUTPUT_PATH/{job_name}.log)
             timeout: Job timeout in seconds (default: 7200 = 2 hours)
+            container_name: Container name for srun (optional, defaults to job_name)
 
         Returns:
             tuple: (success: bool, job_id: str)
@@ -60,13 +65,27 @@ def submit_shell_job(
             # Ensure output directory exists
             os.makedirs(output_path, exist_ok=True)
 
+            # Set defaults
+            if output_log_file is None:
+                output_log_file = f"{output_path}/{job_name}.log"
+            if container_name is None:
+                container_name = job_name
+            if script_args is None:
+                script_args = []
+
+            # Build the bash command with script and arguments
+            # Quote the script path and each argument separately
+            quoted_script = f'"{script_path}"'
+            quoted_args = ' '.join(f'"{arg}"' for arg in script_args)
+            bash_command = f"bash {quoted_script} {quoted_args}".strip()
+
             # Build complete srun command (runs inside sbatch)
             srun_command = (
                 f"srun -l "
-                f"--container-name={job_name} "
+                f"--container-name={container_name} "
                 f"--container-image={container_image} "
                 f"--container-mounts={container_mount} "
-                f"bash -c '{shell_script}'"
+                f"{bash_command}"
             )
 
             # Build sbatch command with all parameters
@@ -107,6 +126,7 @@ def submit_shell_job(
 
             # Submit the job
             logger.info(f"Submitting job '{job_name}' (using sbatch --wrap)...")
+            logger.debug(f"Script: {script_path}")
             logger.debug(f"Log file: {output_log_file}")
 
             output = exec_cmd_with_output(sbatch_args, timeout=60)
@@ -142,8 +162,8 @@ def submit_shell_job(
     def submit_session_collect_job() -> tuple[bool, str]:
         """Submit session collect job using sbatch (non-blocking).
 
-        This method builds the shell script for session collection and
-        delegates to submit_shell_job() for actual submission.
+        This method prepares the arguments for the session_collect.sh script
+        and submits it via the generic submit_shell_job() method.
 
         Key benefits:
         - Non-blocking execution (pytest doesn't wait)
@@ -156,89 +176,25 @@ def submit_session_collect_job() -> tuple[bool, str]:
             tuple: (success: bool, job_id: str)
         """
         try:
-            # Get environment configuration for building the script
+            # Get environment configuration
             work_dir = EnvManager.get_work_dir()
             repo_dir = EnvManager.get_repo_dir()
             install_mode = EnvManager.get_install_mode()
             trtllm_wheel_path = EnvManager.get_trtllm_wheel_path()
             output_path = EnvManager.get_output_path()
 
-            # Build the inner script specific to session collection
-            inner_script = f"""
-INSTALL_MODE="{install_mode}"
-REPO_DIR="{repo_dir}"
-WORK_DIR="{work_dir}"
-OUTPUT_PATH="{output_path}"
-WHEEL_PATH="{trtllm_wheel_path}"
-
-echo "=========================================="
-echo "Session Collect Job Started"
-echo "Time: $(date)"
-echo "Install Mode: $INSTALL_MODE"
-echo "=========================================="
-
-# Step 1: Collect system information (no dependencies)
-echo ""
-echo "Step 1: Collecting system information..."
-cd "$WORK_DIR"
-python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
-echo "System information collection completed"
-
-# Step 2: Handle different installation modes
-echo ""
-echo "Step 2: Installing TensorRT-LLM..."
-if [ "$INSTALL_MODE" = "none" ]; then
-    echo "Using built-in TensorRT-LLM, skipping installation"
-    
-elif [ "$INSTALL_MODE" = "wheel" ]; then
-    echo "Installing TensorRT-LLM wheel..."
-    echo "Wheel path pattern: $WHEEL_PATH"
-    
-    # Expand wildcard and install (use unquoted variable to allow glob expansion)
-    for wheel_file in $WHEEL_PATH; do
-        if [ -f "$wheel_file" ]; then
-            echo "Found wheel: $wheel_file"
-            pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
-            break
-        fi
-    done
-    echo "Wheel installation completed"
-    
-elif [ "$INSTALL_MODE" = "source" ]; then
-    echo "Installing TensorRT-LLM from source..."
-    cd "$REPO_DIR"
-    pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
-    echo "Source installation completed"
-    
-else
-    echo "ERROR: Invalid install mode: $INSTALL_MODE"
-    exit 1
-fi
-
-# Step 3: Collect TensorRT-LLM version information
-echo ""
-echo "Step 3: Collecting TensorRT-LLM version information..."
-VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
-python3 -c 'import tensorrt_llm; print(f"[TensorRT-LLM] TensorRT-LLM version: {{tensorrt_llm.__version__}}")' > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
-echo "TensorRT-LLM version written to: $VERSION_FILE"
-
-echo ""
-echo "=========================================="
-echo "Session Collect Job Completed"
-echo "Time: $(date)"
-echo "=========================================="
-
-# Explicitly exit to ensure job terminates immediately
-exit 0
-"""
-
-            # Submit using the shell job method
-            log_file = f"{output_path}/session_collect.log"
+            # Prepare script path and arguments
+            script_path = f"{work_dir}/session_collect.sh"
+            script_args = [install_mode, repo_dir, work_dir, output_path, trtllm_wheel_path]
+
+            # Submit using the generic shell job method
             return JobManager.submit_shell_job(
                 job_name="session_collect",
-                shell_script=inner_script,
-                output_log_file=log_file,
-                timeout=7200  # 2 hours
+                script_path=script_path,
+                script_args=script_args,
+                output_log_file=f"{output_path}/session_collect.log",
+                timeout=7200,  # 2 hours
+                container_name="session-collect"
             )
 
         except Exception as e:
diff --git a/tests/integration/defs/perf/disagg/session_collect.sh b/tests/integration/defs/perf/disagg/session_collect.sh
new file mode 100644
index 00000000000..cbc7c775036
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/session_collect.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# Session Collection Script
+# Collects system information and TensorRT-LLM version
+
+# Get parameters from environment or command line
+INSTALL_MODE="${1:-none}"
+REPO_DIR="${2:-.}"
+WORK_DIR="${3:-.}"
+OUTPUT_PATH="${4:-./output}"
+WHEEL_PATH="${5:-}"
+
+echo "=========================================="
+echo "Session Collect Job Started"
+echo "Time: $(date)"
+echo "Install Mode: $INSTALL_MODE"
+echo "=========================================="
+
+# Step 1: Collect system information (no dependencies)
+echo ""
+echo "Step 1: Collecting system information..."
+cd "$WORK_DIR"
+python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
+echo "System information collection completed"
+
+# Step 2: Handle different installation modes
+echo ""
+echo "Step 2: Installing TensorRT-LLM..."
+if [ "$INSTALL_MODE" = "none" ]; then
+    echo "Using built-in TensorRT-LLM, skipping installation"
+
+elif [ "$INSTALL_MODE" = "wheel" ]; then
+    echo "Installing TensorRT-LLM wheel..."
+    echo "Wheel path pattern: $WHEEL_PATH"
+
+    # Expand wildcard and install
+    for wheel_file in $WHEEL_PATH; do
+        if [ -f "$wheel_file" ]; then
+            echo "Found wheel: $wheel_file"
+            pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
+            break
+        fi
+    done
+    echo "Wheel installation completed"
+
+elif [ "$INSTALL_MODE" = "source" ]; then
+    echo "Installing TensorRT-LLM from source..."
+    cd "$REPO_DIR"
+    pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
+    echo "Source installation completed"
+
+else
+    echo "ERROR: Invalid install mode: $INSTALL_MODE"
+    exit 1
+fi
+
+# Step 3: Collect TensorRT-LLM version information
+echo ""
+echo "Step 3: Collecting TensorRT-LLM version information..."
+VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
+python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
+echo "TensorRT-LLM version written to: $VERSION_FILE"
+
+echo ""
+echo "=========================================="
+echo "Session Collect Job Completed"
+echo "Time: $(date)"
+echo "=========================================="
+
+exit 0
+

From 10d040dcee85e85012f92b57739af1c4f5464e50 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Tue, 30 Dec 2025 01:54:27 +0000
Subject: [PATCH 09/13] expand the logic of slurm extra args

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py    | 13 +-----
 .../defs/perf/disagg/utils/common.py          | 44 ++++++++++++++-----
 2 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index 515dbf55eb2..a3e961c9c0a 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -9,7 +9,6 @@
 import yaml
 from reporting.report import LogParser, LogWriter, ResultSaver
 from utils.common import (
-    GPU_RESOURCE_CONFIG,
     EnvManager,
     extract_config_fields,
 )
@@ -88,12 +87,6 @@ def submit_shell_job(
                 f"{bash_command}"
             )
 
-            # Build sbatch command with all parameters
-            gpu_type = EnvManager.get_gpu_type()
-            gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type)
-            if not gpu_config:
-                raise ValueError(f"GPU resource configuration not found for {gpu_type}")
-
             # Convert timeout to HH:MM:SS format
             hours = timeout // 3600
             minutes = (timeout % 3600) // 60
@@ -112,11 +105,7 @@ def submit_shell_job(
                 "--parsable",  # Easier job ID parsing
             ]
 
-            # Conditionally add gres parameter based on GPU configuration
-            if gpu_config["gres_gpu"] is not None:
-                sbatch_args.append(f"--gres=gpu:{gpu_config['gres_gpu']}")
-
-            # Add extra SLURM arguments if configured
+            # Add extra SLURM arguments (including --gres from GPU_RESOURCE_CONFIG)
             slurm_extra_args = EnvManager.get_slurm_extra_args()
             if slurm_extra_args:
                 sbatch_args.append(slurm_extra_args)
diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py
index 4ba9b812401..9bd3a14b5a7 100644
--- a/tests/integration/defs/perf/disagg/utils/common.py
+++ b/tests/integration/defs/perf/disagg/utils/common.py
@@ -3,35 +3,40 @@
 import os
 
 # GPU resource configuration
-# Simplified - only fields actually used in the codebase
+# Centralized configuration for all GPU-specific parameters
 GPU_RESOURCE_CONFIG = {
     # OCI GB200
     "GB200": {
-        "gres_gpu": 4,  # srun --gres parameter (None = not required)
+        "slurm_extra_args": "--gres=gpu:4",  # SLURM extra arguments (empty string if not required)
+        "set_segment": True,
         "lock_freq_graphics_mhz": 2062,  # GPU graphics clock lock frequency (MHz)
         "lock_freq_memory_mhz": 3996,  # GPU memory clock lock frequency (MHz)
     },
     # OCI GB300
     "GB300": {
-        "gres_gpu": None,  # GB300 does not require gres
+        "slurm_extra_args": "",  # GB300 does not require extra args
+        "set_segment": True,
         "lock_freq_graphics_mhz": None,  # TODO: Set GB300 lock frequency
         "lock_freq_memory_mhz": None,
     },
     # H100
     "H100": {
-        "gres_gpu": None,  # H100 does not require gres
+        "slurm_extra_args": "",  # H100 does not require extra args
+        "set_segment": False,
         "lock_freq_graphics_mhz": None,  # TODO: Set H100 lock frequency
         "lock_freq_memory_mhz": None,
     },
     # B200
     "B200": {
-        "gres_gpu": 4,
+        "slurm_extra_args": "--gres=gpu:4",
+        "set_segment": False,
         "lock_freq_graphics_mhz": None,  # TODO: Set B200 lock frequency
         "lock_freq_memory_mhz": None,
     },
     # B300
     "B300": {
-        "gres_gpu": 4,
+        "slurm_extra_args": "--gres=gpu:4",
+        "set_segment": False,
         "lock_freq_graphics_mhz": None,  # TODO: Set B300 lock frequency
         "lock_freq_memory_mhz": None,
     },
@@ -59,15 +64,34 @@ def get_slurm_job_name() -> str:
 
     @staticmethod
     def get_slurm_set_segment() -> bool:
+        """Get whether to use SLURM segment parameter based on GPU type.
+        
+        Returns:
+            bool: True if GPU type requires --segment parameter, False otherwise
+        """
         gpu_type = EnvManager.get_gpu_type()
-        gpu_type_support_segment = {"GB200": True, "GB300": True}
-        return gpu_type_support_segment.get(gpu_type, False)
+        gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type, {})
+        return gpu_config.get("set_segment", False)
 
     @staticmethod
     def get_slurm_extra_args() -> str:
+        """Get SLURM extra arguments based on GPU configuration.
+        
+        Returns extra SLURM arguments from GPU_RESOURCE_CONFIG.
+        This allows flexible configuration of GPU-specific SLURM parameters
+        like --gres, --constraint, etc.
+        
+        Returns:
+            str: Extra SLURM arguments (e.g., "--gres=gpu:4" or "")
+        
+        Examples:
+            GB200: "--gres=gpu:4"
+            GB300: ""
+            Custom: "--gres=gpu:4 --constraint=v100"
+        """
         gpu_type = EnvManager.get_gpu_type()
-        gpu_type_support_extra_args = {"GB200": "--gres=gpu:4", "GB300": ""}
-        return gpu_type_support_extra_args.get(gpu_type, "")
+        gpu_config = GPU_RESOURCE_CONFIG.get(gpu_type, {})
+        return gpu_config.get("slurm_extra_args", "")
 
     @staticmethod
     def get_container_image() -> str:

From 01617baf7fd20957a6658129fa7d486d4d29a80c Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Wed, 31 Dec 2025 05:36:44 +0000
Subject: [PATCH 10/13] fx

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/compare_backends.py      |  8 ++---
 .../defs/perf/disagg/execution/executor.py    | 35 +++++++++----------
 .../defs/perf/disagg/session_collect.sh       |  1 -
 .../defs/perf/disagg/simple_collect.py        |  1 -
 .../test_configs/disagg/stress/README.md      |  1 -
 .../defs/perf/disagg/test_disagg.py           | 12 +++----
 .../defs/perf/disagg/utils/common.py          |  8 ++---
 .../defs/perf/disagg/utils/config_loader.py   | 27 +++++++++-----
 .../defs/perf/disagg/utils/trackers.py        |  8 ++---
 9 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/compare_backends.py b/tests/integration/defs/perf/disagg/compare_backends.py
index 8ff9c1d3631..46c2223fa3a 100644
--- a/tests/integration/defs/perf/disagg/compare_backends.py
+++ b/tests/integration/defs/perf/disagg/compare_backends.py
@@ -14,7 +14,7 @@ def extract_backend(test_name):
 
     New format: ccb-NIXL or ccb-UCX or ccb-DEFAULT
     Example: disagg_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL
-    
+
     Note: "DEFAULT" is a special marker that represents the default backend
     """
     match = re.search(r"ccb-(\w+)", test_name)
@@ -158,7 +158,7 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"):
         )
 
     # Print statistics
-    print(f"\n=== Backend Comparison Statistics ===")
+    print("\n=== Backend Comparison Statistics ===")
     print(f"Default backend: {default_backend}")
     print(f"Comparison pairs: {comparison_pairs}")
     print(f"Single-backend cases (skipped): {single_backend_skipped}")
@@ -166,8 +166,8 @@ def compare_backends(csv_path, threshold=5.0, default_backend="NIXL"):
 
     # If no comparison pairs found, exit with success
     if comparison_pairs == 0:
-        print(f"\nInfo: No backend comparison pairs found in disagg_perf tests")
-        print(f"All cases are single-backend only, no comparison needed")
+        print("\nInfo: No backend comparison pairs found in disagg_perf tests")
+        print("All cases are single-backend only, no comparison needed")
         sys.exit(0)
 
     # Convert to DataFrame
diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index a3e961c9c0a..d454765c536 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -8,15 +8,11 @@
 
 import yaml
 from reporting.report import LogParser, LogWriter, ResultSaver
-from utils.common import (
-    EnvManager,
-    extract_config_fields,
-)
+from utils.common import EnvManager
 from utils.logger import logger
 
 from execution.subprocess_utils import exec_cmd, exec_cmd_with_output
 
-
 # ============================================================================
 # Job Manager
 # ============================================================================
@@ -36,7 +32,7 @@ def submit_shell_job(
         script_args: list[str] = None,
         output_log_file: str = None,
         timeout: int = 7200,
-        container_name: str = None
+        container_name: str = None,
     ) -> tuple[bool, str]:
         """Submit a generic shell script job using sbatch --wrap.
 
@@ -75,7 +71,7 @@ def submit_shell_job(
             # Build the bash command with script and arguments
             # Quote the script path and each argument separately
             quoted_script = f'"{script_path}"'
-            quoted_args = ' '.join(f'"{arg}"' for arg in script_args)
+            quoted_args = " ".join(f'"{arg}"' for arg in script_args)
             bash_command = f"bash {quoted_script} {quoted_args}".strip()
 
             # Build complete srun command (runs inside sbatch)
@@ -140,6 +136,7 @@ def submit_shell_job(
         except Exception as e:
             logger.error(f"Failed to submit job '{job_name}': {e}")
             import traceback
+
             logger.debug(traceback.format_exc())
             return False, str(e)
 
@@ -183,12 +180,13 @@ def submit_session_collect_job() -> tuple[bool, str]:
                 script_args=script_args,
                 output_log_file=f"{output_path}/session_collect.log",
                 timeout=7200,  # 2 hours
-                container_name="session-collect"
+                container_name="session-collect",
             )
 
         except Exception as e:
             logger.error(f"Failed to prepare session collect job: {e}")
             import traceback
+
             logger.debug(traceback.format_exc())
             return False, str(e)
 
@@ -232,7 +230,7 @@ def submit_test_job(test_config) -> tuple:
 
             # Call submit.py with the temporary config file
             submit_script = os.path.join(EnvManager.get_script_dir(), "submit.py")
-           
+
             case_log_dir = JobManager.get_result_dir(test_config)
 
             cmd = ["python3", submit_script, "-c", temp_config_path, "--log-dir", case_log_dir]
@@ -292,22 +290,22 @@ def backup_logs(
 
         try:
             final_dir = result_dir
-            
+
             # For FAILED cases, rename directory to add _ERROR suffix
             if not is_passed:
                 error_dir = f"{result_dir}_ERROR"
                 logger.info(f"Renaming failed case directory: {result_dir} -> {error_dir}")
-                
+
                 # Remove old error directory if exists
                 if os.path.exists(error_dir):
                     logger.warning(f"Removing existing error directory: {error_dir}")
                     shutil.rmtree(error_dir)
-                
+
                 # Rename to add _ERROR suffix
                 shutil.move(result_dir, error_dir)
                 final_dir = error_dir
                 logger.success(f"Directory renamed to: {final_dir}")
-            
+
             # Copy temporary config file to the directory
             temp_config_path = test_config.temp_config_path
             if os.path.exists(temp_config_path):
@@ -811,23 +809,23 @@ def _check_job_result(
                 timestamps=timestamps,
                 test_name=test_name,
             )
-            
+
             # If perf check failed, return immediately
             if not perf_result.get("success", False):
                 return perf_result
-            
+
             # Then check accuracy if accuracy_config is provided
             if accuracy_config:
                 # Use metrics config from accuracy_config (defaults to _COMMON_ACCURACY_METRICS)
                 accuracy_metrics = accuracy_config.get_metrics_config()
-                
+
                 accuracy_result = JobManager._check_accuracy_result(
                     job_id=job_id,
                     metrics_config=accuracy_metrics,
                     accuracy_config=accuracy_config,
                     result_dir=result_dir,
                 )
-                
+
                 # If accuracy check failed, merge results and return
                 if not accuracy_result.get("success", False):
                     return {
@@ -836,7 +834,7 @@ def _check_job_result(
                         "accuracy_result": accuracy_result,
                         "error": f"Perf passed but accuracy failed: {accuracy_result.get('error', 'Unknown')}",
                     }
-                
+
                 # Both passed, merge results
                 return {
                     **perf_result,
@@ -858,4 +856,3 @@ def _check_job_result(
                 timestamps=timestamps,
                 test_name=test_name,
             )
-
diff --git a/tests/integration/defs/perf/disagg/session_collect.sh b/tests/integration/defs/perf/disagg/session_collect.sh
index cbc7c775036..30cd3c4c1d4 100644
--- a/tests/integration/defs/perf/disagg/session_collect.sh
+++ b/tests/integration/defs/perf/disagg/session_collect.sh
@@ -67,4 +67,3 @@ echo "Time: $(date)"
 echo "=========================================="
 
 exit 0
-
diff --git a/tests/integration/defs/perf/disagg/simple_collect.py b/tests/integration/defs/perf/disagg/simple_collect.py
index 5a3b017a604..118759e9b7d 100644
--- a/tests/integration/defs/perf/disagg/simple_collect.py
+++ b/tests/integration/defs/perf/disagg/simple_collect.py
@@ -17,7 +17,6 @@
 import socket
 import subprocess
 import sys
-import time
 from collections import OrderedDict
 from datetime import datetime
 
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md
index 528c8e33e8b..ed440ef0409 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/stress/README.md
@@ -477,4 +477,3 @@ For issues or questions:
 2. Review configuration against this README
 3. Compare with `EXAMPLE_deepseek-r1-fp4_1k1k_stress_gsm8k.yaml`
 4. Contact your team's test infrastructure maintainer
-
diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py
index 4c6391b205f..39008ca11a1 100644
--- a/tests/integration/defs/perf/disagg/test_disagg.py
+++ b/tests/integration/defs/perf/disagg/test_disagg.py
@@ -218,8 +218,8 @@ def test_accuracy(self, request, test_config: TestConfig):
     @pytest.mark.parametrize("test_config", STRESS_TEST_CASES)
     def test_stress(self, request, test_config: TestConfig):
         """Stress test combining performance benchmarks and accuracy validation.
-        
-        This test type is designed for stress testing scenarios where both 
+
+        This test type is designed for stress testing scenarios where both
         performance metrics (CSV output) and accuracy (e.g., GSM8K) need to be validated.
         """
         full_test_name = request.node.name
@@ -249,12 +249,12 @@ def test_stress(self, request, test_config: TestConfig):
             logger.info(f"Category: {test_config.test_category}")
             logger.info(f"Model: {test_config.model_name}")
             logger.info(f"Benchmark: {test_config.benchmark_type}")
-            
+
             # Log accuracy datasets if configured
             if test_config.accuracy_config:
                 dataset_names = test_config.accuracy_config.get_all_dataset_names()
                 logger.info(f"Accuracy Datasets: {', '.join(dataset_names)}")
-            
+
             logger.info(f"Metrics log: {test_config.metrics_config.log_file}")
             logger.info(f"Supported GPUs: {', '.join(test_config.supported_gpus)}")
             logger.info(f"{'=' * 60}")
@@ -283,9 +283,7 @@ def test_stress(self, request, test_config: TestConfig):
 
             # Check results - this will handle both perf CSV writing AND accuracy validation
             result = JobManager.check_result(job_id, test_config, timestamps, full_test_name)
-            assert result["success"], (
-                f"Stress test failed: {result.get('error', 'Unknown error')}"
-            )
+            assert result["success"], f"Stress test failed: {result.get('error', 'Unknown error')}"
 
         except Exception as e:
             test_tracker.end_test_case()
diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py
index 9bd3a14b5a7..55d08875b52 100644
--- a/tests/integration/defs/perf/disagg/utils/common.py
+++ b/tests/integration/defs/perf/disagg/utils/common.py
@@ -65,7 +65,7 @@ def get_slurm_job_name() -> str:
     @staticmethod
     def get_slurm_set_segment() -> bool:
         """Get whether to use SLURM segment parameter based on GPU type.
-        
+
         Returns:
             bool: True if GPU type requires --segment parameter, False otherwise
         """
@@ -76,14 +76,14 @@ def get_slurm_set_segment() -> bool:
     @staticmethod
     def get_slurm_extra_args() -> str:
         """Get SLURM extra arguments based on GPU configuration.
-        
+
         Returns extra SLURM arguments from GPU_RESOURCE_CONFIG.
         This allows flexible configuration of GPU-specific SLURM parameters
         like --gres, --constraint, etc.
-        
+
         Returns:
             str: Extra SLURM arguments (e.g., "--gres=gpu:4" or "")
-        
+
         Examples:
             GB200: "--gres=gpu:4"
             GB300: ""
diff --git a/tests/integration/defs/perf/disagg/utils/config_loader.py b/tests/integration/defs/perf/disagg/utils/config_loader.py
index 07531a816ed..567834d6a77 100644
--- a/tests/integration/defs/perf/disagg/utils/config_loader.py
+++ b/tests/integration/defs/perf/disagg/utils/config_loader.py
@@ -45,7 +45,9 @@ class AccuracyConfig:
     """Accuracy test configuration (supports multiple datasets)."""
 
     datasets: List[DatasetThreshold]  # List of dataset threshold configurations
-    metrics: Optional[MetricsConfig] = None  # Optional custom metrics config (defaults to _COMMON_ACCURACY_METRICS)
+    metrics: Optional[MetricsConfig] = (
+        None  # Optional custom metrics config (defaults to _COMMON_ACCURACY_METRICS)
+    )
 
     def get_dataset_config(self, dataset_name: str) -> Optional[DatasetThreshold]:
         """Get configuration by dataset name.
@@ -68,10 +70,10 @@ def get_all_dataset_names(self) -> List[str]:
             List of dataset names
         """
         return [ds.dataset_name for ds in self.datasets]
-    
+
     def get_metrics_config(self) -> MetricsConfig:
         """Get metrics configuration for accuracy parsing.
-        
+
         Returns:
             Custom metrics config if provided, otherwise _COMMON_ACCURACY_METRICS
         """
@@ -398,20 +400,27 @@ def _load_config_file(self, yaml_path: Path, test_type: str, test_category: str)
                             higher_is_better=higher_is_better,
                         )
                     )
-                
+
                 # Check if custom accuracy metrics are provided
                 custom_metrics = None
                 if "metrics" in acc_meta:
                     metrics_override = acc_meta["metrics"]
                     custom_metrics = MetricsConfig(
                         log_file=metrics_override.get("log_file", "7_accuracy_eval.log"),
-                        extractor_pattern=metrics_override.get("extractor_pattern", r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|"),
-                        metric_names=metrics_override.get("metric_names", ["flexible-extract", "strict-match"]),
+                        extractor_pattern=metrics_override.get(
+                            "extractor_pattern",
+                            r"\|([a-zA-Z0-9_-]+)\|.*?\|([\w-]+)\|.*?\|exact_match\|.*?\|([0-9.]+)\|",
+                        ),
+                        metric_names=metrics_override.get(
+                            "metric_names", ["flexible-extract", "strict-match"]
+                        ),
                     )
-                    logger.info(f"Using custom accuracy metrics config from YAML")
-                
+                    logger.info("Using custom accuracy metrics config from YAML")
+
                 accuracy_config = AccuracyConfig(datasets=datasets, metrics=custom_metrics)
-                logger.info(f"Loaded accuracy config with {len(datasets)} dataset(s) for {test_category} test")
+                logger.info(
+                    f"Loaded accuracy config with {len(datasets)} dataset(s) for {test_category} test"
+                )
 
         return TestConfig(
             config_path=str(yaml_path),
diff --git a/tests/integration/defs/perf/disagg/utils/trackers.py b/tests/integration/defs/perf/disagg/utils/trackers.py
index 3ffcebd55fb..acee8d7fd68 100644
--- a/tests/integration/defs/perf/disagg/utils/trackers.py
+++ b/tests/integration/defs/perf/disagg/utils/trackers.py
@@ -75,7 +75,7 @@ def start(self):
 
     def end_and_collect(self):
         """Record end time and trigger session collection.
-        
+
         Uses the new sbatch-based approach for non-blocking execution.
         Submits the job and waits for completion using JobManager.
         """
@@ -84,7 +84,7 @@ def end_and_collect(self):
 
         # Submit session collect job (non-blocking sbatch)
         success, job_id = JobManager.submit_session_collect_job()
-        
+
         if not success:
             logger.error(f"Failed to submit session collect job: {job_id}")
             return False
@@ -95,13 +95,13 @@ def end_and_collect(self):
             job_id=job_id,
             timeout=7200,  # 2 hours
             test_config=None,  # No test config for session collect
-            check_early_failure=False  # Don't check early failures
+            check_early_failure=False,  # Don't check early failures
         )
 
         # Check if log file was created (indicates success)
         output_path = EnvManager.get_output_path()
         log_file = os.path.join(output_path, "session_collect.log")
-        
+
         if os.path.exists(log_file):
             # Update timestamps in CSV
             self._update_csv_timestamps()

From 1596f71e5662920bbb7b37d01886550c51cb1e19 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Wed, 31 Dec 2025 12:29:16 +0000
Subject: [PATCH 11/13] add batch job support batch manager

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 tests/integration/defs/perf/disagg/README.md  | 145 +++++++++++++-
 .../integration/defs/perf/disagg/conftest.py  | 187 +++++++++++++++++-
 .../defs/perf/disagg/test_disagg.py           |  39 ++--
 3 files changed, 343 insertions(+), 28 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/README.md b/tests/integration/defs/perf/disagg/README.md
index 28ba839c6e7..5921900b707 100644
--- a/tests/integration/defs/perf/disagg/README.md
+++ b/tests/integration/defs/perf/disagg/README.md
@@ -132,6 +132,141 @@ poetry run pytest --disagg test_disagg.py -s -vv -m accuracy
 poetry run pytest --disagg test_disagg.py -s -vv -k "deepseek-r1-fp4_1k1k"
 ```
 
+## Batch Job Submission
+
+The framework supports automatic batch job submission to maximize parallelism in SLURM cluster environments. Instead of submitting jobs one-by-one, it groups test cases into batches and submits entire batches when needed.
+
+### Quick Start
+
+**Default batch size (5 jobs per batch):**
+```bash
+# Run all tests with default batching
+poetry run pytest --disagg test_disagg.py -s -vv
+
+# Run with test list
+poetry run pytest --disagg test_disagg.py -s -vv --disagg-test-list=./testlist/all.txt
+```
+
+**Custom batch size:**
+```bash
+# Set batch size via command line
+poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=10
+
+# Set batch size via environment variable
+export DISAGG_BATCH_SIZE=20
+poetry run pytest --disagg test_disagg.py -s -vv
+
+# Submit all jobs at once (unlimited batch)
+poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=0
+```
+
+### How Batch Submission Works
+
+```
+Pytest Collection Phase:
+  - Collects all test cases (e.g., 100 tests)
+  - BatchManager splits them into batches (e.g., 20 batches of 5)
+
+Pytest Execution Phase:
+  Test 0 runs:
+    -> Triggers submission of Batch 0 (jobs 0-4)
+    -> Waits for job 0 to complete
+  
+  Test 1-4 run:
+    -> Batch 0 already submitted, directly wait for completion
+  
+  Test 5 runs:
+    -> Triggers submission of Batch 1 (jobs 5-9)
+    -> Waits for job 5 to complete
+  
+  ... and so on
+```
+
+### Key Benefits
+
+- **Parallel Execution**: All jobs in a batch run simultaneously on SLURM cluster
+- **Reduced Wait Time**: Total time ≈ MAX(job time) instead of SUM(job times)
+- **Automatic Management**: No need to manually split test lists
+- **Lazy Loading**: Only submits batches when needed
+
+### Configuration Options
+
+**Priority**: Command line option > Environment variable > Default (5)
+
+**Examples:**
+
+```bash
+# Small batch for quick testing
+poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=3 \
+  --disagg-test-list=./testlist/debug.txt
+
+# Large batch for production
+poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=50 \
+  --disagg-test-list=./testlist/all.txt
+
+# Submit all at once
+poetry run pytest --disagg test_disagg.py -s -vv --disagg-batch-size=0
+```
+
+### Timeout Configuration
+
+The default timeout for waiting for job completion is **10 hours (36000 seconds)**, which accounts for:
+- SLURM queue wait time
+- Job execution time
+- Buffer for delays
+
+### Performance Comparison
+
+**Before (Sequential Submission):**
+```
+Case 1: submit + wait (1.5h) = 1.5h
+Case 2: submit + wait (1.5h) = 1.5h
+Case 3: submit + wait (1.5h) = 1.5h
+...
+Total: 50 × 1.5h = 75 hours
+```
+
+**After (Batch Submission, batch_size=50):**
+```
+Batch 0 (50 jobs): submitted in parallel
+  Case 1: wait (1.5h)
+  Case 2-50: wait (0s, already done)
+
+Total: ~1.5 hours
+```
+
+**Speedup: 50x**
+
+### Troubleshooting
+
+**Check BatchManager initialization:**
+```
+======================================================================
+Batch Manager Initialized
+Batch size: 5 jobs per batch
+======================================================================
+
+Total test configs: 20
+Total batches: 4
+```
+
+**Monitor batch submission:**
+```
+======================================================================
+Submitting Batch 0
+Range: [0:5] (5 jobs)
+======================================================================
+
+  [  1/5] Job 1234 <- test_config_id_1
+  [  2/5] Job 1235 <- test_config_id_2
+  ...
+```
+
+**If jobs timeout frequently:**
+- Check SLURM queue status
+- Consider reducing batch size to avoid resource contention
+- Verify that timeout (36000s) is sufficient for your workload
+
 ## Test Naming Convention
 
 Tests are automatically named using the format:
@@ -193,6 +328,7 @@ Test results are saved to:
 - `GPU_TYPE`: Current GPU type (default: GB200)
 - `OUTPUT_PATH`: Directory for test results and logs
 - `WORK_DIR`: Working directory for benchmark execution
+- `DISAGG_BATCH_SIZE`: Default batch size for job submission (default: 5)
 - `DEBUG_MODE`: Enable debug mode (set to "1" to skip job submission)
 - `DEBUG_JOB_ID`: Job ID to use in debug mode
 
@@ -212,10 +348,11 @@ The framework consists of:
 
 1. **ConfigLoader**: Scans and loads YAML configurations
 2. **ConfigValidator**: Validates configuration correctness
-3. **JobManager**: Handles SLURM job submission and monitoring
-4. **LogParser**: Extracts metrics from benchmark logs
-5. **TestCaseTracker**: Tracks test execution timing
-6. **ResultSaver**: Saves results to CSV
+3. **BatchManager**: Manages batch job submission for parallel execution
+4. **JobManager**: Handles SLURM job submission and monitoring
+5. **LogParser**: Extracts metrics from benchmark logs
+6. **TestCaseTracker**: Tracks test execution timing
+7. **ResultSaver**: Saves results to CSV
 
 ## Benefits
 
diff --git a/tests/integration/defs/perf/disagg/conftest.py b/tests/integration/defs/perf/disagg/conftest.py
index 2dabeda1cd9..1ff436e38d5 100644
--- a/tests/integration/defs/perf/disagg/conftest.py
+++ b/tests/integration/defs/perf/disagg/conftest.py
@@ -1,9 +1,10 @@
 """Pytest configuration for disagg tests.
 
 Only collects tests in this directory when --disagg parameter is provided.
-Can share options like --disagg-test-list defined in this conftest.py.
+Provides batch job submission capability to improve parallelism.
 """
 
+import os
 import pytest
 from utils.logger import logger
 
@@ -23,6 +24,15 @@ def pytest_addoption(parser):
         help="Path to a file containing test IDs (one per line) to run. "
         "Example: pytest --disagg --disagg-test-list=testlist/testlist_gb200.txt",
     )
+    parser.addoption(
+        "--disagg-batch-size",
+        action="store",
+        type=int,
+        default=None,
+        help="Number of jobs to submit per batch. Default: from env DISAGG_BATCH_SIZE or 5. "
+        "Set to 0 for unlimited (submit all at once). "
+        "Example: pytest --disagg --disagg-batch-size=10",
+    )
 
 
 def pytest_collect_directory(path, parent):
@@ -45,7 +55,6 @@ def pytest_collect_directory(path, parent):
         return True
 
     # With --disagg parameter, proceed with normal collection
-    # Can subsequently use --disagg-test-list and other options from main conftest.py for filtering
     return None
 
 
@@ -88,7 +97,7 @@ def pytest_collection_modifyitems(config, items):
 
     for item in items:
         # item.nodeid is the full test identifier like:
-        # "test_disagg_simple.py::TestDisaggBenchmark::test_benchmark[deepseek-r1-fp4:1k1k:...]"
+        # "test_disagg.py::TestDisaggBenchmark::test_benchmark[deepseek-r1-fp4:1k1k:...]"
         if item.nodeid in wanted_tests:
             selected.append(item)
         else:
@@ -112,3 +121,175 @@ def pytest_collection_modifyitems(config, items):
         logger.warning(f"Please check that the test IDs in {test_list_file} are correct.")
 
     logger.info(f"{'=' * 70}\n")
+
+
+class BatchManager:
+    """Batch job submission manager for disagg tests.
+    
+    Automatically splits test cases into batches and submits them on-demand
+    to maximize parallelism in SLURM cluster environments.
+    
+    Key features:
+    - Lazy batch submission: only submits when needed
+    - Configurable batch size via CLI or environment variable
+    - Maintains job_id mapping for all submitted jobs
+    """
+    
+    def __init__(self, batch_size=5):
+        """Initialize batch manager.
+        
+        Args:
+            batch_size: Number of jobs per batch. None or 0 means unlimited (submit all at once).
+                       Default is 5 if not specified.
+        """
+        # Normalize batch_size: None, 0, or negative means unlimited
+        if batch_size is None or batch_size <= 0:
+            self.batch_size = None
+        else:
+            self.batch_size = batch_size
+        
+        self.submitted_batches = set()  # Track which batch numbers have been submitted
+        self.job_mapping = {}  # Map test_id -> SLURM job_id
+        self.all_configs = []  # Ordered list of all test configs
+        
+        logger.info(f"\n{'=' * 70}")
+        logger.info("Batch Manager Initialized")
+        if self.batch_size:
+            logger.info(f"Batch size: {self.batch_size} jobs per batch")
+        else:
+            logger.info("Batch size: unlimited (submit all at once)")
+        logger.info(f"{'=' * 70}\n")
+    
+    def add_config(self, test_config):
+        """Add a test configuration to the manager.
+        
+        Called during initialization to build the ordered list of configs.
+        
+        Args:
+            test_config: TestConfig object to add
+        """
+        self.all_configs.append(test_config)
+    
+    def get_job_id(self, test_config):
+        """Get SLURM job ID for a test config, submitting batch if needed.
+        
+        This is the main entry point. It:
+        1. Determines which batch the test belongs to
+        2. Submits the entire batch if not already submitted
+        3. Returns the job_id for this specific test
+        
+        Args:
+            test_config: TestConfig object to get job_id for
+            
+        Returns:
+            str: SLURM job ID, or None if submission failed
+        """
+        # Find the index of this config in the ordered list
+        try:
+            idx = next(i for i, c in enumerate(self.all_configs) 
+                      if c.test_id == test_config.test_id)
+        except StopIteration:
+            logger.error(f"Config not found in manager: {test_config.test_id}")
+            return None
+        
+        # Calculate which batch this test belongs to
+        if self.batch_size:
+            batch_num = idx // self.batch_size
+        else:
+            batch_num = 0  # All tests in one batch
+        
+        # Submit the batch if not already submitted
+        if batch_num not in self.submitted_batches:
+            self._submit_batch(batch_num)
+        
+        # Return the cached job_id
+        return self.job_mapping.get(test_config.test_id)
+    
+    def _submit_batch(self, batch_num):
+        """Submit all jobs in a specific batch.
+        
+        Args:
+            batch_num: Batch number to submit (0-indexed)
+        """
+        from execution.executor import JobManager
+        
+        # Calculate batch range
+        if self.batch_size:
+            start_idx = batch_num * self.batch_size
+            end_idx = min(start_idx + self.batch_size, len(self.all_configs))
+        else:
+            start_idx = 0
+            end_idx = len(self.all_configs)
+        
+        batch_configs = self.all_configs[start_idx:end_idx]
+        
+        logger.info(f"\n{'=' * 70}")
+        logger.info(f"Submitting Batch {batch_num}")
+        logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)")
+        logger.info(f"{'=' * 70}\n")
+        
+        # Submit all jobs in this batch
+        success_count = 0
+        for i, config in enumerate(batch_configs, 1):
+            try:
+                success, job_id = JobManager.submit_test_job(config)
+                if success and job_id:
+                    self.job_mapping[config.test_id] = job_id
+                    success_count += 1
+                    # Truncate test_id for display
+                    display_id = config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
+                    logger.success(f"  [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}")
+                else:
+                    self.job_mapping[config.test_id] = None
+                    logger.error(f"  [{i:3d}/{len(batch_configs)}] Failed: {config.test_id[:50]}")
+            except Exception as e:
+                self.job_mapping[config.test_id] = None
+                logger.error(f"  [{i:3d}/{len(batch_configs)}] Error: {e}")
+        
+        # Mark batch as submitted
+        self.submitted_batches.add(batch_num)
+        
+        logger.info(f"\n{'=' * 70}")
+        logger.success(f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded")
+        logger.info(f"{'=' * 70}\n")
+
+
+@pytest.fixture(scope="session")
+def batch_manager(request):
+    """Provide batch manager fixture for test methods.
+    
+    This session-scoped fixture creates and initializes the BatchManager
+    with all collected test configs.
+    
+    Returns:
+        BatchManager: Initialized batch manager instance
+    """
+    # Get batch size from CLI option or environment variable
+    batch_size = request.config.getoption("--disagg-batch-size")
+    if batch_size is None:
+        env_batch_size = os.getenv("DISAGG_BATCH_SIZE")
+        if env_batch_size:
+            try:
+                batch_size = int(env_batch_size)
+            except ValueError:
+                logger.warning(f"Invalid DISAGG_BATCH_SIZE: {env_batch_size}, using default 5")
+                batch_size = 5
+        else:
+            batch_size = 5  # Default batch size
+    
+    # Create batch manager
+    manager = BatchManager(batch_size=batch_size)
+    
+    # Extract all test configs from collected items
+    for item in request.session.items:
+        if hasattr(item, 'callspec') and 'test_config' in item.callspec.params:
+            manager.add_config(item.callspec.params['test_config'])
+    
+    # Log statistics
+    logger.info(f"Total test configs: {len(manager.all_configs)}")
+    if manager.batch_size:
+        total_batches = (len(manager.all_configs) + manager.batch_size - 1) // manager.batch_size
+        logger.info(f"Total batches: {total_batches}")
+    logger.info("")
+    
+    return manager
diff --git a/tests/integration/defs/perf/disagg/test_disagg.py b/tests/integration/defs/perf/disagg/test_disagg.py
index 39008ca11a1..b60ba851967 100644
--- a/tests/integration/defs/perf/disagg/test_disagg.py
+++ b/tests/integration/defs/perf/disagg/test_disagg.py
@@ -62,7 +62,7 @@ class TestDisaggBenchmark:
 
     @pytest.mark.perf
     @pytest.mark.parametrize("test_config", PERF_TEST_CASES)
-    def test_benchmark(self, request, test_config: TestConfig):
+    def test_benchmark(self, request, batch_manager, test_config: TestConfig):
         """Performance benchmark test for YAML configurations."""
         full_test_name = request.node.name
 
@@ -101,15 +101,14 @@ def test_benchmark(self, request, test_config: TestConfig):
                 )
                 job_id = EnvManager.get_debug_job_id()
             else:
-                # Submit job using JobManager
-                success, job_id = JobManager.submit_test_job(test_config)
+                # Get job_id from batch manager (auto-submits batch if needed)
+                job_id = batch_manager.get_job_id(test_config)
 
                 # Validate submission result
-                assert success, f"Job submission failed: {test_config.test_id}"
-                assert job_id, "Unable to get job ID"
+                assert job_id, f"Failed to get job_id for {test_config.test_id}"
 
-                # Wait for completion (timeout/early failure handled inside)
-                JobManager.wait_for_completion(job_id, 7200, test_config, check_early_failure=True)
+                # Wait for completion (timeout: 10 hours = 36000 seconds)
+                JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
 
             # End tracking test case
             test_tracker.end_test_case()
@@ -136,7 +135,7 @@ def test_benchmark(self, request, test_config: TestConfig):
 
     @pytest.mark.accuracy
     @pytest.mark.parametrize("test_config", ACCURACY_TEST_CASES)
-    def test_accuracy(self, request, test_config: TestConfig):
+    def test_accuracy(self, request, batch_manager, test_config: TestConfig):
         """Accuracy test for YAML configurations."""
         full_test_name = request.node.name
 
@@ -179,15 +178,14 @@ def test_accuracy(self, request, test_config: TestConfig):
                 )
                 job_id = EnvManager.get_debug_job_id()
             else:
-                # Submit job using JobManager
-                success, job_id = JobManager.submit_test_job(test_config)
+                # Get job_id from batch manager (auto-submits batch if needed)
+                job_id = batch_manager.get_job_id(test_config)
 
                 # Validate submission result
-                assert success, f"Job submission failed: {test_config.test_id}"
-                assert job_id, "Unable to get job ID"
+                assert job_id, f"Failed to get job_id for {test_config.test_id}"
 
-                # Wait for completion (timeout/early failure handled inside)
-                JobManager.wait_for_completion(job_id, 10800, test_config, check_early_failure=True)
+                # Wait for completion (timeout: 10 hours = 36000 seconds)
+                JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
 
             # End tracking test case
             test_tracker.end_test_case()
@@ -216,7 +214,7 @@ def test_accuracy(self, request, test_config: TestConfig):
 
     @pytest.mark.stress
     @pytest.mark.parametrize("test_config", STRESS_TEST_CASES)
-    def test_stress(self, request, test_config: TestConfig):
+    def test_stress(self, request, batch_manager, test_config: TestConfig):
         """Stress test combining performance benchmarks and accuracy validation.
 
         This test type is designed for stress testing scenarios where both
@@ -265,15 +263,14 @@ def test_stress(self, request, test_config: TestConfig):
                 )
                 job_id = EnvManager.get_debug_job_id()
             else:
-                # Submit job using JobManager
-                success, job_id = JobManager.submit_test_job(test_config)
+                # Get job_id from batch manager (auto-submits batch if needed)
+                job_id = batch_manager.get_job_id(test_config)
 
                 # Validate submission result
-                assert success, f"Job submission failed: {test_config.test_id}"
-                assert job_id, "Unable to get job ID"
+                assert job_id, f"Failed to get job_id for {test_config.test_id}"
 
-                # Wait for completion (longer timeout for stress tests: 4 hours)
-                JobManager.wait_for_completion(job_id, 10800, test_config, check_early_failure=True)
+                # Wait for completion (timeout: 10 hours = 36000 seconds)
+                JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
 
             # End tracking test case
             test_tracker.end_test_case()

From b5406a9d7b290cc01462a15cdb76781074da4488 Mon Sep 17 00:00:00 2001
From: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Wed, 31 Dec 2025 12:55:16 +0000
Subject: [PATCH 12/13] fx

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../defs/perf/disagg/execution/executor.py      |  4 +++-
 .../perf/disagg/execution/subprocess_utils.py   | 17 +++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py
index d454765c536..547b63aa8c4 100644
--- a/tests/integration/defs/perf/disagg/execution/executor.py
+++ b/tests/integration/defs/perf/disagg/execution/executor.py
@@ -114,7 +114,9 @@ def submit_shell_job(
             logger.debug(f"Script: {script_path}")
             logger.debug(f"Log file: {output_log_file}")
 
-            output = exec_cmd_with_output(sbatch_args, timeout=60)
+            # Use check=False to allow submission even with Kerberos warnings
+            # (mimics submit.py behavior)
+            output = exec_cmd_with_output(sbatch_args, timeout=60, check=False)
             job_id = output.strip()
 
             # Parse job ID (--parsable returns just the job ID)
diff --git a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
index 9ab77714267..27df7f829d1 100644
--- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
+++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
@@ -33,19 +33,20 @@ def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int:
     return result.returncode
 
 
-def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs) -> str:
+def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, check: bool = True, **kwargs) -> str:
     """Execute command and return output as string.
 
     Args:
         *popenargs: Command and arguments
         timeout: Timeout in seconds
+        check: If True, raise CalledProcessError on non-zero exit code (default: True)
         **kwargs: Additional subprocess arguments
 
     Returns:
         stdout as string (decoded from bytes)
 
     Raises:
-        subprocess.CalledProcessError: If command returns non-zero exit code
+        subprocess.CalledProcessError: If check=True and command returns non-zero exit code
         subprocess.TimeoutExpired: If timeout is reached
     """
     result = subprocess.run(
@@ -53,11 +54,15 @@ def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs)
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         timeout=timeout,
-        check=True,
+        check=check,
         **kwargs,
     )
-    # Log stderr if it exists
+    # Log stderr if it exists (as warning if check=False, as error if check=True)
     if result.stderr:
-        stderr_output = result.stderr.decode()
-        logger.error(f"Command stderr: {stderr_output}")
+        stderr_output = result.stderr.decode().strip()
+        if stderr_output:
+            if check:
+                logger.error(f"Command stderr: {stderr_output}")
+            else:
+                logger.warning(f"Command stderr: {stderr_output}")
     return result.stdout.decode()

From 2259c581f9f9b397bbcac1ed7075cdea51caa847 Mon Sep 17 00:00:00 2001
From: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
Date: Wed, 7 Jan 2026 17:15:13 +0800
Subject: [PATCH 13/13] fix pre-commit failed

Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
---
 .../integration/defs/perf/disagg/conftest.py  | 78 ++++++++++---------
 .../perf/disagg/execution/subprocess_utils.py |  4 +-
 2 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/tests/integration/defs/perf/disagg/conftest.py b/tests/integration/defs/perf/disagg/conftest.py
index 1ff436e38d5..a4b88542dfd 100644
--- a/tests/integration/defs/perf/disagg/conftest.py
+++ b/tests/integration/defs/perf/disagg/conftest.py
@@ -5,6 +5,7 @@
 """
 
 import os
+
 import pytest
 from utils.logger import logger
 
@@ -125,19 +126,19 @@ def pytest_collection_modifyitems(config, items):
 
 class BatchManager:
     """Batch job submission manager for disagg tests.
-    
+
     Automatically splits test cases into batches and submits them on-demand
     to maximize parallelism in SLURM cluster environments.
-    
+
     Key features:
     - Lazy batch submission: only submits when needed
     - Configurable batch size via CLI or environment variable
     - Maintains job_id mapping for all submitted jobs
     """
-    
+
     def __init__(self, batch_size=5):
         """Initialize batch manager.
-        
+
         Args:
             batch_size: Number of jobs per batch. None or 0 means unlimited (submit all at once).
                        Default is 5 if not specified.
@@ -147,11 +148,11 @@ def __init__(self, batch_size=5):
             self.batch_size = None
         else:
             self.batch_size = batch_size
-        
+
         self.submitted_batches = set()  # Track which batch numbers have been submitted
         self.job_mapping = {}  # Map test_id -> SLURM job_id
         self.all_configs = []  # Ordered list of all test configs
-        
+
         logger.info(f"\n{'=' * 70}")
         logger.info("Batch Manager Initialized")
         if self.batch_size:
@@ -159,60 +160,61 @@ def __init__(self, batch_size=5):
         else:
             logger.info("Batch size: unlimited (submit all at once)")
         logger.info(f"{'=' * 70}\n")
-    
+
     def add_config(self, test_config):
         """Add a test configuration to the manager.
-        
+
         Called during initialization to build the ordered list of configs.
-        
+
         Args:
             test_config: TestConfig object to add
         """
         self.all_configs.append(test_config)
-    
+
     def get_job_id(self, test_config):
         """Get SLURM job ID for a test config, submitting batch if needed.
-        
+
         This is the main entry point. It:
         1. Determines which batch the test belongs to
         2. Submits the entire batch if not already submitted
         3. Returns the job_id for this specific test
-        
+
         Args:
             test_config: TestConfig object to get job_id for
-            
+
         Returns:
             str: SLURM job ID, or None if submission failed
         """
         # Find the index of this config in the ordered list
         try:
-            idx = next(i for i, c in enumerate(self.all_configs) 
-                      if c.test_id == test_config.test_id)
+            idx = next(
+                i for i, c in enumerate(self.all_configs) if c.test_id == test_config.test_id
+            )
         except StopIteration:
             logger.error(f"Config not found in manager: {test_config.test_id}")
             return None
-        
+
         # Calculate which batch this test belongs to
         if self.batch_size:
             batch_num = idx // self.batch_size
         else:
             batch_num = 0  # All tests in one batch
-        
+
         # Submit the batch if not already submitted
         if batch_num not in self.submitted_batches:
             self._submit_batch(batch_num)
-        
+
         # Return the cached job_id
         return self.job_mapping.get(test_config.test_id)
-    
+
     def _submit_batch(self, batch_num):
         """Submit all jobs in a specific batch.
-        
+
         Args:
             batch_num: Batch number to submit (0-indexed)
         """
         from execution.executor import JobManager
-        
+
         # Calculate batch range
         if self.batch_size:
             start_idx = batch_num * self.batch_size
@@ -220,14 +222,14 @@ def _submit_batch(self, batch_num):
         else:
             start_idx = 0
             end_idx = len(self.all_configs)
-        
+
         batch_configs = self.all_configs[start_idx:end_idx]
-        
+
         logger.info(f"\n{'=' * 70}")
         logger.info(f"Submitting Batch {batch_num}")
         logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)")
         logger.info(f"{'=' * 70}\n")
-        
+
         # Submit all jobs in this batch
         success_count = 0
         for i, config in enumerate(batch_configs, 1):
@@ -237,7 +239,9 @@ def _submit_batch(self, batch_num):
                     self.job_mapping[config.test_id] = job_id
                     success_count += 1
                     # Truncate test_id for display
-                    display_id = config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
+                    display_id = (
+                        config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
+                    )
                     logger.success(f"  [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}")
                 else:
                     self.job_mapping[config.test_id] = None
@@ -245,22 +249,24 @@ def _submit_batch(self, batch_num):
             except Exception as e:
                 self.job_mapping[config.test_id] = None
                 logger.error(f"  [{i:3d}/{len(batch_configs)}] Error: {e}")
-        
+
         # Mark batch as submitted
         self.submitted_batches.add(batch_num)
-        
+
         logger.info(f"\n{'=' * 70}")
-        logger.success(f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded")
+        logger.success(
+            f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded"
+        )
         logger.info(f"{'=' * 70}\n")
 
 
 @pytest.fixture(scope="session")
 def batch_manager(request):
     """Provide batch manager fixture for test methods.
-    
+
     This session-scoped fixture creates and initializes the BatchManager
     with all collected test configs.
-    
+
     Returns:
         BatchManager: Initialized batch manager instance
     """
@@ -276,20 +282,20 @@ def batch_manager(request):
                 batch_size = 5
         else:
             batch_size = 5  # Default batch size
-    
+
     # Create batch manager
     manager = BatchManager(batch_size=batch_size)
-    
+
     # Extract all test configs from collected items
     for item in request.session.items:
-        if hasattr(item, 'callspec') and 'test_config' in item.callspec.params:
-            manager.add_config(item.callspec.params['test_config'])
-    
+        if hasattr(item, "callspec") and "test_config" in item.callspec.params:
+            manager.add_config(item.callspec.params["test_config"])
+
     # Log statistics
     logger.info(f"Total test configs: {len(manager.all_configs)}")
     if manager.batch_size:
         total_batches = (len(manager.all_configs) + manager.batch_size - 1) // manager.batch_size
         logger.info(f"Total batches: {total_batches}")
     logger.info("")
-    
+
     return manager
diff --git a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
index 27df7f829d1..39a3f0ac4b9 100644
--- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
+++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py
@@ -33,7 +33,9 @@ def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int:
     return result.returncode
 
 
-def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, check: bool = True, **kwargs) -> str:
+def exec_cmd_with_output(
+    *popenargs, timeout: Optional[float] = None, check: bool = True, **kwargs
+) -> str:
     """Execute command and return output as string.
 
     Args: