LarryXFly
diff --git a/‎jenkins/L0_Test.groovy‎
Lines changed: 3 additions & 0 deletions b/‎jenkins/L0_Test.groovy‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/integration/defs/perf/test_perf.py‎
Lines changed: 501 additions & 15 deletions b/‎tests/integration/defs/perf/test_perf.py‎
Lines changed: 501 additions & 15 deletions
diff --git a/‎tests/integration/defs/perf/utils.py‎
Lines changed: 58 additions & 0 deletions b/‎tests/integration/defs/perf/utils.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml‎
Lines changed: 37 additions & 0 deletions b/‎tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml‎
Lines changed: 37 additions & 0 deletions b/‎tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎tests/scripts/perf-sanity/README.md‎
Lines changed: 76 additions & 80 deletions b/‎tests/scripts/perf-sanity/README.md‎
Lines changed: 76 additions & 80 deletions
@@ -2655,6 +2655,9 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
         "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
         "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
+        // Perf sanity post merge test
+        "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
+        "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
     ]
     fullSet += x86SlurmTestConfigs.keySet()
 
 
@@ -90,9 +90,20 @@ class PerfMetricType(str, Enum):
     set up special threshold criteria for each type of metrics (like >50MB for engine size increase, etc.).
     """
     INFERENCE_TIME = "INFERENCE_TIME"
+    MEDIAN_INFERENCE_TIME = "MEDIAN_INFERENCE_TIME"
+    P99_INFERENCE_TIME = "P99_INFERENCE_TIME"
+    INTER_TOKEN_TIME = "INTER_TOKEN_TIME"
+    MEDIAN_INTER_TOKEN_TIME = "MEDIAN_INTER_TOKEN_TIME"
+    P99_INTER_TOKEN_TIME = "P99_INTER_TOKEN_TIME"
     FIRST_TOKEN_TIME = "FIRST_TOKEN_TIME"
+    MEDIAN_FIRST_TOKEN_TIME = "MEDIAN_FIRST_TOKEN_TIME"
+    P99_FIRST_TOKEN_TIME = "P99_FIRST_TOKEN_TIME"
     OUTPUT_TOKEN_TIME = "OUTPUT_TOKEN_TIME"
+    MEDIAN_OUTPUT_TOKEN_TIME = "MEDIAN_OUTPUT_TOKEN_TIME"
+    P99_OUTPUT_TOKEN_TIME = "P99_OUTPUT_TOKEN_TIME"
     TOKEN_THROUGHPUT = "TOKEN_THROUGHPUT"
+    TOTAL_TOKEN_THROUGHPUT = "TOTAL_TOKEN_THROUGHPUT"
+    USER_THROUGHPUT = "USER_THROUGHPUT"
     BUILD_TIME = "BUILD_TIME"
     BUILD_PEAK_CPU_MEMORY = "BUILD_PEAK_CPU_MEMORY"
     BUILD_PEAK_GPU_MEMORY = "BUILD_PEAK_GPU_MEMORY"
@@ -311,6 +322,53 @@ def get_cmd_str(self, cmd_idx) -> List[str]:
         return cmd_str
 
 
+class PerfServerClientBenchmarkCmds(NamedTuple):
+    server_cmds: List[str]
+    client_cmds: List[List[str]]
+    names: List[str]
+    working_dir: str
+
+    def wait_for_endpoint_ready(self, url: str, timeout: int = 5400):
+        start = time.monotonic()
+        while time.monotonic() - start < timeout:
+            try:
+                time.sleep(10)
+                if requests.get(url).status_code == 200:
+                    print(f"endpoint {url} is ready")
+                    return
+            except Exception as err:
+                print(f"endpoint {url} is not ready, with exception: {err}")
+        print_error(
+            f"Endpoint {url} did not become ready within {timeout} seconds")
+
+    def run_cmd(self, cmd_idx: int, venv) -> str:
+        output = ""
+        server_file_path = os.path.join(
+            self.working_dir, f"trtllm-serve.{self.names[cmd_idx]}.log")
+        client_file_path = os.path.join(
+            self.working_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log")
+        try:
+            with (  # Start server process
+                    open(server_file_path, 'w') as server_ctx,
+                    popen(self.server_cmds[cmd_idx],
+                          stdout=server_ctx,
+                          stderr=subprocess.STDOUT,
+                          env=venv._new_env,
+                          shell=True) as server_proc):
+                self.wait_for_endpoint_ready(
+                    "http://localhost:8000/v1/models",
+                    timeout=5400)  # 90 minutes for large models
+                output += subprocess.check_output(self.client_cmds[cmd_idx],
+                                                  env=venv._new_env).decode()
+        finally:
+            server_proc.terminate()
+            server_proc.wait()
+        return output
+
+    def get_cmd_str(self, cmd_idx) -> List[str]:
+        return ["server-benchmark tests, please check config files"]
+
+
 class PerfDisaggScriptTestCmds(NamedTuple):
     ctx_cmd: str
     gen_cmd: str
 
@@ -0,0 +1,37 @@
+version: 0.0.1
+perf_sanity_l0_dgx_b200:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*b200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: pre_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]
+
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*b200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]
@@ -0,0 +1,37 @@
+version: 0.0.1
+perf_sanity_l0_dgx_b300:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*gb110*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: pre_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]
+
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*gb110*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
+  - perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]
@@ -6,7 +6,6 @@ Benchmarking scripts for TensorRT-LLM serving performance tests with configurati
 
 - Run performance benchmarks across multiple model configurations
 - Manage test cases through YAML configuration files
-- Generate comprehensive CSV reports with complete test case coverage
 - Support selective execution of specific test cases
 
 ## Scripts Overview
@@ -16,123 +15,120 @@ Benchmarking scripts for TensorRT-LLM serving performance tests with configurati
 
 **Structure**:
 ```yaml
-test_cases:
-  - id: 1
-    model: "70B-FP8"
-    gpus: 1
-    tp: 1
-    ep: 1
-    attn_backend: "TRTLLM"
-    moe_backend: ""
-    enable_attention_dp: false
-    free_gpu_mem_fraction: 0.9
-    max_batch_size: 512
-    isl: 1024
-    osl: 1024
-    max_num_tokens: 16384
+server_configs:
+  - name: "r1_fp4_dep4"
+    model_name: "deepseek_r1_0528_fp4"
+    tp: 4
+    ep: 4
+    pp: 1
+    attention_backend: "TRTLLM"
+    moe_backend: "CUTLASS"
+    moe_max_num_tokens: ""
+    enable_attention_dp: true
+    enable_chunked_prefill: false
+    max_num_tokens: 2176
+    disable_overlap_scheduler: false
+    kv_cache_dtype: "fp8"
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.8
+    max_batch_size: 256
+    enable_padding: true
+    client_configs:
+      - name: "con1_iter1_1024_1024"
+        concurrency: 1
+        iterations: 1
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.0
+      - name: "con8_iter1_1024_1024"
+        concurrency: 8
+        iterations: 1
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.0
+
+  - name: "r1_fp4_tep4"
+    model_name: "deepseek_r1_0528_fp4"
+    tp: 4
+    ep: 4
+    pp: 1
+    attention_backend: "TRTLLM"
+    moe_backend: "CUTLASS"
     moe_max_num_tokens: ""
-    concurrency_iterations:
-      - [1, 10]
-      - [8, 10]
-      - [64, 5]
-      - [512, 2]
+    enable_attention_dp: false
+    enable_chunked_prefill: false
+    max_num_tokens: 2176
+    disable_overlap_scheduler: false
+    kv_cache_dtype: "fp8"
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.8
+    max_batch_size: 256
+    enable_padding: true
+    client_configs:
+      - name: "con1_iter1_1024_1024"
+        concurrency: 1
+        iterations: 1
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.0
+      - name: "con8_iter1_1024_1024"
+        concurrency: 8
+        iterations: 1
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.0
 ```
 
-**Configuration Fields**:
-- `id`: Unique identifier for the test case
-- `model`: Model name (e.g., "70B-FP8", "Scout-FP4")
-- `gpus`: Number of GPUs to use
-- `tp`: Tensor parallelism size
-- `ep`: Expert parallelism size
-- `attn_backend`: Attention backend ("TRTLLM", "FLASHINFER")
-- `moe_backend`: MoE backend ("DEEPGEMM", "TRTLLM", "CUTLASS", "")
-- `enable_attention_dp`: Enable attention data parallelism
-- `free_gpu_mem_fraction`: GPU memory fraction to reserve
-- `max_batch_size`: Maximum batch size
-- `isl`: Input sequence length
-- `osl`: Output sequence length
-- `max_num_tokens`: Maximum number of tokens
-- `moe_max_num_tokens`: Maximum number of tokens for MoE
-- `concurrency_iterations`: List of [concurrency, iteration] pairs
-
-
 ### 2. `run_benchmark_serve.py` - Main Benchmark Runner
 **Purpose**: Executes performance benchmarks based on YAML configuration files.
 
 **Usage**:
 ```bash
-python run_benchmark_serve.py --output_folder <output_folder> --config_file <config_file> [--skip <skip_pattern>] [--select <select_pattern>]
+python run_benchmark_serve.py --log_folder <log_folder> --config_file <config_file> [--select <select_pattern>] [--timeout 5400]
 ```
 
 **Arguments**:
-- `--output_folder`: Directory to store benchmark results (required)
+- `--log_folder`: Directory to store benchmark logs (required)
 - `--config_file`: Path to YAML configuration file (required)
-- `--skip`: Skip pattern for specific test cases/concurrencies (optional, default: no skipping)
-- `--select`: Select pattern for specific test cases/concurrencies (optional, default: all test cases)
+- `--select`: Select pattern for specific Server and Client Config. (optional, default: all test cases)
+- `--timeout`: Timeout for server setup. (optional, default: 3600 seconds)
 
 **Examples**:
 ```bash
-# Run all test cases
-python run_benchmark_serve.py --output_folder results --config_file benchmark_config.yaml --skip default --select default
-
-# Skip specific test cases
-python run_benchmark_serve.py --output_folder results --config_file benchmark_config.yaml --skip "2-1,4"
-
-# Run specific concurrencies from specific test cases
-python run_benchmark_serve.py --output_folder results --config_file benchmark_config.yaml --select "1,2-3"
+# Select
+python run_benchmark_serve.py --log_folder ./results --config_file benchmark_config.yaml --select "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024"
 
 ```
 
-**Skip Pattern**:
-Format: `"test_case1,test_case2,test_case3"` or `"test_case1-concurrency1,test_case2-concurrency3"`
-- `"2,4"`: Skip test cases 2 and 4 entirely
-- `"2-1,4-2"`: Skip test case 2's 1st concurrency and test case 4's 2nd concurrency
-- `"default"` or empty: No skipping (default)
-
-**Select Pattern**:
-Format: `"test_case1,test_case2,test_case3"` or `"test_case1-concurrency1,test_case2-concurrency3"`
-- `"1,3,5"`: Run only test cases 1, 3, and 5 (all concurrencies)
-- `"1-1,2-3"`: Run test case 1's 1st concurrency and test case 2's 3rd concurrency
-- `"default"` or empty: Run all test cases (default)
-
-
 ### 3. `parse_benchmark_results.py` - Results Parser
-**Purpose**: Parses benchmark log files and generates comprehensive CSV reports with all test cases from the configuration file.
-
-**Usage**:
-```bash
-python parse_benchmark_results.py --input_folder <input_folder> --output_csv <output_csv> --config_file <config_file>
-```
+**Purpose**: Print log's perf.
 
 **Arguments**:
-- `input_folder`: Folder containing benchmark log files (serve.*.log) (required)
-- `output_csv`: Output CSV filename for the results table (required)
-- `config_file`: Path to benchmark_config.yaml file (required)
+- `--log_folder`: Directory to store benchmark logs (required)
 
-**Examples**:
+**Usage**:
 ```bash
-python parse_benchmark_results.py --config_file ./benchmark_logs --output_csv results.csv --input_folder ./benchmark_config.yaml
-
+python parse_benchmark_results.py --log_folder <log_folder>
 ```
 
+
 ### 4. `benchmark-serve.sh` - SLURM Job Script
 **Usage**:
 ```bash
-sbatch benchmark-serve.sh [IMAGE] [bench_dir] [output_dir] [select_pattern] [skip_pattern]
+sbatch benchmark-serve.sh [IMAGE] [bench_dir] [log_folder] [select_pattern]
 ```
 
 **Parameters**:
 - `IMAGE`: Docker image (default: tensorrt-llm-staging/release:main-x86_64)
 - `bench_dir`: Directory containing config file and benchmark scripts (default: current directory)
-- `output_dir`: Directory containing output logs and csv. (default: current directory)
+- `log_folder`: Directory containing output logs and csv. (default: current directory)
 - `select_pattern`: Select pattern (default: default - all test cases)
-- `skip_pattern`: Skip pattern (default: default - no skipping)
 
 **Examples**:
 ```bash
 
 bench_dir="/path/to/benchmark/scripts"
-output_dir="/path/to/store/output/files"
-sbatch --reservation=RES--COM-3970 --qos=reservation -D ${output_dir} ${bench_dir}/benchmark-serve.sh urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release:main-x86_64 ${bench_dir} ${output_dir} "1-1" ""
+log_folder="/path/to/store/output/files"
+sbatch --reservation=RES--COM-3970 --qos=reservation -D ${log_folder} ${bench_dir}/benchmark-serve.sh urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release:main-x86_64 ${bench_dir} ${log_folder} "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024"
 
 ```
Original file line number	Diff line number	Diff line change
`@@ -2655,6 +2655,9 @@ def launchTestJobs(pipeline, testFilter)`
`2655`	`2655`	`"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],`
`2656`	`2656`	`"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],`
`2657`	`2657`	`"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],`
	`2658`	`+ // Perf sanity post merge test`
	`2659`	`+ "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],`
	`2660`	`+ "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],`
`2658`	`2661`	`]`
`2659`	`2662`	`fullSet += x86SlurmTestConfigs.keySet()`
`2660`	`2663`