diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 20b9a2a6ab1..e15777c5fef 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2689,6 +2689,7 @@ def launchTestJobs(pipeline, testFilter) "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4], // Perf sanity post merge test "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4], + "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "perf_sanity_l0_dgx_b200", 1, 1, 8], "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4], ] fullSet += x86SlurmTestConfigs.keySet() diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py index 9f9ebda1693..552e55b8f47 100644 --- a/tests/integration/defs/perf/open_search_db_utils.py +++ b/tests/integration/defs/perf/open_search_db_utils.py @@ -20,6 +20,7 @@ import re import sys import time +from datetime import datetime from defs.trt_test_alternative import print_info @@ -40,11 +41,13 @@ "l_ep", "l_pp", "l_max_num_tokens", + "l_cuda_graph_max_batch_size", "b_enable_chunked_prefill", "b_disable_overlap_scheduler", "s_attention_backend", "s_moe_backend", "l_moe_max_num_tokens", + "l_num_postprocess_workers", "l_stream_interval", "b_enable_attention_dp", "b_attention_dp_balance", @@ -55,6 +58,11 @@ "d_free_gpu_memory_fraction", "l_max_batch_size", "b_enable_padding", + "s_spec_decoding_type", + "l_num_nextn_predict_layers", + "l_eagle3_layers_to_capture", + "l_max_draft_len", + "s_speculative_model_dir", ] # Client config fields to compare @@ -64,6 +72,8 @@ "l_isl", "l_osl", "d_random_range_ratio", + "s_backend", + "b_use_chat_template", ] # Metrics where larger is better @@ -189,7 +199,7 @@ def get_job_info(): } -def query_history_data(): +def query_history_data(gpu_type): """ Query post-merge data with specific gpu type and model name """ @@ -209,6 +219,16 @@ def query_history_data(): "b_is_post_merge": True } }, + { + "term": { + "b_is_regression": False + } + }, + { + "term": { + "s_gpu_type": gpu_type + } + }, { "range": { "ts_created": { @@ -339,27 +359,44 @@ def calculate_best_perf_result(history_data_list, new_data): return best_metrics -def get_history_data(new_data_dict): +def get_history_data(new_data_dict, gpu_type): """ Query history post-merge data for each cmd_idx """ + + def get_latest_data(data_list): + if not data_list: + return None + time_format = "%b %d, %Y @ %H:%M:%S.%f" + # Find the item with the maximum ts_created value + latest_data = max( + data_list, + key=lambda x: datetime.strptime(x["ts_created"], time_format)) + return latest_data + history_baseline_dict = {} history_data_dict = {} cmd_idxs = new_data_dict.keys() for cmd_idx in cmd_idxs: history_data_dict[cmd_idx] = [] - history_baseline_dict[cmd_idx] = None - history_data_list = query_history_data() + history_baseline_dict[cmd_idx] = [] + history_data_list = [] + if cmd_idxs: + history_data_list = query_history_data(gpu_type) if history_data_list: for history_data in history_data_list: for cmd_idx in cmd_idxs: if match(history_data, new_data_dict[cmd_idx]): if history_data.get("b_is_baseline") and history_data.get( "b_is_baseline") == True: - history_baseline_dict[cmd_idx] = history_data + history_baseline_dict[cmd_idx].append(history_data) else: history_data_dict[cmd_idx].append(history_data) break + # Sometime database has several baselines and we only use the latest baseline one + for cmd_idx, baseline_list in history_baseline_dict: + latest_baseline = get_latest_data(baseline_list) + history_baseline_dict[cmd_idx] = latest_baseline return history_baseline_dict, history_data_dict @@ -477,6 +514,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict, # Only post regressive test cases when post-merge. if new_baseline_data_dict: data_list.extend(regressive_data_list) + if not data_list: + return try: print_info( f"Ready to post {len(data_list)} data to {TEST_INFO_PROJECT_NAME}") diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 081b9fb6b67..57274c9fd73 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -103,6 +103,7 @@ "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/", + "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/", "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only", "qwen2_7b_instruct": "Qwen2-7B-Instruct", @@ -513,11 +514,13 @@ def __init__( max_num_tokens: int, attention_backend: str, max_batch_size: int, + cuda_graph_max_batch_size: int = 0, pp: int = 1, enable_chunked_prefill: bool = False, disable_overlap_scheduler: bool = False, moe_backend: str = "", moe_max_num_tokens: int = 0, + num_postprocess_workers: int = 0, stream_interval: int = 10, enable_attention_dp: bool = False, attention_dp_balance: bool = False, @@ -527,6 +530,11 @@ def __init__( enable_block_reuse: bool = False, free_gpu_memory_fraction: float = 0.8, enable_padding: bool = True, + spec_decoding_type: str = "", + num_nextn_predict_layers: int = 0, + eagle3_layers_to_capture: int = 0, + max_draft_len: int = 0, + speculative_model_dir: str = "", ): self.name = name self.model_name = model_name @@ -540,6 +548,7 @@ def __init__( self.attention_backend = attention_backend self.moe_backend = moe_backend self.moe_max_num_tokens = moe_max_num_tokens + self.num_postprocess_workers = num_postprocess_workers self.stream_interval = stream_interval self.enable_attention_dp = enable_attention_dp self.attention_dp_balance = attention_dp_balance @@ -549,7 +558,13 @@ def __init__( self.enable_block_reuse = enable_block_reuse self.free_gpu_memory_fraction = free_gpu_memory_fraction self.max_batch_size = max_batch_size + self.cuda_graph_max_batch_size = max_batch_size if cuda_graph_max_batch_size == 0 else cuda_graph_max_batch_size self.enable_padding = enable_padding + self.spec_decoding_type = spec_decoding_type + self.num_nextn_predict_layers = num_nextn_predict_layers + self.eagle3_layers_to_capture = eagle3_layers_to_capture + self.max_draft_len = max_draft_len + self.speculative_model_dir = speculative_model_dir self.model_path = "" @@ -567,7 +582,7 @@ def to_cmd(self, working_dir: str) -> List[str]: def to_db_data(self) -> dict: """Convert ServerConfig to Database data""" - return { + db_data = { "s_model_name": self.model_name.lower(), "l_gpus": self.gpus, "l_tp": self.tp, @@ -588,9 +603,30 @@ def to_db_data(self) -> dict: "b_enable_block_reuse": self.enable_block_reuse, "d_free_gpu_memory_fraction": self.free_gpu_memory_fraction, "l_max_batch_size": self.max_batch_size, + "l_cuda_graph_max_batch_size": self.cuda_graph_max_batch_size, "b_enable_padding": self.enable_padding, + "s_spec_decoding_type": self.spec_decoding_type, + "l_num_nextn_predict_layers": self.num_nextn_predict_layers, + "l_eagle3_layers_to_capture": self.eagle3_layers_to_capture, + "l_max_draft_len": self.max_draft_len, + "s_speculative_model_dir": self.speculative_model_dir, "s_server_log_link": "", } + if self.num_postprocess_workers > 0: + db_data["l_num_postprocess_workers"] = self.num_postprocess_workers + if self.spec_decoding_type: + db_data["s_spec_decoding_type"] = self.spec_decoding_type + if self.num_nextn_predict_layers > 0: + db_data[ + "l_num_nextn_predict_layers"] = self.num_nextn_predict_layers + if self.eagle3_layers_to_capture > 0: + db_data[ + "l_eagle3_layers_to_capture"] = self.eagle3_layers_to_capture + if self.max_draft_len > 0: + db_data["l_max_draft_len"] = self.max_draft_len + if self.speculative_model_dir: + db_data["s_speculative_model_dir"] = self.speculative_model_dir + return db_data def generate_extra_llm_api_config(self) -> str: """Generate extra-llm-api-config.yml content""" @@ -599,14 +635,14 @@ def generate_extra_llm_api_config(self) -> str: f"moe_expert_parallel_size: {self.ep}", f"pipeline_parallel_size: {self.pp}", f"max_num_tokens: {self.max_num_tokens}", + f"max_batch_size: {self.max_batch_size}", f"enable_attention_dp: {str(self.enable_attention_dp).lower()}", f"disable_overlap_scheduler: {str(self.disable_overlap_scheduler).lower()}", - f"stream_interval: {self.stream_interval}", f"attn_backend: {self.attention_backend}", f"enable_chunked_prefill: {str(self.enable_chunked_prefill).lower()}", "cuda_graph_config:", f" enable_padding: {str(self.enable_padding).lower()}", - f" max_batch_size: {self.max_batch_size}", + f" max_batch_size: {self.cuda_graph_max_batch_size}", "kv_cache_config:", f" dtype: {self.kv_cache_dtype}", f" free_gpu_memory_fraction: {self.free_gpu_memory_fraction}", @@ -614,6 +650,13 @@ def generate_extra_llm_api_config(self) -> str: "print_iter_log: false", ] + if self.stream_interval > 0: + config_lines.append(f"stream_interval: {self.stream_interval}") + + if self.num_postprocess_workers > 0: + config_lines.append( + f"num_postprocess_workers: {self.num_postprocess_workers}") + # Add moe_config if moe_backend is specified if self.moe_backend: config_lines.append("moe_config:") @@ -629,6 +672,25 @@ def generate_extra_llm_api_config(self) -> str: f" batching_wait_iters: {self.batching_wait_iters}") config_lines.append(f" timeout_iters: {self.timeout_iters}") + if self.spec_decoding_type: + config_lines.append("speculative_config:") + config_lines.append(f" decoding_type: {self.spec_decoding_type}") + if self.num_nextn_predict_layers > 0: + config_lines.append( + f" num_nextn_predict_layers: {self.num_nextn_predict_layers}" + ) + if self.eagle3_layers_to_capture > 0: + config_lines.append( + f" eagle3_layers_to_capture: {self.eagle3_layers_to_capture}" + ) + if self.max_draft_len > 0: + config_lines.append(f" max_draft_len: {self.max_draft_len}") + if self.speculative_model_dir: + spec_model_dir = os.path.join(llm_models_root(), + self.speculative_model_dir) + config_lines.append( + f" speculative_model_dir: {spec_model_dir}") + return "\n".join(config_lines) @@ -644,7 +706,9 @@ def __init__(self, iterations: int, isl: int, osl: int, - random_range_ratio: float = 0.0): + random_range_ratio: float = 0.0, + backend: str = "", + use_chat_template: bool = False): self.name = name self.model_name = model_name self.concurrency = concurrency @@ -652,6 +716,8 @@ def __init__(self, self.isl = isl self.osl = osl self.random_range_ratio = random_range_ratio + self.backend = backend + self.use_chat_template = use_chat_template self.model_path = "" @@ -659,7 +725,7 @@ def to_cmd(self, working_dir: str) -> List[str]: model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name - return [ + benchmark_cmd = [ "python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving", "--model", self.model_path, "--dataset-name", "random", "--random-ids", "--num-prompts", @@ -670,17 +736,30 @@ def to_cmd(self, working_dir: str) -> List[str]: "--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency", str(self.concurrency) ] + if self.backend: + benchmark_cmd.append("--backend") + benchmark_cmd.append(self.backend) + if self.use_chat_template: + benchmark_cmd.append("--use-chat-template") + return benchmark_cmd def to_db_data(self) -> dict: """Convert ClientConfig to Database data""" - return { + db_data = { "l_concurrency": self.concurrency, "l_iterations": self.iterations, "l_isl": self.isl, "l_osl": self.osl, "d_random_range_ratio": self.random_range_ratio, + "s_backend": self.backend, + "b_use_chat_template": self.use_chat_template, "s_client_log_link": "", } + if self.backend: + db_data["s_backend"] = self.backend + if self.use_chat_template: + db_data["b_use_chat_template"] = self.use_chat_template + return db_data def parse_select_pattern(select_pattern: str): @@ -785,7 +864,18 @@ def parse_config_file(config_file_path: str, select_pattern: str = None): free_gpu_memory_fraction=server_config_data.get( 'free_gpu_memory_fraction', 0.8), max_batch_size=server_config_data.get('max_batch_size', 256), - enable_padding=server_config_data.get('enable_padding', True)) + cuda_graph_max_batch_size=server_config_data.get( + 'cuda_graph_max_batch_size', 0), + enable_padding=server_config_data.get('enable_padding', True), + spec_decoding_type=server_config_data.get('spec_decoding_type', ""), + num_nextn_predict_layers=server_config_data.get( + 'num_nextn_predict_layers', 0), + eagle3_layers_to_capture=server_config_data.get( + 'eagle3_layers_to_capture', 0), + max_draft_len=server_config_data.get('max_draft_len', 0), + speculative_model_dir=server_config_data.get( + 'speculative_model_dir', 0), + ) server_id = len(server_configs) server_configs.append(server_config) @@ -812,7 +902,11 @@ def parse_config_file(config_file_path: str, select_pattern: str = None): isl=client_config_data.get('isl', 1024), osl=client_config_data.get('osl', 1024), random_range_ratio=client_config_data.get( - 'random_range_ratio', 0.0)) + 'random_range_ratio', 0.0), + backend=client_config_data.get('backend', ""), + use_chat_template=client_config_data.get( + 'use_chat_template', False), + ) client_configs.append(client_config) server_client_configs[server_id] = client_configs @@ -2114,7 +2208,7 @@ def upload_test_results_to_database(self): # Get history data for each cmd_idx history_baseline_dict, history_data_dict = get_history_data( - new_data_dict) + new_data_dict, self._config.gpu_type) # Prepare regressive test cases regressive_data_list = prepare_regressive_test_cases( history_baseline_dict, new_data_dict) diff --git a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml b/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml index 3fdd60670f9..b3bf3662f2c 100644 --- a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml @@ -1,5 +1,39 @@ version: 0.0.1 perf_sanity_l0_dgx_b200: +- condition: + ranges: + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*b200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: pre_merge + backend: pytorch + orchestrator: mpi + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1,r1_fp8_tep8_mtp3,gpt_oss_fp4_tp8_eagle3] + +- condition: + ranges: + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*b200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: post_merge + backend: pytorch + orchestrator: mpi + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1,r1_fp8_tep8_mtp3,gpt_oss_fp4_tp8_eagle3] + - condition: ranges: system_gpu_count: @@ -15,7 +49,7 @@ perf_sanity_l0_dgx_b200: backend: pytorch orchestrator: mpi tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200] + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1,r1_fp4_v2_tep4_mtp3,gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] - condition: ranges: @@ -32,4 +66,4 @@ perf_sanity_l0_dgx_b200: backend: pytorch orchestrator: mpi tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200] + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1,r1_fp4_v2_tep4_mtp3,gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] diff --git a/tests/scripts/perf-sanity/l0_dgx_b200.yaml b/tests/scripts/perf-sanity/l0_dgx_b200.yaml index d8fccb78ef1..4e75f2f769a 100644 --- a/tests/scripts/perf-sanity/l0_dgx_b200.yaml +++ b/tests/scripts/perf-sanity/l0_dgx_b200.yaml @@ -1,6 +1,61 @@ server_configs: - - name: "r1_fp4_dep4" - model_name: "deepseek_r1_0528_fp4" + - name: "r1_fp8_dep8_mtp1" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tp: 8 + ep: 8 + pp: 1 + attention_backend: "TRTLLM" + moe_backend: "DEEPGEMM" + enable_attention_dp: true + batching_wait_iter: 0 + enable_balance: true + timeout_iters: 60 + max_batch_size: 512 + max_num_tokens: 2112 + kv_cache_dtype: "fp8" + free_gpu_memory_fraction: 0.8 + cuda_graph_max_batch_size: 512 + enable_padding: true + spec_decoding_type: "MTP" + num_nextn_predict_layers: 1 + client_configs: + - name: "con1024_iter10_1k1k" + concurrency: 1024 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp8_tep8_mtp3" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tp: 8 + ep: 8 + pp: 1 + attention_backend: "TRTLLM" + moe_backend: "TRTLLM" + enable_attention_dp: false + max_batch_size: 32 + max_num_tokens: 3136 + kv_cache_dtype: "fp8" + free_gpu_memory_fraction: 0.8 + cuda_graph_max_batch_size: 32 + enable_padding: true + spec_decoding_type: "MTP" + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_1k1k" + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_dep4_mtp1" + model_name: "deepseek_r1_0528_fp4_v2" gpus: 4 tp: 4 ep: 4 @@ -8,51 +63,132 @@ server_configs: attention_backend: "TRTLLM" moe_backend: "CUTLASS" enable_attention_dp: true - enable_chunked_prefill: false - max_num_tokens: 2176 + batching_wait_iter: 0 + enable_balance: true + timeout_iters: 60 + max_batch_size: 256 + max_num_tokens: 3072 kv_cache_dtype: "fp8" free_gpu_memory_fraction: 0.8 - max_batch_size: 256 + cuda_graph_max_batch_size: 256 enable_padding: true + spec_decoding_type: "MTP" + num_nextn_predict_layers: 1 client_configs: - - name: "con1_iter1_1024_1024" - concurrency: 1 - iterations: 1 - isl: 1024 - osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" - concurrency: 8 - iterations: 1 + - name: "con1024_iter10_1k1k" + concurrency: 1024 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 + random_range_ratio: 0.8 + backend: "openai" - - name: "r1_fp4_tep4" - model_name: "deepseek_r1_0528_fp4" + - name: "r1_fp4_v2_tep4_mtp3" + model_name: "deepseek_r1_0528_fp4_v2" gpus: 4 tp: 4 ep: 4 pp: 1 attention_backend: "TRTLLM" - moe_backend: "CUTLASS" + moe_backend: "TRTLLM" enable_attention_dp: false - enable_chunked_prefill: false - max_num_tokens: 2176 + max_batch_size: 32 + max_num_tokens: 5248 kv_cache_dtype: "fp8" free_gpu_memory_fraction: 0.8 - max_batch_size: 256 + cuda_graph_max_batch_size: 32 + enable_padding: true + spec_decoding_type: "MTP" + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_1k1k" + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "gpt_oss_fp4_tp8_eagle3" + model_name: "gpt_oss_120b_fp4" + gpus: 8 + tp: 8 + ep: 1 + pp: 1 + attention_backend: "TRTLLM" + moe_backend: "TRTLLM" + enable_attention_dp: false + max_batch_size: 1 + max_num_tokens: 20000 + kv_cache_dtype: "fp8" + free_gpu_memory_fraction: 0.8 + cuda_graph_max_batch_size: 1 enable_padding: true + num_postprocess_workers: 4 + stream_interval: 20 + spec_decoding_type: "Eagle" + eagle3_layers_to_capture: 1 + max_draft_len: 3 + speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3" client_configs: - - name: "con1_iter1_1024_1024" + - name: "con1_iter32_1k1k" concurrency: 1 - iterations: 1 + iterations: 32 isl: 1024 osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" - concurrency: 8 - iterations: 1 + random_range_ratio: 0.8 + backend: "openai" + + - name: "gpt_oss_fp4_dep2" + model_name: "gpt_oss_120b_fp4" + gpus: 2 + tp: 2 + ep: 2 + pp: 1 + attention_backend: "TRTLLM" + moe_backend: "TRTLLM" + enable_attention_dp: true + enable_balance: true + max_batch_size: 1024 + max_num_tokens: 20000 + kv_cache_dtype: "fp8" + free_gpu_memory_fraction: 0.8 + cuda_graph_max_batch_size: 1024 + enable_padding: true + num_postprocess_workers: 4 + stream_interval: 20 + client_configs: + - name: "con2048_iter5_1k1k" + concurrency: 2048 + iterations: 5 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "gpt_oss_fp4_dep4" + model_name: "gpt_oss_120b_fp4" + gpus: 4 + tp: 4 + ep: 4 + pp: 1 + attention_backend: "TRTLLM" + moe_backend: "TRTLLM" + enable_attention_dp: true + enable_balance: true + max_batch_size: 512 + max_num_tokens: 20000 + kv_cache_dtype: "fp8" + free_gpu_memory_fraction: 0.8 + cuda_graph_max_batch_size: 512 + enable_padding: true + num_postprocess_workers: 4 + stream_interval: 20 + client_configs: + - name: "con2048_iter5_1k1k" + concurrency: 2048 + iterations: 5 isl: 1024 osl: 1024 - random_range_ratio: 0.0 + random_range_ratio: 0.8 + backend: "openai"