103103 "deepseek_r1_nvfp4" : "DeepSeek-R1/DeepSeek-R1-FP4" ,
104104 "deepseek_r1_0528_fp8" : "DeepSeek-R1/DeepSeek-R1-0528/" ,
105105 "deepseek_r1_0528_fp4" : "DeepSeek-R1/DeepSeek-R1-0528-FP4/" ,
106+ "deepseek_r1_0528_fp4_v2" : "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/" ,
106107 "deepseek_v3_lite_fp8" : "DeepSeek-V3-Lite/fp8" ,
107108 "deepseek_v3_lite_nvfp4" : "DeepSeek-V3-Lite/nvfp4_moe_only" ,
108109 "qwen2_7b_instruct" : "Qwen2-7B-Instruct" ,
@@ -513,11 +514,13 @@ def __init__(
513514 max_num_tokens : int ,
514515 attention_backend : str ,
515516 max_batch_size : int ,
517+ cuda_graph_max_batch_size : int = 0 ,
516518 pp : int = 1 ,
517519 enable_chunked_prefill : bool = False ,
518520 disable_overlap_scheduler : bool = False ,
519521 moe_backend : str = "" ,
520522 moe_max_num_tokens : int = 0 ,
523+ num_postprocess_workers : int = 0 ,
521524 stream_interval : int = 10 ,
522525 enable_attention_dp : bool = False ,
523526 attention_dp_balance : bool = False ,
@@ -527,6 +530,11 @@ def __init__(
527530 enable_block_reuse : bool = False ,
528531 free_gpu_memory_fraction : float = 0.8 ,
529532 enable_padding : bool = True ,
533+ spec_decoding_type : str = "" ,
534+ num_nextn_predict_layers : int = 0 ,
535+ eagle3_layers_to_capture : int = 0 ,
536+ max_draft_len : int = 0 ,
537+ speculative_model_dir : str = "" ,
530538 ):
531539 self .name = name
532540 self .model_name = model_name
@@ -540,6 +548,7 @@ def __init__(
540548 self .attention_backend = attention_backend
541549 self .moe_backend = moe_backend
542550 self .moe_max_num_tokens = moe_max_num_tokens
551+ self .num_postprocess_workers = num_postprocess_workers
543552 self .stream_interval = stream_interval
544553 self .enable_attention_dp = enable_attention_dp
545554 self .attention_dp_balance = attention_dp_balance
@@ -549,7 +558,13 @@ def __init__(
549558 self .enable_block_reuse = enable_block_reuse
550559 self .free_gpu_memory_fraction = free_gpu_memory_fraction
551560 self .max_batch_size = max_batch_size
561+ self .cuda_graph_max_batch_size = max_batch_size if cuda_graph_max_batch_size == 0 else cuda_graph_max_batch_size
552562 self .enable_padding = enable_padding
563+ self .spec_decoding_type = spec_decoding_type
564+ self .num_nextn_predict_layers = num_nextn_predict_layers
565+ self .eagle3_layers_to_capture = eagle3_layers_to_capture
566+ self .max_draft_len = max_draft_len
567+ self .speculative_model_dir = speculative_model_dir
553568
554569 self .model_path = ""
555570
@@ -567,7 +582,7 @@ def to_cmd(self, working_dir: str) -> List[str]:
567582
568583 def to_db_data (self ) -> dict :
569584 """Convert ServerConfig to Database data"""
570- return {
585+ db_data = {
571586 "s_model_name" : self .model_name .lower (),
572587 "l_gpus" : self .gpus ,
573588 "l_tp" : self .tp ,
@@ -588,9 +603,30 @@ def to_db_data(self) -> dict:
588603 "b_enable_block_reuse" : self .enable_block_reuse ,
589604 "d_free_gpu_memory_fraction" : self .free_gpu_memory_fraction ,
590605 "l_max_batch_size" : self .max_batch_size ,
606+ "l_cuda_graph_max_batch_size" : self .cuda_graph_max_batch_size ,
591607 "b_enable_padding" : self .enable_padding ,
608+ "s_spec_decoding_type" : self .spec_decoding_type ,
609+ "l_num_nextn_predict_layers" : self .num_nextn_predict_layers ,
610+ "l_eagle3_layers_to_capture" : self .eagle3_layers_to_capture ,
611+ "l_max_draft_len" : self .max_draft_len ,
612+ "s_speculative_model_dir" : self .speculative_model_dir ,
592613 "s_server_log_link" : "" ,
593614 }
615+ if self .num_postprocess_workers > 0 :
616+ db_data ["l_num_postprocess_workers" ] = self .num_postprocess_workers
617+ if self .spec_decoding_type :
618+ db_data ["s_spec_decoding_type" ] = self .spec_decoding_type
619+ if self .num_nextn_predict_layers > 0 :
620+ db_data [
621+ "l_num_nextn_predict_layers" ] = self .num_nextn_predict_layers
622+ if self .eagle3_layers_to_capture > 0 :
623+ db_data [
624+ "l_eagle3_layers_to_capture" ] = self .eagle3_layers_to_capture
625+ if self .max_draft_len > 0 :
626+ db_data ["l_max_draft_len" ] = self .max_draft_len
627+ if self .speculative_model_dir :
628+ db_data ["s_speculative_model_dir" ] = self .speculative_model_dir
629+ return db_data
594630
595631 def generate_extra_llm_api_config (self ) -> str :
596632 """Generate extra-llm-api-config.yml content"""
@@ -599,21 +635,28 @@ def generate_extra_llm_api_config(self) -> str:
599635 f"moe_expert_parallel_size: { self .ep } " ,
600636 f"pipeline_parallel_size: { self .pp } " ,
601637 f"max_num_tokens: { self .max_num_tokens } " ,
638+ f"max_batch_size: { self .max_batch_size } " ,
602639 f"enable_attention_dp: { str (self .enable_attention_dp ).lower ()} " ,
603640 f"disable_overlap_scheduler: { str (self .disable_overlap_scheduler ).lower ()} " ,
604- f"stream_interval: { self .stream_interval } " ,
605641 f"attn_backend: { self .attention_backend } " ,
606642 f"enable_chunked_prefill: { str (self .enable_chunked_prefill ).lower ()} " ,
607643 "cuda_graph_config:" ,
608644 f" enable_padding: { str (self .enable_padding ).lower ()} " ,
609- f" max_batch_size: { self .max_batch_size } " ,
645+ f" max_batch_size: { self .cuda_graph_max_batch_size } " ,
610646 "kv_cache_config:" ,
611647 f" dtype: { self .kv_cache_dtype } " ,
612648 f" free_gpu_memory_fraction: { self .free_gpu_memory_fraction } " ,
613649 f" enable_block_reuse: { str (self .enable_block_reuse ).lower ()} " ,
614650 "print_iter_log: false" ,
615651 ]
616652
653+ if self .stream_interval > 0 :
654+ config_lines .append (f"stream_interval: { self .stream_interval } " )
655+
656+ if self .num_postprocess_workers > 0 :
657+ config_lines .append (
658+ f"num_postprocess_workers: { self .num_postprocess_workers } " )
659+
617660 # Add moe_config if moe_backend is specified
618661 if self .moe_backend :
619662 config_lines .append ("moe_config:" )
@@ -629,6 +672,25 @@ def generate_extra_llm_api_config(self) -> str:
629672 f" batching_wait_iters: { self .batching_wait_iters } " )
630673 config_lines .append (f" timeout_iters: { self .timeout_iters } " )
631674
675+ if self .spec_decoding_type :
676+ config_lines .append ("speculative_config:" )
677+ config_lines .append (f" decoding_type: { self .spec_decoding_type } " )
678+ if self .num_nextn_predict_layers > 0 :
679+ config_lines .append (
680+ f" num_nextn_predict_layers: { self .num_nextn_predict_layers } "
681+ )
682+ if self .eagle3_layers_to_capture > 0 :
683+ config_lines .append (
684+ f" eagle3_layers_to_capture: { self .eagle3_layers_to_capture } "
685+ )
686+ if self .max_draft_len > 0 :
687+ config_lines .append (f" max_draft_len: { self .max_draft_len } " )
688+ if self .speculative_model_dir :
689+ spec_model_dir = os .path .join (llm_models_root (),
690+ self .speculative_model_dir )
691+ config_lines .append (
692+ f" speculative_model_dir: { spec_model_dir } " )
693+
632694 return "\n " .join (config_lines )
633695
634696
@@ -644,22 +706,26 @@ def __init__(self,
644706 iterations : int ,
645707 isl : int ,
646708 osl : int ,
647- random_range_ratio : float = 0.0 ):
709+ random_range_ratio : float = 0.0 ,
710+ backend : str = "" ,
711+ use_chat_template : bool = False ):
648712 self .name = name
649713 self .model_name = model_name
650714 self .concurrency = concurrency
651715 self .iterations = iterations
652716 self .isl = isl
653717 self .osl = osl
654718 self .random_range_ratio = random_range_ratio
719+ self .backend = backend
720+ self .use_chat_template = use_chat_template
655721
656722 self .model_path = ""
657723
658724 def to_cmd (self , working_dir : str ) -> List [str ]:
659725 model_dir = get_model_dir (self .model_name )
660726 self .model_path = model_dir if os .path .exists (
661727 model_dir ) else self .model_name
662- return [
728+ benchmark_cmd = [
663729 "python" , "-m" , "tensorrt_llm.serve.scripts.benchmark_serving" ,
664730 "--model" , self .model_path , "--dataset-name" , "random" ,
665731 "--random-ids" , "--num-prompts" ,
@@ -670,17 +736,30 @@ def to_cmd(self, working_dir: str) -> List[str]:
670736 "--percentile-metrics" , "ttft,tpot,itl,e2el" , "--max-concurrency" ,
671737 str (self .concurrency )
672738 ]
739+ if self .backend :
740+ benchmark_cmd .append ("--backend" )
741+ benchmark_cmd .append (self .backend )
742+ if self .use_chat_template :
743+ benchmark_cmd .append ("--use-chat-template" )
744+ return benchmark_cmd
673745
674746 def to_db_data (self ) -> dict :
675747 """Convert ClientConfig to Database data"""
676- return {
748+ db_data = {
677749 "l_concurrency" : self .concurrency ,
678750 "l_iterations" : self .iterations ,
679751 "l_isl" : self .isl ,
680752 "l_osl" : self .osl ,
681753 "d_random_range_ratio" : self .random_range_ratio ,
754+ "s_backend" : self .backend ,
755+ "b_use_chat_template" : self .use_chat_template ,
682756 "s_client_log_link" : "" ,
683757 }
758+ if self .backend :
759+ db_data ["s_backend" ] = self .backend
760+ if self .use_chat_template :
761+ db_data ["b_use_chat_template" ] = self .use_chat_template
762+ return db_data
684763
685764
686765def parse_select_pattern (select_pattern : str ):
@@ -2115,7 +2194,7 @@ def upload_test_results_to_database(self):
21152194
21162195 # Get history data for each cmd_idx
21172196 history_baseline_dict , history_data_dict = get_history_data (
2118- new_data_dict )
2197+ new_data_dict , self . _config . gpu_type )
21192198 # Prepare regressive test cases
21202199 regressive_data_list = prepare_regressive_test_cases (
21212200 history_baseline_dict , new_data_dict )
0 commit comments