|
57 | 57 | "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8", |
58 | 58 | "llama_v3.3_70b_instruct_fp4": |
59 | 59 | "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4", |
60 | | - "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", |
61 | 60 | "llama_v3.1_405b_instruct_fp8": |
62 | 61 | "llama-3.1-model/Llama-3.1-405B-Instruct-FP8", |
63 | 62 | "llama_v3.1_405b_instruct_fp4": |
@@ -783,6 +782,8 @@ def __init__( |
783 | 782 | tp_size: int = 1, |
784 | 783 | pp_size: int = 1, |
785 | 784 | num_gpus: int = 1, |
| 785 | + # only for torch-backend currently |
| 786 | + extra: bool = False, |
786 | 787 | # _autodeploy backend specific parameters |
787 | 788 | ad_compile_backend: str = "torch-opt", |
788 | 789 | free_mem_ratio: float = 0.9, |
@@ -841,6 +842,8 @@ def __init__( |
841 | 842 | self.pp_size = pp_size |
842 | 843 | # Number of GPUs. |
843 | 844 | self.num_gpus = num_gpus |
| 845 | + # Extra flag to enable pytorch_model_config reading for TRT backend |
| 846 | + self.extra = extra |
844 | 847 | # _autodeploy backend specific parameters |
845 | 848 | self.ad_compile_backend = ad_compile_backend |
846 | 849 | self.free_mem_ratio = free_mem_ratio |
@@ -1016,6 +1019,10 @@ def to_string(self, |
1016 | 1019 | if self.num_gpus > 1: |
1017 | 1020 | entries.append(f"gpus:{self.num_gpus}") |
1018 | 1021 |
|
| 1022 | + # Add extra flag for llm-api-config.yml. |
| 1023 | + if self.extra: |
| 1024 | + entries.append("extra") |
| 1025 | + |
1019 | 1026 | # Concatenate labels with "-". |
1020 | 1027 | return "-".join(entries) |
1021 | 1028 |
|
@@ -1180,6 +1187,11 @@ def load_from_str(self, test_param_labels) -> None: |
1180 | 1187 | self.num_gpus = 1 if not labels[0].startswith("gpus:") else int( |
1181 | 1188 | labels.pop(0).replace("gpus:", "")) |
1182 | 1189 |
|
| 1190 | + if len(labels) > 0: |
| 1191 | + self.extra = True if labels[0] == "extra" else False |
| 1192 | + if self.extra: |
| 1193 | + labels.pop(0) |
| 1194 | + |
1183 | 1195 | assert len( |
1184 | 1196 | labels |
1185 | 1197 | ) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}" |
@@ -1644,18 +1656,26 @@ def get_trtllm_bench_command(self, engine_dir): |
1644 | 1656 | benchmark_cmd += [f"--pp={self._config.pp_size}"] |
1645 | 1657 | if self._config.streaming == "streaming": |
1646 | 1658 | benchmark_cmd += [f"--streaming"] |
1647 | | - #use default yaml config |
1648 | | - if self._config.backend == "pytorch": |
| 1659 | + |
| 1660 | + #Add extra-llm-api-config.yml for pytorch backend and tensorrt backend with extra flag |
| 1661 | + if self._config.backend == "pytorch" or (self._config.backend == "" |
| 1662 | + and self._config.extra): |
1649 | 1663 | pytorch_config_path = os.path.join(engine_dir, |
1650 | 1664 | "extra-llm-api-config.yml") |
1651 | 1665 | if not os.path.exists(pytorch_config_path): |
1652 | 1666 | os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True) |
1653 | 1667 | config = get_model_yaml_config(self._config.to_string(), |
1654 | 1668 | lora_dirs=self.lora_dirs) |
1655 | | - print_info(f"pytorch model config: {config}") |
1656 | | - with open(pytorch_config_path, 'w') as f: |
1657 | | - yaml.dump(config, f, default_flow_style=False) |
1658 | | - benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"] |
| 1669 | + if config: |
| 1670 | + print_info(f"pytorch/TRT model config: {config}") |
| 1671 | + with open(pytorch_config_path, 'w') as f: |
| 1672 | + yaml.dump(config, f, default_flow_style=False) |
| 1673 | + benchmark_cmd += [ |
| 1674 | + f"--extra_llm_api_options={pytorch_config_path}" |
| 1675 | + ] |
| 1676 | + # If guided_decoding_backend is set, we need to initialize tokenizer |
| 1677 | + if config.get('guided_decoding_backend') is not None: |
| 1678 | + benchmark_cmd += ["--no_skip_tokenizer_init"] |
1659 | 1679 | elif self._config.backend == "_autodeploy": |
1660 | 1680 | autodeploy_config_path = os.path.join(engine_dir, |
1661 | 1681 | "extra_llm_api_options.yaml") |
|
0 commit comments