|
1 | 1 | import ast
|
2 | 2 |
|
3 | 3 | import vllm.envs as envs
|
4 |
| -from transformers import PretrainedConfig |
5 |
| -from vllm.config import ModelConfig |
6 | 4 | from vllm.config.speculative import SpeculativeConfig
|
7 | 5 | from vllm.logger import logger
|
8 | 6 |
|
9 | 7 |
|
10 |
| -# mypy: ignore-errors |
11 |
| -@property |
12 |
| -def is_deepseek_mla(self: ModelConfig): |
13 |
| - if not hasattr(self.hf_text_config, "model_type"): |
14 |
| - return False |
15 |
| - elif self.hf_text_config.model_type in \ |
16 |
| - ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', |
17 |
| - 'kimi_k2', 'longcat_flash', 'deepseek_v32'): |
18 |
| - return self.hf_text_config.kv_lora_rank is not None |
19 |
| - elif self.hf_text_config.model_type == 'eagle': |
20 |
| - # if the model is an EAGLE module, check for the |
21 |
| - # underlying architecture |
22 |
| - return self.hf_text_config.model.model_type in \ |
23 |
| - ('deepseek_v2', 'deepseek_v3', 'deepseek_v32') \ |
24 |
| - and self.hf_text_config.kv_lora_rank is not None |
25 |
| - return False |
26 |
| - |
27 |
| - |
28 |
| -@staticmethod |
29 |
| -def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: |
30 |
| - if hf_config.model_type in ("deepseek_v3", "deepseek_v32"): |
31 |
| - hf_config.model_type = "deepseek_mtp" |
32 |
| - if hf_config.model_type == "deepseek_mtp": |
33 |
| - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) |
34 |
| - hf_config.update({ |
35 |
| - "n_predict": n_predict, |
36 |
| - "architectures": ["DeepSeekMTPModel"] |
37 |
| - }) |
38 |
| - |
39 |
| - if hf_config.architectures[0] == "MiMoForCausalLM": |
40 |
| - hf_config.model_type = "mimo_mtp" |
41 |
| - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) |
42 |
| - hf_config.update({ |
43 |
| - "num_hidden_layers": 0, |
44 |
| - "n_predict": n_predict, |
45 |
| - "architectures": ["MiMoMTPModel"] |
46 |
| - }) |
47 |
| - |
48 |
| - if hf_config.architectures[0] == "Glm4MoeForCausalLM": |
49 |
| - hf_config.model_type = "glm4_moe_mtp" |
50 |
| - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) |
51 |
| - hf_config.update({ |
52 |
| - "num_hidden_layers": 0, |
53 |
| - "n_predict": n_predict, |
54 |
| - "architectures": ["Glm4MoeMTPModel"] |
55 |
| - }) |
56 |
| - |
57 |
| - if hf_config.model_type == "ernie4_5_moe": |
58 |
| - hf_config.model_type = "ernie_mtp" |
59 |
| - if hf_config.model_type == "ernie_mtp": |
60 |
| - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) |
61 |
| - hf_config.update({ |
62 |
| - "n_predict": n_predict, |
63 |
| - "architectures": ["ErnieMTPModel"] |
64 |
| - }) |
65 |
| - |
66 |
| - if hf_config.model_type == "qwen3_next": |
67 |
| - hf_config.model_type = "qwen3_next_mtp" |
68 |
| - if hf_config.model_type == "qwen3_next_mtp": |
69 |
| - n_predict = getattr(hf_config, "num_nextn_predict_layers", None) |
70 |
| - hf_config.update({ |
71 |
| - "n_predict": n_predict, |
72 |
| - "architectures": ["Qwen3NextMTP"] |
73 |
| - }) |
74 |
| - if hf_config.model_type == "longcat_flash": |
75 |
| - hf_config.model_type = "longcat_flash_mtp" |
76 |
| - n_predict = getattr(hf_config, "num_nextn_predict_layers", 1) |
77 |
| - hf_config.update({ |
78 |
| - "n_predict": n_predict, |
79 |
| - "architectures": ["LongCatFlashMTPModel"] |
80 |
| - }) |
81 |
| - |
82 |
| - return hf_config |
83 |
| - |
84 |
| - |
85 | 8 | def __post_init__(self):
|
86 | 9 |
|
87 | 10 | # Note: "method" is a new parameter that helps to extend the
|
@@ -308,6 +231,4 @@ def __post_init__(self):
|
308 | 231 | self.draft_tensor_parallel_size))
|
309 | 232 |
|
310 | 233 |
|
311 |
| -ModelConfig.is_deepseek_mla = is_deepseek_mla |
312 | 234 | SpeculativeConfig.__post_init__ = __post_init__
|
313 |
| -SpeculativeConfig.hf_config_override = hf_config_override |
0 commit comments