alibaba
diff --git a/‎rtp_llm/models/deepseek_v2.py‎
Lines changed: 48 additions & 15 deletions b/‎rtp_llm/models/deepseek_v2.py‎
Lines changed: 48 additions & 15 deletions
diff --git a/‎rtp_llm/openai/renderers/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎rtp_llm/openai/renderers/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -7,7 +7,6 @@
 import torch
 
 from rtp_llm.config.model_config import ModelConfig
-from rtp_llm.ops import MlaOpsType
 from rtp_llm.model_factory_register import register_model
 from rtp_llm.model_loader.attn_weight import MlaAttnAtomicWeight, MlaConfig
 from rtp_llm.model_loader.ffn_weight import (
@@ -29,6 +28,7 @@
 )
 from rtp_llm.models_py.model_desc.generic_moe import GenericMoeModel
 from rtp_llm.models_py.model_desc.module_base import GptModelBase
+from rtp_llm.ops import MlaOpsType
 from rtp_llm.utils.model_weight import (
     CkptWeightInfo,
     W,
@@ -73,7 +73,8 @@ def _get_hf_layer_weight_info(self, layer_id: int):
             kv_lora_rank=self.kv_lora_rank,
             ope_head_dim=self.nope_head_dim,
             v_head_dim=self.v_head_dim,
-            use_mla=self.model_config.attn_config.use_mla and self.model_config.mla_ops_type != MlaOpsType.MHA,
+            use_mla=self.model_config.attn_config.use_mla
+            and self.model_config.mla_ops_type != MlaOpsType.MHA,
             q_use_lora=self.q_use_lora,
         )
         layer_weights = [
@@ -225,7 +226,10 @@ def _get_hf_layer_weight_info(self, layer_id: int):
                 )
             )
 
-        if self.model_config.attn_config.use_mla and self.model_config.mla_ops_type != MlaOpsType.MHA:
+        if (
+            self.model_config.attn_config.use_mla
+            and self.model_config.mla_ops_type != MlaOpsType.MHA
+        ):
             mla_layer_weights.append(
                 MlaAttnAtomicWeight(
                     W.mla_kc,
@@ -522,7 +526,7 @@ def _create_python_model(self) -> Optional[GptModelBase]:
         py_hw_kernel_config = self.hw_kernel_config
         moe_config = self.moe_config
         max_generate_batch_size = self.max_generate_batch_size
-        
+
         # Use GenericMoeModel with new config architecture
         # attention_type is determined from model_config.attn_config.use_mla
         self.py_model = GenericMoeModel(
@@ -546,11 +550,13 @@ def _from_hf(config: ModelConfig, ckpt_path: str):
             config_json = json.loads(content)
             config.inter_size = config_json["intermediate_size"]
             config.attn_config.head_num = config_json["num_attention_heads"]
-            config.attn_config.kv_head_num = config_json.get("num_key_value_heads", config.attn_config.head_num)
+            config.attn_config.kv_head_num = config_json.get(
+                "num_key_value_heads", config.attn_config.head_num
+            )
             config.num_layers = config_json["num_hidden_layers"]
-            config.attn_config.rope_config.base = int(config_json.get(
-                "rope_theta", config.attn_config.rope_config.base
-            ))
+            config.attn_config.rope_config.base = int(
+                config_json.get("rope_theta", config.attn_config.rope_config.base)
+            )
             config.vocab_size = config_json["vocab_size"]
             config.layernorm_eps = config_json.get("rms_norm_eps", 1e-06)
             config.tie_word_embeddings = config_json.get("tie_word_embeddings", False)
@@ -559,13 +565,19 @@ def _from_hf(config: ModelConfig, ckpt_path: str):
             # MLA config
             config.attn_config.use_mla = True
             q_lora_rank = config_json.get("q_lora_rank")
-            config.attn_config.q_lora_rank = int(q_lora_rank) if q_lora_rank is not None else 0
+            config.attn_config.q_lora_rank = (
+                int(q_lora_rank) if q_lora_rank is not None else 0
+            )
             kv_lora_rank = config_json.get("kv_lora_rank")
-            config.attn_config.kv_lora_rank = int(kv_lora_rank) if kv_lora_rank is not None else 0
+            config.attn_config.kv_lora_rank = (
+                int(kv_lora_rank) if kv_lora_rank is not None else 0
+            )
             config.attn_config.nope_head_dim = config_json["qk_nope_head_dim"]
             config.attn_config.rope_head_dim = config_json["qk_rope_head_dim"]
             config.attn_config.v_head_dim = config_json["v_head_dim"]
-            config.attn_config.size_per_head = config.attn_config.nope_head_dim + config.attn_config.rope_head_dim
+            config.attn_config.size_per_head = (
+                config.attn_config.nope_head_dim + config.attn_config.rope_head_dim
+            )
             config.attn_config.rope_config.dim = config.attn_config.rope_head_dim
 
             # yarn rotary config
@@ -575,8 +587,12 @@ def _from_hf(config: ModelConfig, ckpt_path: str):
                 config.attn_config.rope_config.style = 5
             rope_scaling = config_json.get("rope_scaling")
             config.attn_config.rope_config.scale = rope_scaling["factor"]
-            config.attn_config.rope_config.factor1 = float(rope_scaling.get("beta_slow", 1))
-            config.attn_config.rope_config.factor2 = float(rope_scaling.get("beta_fast", 32))
+            config.attn_config.rope_config.factor1 = float(
+                rope_scaling.get("beta_slow", 1)
+            )
+            config.attn_config.rope_config.factor2 = float(
+                rope_scaling.get("beta_fast", 32)
+            )
             config.attn_config.rope_config.max_pos = rope_scaling[
                 "original_max_position_embeddings"
             ]
@@ -636,8 +652,25 @@ def get_weight_cls():
 
 class DeepSeekV3MtpWeight(DeepSeekV2Weight):
 
-    def __init__(self, model_config: ModelConfig, parallelism_config, hw_kernel_config, kv_cache_config, merge_lora: bool = False, vit_config=None, **kwargs):
-        super().__init__(model_config=model_config, parallelism_config=parallelism_config, hw_kernel_config=hw_kernel_config, kv_cache_config=kv_cache_config, merge_lora=merge_lora, vit_config=vit_config, **kwargs)
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallelism_config,
+        hw_kernel_config,
+        kv_cache_config,
+        merge_lora: bool = False,
+        vit_config=None,
+        **kwargs,
+    ):
+        super().__init__(
+            model_config=model_config,
+            parallelism_config=parallelism_config,
+            hw_kernel_config=hw_kernel_config,
+            kv_cache_config=kv_cache_config,
+            merge_lora=merge_lora,
+            vit_config=vit_config,
+            **kwargs,
+        )
 
     def _get_weight_info(self):
         layer_weights: List[List[WeightModule]] = []
 
@@ -3,6 +3,7 @@
 from .chatglm4_renderer import ChatGlm4Renderer
 from .chatglm45_renderer import ChatGlm45Renderer
 from .deepseekv31_renderer import DeepseekV31Renderer
+from .deepseekv32_renderer import DeepseekV32Renderer
 from .internvl_renderer import InternVLRenderer
 from .kimik2_renderer import KimiK2Renderer
 from .llava_renderer import LlavaRenderer