Improve internvl for turbomind engine (#3769)

lvhan028 · RunningLeon · web-flow · commit 46c70ff502b8 · 2025-07-25T14:41:44.000+08:00
* support internvl using moe model as LLM part

* improvement

* update

* update

* add interns1 template

* update

* support pt interns1 hf

* support interns1 in turbomind

* fix embedding data type mapping

* add escaping for regex

* add escaping for regex

* fix linting

---------

Co-authored-by: RunningLeon &lt;mnsheng@yeah.net&gt;
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
@@ -11,7 +11,7 @@ class DeepSeekVLReader(LlamaReader):
     """DeepSeekVL model reader."""
 
     attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
     tok_embeddings_key = 'language_model.model.embed_tokens.weight'
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
diff --git a/lmdeploy/turbomind/deploy/source_model/glm4.py b/lmdeploy/turbomind/deploy/source_model/glm4.py
@@ -12,7 +12,7 @@
 class Glm4Reader(LlamaReader):
     """Glm4Reader."""
 
-    attn_layer_patten = r'transformer.encoder.layers.([0-9]+).'
+    attn_layer_patten = r'transformer\.encoder\.layers\.([0-9]+).'
     tok_embeddings_key = 'transformer.embedding.word_embeddings.weight'
     norm_weight_key = 'transformer.encoder.final_layernorm.weight'
     output_weight_key = 'transformer.output_layer.weight'
diff --git a/lmdeploy/turbomind/deploy/source_model/internlm2.py b/lmdeploy/turbomind/deploy/source_model/internlm2.py
@@ -10,7 +10,7 @@ class InternLM2Reader(LlamaReader):
     """InternLM2 model reader."""
 
     attn_layer_prefix = 'model.layers'
-    attn_layer_patten = r'model.layers.([0-9]+).'
+    attn_layer_patten = r'model\.layers\.([0-9]+).'
     tok_embeddings_key = 'model.tok_embeddings.weight'
     norm_weight_key = 'model.norm.weight'
     output_weight_key = 'output.weight'
diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -1,18 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-
-from ..config import RopeParam
 from .base import INPUT_MODELS
 from .internlm2 import InternLM2Reader
 from .llama import LlamaModel, LlamaReader
+from .qwen import Qwen3MoeReader
 
 
 class InternVLReader(LlamaReader):
     """InternVLReader for llama model."""
 
     attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
     tok_embeddings_key = 'language_model.model.embed_tokens.weight'
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
@@ -27,7 +24,7 @@ class InternVL2Reader(InternLM2Reader):
     """InternVLReader for InternLM2 model."""
 
     attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
     tok_embeddings_key = 'language_model.model.tok_embeddings.weight'
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.output.weight'
@@ -37,6 +34,22 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_
         super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
 
 
+class InternS1Reader(Qwen3MoeReader):
+    """InternVL3Reader for InternVL+Qwen3MoE model."""
+
+    attn_layer_prefix = 'model.language_model.layers'
+    attn_layer_patten = r'model\.language_model\.layers\.([0-9]+).'
+    tok_embeddings_key = 'model.language_model.embed_tokens.weight'
+    norm_weight_key = 'model.language_model.norm.weight'
+    output_weight_key = 'lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
+        model_cfg = model_cfg.get('text_config')
+        if model_cfg is None:
+            raise ValueError(f'Miss "text_config" in model config: {model_cfg}')
+        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
+
+
 @INPUT_MODELS.register_module(name='internvl')
 class InternVLModel(LlamaModel):
     """InternVL model in hf format."""
@@ -45,53 +58,18 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         super().__init__(model_path, tokenizer_path, **kwargs)
         from transformers import AutoConfig
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-        llm_config = getattr(config, 'llm_config', None) or getattr(config, 'text_config', None)
-        arch = llm_config.architectures[0]
-        _readers = dict(InternLM2ForCausalLM=InternVL2Reader,
-                        LlamaForCausalLM=InternVLReader,
-                        Qwen2ForCausalLM=InternVLReader)
-        self.Reader = _readers[arch]
+        self.llm_config = getattr(config, 'llm_config', None) or getattr(config, 'text_config', None)
+        arch = self.llm_config.architectures[0]
+        relations = dict(
+            InternLM2ForCausalLM=('internlm2', InternVL2Reader),
+            LlamaForCausalLM=('llama', InternVLReader),
+            Qwen2ForCausalLM=('qwen2', InternVLReader),
+            Qwen3MoeForCausalLM=('qwen3-moe', InternS1Reader),
+        )
+        llm_model, self.Reader = relations[arch]
+        self.llm_model = INPUT_MODELS.get(llm_model)(model_path=model_path, tokenizer_path=tokenizer_path, **kwargs)
 
     def model_info(self):
         """Read model info."""
-        params_path = osp.join(self.model_path, 'config.json')
-        with open(params_path) as f:
-            file_content = json.load(f)
-            model_arg = file_content.get('llm_config') or file_content.get('text_config')
-            num_layer = model_arg['num_hidden_layers']
-            norm_eps = model_arg['rms_norm_eps']
-            hidden_units = model_arg['hidden_size']
-            attn_head_num = model_arg['num_attention_heads']
-            vocab_size = model_arg['vocab_size']
-            inter_size = model_arg['intermediate_size']
-            if 'num_key_value_heads' in model_arg:
-                kv_head_num = model_arg['num_key_value_heads']
-            else:
-                kv_head_num = model_arg['num_attention_heads']
-            rope_theta = float(model_arg.get('rope_theta', 10000.0))
-            max_position_embeddings = int(model_arg.get('max_position_embeddings', 0))
-            rope_scaling = model_arg.get('rope_scaling', None)
-            scaling_factor = 0.0
-            scaling_type = 'default'
-            if isinstance(rope_scaling, dict):
-                scaling_type = model_arg['rope_scaling'].get('type', 'default')
-                scaling_factor = model_arg['rope_scaling'].get('factor', '')
-            attn_bias = 1 if model_arg['architectures'][0] == 'Qwen2ForCausalLM' else 0
-            rotary_embedding = hidden_units // attn_head_num
-            rope_param = RopeParam(type=scaling_type,
-                                   base=rope_theta,
-                                   dim=rotary_embedding,
-                                   max_position_embeddings=max_position_embeddings,
-                                   factor=scaling_factor)
-
-        return dict(num_layer=num_layer,
-                    size_per_head=hidden_units // attn_head_num,
-                    attn_bias=attn_bias,
-                    norm_eps=norm_eps,
-                    hidden_units=hidden_units,
-                    inter_size=inter_size,
-                    vocab_size=vocab_size,
-                    head_num=attn_head_num,
-                    kv_head_num=kv_head_num,
-                    max_position_embeddings=max_position_embeddings,
-                    rope_param=rope_param)
+        self.llm_model.model_config = self.llm_config.to_dict()
+        return self.llm_model.model_info()
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -1,7 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import json
 import math
-import os.path as osp
 import re
 
 import torch
@@ -17,7 +15,7 @@ class LlamaReader(BaseReader):
     """LlamaReader."""
 
     attn_layer_prefix = 'model.layers'
-    attn_layer_patten = r'model.layers.([0-9]+).'
+    attn_layer_patten = r'model\.layers\.([0-9]+).'
     tok_embeddings_key = 'model.embed_tokens.weight'
     norm_weight_key = 'model.norm.weight'
     output_weight_key = 'lm_head.weight'
@@ -118,79 +116,76 @@ def readers(self):
 
     def model_info(self):
         """Read model info."""
-        params_path = osp.join(self.model_path, 'config.json')
-        with open(params_path) as f:
-            model_arg = json.load(f)
-            num_layer = model_arg['num_hidden_layers']
-            norm_eps = model_arg['rms_norm_eps']
-            attn_head_num = model_arg['num_attention_heads']
-            vocab_size = model_arg['vocab_size']
-            inter_size = model_arg['intermediate_size']
-            if 'num_key_value_heads' in model_arg:
-                kv_head_num = model_arg['num_key_value_heads']
-            else:
-                kv_head_num = model_arg['num_attention_heads']
-            hidden_units = model_arg['hidden_size']
-            head_dim = model_arg.get('head_dim', hidden_units // attn_head_num)
-            # compute rope param
-            rope_theta = float(model_arg.get('rope_theta', 10000.0))
-            max_position_embeddings = int(model_arg.get('max_position_embeddings', 0))
-            rope_param = RopeParam(type='default', base=rope_theta, dim=head_dim)
-            rope_scaling = model_arg.get('rope_scaling', None)
-            if isinstance(rope_scaling, dict):
-                llama2_scaling_type = rope_scaling.get('type', '')
-                llama3_scaling_type = rope_scaling.get('rope_type', '')
-                if llama2_scaling_type and llama3_scaling_type \
-                        and llama2_scaling_type != llama3_scaling_type:
-                    raise ValueError(f'Ambiguous rope_scaling in config: {model_arg}')
-                scaling_type = llama2_scaling_type if llama2_scaling_type \
-                    else llama3_scaling_type
-                if rope_scaling.get('mrope_section') is not None:
-                    # TODO: treat mrope as an option to the common rope functions
-                    scaling_type = 'mrope'
-                scaling_factor = rope_scaling.get('factor', 0.0)
-                if scaling_type == 'default':
-                    pass
-                elif scaling_type == 'dynamic':
-                    rope_param.type = 'dynamic'
-                    rope_param.factor = scaling_factor
-                    rope_param.max_position_embeddings = max_position_embeddings
-                elif scaling_type == 'linear':
-                    rope_param.type = 'linear'
-                    rope_param.factor = scaling_factor
-                elif scaling_type == 'llama3':
-                    low_freq_factor = rope_scaling.get('low_freq_factor', 1.0)
-                    high_freq_factor = rope_scaling.get('high_freq_factor', 1.0)
-                    original_max_position_embeddings = model_arg['rope_scaling'].get(
-                        'original_max_position_embeddings', 0)
-                    rope_param.type = 'llama3'
-                    rope_param.factor = scaling_factor
-                    rope_param.low_freq_factor = low_freq_factor
-                    rope_param.high_freq_factor = high_freq_factor
-                    rope_param.original_max_position_embeddings = original_max_position_embeddings
-                elif scaling_type == 'yarn':
-                    attention_factor = rope_scaling.get('attention_factor', None)
-                    if attention_factor is None:
-                        attention_factor = 0.1 * math.log(scaling_factor) + 1.0
-                    beta_fast = rope_scaling.get('beta_fast', 32.0)
-                    beta_slow = rope_scaling.get('beta_slow', 1.0)
-                    rope_param.type = 'yarn'
-                    if 'original_max_position_embeddings' in rope_scaling:
-                        original_max_position_embeddings = rope_scaling['original_max_position_embeddings']
-                        scaling_factor = max_position_embeddings / original_max_position_embeddings
-                    else:
-                        original_max_position_embeddings = max_position_embeddings
-                    rope_param.factor = scaling_factor
-                    rope_param.max_position_embeddings = original_max_position_embeddings
-                    rope_param.attention_factor = attention_factor
-                    rope_param.beta_fast = beta_fast
-                    rope_param.beta_slow = beta_slow
-                elif scaling_type == 'mrope':
-                    mrope_section = rope_scaling.get('mrope_section')
-                    rope_param.type = 'mrope'
-                    rope_param.mrope_section = mrope_section
+        model_arg = self.model_config
+        num_layer = model_arg['num_hidden_layers']
+        norm_eps = model_arg['rms_norm_eps']
+        attn_head_num = model_arg['num_attention_heads']
+        vocab_size = model_arg['vocab_size']
+        inter_size = model_arg['intermediate_size']
+        if 'num_key_value_heads' in model_arg:
+            kv_head_num = model_arg['num_key_value_heads']
+        else:
+            kv_head_num = model_arg['num_attention_heads']
+        hidden_units = model_arg['hidden_size']
+        head_dim = model_arg.get('head_dim', hidden_units // attn_head_num)
+        # compute rope param
+        rope_theta = float(model_arg.get('rope_theta', 10000.0))
+        max_position_embeddings = int(model_arg.get('max_position_embeddings', 0))
+        rope_param = RopeParam(type='default', base=rope_theta, dim=head_dim)
+        rope_scaling = model_arg.get('rope_scaling', None)
+        if isinstance(rope_scaling, dict):
+            llama2_scaling_type = rope_scaling.get('type', '')
+            llama3_scaling_type = rope_scaling.get('rope_type', '')
+            if llama2_scaling_type and llama3_scaling_type \
+                    and llama2_scaling_type != llama3_scaling_type:
+                raise ValueError(f'Ambiguous rope_scaling in config: {model_arg}')
+            scaling_type = llama2_scaling_type if llama2_scaling_type \
+                else llama3_scaling_type
+            if rope_scaling.get('mrope_section') is not None:
+                # TODO: treat mrope as an option to the common rope functions
+                scaling_type = 'mrope'
+            scaling_factor = rope_scaling.get('factor', 0.0)
+            if scaling_type == 'default':
+                pass
+            elif scaling_type == 'dynamic':
+                rope_param.type = 'dynamic'
+                rope_param.factor = scaling_factor
+                rope_param.max_position_embeddings = max_position_embeddings
+            elif scaling_type == 'linear':
+                rope_param.type = 'linear'
+                rope_param.factor = scaling_factor
+            elif scaling_type == 'llama3':
+                low_freq_factor = rope_scaling.get('low_freq_factor', 1.0)
+                high_freq_factor = rope_scaling.get('high_freq_factor', 1.0)
+                original_max_position_embeddings = model_arg['rope_scaling'].get('original_max_position_embeddings', 0)
+                rope_param.type = 'llama3'
+                rope_param.factor = scaling_factor
+                rope_param.low_freq_factor = low_freq_factor
+                rope_param.high_freq_factor = high_freq_factor
+                rope_param.original_max_position_embeddings = original_max_position_embeddings
+            elif scaling_type == 'yarn':
+                attention_factor = rope_scaling.get('attention_factor', None)
+                if attention_factor is None:
+                    attention_factor = 0.1 * math.log(scaling_factor) + 1.0
+                beta_fast = rope_scaling.get('beta_fast', 32.0)
+                beta_slow = rope_scaling.get('beta_slow', 1.0)
+                rope_param.type = 'yarn'
+                if 'original_max_position_embeddings' in rope_scaling:
+                    original_max_position_embeddings = rope_scaling['original_max_position_embeddings']
+                    scaling_factor = max_position_embeddings / original_max_position_embeddings
                 else:
-                    raise RuntimeError(f'Unsupported rope type: {scaling_type}')
+                    original_max_position_embeddings = max_position_embeddings
+                rope_param.factor = scaling_factor
+                rope_param.max_position_embeddings = original_max_position_embeddings
+                rope_param.attention_factor = attention_factor
+                rope_param.beta_fast = beta_fast
+                rope_param.beta_slow = beta_slow
+            elif scaling_type == 'mrope':
+                mrope_section = rope_scaling.get('mrope_section')
+                rope_param.type = 'mrope'
+                rope_param.mrope_section = mrope_section
+            else:
+                raise RuntimeError(f'Unsupported rope type: {scaling_type}')
 
         return dict(size_per_head=head_dim,
                     num_layer=num_layer,
diff --git a/lmdeploy/turbomind/deploy/source_model/llava.py b/lmdeploy/turbomind/deploy/source_model/llava.py
@@ -11,7 +11,7 @@ class LlavaReader(LlamaReader):
     """LlavaReader for llama model."""
 
     attn_layer_prefix = 'language_model.model.layers'
-    attn_layer_patten = r'language_model.model.layers.([0-9]+).'
+    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
     tok_embeddings_key = 'language_model.model.embed_tokens.weight'
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
diff --git a/lmdeploy/turbomind/deploy/source_model/minicpmv.py b/lmdeploy/turbomind/deploy/source_model/minicpmv.py
@@ -11,7 +11,7 @@ class MiniCPMVReader(LlamaReader):
     """MiniCPMVReader for llama model."""
 
     attn_layer_prefix = 'llm.model.layers'
-    attn_layer_patten = r'llm.model.layers.([0-9]+).'
+    attn_layer_patten = r'llm\.model\.layers\.([0-9]+).'
     tok_embeddings_key = 'llm.model.embed_tokens.weight'
     norm_weight_key = 'llm.model.norm.weight'
     output_weight_key = 'llm.lm_head.weight'
diff --git a/lmdeploy/turbomind/deploy/source_model/molmo.py b/lmdeploy/turbomind/deploy/source_model/molmo.py
@@ -11,7 +11,7 @@
 
 class MolmoReader(LlamaReader):
     attn_layer_prefix = 'model.transformer.blocks'
-    attn_layer_patten = r'model.transformer.blocks.([0-9]+).'
+    attn_layer_patten = r'model\.transformer\.blocks\.([0-9]+).'
     norm_weight_key = 'model.transformer.ln_f.weight'
     output_weight_key = 'model.transformer.ff_out.weight'
 
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -12,7 +12,7 @@
 class QwenReader(LlamaReader):
     """QwenReader."""
 
-    attn_layer_patten = r'transformer.h.([0-9]+).'
+    attn_layer_patten = r'transformer\.h\.([0-9]+).'
     tok_embeddings_key = 'transformer.wte.weight'
     norm_weight_key = 'transformer.ln_f.weight'
     output_weight_key = 'lm_head.weight'
@@ -124,28 +124,28 @@ def moe_ffn_expert(self, e=None, i=None, kind=None):
             return self.filter(r'experts')
         result = []
         for key in ['gate', 'down', 'up']:
-            name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
+            name = f'{self.attn_layer_prefix}.{i}.mlp.experts.{e}.{key}_proj.{kind}'
             tensor = self.params.get(name)
             tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
 
     def moe_ffn_gate(self, i):
-        return self.transform(self.params.get(f'model.layers.{i}.mlp.gate.weight'), 'weight')
+        return self.transform(self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.gate.weight'), 'weight')
 
     def _ffn(self, i: int, kind: str):
         """Get ffn kind for layer i."""
         if not kind:
             return self.filter(self.ffn_pattern)
         result = []
         for key in ['gate', 'down', 'up']:
-            tensor = self.params[f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}']
+            tensor = self.params[f'{self.attn_layer_prefix}.{i}.mlp.shared_expert.{key}_proj.{kind}']
             tensor = self.transform(tensor, kind)
             result.append(tensor)
         return (*result, )
 
     def moe_ffn_shared_gate(self, i):
-        return self.params.get(f'model.layers.{i}.mlp.shared_expert_gate.weight')
+        return self.params.get(f'{self.attn_layer_prefix}.{i}.mlp.shared_expert_gate.weight')
 
 
 @INPUT_MODELS.register_module(name='qwen2-moe')
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -42,13 +42,12 @@
     InternVLChatModel='internvl',
     # internvl3
     InternVLForConditionalGeneration='internvl',
+    InternS1ForConditionalGeneration='internvl',
     # deepseek-vl
     MultiModalityCausalLM='deepseekvl',
     DeepseekV2ForCausalLM='deepseek2',
     # MiniCPMV
     MiniCPMV='minicpmv',
-    # mini gemini
-    MGMLlamaForCausalLM='llama',
     # chatglm2/3, glm4
     ChatGLMModel='glm4',
     ChatGLMForConditionalGeneration='glm4',
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc