Skip to content

Commit d573a17

Browse files
authored
support internvl3 (#3842)
* update * tested * revert device --------- Co-authored-by: hjh <[email protected]>
1 parent 0f07783 commit d573a17

File tree

5 files changed

+56
-1
lines changed

5 files changed

+56
-1
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,13 @@
620620
|[OpenGVLab/InternVL2_5-26B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-26B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|&#x2718;|vision, video|[OpenGVLab/InternVL2_5-26B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-26B-MPO)|
621621
|[OpenGVLab/InternVL2_5-38B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-38B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|&#x2718;|vision, video|[OpenGVLab/InternVL2_5-38B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-38B-MPO)|
622622
|[OpenGVLab/InternVL2_5-78B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-78B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|&#x2718;|vision, video|[OpenGVLab/InternVL2_5-78B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-78B-MPO)|
623+
|[OpenGVLab/InternVL3-1B](https://modelscope.cn/models/OpenGVLab/InternVL3-1B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-1B](https://huggingface.co/OpenGVLab/InternVL3-1B)|
624+
|[OpenGVLab/InternVL3-2B](https://modelscope.cn/models/OpenGVLab/InternVL3-2B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-2B](https://huggingface.co/OpenGVLab/InternVL3-2B)|
625+
|[OpenGVLab/InternVL3-8B](https://modelscope.cn/models/OpenGVLab/InternVL3-8B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-8B](https://huggingface.co/OpenGVLab/InternVL3-8B)|
626+
|[OpenGVLab/InternVL3-9B](https://modelscope.cn/models/OpenGVLab/InternVL3-9B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-9B](https://huggingface.co/OpenGVLab/InternVL3-9B)|
627+
|[OpenGVLab/InternVL3-14B](https://modelscope.cn/models/OpenGVLab/InternVL3-14B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-14B](https://huggingface.co/OpenGVLab/InternVL3-14B)|
628+
|[OpenGVLab/InternVL3-38B](https://modelscope.cn/models/OpenGVLab/InternVL3-38B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-38B](https://huggingface.co/OpenGVLab/InternVL3-38B)|
629+
|[OpenGVLab/InternVL3-78B](https://modelscope.cn/models/OpenGVLab/InternVL3-78B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-78B](https://huggingface.co/OpenGVLab/InternVL3-78B)|
623630
|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b)|xcomposer2|ixcomposer2|-|&#x2718;|vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
624631
|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)|xcomposer2_4khd|ixcomposer2|-|&#x2718;|vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
625632
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b)|xcomposer2_5|xcomposer2_5|decord|&#x2718;|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,13 @@ The table below introduces the models integrated with ms-swift:
620620
|[OpenGVLab/InternVL2_5-26B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-26B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|&#x2718;|vision, video|[OpenGVLab/InternVL2_5-26B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-26B-MPO)|
621621
|[OpenGVLab/InternVL2_5-38B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-38B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|&#x2718;|vision, video|[OpenGVLab/InternVL2_5-38B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-38B-MPO)|
622622
|[OpenGVLab/InternVL2_5-78B-MPO](https://modelscope.cn/models/OpenGVLab/InternVL2_5-78B-MPO)|internvl2_5|internvl2_5|transformers>=4.36, timm|&#x2718;|vision, video|[OpenGVLab/InternVL2_5-78B-MPO](https://huggingface.co/OpenGVLab/InternVL2_5-78B-MPO)|
623+
|[OpenGVLab/InternVL3-1B](https://modelscope.cn/models/OpenGVLab/InternVL3-1B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-1B](https://huggingface.co/OpenGVLab/InternVL3-1B)|
624+
|[OpenGVLab/InternVL3-2B](https://modelscope.cn/models/OpenGVLab/InternVL3-2B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-2B](https://huggingface.co/OpenGVLab/InternVL3-2B)|
625+
|[OpenGVLab/InternVL3-8B](https://modelscope.cn/models/OpenGVLab/InternVL3-8B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-8B](https://huggingface.co/OpenGVLab/InternVL3-8B)|
626+
|[OpenGVLab/InternVL3-9B](https://modelscope.cn/models/OpenGVLab/InternVL3-9B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-9B](https://huggingface.co/OpenGVLab/InternVL3-9B)|
627+
|[OpenGVLab/InternVL3-14B](https://modelscope.cn/models/OpenGVLab/InternVL3-14B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-14B](https://huggingface.co/OpenGVLab/InternVL3-14B)|
628+
|[OpenGVLab/InternVL3-38B](https://modelscope.cn/models/OpenGVLab/InternVL3-38B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-38B](https://huggingface.co/OpenGVLab/InternVL3-38B)|
629+
|[OpenGVLab/InternVL3-78B](https://modelscope.cn/models/OpenGVLab/InternVL3-78B)|internvl3|internvl2_5|transformers>=4.37.2, timm|&#x2718;|vision, video|[OpenGVLab/InternVL3-78B](https://huggingface.co/OpenGVLab/InternVL3-78B)|
623630
|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b)|xcomposer2|ixcomposer2|-|&#x2718;|vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
624631
|[Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-4khd-7b)|xcomposer2_4khd|ixcomposer2|-|&#x2718;|vision|[internlm/internlm-xcomposer2-4khd-7b](https://huggingface.co/internlm/internlm-xcomposer2-4khd-7b)|
625632
|[Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2d5-7b)|xcomposer2_5|xcomposer2_5|decord|&#x2718;|vision|[internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)|

swift/llm/model/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ class MLLMModelType:
151151
internvl2 = 'internvl2'
152152
internvl2_phi3 = 'internvl2_phi3'
153153
internvl2_5 = 'internvl2_5'
154+
internvl3 = 'internvl3'
154155
xcomposer2 = 'xcomposer2'
155156
xcomposer2_4khd = 'xcomposer2_4khd'
156157
xcomposer2_5 = 'xcomposer2_5'

swift/llm/model/model/internlm.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,28 @@ def get_model_tokenizer_internvl(model_dir: str,
276276
tags=['vision', 'video'],
277277
))
278278

279+
register_model(
280+
ModelMeta(
281+
MLLMModelType.internvl3,
282+
[
283+
ModelGroup([
284+
Model('OpenGVLab/InternVL3-1B', 'OpenGVLab/InternVL3-1B'),
285+
Model('OpenGVLab/InternVL3-2B', 'OpenGVLab/InternVL3-2B'),
286+
Model('OpenGVLab/InternVL3-8B', 'OpenGVLab/InternVL3-8B'),
287+
Model('OpenGVLab/InternVL3-9B', 'OpenGVLab/InternVL3-9B'),
288+
Model('OpenGVLab/InternVL3-14B', 'OpenGVLab/InternVL3-14B'),
289+
Model('OpenGVLab/InternVL3-38B', 'OpenGVLab/InternVL3-38B'),
290+
Model('OpenGVLab/InternVL3-78B', 'OpenGVLab/InternVL3-78B'),
291+
]),
292+
],
293+
TemplateType.internvl2_5,
294+
get_model_tokenizer_internvl,
295+
architectures=['InternVLChatModel'],
296+
model_arch=ModelArch.internvl,
297+
requires=['transformers>=4.37.2', 'timm'],
298+
tags=['vision', 'video'],
299+
))
300+
279301
register_model(
280302
ModelMeta(
281303
MLLMModelType.xcomposer2_5,

tests/test_align/test_template/test_vision.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,22 @@ def test_internvl2_phi3():
8181
_infer_model(pt_engine, system='')
8282

8383

84+
def test_internvl3_8b():
85+
pt_engine = PtEngine('OpenGVLab/InternVL3-8B')
86+
response = _infer_model(pt_engine)
87+
pt_engine.default_template.template_backend = 'jinja'
88+
response2 = _infer_model(pt_engine, system='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。')
89+
assert response == response2
90+
91+
92+
def test_internvl3_9b():
93+
pt_engine = PtEngine('OpenGVLab/InternVL3-9B')
94+
response = _infer_model(pt_engine)
95+
pt_engine.default_template.template_backend = 'jinja'
96+
response2 = _infer_model(pt_engine, system='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。')
97+
assert response == response2
98+
99+
84100
def test_llava():
85101
pt_engine = PtEngine('AI-ModelScope/llava-v1.6-mistral-7b')
86102
_infer_model(pt_engine)
@@ -570,4 +586,6 @@ def test_llama4():
570586
# test_ui_tars()
571587
# test_gemma3_vision()
572588
# test_mistral_2503()
573-
test_llama4()
589+
# test_llama4()
590+
test_internvl3_8b()
591+
test_internvl3_9b()

0 commit comments

Comments
 (0)