support glm-edge & glm-edge-v (#2526)

Jintao-Huang · web-flow · commit d2b45ec8c6ac · 2024-11-29T14:51:11.000+08:00
diff --git a/README.md b/README.md
@@ -55,7 +55,8 @@ You can contact us and communicate with us by adding our group:
 <img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
 
 ## 🎉 News
-- 2024.11.28: Supports the model qwq-32b-preview, macro-o1, and the dataset open-o1. Use `swift infer --model_type qwq-32b-preview` for the experience.
+- 2024.11.29: Support for glm-edge and glm-edge-v series models. Use `swift infer --model_type glm-edge-v-2b` for the experience.
+- 2024.11.28: Supports the model qwq-32b-preview, marco-o1, and the dataset open-o1. Use `swift infer --model_type qwq-32b-preview` for the experience.
 - 2024.11.12: Supports training and deployment of the qwen2.5-coder series models: 0.5b, 3b, 14b, and 32b. Use `swift infer --model_type qwen2_5-coder-3b-instruct` to experience it.
 - 2024.10.26: Support for training and deploying aya-expanse series models. Experience it using `swift infer --model_type aya-expanse-32b`.
 - 2024.10.23: Support for training and deploying emu3-chat. Experience it using `swift infer --model_type emu3-chat`.
diff --git a/README_CN.md b/README_CN.md
@@ -56,7 +56,8 @@ SWIFT具有丰富全面的文档，请查看我们的文档网站:
 
 
 ## 🎉 新闻
-- 2024.11.28: 支持模型qwq-32b-preview, macro-o1, 支持数据集open-o1. 使用`swift infer --model_type qwq-32b-preview`进行体验.
+- 2024.11.29: 支持glm-edge和glm-edge-v系列模型. 使用`swift infer --model_type glm-edge-v-2b`进行体验.
+- 2024.11.28: 支持模型qwq-32b-preview, marco-o1, 支持数据集open-o1. 使用`swift infer --model_type qwq-32b-preview`进行体验.
 - 2024.11.12: 支持qwen2.5-coder系列模型0.5b, 3b, 14b, 32b的训练到部署. 使用`swift infer --model_type qwen2_5-coder-3b-instruct`进行体验.
 - 2024.10.26: 支持aya-expanse系列模型的训练到部署. 使用`swift infer --model_type aya-expanse-32b`进行体验.
 - 2024.10.23: 支持emu3-chat的训练到部署. 使用`swift infer --model_type emu3-chat`进行体验.
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -193,6 +193,8 @@
 |glm4-9b-chat|[ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat/summary)|query_key_value|chatglm4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.42|-|[THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)|
 |glm4-9b-chat-1m|[ZhipuAI/glm-4-9b-chat-1m](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m/summary)|query_key_value|chatglm4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.42|-|[THUDM/glm-4-9b-chat-1m](https://huggingface.co/THUDM/glm-4-9b-chat-1m)|
 |codegeex4-9b-chat|[ZhipuAI/codegeex4-all-9b](https://modelscope.cn/models/ZhipuAI/codegeex4-all-9b/summary)|query_key_value|codegeex4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers<4.42|coding|[THUDM/codegeex4-all-9b](https://huggingface.co/THUDM/codegeex4-all-9b)|
+|glm-edge-1_5b-chat|[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat/summary)|q_proj, k_proj, v_proj|chatglm4|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|-|[THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat)|
+|glm-edge-4b-chat|[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat/summary)|q_proj, k_proj, v_proj|chatglm4|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|-|[THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat)|
 |llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;||-|[meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)|
 |llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|&#x2714;|&#x2718;||-|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|
 |llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;||-|[meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf)|
@@ -475,6 +477,8 @@
 |qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
 |qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|glm-edge-v-2b|[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b/summary)|^(model.layers)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm-edge-v|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
+|glm-edge-v-5b|[ZhipuAI/glm-edge-v-5b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-5b/summary)|^(model.layers)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm-edge-v|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|vision|[THUDM/glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b)|
 |llama3_2-11b-vision|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
 |llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
 |llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -193,6 +193,8 @@ The table below introcudes all models supported by SWIFT:
 |glm4-9b-chat|[ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat/summary)|query_key_value|chatglm4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.42|-|[THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)|
 |glm4-9b-chat-1m|[ZhipuAI/glm-4-9b-chat-1m](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat-1m/summary)|query_key_value|chatglm4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.42|-|[THUDM/glm-4-9b-chat-1m](https://huggingface.co/THUDM/glm-4-9b-chat-1m)|
 |codegeex4-9b-chat|[ZhipuAI/codegeex4-all-9b](https://modelscope.cn/models/ZhipuAI/codegeex4-all-9b/summary)|query_key_value|codegeex4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers<4.42|coding|[THUDM/codegeex4-all-9b](https://huggingface.co/THUDM/codegeex4-all-9b)|
+|glm-edge-1_5b-chat|[ZhipuAI/glm-edge-1.5b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-1.5b-chat/summary)|q_proj, k_proj, v_proj|chatglm4|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|-|[THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat)|
+|glm-edge-4b-chat|[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat/summary)|q_proj, k_proj, v_proj|chatglm4|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|-|[THUDM/glm-edge-4b-chat](https://huggingface.co/THUDM/glm-edge-4b-chat)|
 |llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;||-|[meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)|
 |llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|&#x2714;|&#x2718;||-|[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|
 |llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;||-|[meta-llama/Llama-2-13b-hf](https://huggingface.co/meta-llama/Llama-2-13b-hf)|
@@ -475,6 +477,8 @@ The table below introcudes all models supported by SWIFT:
 |qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
 |qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.dev.0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|glm-edge-v-2b|[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b/summary)|^(model.layers)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm-edge-v|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
+|glm-edge-v-5b|[ZhipuAI/glm-edge-v-5b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-5b/summary)|^(model.layers)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm-edge-v|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.46|vision|[THUDM/glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b)|
 |llama3_2-11b-vision|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
 |llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
 |llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -259,6 +259,11 @@ class ModelType:
     glm4_9b_chat = 'glm4-9b-chat'
     glm4_9b_chat_1m = 'glm4-9b-chat-1m'
     codegeex4_9b_chat = 'codegeex4-9b-chat'
+
+    glm_edge_1_5b_chat = 'glm-edge-1_5b-chat'
+    glm_edge_4b_chat = 'glm-edge-4b-chat'
+    glm_edge_v_2b = 'glm-edge-v-2b'
+    glm_edge_v_5b = 'glm-edge-v-5b'
     # llama2
     llama2_7b = 'llama2-7b'
     llama2_7b_chat = 'llama2-7b-chat'
@@ -711,6 +716,7 @@ class LoRATM(NamedTuple):
     molmo = 'molmo'
     deepseek_janus = 'deepseek_janus'
     emu3_chat = 'emu3_chat'
+    glm_edge_v = 'glm_edge_v'
     # default lora target modules for nlp llms.
     minicpm3 = ['q_a_proj', 'q_b_proj', 'kv_a_proj_with_mqa', 'kv_b_proj']
     baichuan = ['W_pack']
@@ -5158,6 +5164,22 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     return model, tokenizer
 
 
+@register_model(
+    ModelType.glm_edge_1_5b_chat,
+    'ZhipuAI/glm-edge-1.5b-chat',
+    LoRATM.llama,
+    TemplateType.chatglm4,
+    support_flash_attn=True,
+    requires=['transformers>=4.46'],
+    hf_model_id='THUDM/glm-edge-1.5b-chat')
+@register_model(
+    ModelType.glm_edge_4b_chat,
+    'ZhipuAI/glm-edge-4b-chat',
+    LoRATM.llama,
+    TemplateType.chatglm4,
+    support_flash_attn=True,
+    requires=['transformers>=4.46'],
+    hf_model_id='THUDM/glm-edge-4b-chat')
 @register_model(
     ModelType.llama3_1_nemotron_70B_instruct_hf,
     'AI-ModelScope/Llama-3.1-Nemotron-70B-Instruct-HF',
@@ -6652,6 +6674,34 @@ def get_model_tokenizer_llava_hf(model_dir: str, *args, **kwargs):
     return model, tokenizer
 
 
+@register_model(
+    ModelType.glm_edge_v_2b,
+    'ZhipuAI/glm-edge-v-2b',
+    LoRATM.glm_edge_v,
+    TemplateType.glm_edge_v,
+    support_flash_attn=True,
+    placeholder_tokens=['<|begin_of_image|>'],
+    requires=['transformers>=4.46'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='THUDM/glm-edge-v-2b')
+@register_model(
+    ModelType.glm_edge_v_5b,
+    'ZhipuAI/glm-edge-v-5b',
+    LoRATM.glm_edge_v,
+    TemplateType.glm_edge_v,
+    support_flash_attn=True,
+    requires=['transformers>=4.46'],
+    placeholder_tokens=['<|begin_of_image|>'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='THUDM/glm-edge-v-5b')
+def get_model_tokenizer_glm_edge_v(model_dir: str, *args, **kwargs):
+    from transformers import AutoImageProcessor
+    processor = AutoImageProcessor.from_pretrained(model_dir)
+    model, tokenizer = get_model_tokenizer_with_flash_attn(model_dir, *args, **kwargs)
+    tokenizer.processor = processor
+    return model, tokenizer
+
+
 @register_model(
     ModelType.llama3_2_11b_vision,
     'LLM-Research/Llama-3.2-11B-Vision',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -62,6 +62,7 @@ class TemplateType:
     chatglm2 = 'chatglm2'
     chatglm3 = 'chatglm3'
     chatglm4 = 'chatglm4'
+    glm_edge_v = 'glm-edge-v'
     codegeex4 = 'codegeex4'
     llama = 'llama'  # llama2
     llama3 = 'llama3'
@@ -1920,6 +1921,30 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
 
 register_template(TemplateType.glm4v, GLM4VTemplate(), infer_media_type='dialogue', lazy_tokenize=True, use_model=True)
 
+
+class GLMEdgeVTemplate(GLMTemplate):
+
+    def __init__(self):
+        super().__init__([], ['<|user|>\\n{{QUERY}}\\n<|assistant|>\\n'], ['\\n'], ['<|endoftext|>'], None,
+                         ['<|system|>\\n{{SYSTEM}}\\n'])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return ['<|begin_of_image|>' * 578]
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        processor = self.tokenizer.processor
+        images = example['images']
+        if images:
+            inputs['pixel_values'] = torch.tensor(processor(images).pixel_values)
+        return inputs, {}
+
+
+register_template(TemplateType.glm_edge_v, GLMEdgeVTemplate(), lazy_tokenize=True, use_model=True)
+
 register_template(
     TemplateType.yi_vl,
     YiVLTemplate([], [[8308], 'Human: {{QUERY}}\n', [8308], 'Assistant:'], ['\n'], ['\n', [8308]], yi_vl_default_system,
diff --git a/swift/utils/module_mapping.py b/swift/utils/module_mapping.py
@@ -319,13 +319,16 @@ def __post_init__(self):
 
 EMU3_CHAT_KEYS = MultiModelKeys(language_model='model', )
 
+GLM_EDGE_V = MultiModelKeys(language_model='model.layers', vision_tower='model.vision')
+
 MODEL_KEYS_MAPPING = OrderedDict([
     # MLLM here
     ('qwen_audio', QWEN_AUDIO_KEYS),
     ('qwen_vl', QWEN_VL_KEYS),
     ('qwen2_audio', QWEN2_AUDIO_KEYS),
     ('qwen2_vl', QWEN2_VL_KEYS),
     ('glm4v', GLM4V_KEYS),
+    ('glm_edge_v', GLM_EDGE_V),
     ('llava_next_video', LLAVA_NEXT_VIDEO_KEYS),
     ('llava_llama', LLAVA_LLAMA_KEYS),
     ('llava', LLAVA_KEYS),