initial commit for branch glm45v

ddh0 · ddh0 · commit 36955c35b715 · 2025-10-14T02:13:16.000-05:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -9219,6 +9219,35 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [] # skip other tensors
 
+@ModelBase.register("Glm4vMoeForConditionalGeneration")
+class GLM4V_MoE(MmprojModel):
+    #
+    # the HF model's type is `glm4v_moe`. internally, it consists of two models:
+    # - `glm4v_moe_text`
+    # + main text model
+    # + tensor names start with "model.language_model."
+    # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation
+    # - `glm4v_moe`
+    # + vision adapter (ViT)
+    # + tensor names start with "model.visual."
+    # + "3D-RoPE" (without the interpolation mentioned above)
+    #
+    # other notable quirks include:
+    # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air)
+    # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air
+    # - the model's vision supports video input, but this is not implemented here
+    #
+    # for more info, refer to:
+    # - reference impl          : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe
+    # - HF model card           : https://huggingface.co/zai-org/GLM-4.5V
+    # - arXiv paper (model)     : https://arxiv.org/abs/2507.01006
+    # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402
+    #
+    # TODO: the model's tokenizer has video-related special tokens - deal with these (??)
+    #
+    pass
+
+
 ###### CONVERSION LOGIC ######
 
 
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -69,6 +69,7 @@ enum llm_arch {
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
     LLM_ARCH_GLM4_MOE,
+    LLM_ARCH_GLM4V_MOE,
     LLM_ARCH_BITNET,
     LLM_ARCH_T5,
     LLM_ARCH_T5ENCODER,
@@ -122,7 +123,6 @@ enum llm_kv {
     LLM_KV_GENERAL_LICENSE,
     LLM_KV_GENERAL_SOURCE_URL,
     LLM_KV_GENERAL_SOURCE_HF_REPO,
-
     LLM_KV_VOCAB_SIZE,
     LLM_KV_CONTEXT_LENGTH,
     LLM_KV_EMBEDDING_LENGTH,