@@ -9219,33 +9219,55 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
92199219
92209220 return [] # skip other tensors
92219221
9222+
9223+ @ModelBase .register ("Glm4vMoeForConditionalGeneration" )
9224+ class GLM4V_Text_MoE (Glm4MoeModel ):
9225+ """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)
9226+
9227+ ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
9228+ model_arch = gguf .MODEL_ARCH .GLM4_MOE
9229+
9230+ def set_gguf_parameters (self ):
9231+ # parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536
9232+ # should be correctly picked up from the text_config by the base classes
9233+ super ().set_gguf_parameters ()
9234+
9235+ def modify_tensors (
9236+ self , data_torch : Tensor , name : str , bid : int | None
9237+ ) -> Iterable [tuple [str , Tensor ]]:
9238+ # skip vision tensors for the text model
9239+ if name .startswith ("model.visual." ):
9240+ return []
9241+
9242+ # the Glm4MoeModel class expects tensor names to start with 'model.',
9243+ # so we strip the we strip the 'language_model.' part
9244+ if name .startswith ("model.language_model." ):
9245+ name = name .replace ("model.language_model." , "model." , 1 )
9246+
9247+ # let the parent class handle the MoE logic and tensor mapping
9248+ yield from super ().modify_tensors (data_torch , name , bid )
9249+
9250+
92229251@ModelBase .register ("Glm4vMoeForConditionalGeneration" )
92239252class GLM4V_MoE (MmprojModel ):
9253+ """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
9254+
9255+ ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
92249256 #
9225- # the HF model's type is `glm4v_moe`. internally, it consists of two models:
9226- # - `glm4v_moe_text`
9227- # + main text model
9228- # + tensor names start with "model.language_model."
9229- # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation
9230- # - `glm4v_moe`
9231- # + vision adapter (ViT)
9232- # + tensor names start with "model.visual."
9233- # + "3D-RoPE" (without the interpolation mentioned above)
9234- #
9235- # other notable quirks include:
9236- # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air)
9237- # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air
9238- # - the model's vision supports video input, but this is not implemented here
9239- #
9240- # for more info, refer to:
9241- # - reference impl : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe
9242- # - HF model card : https://huggingface.co/zai-org/GLM-4.5V
9243- # - arXiv paper (model) : https://arxiv.org/abs/2507.01006
9244- # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402
9245- #
9246- # TODO: the model's tokenizer has video-related special tokens - deal with these (??)
9257+ # TODO: this is not complete yet! need to handle custom RoPE nonsense.
92479258 #
9248- pass
9259+ def set_gguf_parameters (self ):
9260+ super ().set_gguf_parameters ()
9261+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .GLM4V )
9262+ self .gguf_writer .add_vision_use_gelu (True )
9263+ if (ln_eps := self .find_vparam (["layer_norm_eps" ], optional = True )) is not None :
9264+ self .gguf_writer .add_vision_attention_layernorm_eps (ln_eps )
9265+
9266+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
9267+ if name .startswith ("model.visual." ):
9268+ yield self .map_tensor_name (name ), data_torch
9269+ else :
9270+ return
92499271
92509272
92519273###### CONVERSION LOGIC ######
0 commit comments