Skip to content

Commit 01d085d

Browse files
committed
WIP conversion logic
1 parent 0a72591 commit 01d085d

File tree

2 files changed

+47
-23
lines changed

2 files changed

+47
-23
lines changed

convert_hf_to_gguf.py

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9219,33 +9219,55 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
92199219

92209220
return [] # skip other tensors
92219221

9222+
9223+
@ModelBase.register("Glm4vMoeForConditionalGeneration")
9224+
class GLM4V_Text_MoE(Glm4MoeModel):
9225+
"""Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)
9226+
9227+
ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
9228+
model_arch = gguf.MODEL_ARCH.GLM4_MOE
9229+
9230+
def set_gguf_parameters(self):
9231+
# parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536
9232+
# should be correctly picked up from the text_config by the base classes
9233+
super().set_gguf_parameters()
9234+
9235+
def modify_tensors(
9236+
self, data_torch: Tensor, name: str, bid: int | None
9237+
) -> Iterable[tuple[str, Tensor]]:
9238+
# skip vision tensors for the text model
9239+
if name.startswith("model.visual."):
9240+
return []
9241+
9242+
# the Glm4MoeModel class expects tensor names to start with 'model.',
9243+
# so we strip the we strip the 'language_model.' part
9244+
if name.startswith("model.language_model."):
9245+
name = name.replace("model.language_model.", "model.", 1)
9246+
9247+
# let the parent class handle the MoE logic and tensor mapping
9248+
yield from super().modify_tensors(data_torch, name, bid)
9249+
9250+
92229251
@ModelBase.register("Glm4vMoeForConditionalGeneration")
92239252
class GLM4V_MoE(MmprojModel):
9253+
"""Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
9254+
9255+
ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
92249256
#
9225-
# the HF model's type is `glm4v_moe`. internally, it consists of two models:
9226-
# - `glm4v_moe_text`
9227-
# + main text model
9228-
# + tensor names start with "model.language_model."
9229-
# + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation
9230-
# - `glm4v_moe`
9231-
# + vision adapter (ViT)
9232-
# + tensor names start with "model.visual."
9233-
# + "3D-RoPE" (without the interpolation mentioned above)
9234-
#
9235-
# other notable quirks include:
9236-
# - has MTP layer (need to keep these tensors - same as GLM-4.5-Air)
9237-
# - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air
9238-
# - the model's vision supports video input, but this is not implemented here
9239-
#
9240-
# for more info, refer to:
9241-
# - reference impl : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe
9242-
# - HF model card : https://huggingface.co/zai-org/GLM-4.5V
9243-
# - arXiv paper (model) : https://arxiv.org/abs/2507.01006
9244-
# - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402
9245-
#
9246-
# TODO: the model's tokenizer has video-related special tokens - deal with these (??)
9257+
# TODO: this is not complete yet! need to handle custom RoPE nonsense.
92479258
#
9248-
pass
9259+
def set_gguf_parameters(self):
9260+
super().set_gguf_parameters()
9261+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
9262+
self.gguf_writer.add_vision_use_gelu(True)
9263+
if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None:
9264+
self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps)
9265+
9266+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9267+
if name.startswith("model.visual."):
9268+
yield self.map_tensor_name(name), data_torch
9269+
else:
9270+
return
92499271

92509272

92519273
###### CONVERSION LOGIC ######

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
428428
GLM_EDGE = auto()
429429
MERGER = auto()
430430
GEMMA3 = auto()
431+
GLM4V = auto()
431432

432433

433434
class MODEL_TENSOR(IntEnum):
@@ -3055,6 +3056,7 @@ class VisionProjectorType:
30553056
VOXTRAL = "voxtral"
30563057
LFM2 = "lfm2"
30573058
KIMIVL = "kimivl"
3059+
GLM4V = "glm4v_moe"
30583060

30593061

30603062
# Items here are (block size, type size)

0 commit comments

Comments
 (0)