From 8448b23afdef5e9940ef8eac32bb62702ea0defa Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 02:13:16 -0500 Subject: [PATCH 01/10] initial commit for branch glm45v --- convert_hf_to_gguf.py | 29 +++++++++++++++++++++++++++++ src/llama-arch.h | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8c5132193e0e0..36278866da005 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9219,6 +9219,35 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors +@ModelBase.register("Glm4vMoeForConditionalGeneration") +class GLM4V_MoE(MmprojModel): + # + # the HF model's type is `glm4v_moe`. internally, it consists of two models: + # - `glm4v_moe_text` + # + main text model + # + tensor names start with "model.language_model." + # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation + # - `glm4v_moe` + # + vision adapter (ViT) + # + tensor names start with "model.visual." + # + "3D-RoPE" (without the interpolation mentioned above) + # + # other notable quirks include: + # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air) + # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air + # - the model's vision supports video input, but this is not implemented here + # + # for more info, refer to: + # - reference impl : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe + # - HF model card : https://huggingface.co/zai-org/GLM-4.5V + # - arXiv paper (model) : https://arxiv.org/abs/2507.01006 + # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402 + # + # TODO: the model's tokenizer has video-related special tokens - deal with these (??) + # + pass + + ###### CONVERSION LOGIC ###### diff --git a/src/llama-arch.h b/src/llama-arch.h index c41de89859d5c..831ec378ef332 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -70,6 +70,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, + LLM_ARCH_GLM4V_MOE, LLM_ARCH_BITNET, LLM_ARCH_T5, LLM_ARCH_T5ENCODER, @@ -123,7 +124,6 @@ enum llm_kv { LLM_KV_GENERAL_LICENSE, LLM_KV_GENERAL_SOURCE_URL, LLM_KV_GENERAL_SOURCE_HF_REPO, - LLM_KV_VOCAB_SIZE, LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, From 70c86861a4a0a4620f5591062240df0bb802aded Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 16:47:12 -0500 Subject: [PATCH 02/10] use F32 accumulators for GLM4V_MOE --- src/llama-graph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f29a1e98c9103..ffc2187a1b107 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -817,7 +817,7 @@ ggml_tensor * llm_graph_context::build_ffn( if (down) { cur = build_lora_mm(down, cur); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { + if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) { // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } @@ -1583,7 +1583,7 @@ ggml_tensor * llm_graph_context::build_attn( if (wo) { cur = build_lora_mm(wo, cur); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { + if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) { // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } From 631d4fa8693b7617d8c50e8824b54f9f3580ad5e Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 16:48:21 -0500 Subject: [PATCH 03/10] add arch --- gguf-py/gguf/constants.py | 34 ++++++++++++++++++++++++++++++---- src/llama-arch.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f5e5fba8008bd..0afc58331b565 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -385,6 +385,7 @@ class MODEL_ARCH(IntEnum): CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() + GLM4V_MOE = auto() BITNET = auto() T5 = auto() T5ENCODER = auto() @@ -656,10 +657,10 @@ class MODEL_TENSOR(IntEnum): A_MM_NORM_PRE = auto() A_MM_NORM_MID = auto() # nextn/mtp - NEXTN_EH_PROJ = auto() - NEXTN_EMBED_TOKENS = auto() - NEXTN_ENORM = auto() - NEXTN_HNORM = auto() + NEXTN_EH_PROJ = auto() + NEXTN_EMBED_TOKENS = auto() + NEXTN_ENORM = auto() + NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() @@ -729,6 +730,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.GLM4_MOE: "glm4moe", + MODEL_ARCH.GLM4V_MOE: "glm4v_moe", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", @@ -2273,6 +2275,30 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], + MODEL_ARCH.GLM4V_MOE: [ # same as GLM4_MOE without MTP tensors + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], MODEL_ARCH.BITNET: [ MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b7e00b275b6f7..f2a8cbdf99a2e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -66,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, + { LLM_ARCH_GLM4V_MOE, "glm4v_moe" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5ENCODER, "t5encoder" }, @@ -1507,6 +1508,33 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, }, }, + { + LLM_ARCH_GLMV4_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + }, + }, { LLM_ARCH_BITNET, { From 2aa698558b0bfa58f8c759b81b34868cb8947086 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 17:10:55 -0500 Subject: [PATCH 04/10] llama-model : add placeholders --- src/llama-model.cpp | 20 ++++++++++++++++++++ src/llama-model.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5002bd42ff04e..dc0ab0bf6a6f7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1611,6 +1611,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GLM4V_MOE: + { + // TODO + } break; case LLM_ARCH_BITNET: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -4892,6 +4896,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; + case LLM_ARCH_GLM4V_MOE: + { + // TODO + } + break; case LLM_ARCH_NEMOTRON: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -14683,6 +14692,12 @@ struct llm_build_glm4_moe : public llm_graph_context { } }; +struct llm_build_glm4v_moe : public llm_graph_context { + llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + // TODO + } +}; + struct llm_build_nemotron : public llm_graph_context { llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -19750,6 +19765,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_GLM4V_MOE: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_BITNET: { llm = std::make_unique(*this, params); @@ -20119,6 +20138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: + case LLM_ARCH_GLM4V_MOE: return LLAMA_ROPE_TYPE_MROPE; // all model arches should be listed explicitly here diff --git a/src/llama-model.h b/src/llama-model.h index 7f48662f2807a..2c9b05fbc790f 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -110,7 +110,7 @@ enum llm_type { LLM_TYPE_8B_A1B, // lfm2moe LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_30B_A3B, - LLM_TYPE_106B_A12B, // GLM-4.5-Air + LLM_TYPE_106B_A12B, // GLM-4.5-Air (and GLM-4.5V text model) LLM_TYPE_235B_A22B, LLM_TYPE_300B_A47B, // Ernie MoE big LLM_TYPE_355B_A32B, // GLM-4.5 From d0e9dce27d92a4be7e901ed9cec92484dd1f78a0 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 17:13:29 -0500 Subject: [PATCH 05/10] fix arch name for tensor names --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f2a8cbdf99a2e..6964c75ac6268 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1509,7 +1509,7 @@ static const std::map> LLM_TENSOR_N }, }, { - LLM_ARCH_GLMV4_MOE, + LLM_ARCH_GLM4V_MOE, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, From 01d085dd4ac152ded1db173551fb322010e4d056 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 17:44:50 -0500 Subject: [PATCH 06/10] WIP conversion logic --- convert_hf_to_gguf.py | 68 ++++++++++++++++++++++++++------------- gguf-py/gguf/constants.py | 2 ++ 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 36278866da005..ee80dd9a568f5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9219,33 +9219,55 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors + +@ModelBase.register("Glm4vMoeForConditionalGeneration") +class GLM4V_Text_MoE(Glm4MoeModel): + """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V) + + ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" + model_arch = gguf.MODEL_ARCH.GLM4_MOE + + def set_gguf_parameters(self): + # parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536 + # should be correctly picked up from the text_config by the base classes + super().set_gguf_parameters() + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + # skip vision tensors for the text model + if name.startswith("model.visual."): + return [] + + # the Glm4MoeModel class expects tensor names to start with 'model.', + # so we strip the we strip the 'language_model.' part + if name.startswith("model.language_model."): + name = name.replace("model.language_model.", "model.", 1) + + # let the parent class handle the MoE logic and tensor mapping + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Glm4vMoeForConditionalGeneration") class GLM4V_MoE(MmprojModel): + """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V). + + ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" # - # the HF model's type is `glm4v_moe`. internally, it consists of two models: - # - `glm4v_moe_text` - # + main text model - # + tensor names start with "model.language_model." - # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation - # - `glm4v_moe` - # + vision adapter (ViT) - # + tensor names start with "model.visual." - # + "3D-RoPE" (without the interpolation mentioned above) - # - # other notable quirks include: - # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air) - # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air - # - the model's vision supports video input, but this is not implemented here - # - # for more info, refer to: - # - reference impl : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe - # - HF model card : https://huggingface.co/zai-org/GLM-4.5V - # - arXiv paper (model) : https://arxiv.org/abs/2507.01006 - # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402 - # - # TODO: the model's tokenizer has video-related special tokens - deal with these (??) + # TODO: this is not complete yet! need to handle custom RoPE nonsense. # - pass + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + self.gguf_writer.add_vision_use_gelu(True) + if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None: + self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("model.visual."): + yield self.map_tensor_name(name), data_torch + else: + return ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0afc58331b565..c9708253163c8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -428,6 +428,7 @@ class VISION_PROJECTOR_TYPE(IntEnum): GLM_EDGE = auto() MERGER = auto() GEMMA3 = auto() + GLM4V = auto() class MODEL_TENSOR(IntEnum): @@ -3055,6 +3056,7 @@ class VisionProjectorType: VOXTRAL = "voxtral" LFM2 = "lfm2" KIMIVL = "kimivl" + GLM4V = "glm4v_moe" # Items here are (block size, type size) From 14cee9c9d748ae384f3ba00dc078ba66b648a649 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 18:03:11 -0500 Subject: [PATCH 07/10] better class names --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ee80dd9a568f5..664050f60c13a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9221,7 +9221,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Glm4vMoeForConditionalGeneration") -class GLM4V_Text_MoE(Glm4MoeModel): +class GLM4VMoEModel(Glm4MoeModel): """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V) ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" @@ -9249,7 +9249,7 @@ def modify_tensors( @ModelBase.register("Glm4vMoeForConditionalGeneration") -class GLM4V_MoE(MmprojModel): +class GLM4VMoEVisionModel(MmprojModel): """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V). ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" From e0b6064d90a48f9a3fa9b359c310f2bc032f1cff Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 19:32:15 -0500 Subject: [PATCH 08/10] add `clip.vision.rope.*` to GGUF constants need `clip.vision.rope.freq_base` for GLM-4.5V --- gguf-py/gguf/constants.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c9708253163c8..935a005930fc3 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -276,6 +276,21 @@ class ClipVision: USE_SILU = "clip.use_silu" N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl + class Rope: + DIMENSION_COUNT = "clip.vision.rope.dimension_count" + DIMENSION_SECTIONS = "clip.vision.rope.dimension_sections" + FREQ_BASE = "clip.vision.rope.freq_base" + SCALING_TYPE = "clip.vision.rope.scaling.type" + SCALING_FACTOR = "clip.vision.rope.scaling.factor" + SCALING_ATTN_FACTOR = "clip.vision.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "clip.vision.rope.scaling.original_context_length" + SCALING_FINETUNED = "clip.vision.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "clip.vision.rope.scaling.yarn_log_multiplier" + SCALING_YARN_EXT_FACTOR = "clip.vision.rope.scaling.yarn_ext_factor" + SCALING_YARN_ATTN_FACTOR = "clip.vision.rope.scaling.yarn_attn_factor" + SCALING_YARN_BETA_FAST = "clip.vision.rope.scaling.yarn_beta_fast" + SCALING_YARN_BETA_SLOW = "clip.vision.rope.scaling.yarn_beta_slow" + class Attention: HEAD_COUNT = "clip.vision.attention.head_count" LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" From 7bdc330708df45b6ec8b28d51ddb38236593155f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 19:32:39 -0500 Subject: [PATCH 09/10] add `add_vision_rope_freq_base` for GGUF metadata --- gguf-py/gguf/gguf_writer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 306679e21834b..5076b44866715 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1038,6 +1038,9 @@ def add_vision_head_count(self, value: int) -> None: def add_vision_attention_layernorm_eps(self, value: float) -> None: self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value) + def add_vision_rope_freq_base(self, value: float) -> None: + self.add_float32(Keys.ClipVision.Rope.FREQ_BASE, value) + def add_vision_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) From ed7c271047edf9b2dc98df8e747f9a35d745a688 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 19:33:17 -0500 Subject: [PATCH 10/10] set `clip.vision.rope.freq_base` during conversion --- convert_hf_to_gguf.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 664050f60c13a..b4e309c353030 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9254,16 +9254,27 @@ class GLM4VMoEVisionModel(MmprojModel): ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" # - # TODO: this is not complete yet! need to handle custom RoPE nonsense. + # TODO: this is not complete yet! # def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) self.gguf_writer.add_vision_use_gelu(True) + if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None: self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps) + # the ViT in GLM-4.5V applies its own RoPE inside its attention blocks + if (rope_theta := self.find_vparam(["rope_theta"], optional=True)) is not None: + self.gguf_writer.add_vision_rope_freq_base(rope_theta) + logger.info(f"gguf: vision rope theta = {rope_theta}") + else: + logger.warning('gguf: -------------------------------------------------------------') + logger.warning('gguf: missing vision rope theta! the conversion might be incorrect!') + logger.warning('gguf: -------------------------------------------------------------') + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused if name.startswith("model.visual."): yield self.map_tensor_name(name), data_torch else: