feat: support GLM 4.5 family of models

sammcj · sammcj · commit dbfadb661e15 · 2025-08-02T15:31:55.000+10:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -6605,9 +6605,9 @@ def set_vocab(self):
         self.gguf_writer.add_token_types(toktypes)
 
         # Special tokens
-        # BOS should be [gMASK] (151331), EOS should be <|endoftext|> (151329) as per official config
+        # BOS should be [gMASK] (151331), EOS should be <|endoftext|> (151329) as per tokenizer analysis
         special_vocab._set_special_token(
-            "eos", tokenizer.get_added_vocab()["<|endoftext|>"]  # 151329 - official EOS token
+            "eos", tokenizer.get_added_vocab()["<|endoftext|>"]  # 151329 - correct EOS token
         )
         special_vocab._set_special_token(
             "eot", tokenizer.get_added_vocab()["<|endoftext|>"]  # 151329 - same as EOS
@@ -6620,9 +6620,25 @@ def set_vocab(self):
         )
         special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"])  # 151338
 
-        if "/nothink" in tokenizer.get_added_vocab():
-            special_vocab._set_special_token("nothink", tokenizer.get_added_vocab()["/nothink"])  # 151360
+        if "<sop>" in tokenizer.get_added_vocab():
+            special_vocab._set_special_token("sop", tokenizer.get_added_vocab()["<sop>"])  # 151333
+        if "<eop>" in tokenizer.get_added_vocab():
+            special_vocab._set_special_token("eop", tokenizer.get_added_vocab()["<eop>"])  # 151334
+        if "[sMASK]" in tokenizer.get_added_vocab():
+            special_vocab._set_special_token("smask", tokenizer.get_added_vocab()["[sMASK]"])  # 151332
+
+        # TODO: clean up once decided on an approach to think and /nothink
+        #
+        # Previously:
+        # if "/nothink" in tokenizer.get_added_vocab():
+        #     special_vocab._set_special_token("nothink", tokenizer.get_added_vocab()["/nothink"])  # 151360
         # Note: <think> and </think> are regular tokens (special=false in official config), not special tokens
+        #
+        # Latest thinking is:
+        # NOTE: /nothink token exists but causes generation issues as mentioned in
+        # https://huggingface.co/zai-org/GLM-4.5/discussions/9
+        # "it is a very special token. Even as input, it will be encoded into a special token, causing generation issues."
+        # Therefore we do NOT add it to avoid generation problems
 
         special_vocab.add_to_gguf(self.gguf_writer)
 
@@ -6639,6 +6655,8 @@ def set_gguf_parameters(self):
         # MoE parameters - Use only routed expert count (shared experts handled separately)
         if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
             self.gguf_writer.add_expert_count(n_routed_experts)
+        if (num_experts_per_tok := self.hparams.get("num_experts_per_tok")) is not None:
+            self.gguf_writer.add_expert_used_count(num_experts_per_tok)
         if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
             self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
         if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
diff --git a/models/templates/README.md b/models/templates/README.md
@@ -21,4 +21,5 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct                      > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
 ./scripts/get_chat_template.py Qwen/QwQ-32B                                  > models/templates/Qwen-QwQ-32B.jinja
 ./scripts/get_chat_template.py Qwen/Qwen3-0.6B                               > models/templates/Qwen-Qwen3-0.6B.jinja
-```
+./scripts/get_chat_template.py zai-org/GLM-4.5                               > models/templates/zai-org-GLM-4.5.jinja
+```
diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt
@@ -1,5 +1,5 @@
 numpy~=1.26.4
 sentencepiece~=0.2.0
-transformers>=4.45.1,<5.0.0
+transformers>=4.54.1,<5.0.0
 gguf>=0.1.0
 protobuf>=4.21.0,<5.0.0
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
     if (model.arch == LLM_ARCH_GEMMA3N) {
         n_layer_cache = 20;
     }
+    if (model.arch == LLM_ARCH_GLM4_MOE) {
+        // GLM4_MOE: Only process first 46 transformer layers, skip NextN layer
+        n_layer_cache = hparams.n_layer - 1;
+    }
 
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -4397,6 +4397,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, final_layer), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
                     create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, final_layer), { n_embd }, TENSOR_NOT_REQUIRED);
 
+                    // Load ALL tensors including NextN layer to satisfy tensor count (803)
+                    // but only PROCESS first 46 transformer layers in forward pass
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
 
@@ -13492,7 +13494,10 @@ struct llm_build_glm4_moe : public llm_graph_context {
 
         ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-        for (int il = 0; il < n_layer; ++il) {
+        // Only process first 46 transformer layers (skip NextN layer 46)
+        // Layer 46 tensors are loaded but not processed in forward pass
+        const int n_transformer_layers = n_layer - 1;
+        for (int il = 0; il < n_transformer_layers; ++il) {
             ggml_tensor * inpSA = inpL;
 
             // Pre-attention norm
@@ -13554,7 +13559,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
                         Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
             }
 
-            if (il == n_layer - 1 && inp_out_ids) {
+            if (il == n_transformer_layers - 1 && inp_out_ids) {
                 cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }