fixed according to reviewer's comments

Lpzhan931 · Lpzhan931 · commit 6b880935052e · 2025-11-04T00:24:14.000+08:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1270,28 +1270,6 @@ def _set_vocab_llama_hf(self):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-    def _set_vocab_pangu_embedded(self):
-        tokens, scores, toktypes = self._create_vocab_sentencepiece()
-
-        self.gguf_writer.add_tokenizer_model("pangu_embedded")
-        self.gguf_writer.add_tokenizer_pre("default")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        tokenizer_config_file = self.dir_model / "tokenizer_config.json"
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "chat_template" in tokenizer_config_json:
-                    self.gguf_writer.add_chat_template(tokenizer_config_json["chat_template"])
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-
     def _set_vocab_rwkv_world(self):
         assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
         vocab_size = self.hparams.get("vocab_size", 65536)
@@ -7212,12 +7190,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 class PanguEmbeddedModel(TextModel):
     model_arch = gguf.MODEL_ARCH.PANGU_EMBED
     
-    def set_vocab(self):
-        try:
-            self._set_vocab_pangu_embedded()
-        except FileNotFoundError:
-            print("pangu vocab set fail, fallback to sentencepiece!")   
-            self._set_vocab_sentencepiece()
+    def set_vocab(self):  
+        self._set_vocab_sentencepiece()
 
         tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
         if tokenizer_config_file.is_file():
@@ -7236,18 +7210,15 @@ def set_gguf_parameters(self):
             rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
         self.gguf_writer.add_rope_dimension_count(rope_dim)
 
-        if (head_dim := hparams.get("head_dim")) is None:
-            if "hidden_size" in hparams and "num_attention_heads" in hparams:
-                head_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-
-        if head_dim is not None:
-            self.gguf_writer.add_key_length(head_dim)
-            self.gguf_writer.add_value_length(head_dim)
+        if hparams.get("head_dim") is None:
+            self.gguf_writer.add_key_length(rope_dim)
+            self.gguf_writer.add_value_length(rope_dim)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid
-        n_head = self.find_hparam(["n_heads", "num_attention_heads"])
-        n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
+        if name == "lm_head.weight":
+            if self.hparams.get("tie_word_embeddings", False):
+                logger.info("Skipping tied output layer 'lm_head.weight'")
+                return []
         return [(self.map_tensor_name(name), data_torch)]
 
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -89,7 +89,6 @@ add_library(llama
             models/mamba.cpp
             models/minicpm3.cpp
             models/minimax-m2.cpp
-            models/pangu_embedded.cpp
             models/mpt.cpp
             models/nemotron-h.cpp
             models/nemotron.cpp
@@ -100,6 +99,7 @@ add_library(llama
             models/openai-moe-iswa.cpp
             models/openelm.cpp
             models/orion.cpp
+            models/pangu-embedded.cpp
             models/phi2.cpp
             models/phi3.cpp
             models/plamo.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -107,7 +107,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_APERTUS,          "apertus"          },
     { LLM_ARCH_MINIMAX_M2,       "minimax-m2"       },
     { LLM_ARCH_COGVLM,           "cogvlm"           },
-    {LLM_ARCH_PANGU_EMBED,       "pangu_embedded"   },
+    { LLM_ARCH_PANGU_EMBED,      "pangu_embedded"   },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
@@ -214,7 +214,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
         return LLM_CHAT_TEMPLATE_SEED_OSS;
     } else if (tmpl_contains("'Assistant: '  + message['content'] + '<|separator|>")) {
         return LLM_CHAT_TEMPLATE_GROK_2;
-    } else if (tmpl_contains("[unused9]") && tmpl_contains("[unused10]")) {
+    } else if (tmpl_contains("[unused9]") && tmpl_contains("message['content'] + '[unused10]'")) {
         return LLM_CHAT_TEMPLATE_PANGU_EMBED;
     }
     return LLM_CHAT_TEMPLATE_UNKNOWN;
@@ -840,9 +840,6 @@ int32_t llm_chat_apply_template(
                 ss << "[unused9]工具：" << content << "[unused10]";
             } else if (role == "function") {
                 ss << "[unused9]方法：" << content << "[unused10]";
-            } else {
-                // unknown role
-                ss << "[unused9]" << role << "：" << content << "[unused10]";
             }
         }
         if (add_ass) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -6275,11 +6275,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_PANGU_EMBED:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                     output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
                     
-                    // openPanguEmbedded-1B model's lm_head/output is 'tie_word_embeddings', the 7B model is not
-                    if(type == LLM_TYPE_1B){
+                    // if output is NULL, init from the input tok embed
+                    if(output == NULL){
                         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                     }
 
@@ -6295,26 +6297,24 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 
                         // bias tensors
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head},     0);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd_head_k * n_head}, 0);
                         layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
                         layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
-                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     0);
+                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
 
                         if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
                             layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                             layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        }
-                        else {
+                        } else {
                             layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                         }
 
                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
-
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -1805,20 +1805,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_sep_id = LLAMA_TOKEN_NULL;
             special_pad_id = 3;  // <|plamo:pad|>
             special_mask_id = LLAMA_TOKEN_NULL;
-        } else if (tokenizer_model == "pangu_embedded") {
-            type = LLAMA_VOCAB_TYPE_SPM;
-
-            // default special tokens
-            special_bos_id  = 1;
-            special_eos_id  = 45892;
-            special_unk_id  = 0;
-            special_sep_id  = LLAMA_TOKEN_NULL;
-            special_pad_id  = 0;
-            special_mask_id = LLAMA_TOKEN_NULL;
-
-            add_space_prefix = true;
-            add_bos = true;
-            add_eos = false;
         } else {
             throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
         }
diff --git a/src/models/models.h b/src/models/models.h
@@ -317,10 +317,6 @@ struct llm_build_minimax_m2 : public llm_graph_context {
     llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
 };
 
-struct llm_build_pangu_embedded : public llm_graph_context {
-    llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params);
-};
-
 struct llm_build_mpt : public llm_graph_context {
     llm_build_mpt(const llama_model & model, const llm_graph_params & params);
 };
@@ -365,6 +361,10 @@ struct llm_build_orion : public llm_graph_context {
     llm_build_orion(const llama_model & model, const llm_graph_params & params);
 };
 
+struct llm_build_pangu_embedded : public llm_graph_context {
+    llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_phi2 : public llm_graph_context {
     llm_build_phi2(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/pangu-embedded.cpp b/src/models/pangu-embedded.cpp
@@ -0,0 +1,121 @@
+#include "models.h"
+
+
+llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self attention
+        {
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+            cb(Qcur, "Qcur", il);
+
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+            cb(Kcur, "Kcur", il);
+
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+            cb(Vcur, "Vcur", il);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, 
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, 
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn, 
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // feed-forward network 
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                NULL,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "ffn_out", il);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+
+    if (model.output_b != nullptr) {
+        cur = ggml_add(ctx0, cur, model.output_b);
+    }
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/pangu_embedded.cpp b/src/models/pangu_embedded.cpp

Original file line number	Diff line number	Diff line change
`@@ -214,7 +214,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {`
`214`	`214`	`return LLM_CHAT_TEMPLATE_SEED_OSS;`
`215`	`215`	`} else if (tmpl_contains("'Assistant: ' + message['content'] + '<\|separator\|>")) {`
`216`	`216`	`return LLM_CHAT_TEMPLATE_GROK_2;`
`217`		`- } else if (tmpl_contains("[unused9]") && tmpl_contains("[unused10]")) {`
	`217`	`+ } else if (tmpl_contains("[unused9]") && tmpl_contains("message['content'] + '[unused10]'")) {`
`218`	`218`	`return LLM_CHAT_TEMPLATE_PANGU_EMBED;`
`219`	`219`	`}`
`220`	`220`	`return LLM_CHAT_TEMPLATE_UNKNOWN;`
`@@ -840,9 +840,6 @@ int32_t llm_chat_apply_template(`
`840`	`840`	`ss << "[unused9]工具：" << content << "[unused10]";`
`841`	`841`	`} else if (role == "function") {`
`842`	`842`	`ss << "[unused9]方法：" << content << "[unused10]";`
`843`		`- } else {`
`844`		`- // unknown role`
`845`		`- ss << "[unused9]" << role << "：" << content << "[unused10]";`
`846`	`843`	`}`
`847`	`844`	`}`
`848`	`845`	`if (add_ass) {`