Cast embedding weights to fp32

mitmul · mitmul · commit 73a261f6f4c5 · 2025-07-03T14:44:28.000+09:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -3554,6 +3554,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
         self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
         self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
+        self.gguf_writer.add_group_norm_eps(hparams.get("rms_norm_eps", 1e-06))
+        self.gguf_writer.add_layer_norm_eps(hparams.get("rms_norm_eps", 1e-06))
         self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
 
         # Mamba parameters
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1132,6 +1132,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
         }
 
         // Debug: Dump tensor values after computation (for PLaMo-2 only)
+#define PLAMO2_DEBUG
+#ifdef PLAMO2_DEBUG
         if (model.arch == LLM_ARCH_PLAMO2) {  // Only for small inputs
             // Create debug directory if it doesn't exist
             #ifdef _WIN32
@@ -1230,6 +1232,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                 }
             }
         }
+#endif // PLAMO2_DEBUG
 
         n_outputs_prev += n_outputs;
     } while (mctx->next());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -22,6 +22,7 @@
 #include <functional>
 #include <map>
 #include <regex>
+#include <sstream>
 #include <stdexcept>
 
 const char * llm_type_name(llm_type type) {
@@ -8215,8 +8216,11 @@ struct llm_build_plamo2 : public llm_graph_context {
         // const int64_t n_embd_head = hparams.n_embd_head_v;
         // ggml_tensor * inp_pos = build_inp_pos();
 
+        // TODO: Cast to f32 is currently required for ggml_get_rows in build_inp_embd
+        ggml_tensor * embed_tokens = ggml_cast(ctx0, model.tok_embd, GGML_TYPE_F32);
+
         // {n_embd, n_tokens}
-        ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+        ggml_tensor * inpL = build_inp_embd(embed_tokens);
         cb(inpL, "embedding_output", -1);
 
         // ensure the memory context is hybrid
@@ -8230,9 +8234,8 @@ struct llm_build_plamo2 : public llm_graph_context {
             // cb(model.layers[il].attn_norm, "attn_norm", il);
 
             // pre_mixer_norm
-            // cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
-            cur = ggml_rms_norm(ctx0, inpL, hparams.f_norm_rms_eps);
-            cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
+            cb(inpL, "attn_pre_norm_input", il);
+            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
             cb(cur, "attn_pre_norm", il);
 
             // check if this layer is Mamba or Attention

Original file line number	Diff line number	Diff line change
`@@ -1132,6 +1132,8 @@ int llama_context::decode(const llama_batch & batch_inp) {`
`1132`	`1132`	`}`
`1133`	`1133`
`1134`	`1134`	`// Debug: Dump tensor values after computation (for PLaMo-2 only)`
	`1135`	`+#define PLAMO2_DEBUG`
	`1136`	`+#ifdef PLAMO2_DEBUG`
`1135`	`1137`	`if (model.arch == LLM_ARCH_PLAMO2) { // Only for small inputs`
`1136`	`1138`	`// Create debug directory if it doesn't exist`
`1137`	`1139`	`#ifdef _WIN32`
`@@ -1230,6 +1232,7 @@ int llama_context::decode(const llama_batch & batch_inp) {`
`1230`	`1232`	`}`
`1231`	`1233`	`}`
`1232`	`1234`	`}`
	`1235`	`+#endif // PLAMO2_DEBUG`
`1233`	`1236`
`1234`	`1237`	`n_outputs_prev += n_outputs;`
`1235`	`1238`	`} while (mctx->next());`