Skip to content

Commit 73a261f

Browse files
committed
Cast embedding weights to fp32
1 parent 47a779b commit 73a261f

File tree

3 files changed

+12
-4
lines changed

3 files changed

+12
-4
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3554,6 +3554,8 @@ def set_gguf_parameters(self):
35543554
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
35553555
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
35563556
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
3557+
self.gguf_writer.add_group_norm_eps(hparams.get("rms_norm_eps", 1e-06))
3558+
self.gguf_writer.add_layer_norm_eps(hparams.get("rms_norm_eps", 1e-06))
35573559
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
35583560

35593561
# Mamba parameters

src/llama-context.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,6 +1132,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
11321132
}
11331133

11341134
// Debug: Dump tensor values after computation (for PLaMo-2 only)
1135+
#define PLAMO2_DEBUG
1136+
#ifdef PLAMO2_DEBUG
11351137
if (model.arch == LLM_ARCH_PLAMO2) { // Only for small inputs
11361138
// Create debug directory if it doesn't exist
11371139
#ifdef _WIN32
@@ -1230,6 +1232,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
12301232
}
12311233
}
12321234
}
1235+
#endif // PLAMO2_DEBUG
12331236

12341237
n_outputs_prev += n_outputs;
12351238
} while (mctx->next());

src/llama-model.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <functional>
2323
#include <map>
2424
#include <regex>
25+
#include <sstream>
2526
#include <stdexcept>
2627

2728
const char * llm_type_name(llm_type type) {
@@ -8215,8 +8216,11 @@ struct llm_build_plamo2 : public llm_graph_context {
82158216
// const int64_t n_embd_head = hparams.n_embd_head_v;
82168217
// ggml_tensor * inp_pos = build_inp_pos();
82178218

8219+
// TODO: Cast to f32 is currently required for ggml_get_rows in build_inp_embd
8220+
ggml_tensor * embed_tokens = ggml_cast(ctx0, model.tok_embd, GGML_TYPE_F32);
8221+
82188222
// {n_embd, n_tokens}
8219-
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
8223+
ggml_tensor * inpL = build_inp_embd(embed_tokens);
82208224
cb(inpL, "embedding_output", -1);
82218225

82228226
// ensure the memory context is hybrid
@@ -8230,9 +8234,8 @@ struct llm_build_plamo2 : public llm_graph_context {
82308234
// cb(model.layers[il].attn_norm, "attn_norm", il);
82318235

82328236
// pre_mixer_norm
8233-
// cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
8234-
cur = ggml_rms_norm(ctx0, inpL, hparams.f_norm_rms_eps);
8235-
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
8237+
cb(inpL, "attn_pre_norm_input", il);
8238+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
82368239
cb(cur, "attn_pre_norm", il);
82378240

82388241
// check if this layer is Mamba or Attention

0 commit comments

Comments
 (0)