Skip to content

Commit 6c7b2cc

Browse files
committed
Cast embedding weights to fp32
1 parent 222c5ad commit 6c7b2cc

File tree

3 files changed

+12
-4
lines changed

3 files changed

+12
-4
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3495,6 +3495,8 @@ def set_gguf_parameters(self):
34953495
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
34963496
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
34973497
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
3498+
self.gguf_writer.add_group_norm_eps(hparams.get("rms_norm_eps", 1e-06))
3499+
self.gguf_writer.add_layer_norm_eps(hparams.get("rms_norm_eps", 1e-06))
34983500
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
34993501

35003502
# Mamba parameters

src/llama-context.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,6 +1132,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
11321132
}
11331133

11341134
// Debug: Dump tensor values after computation (for PLaMo-2 only)
1135+
#define PLAMO2_DEBUG
1136+
#ifdef PLAMO2_DEBUG
11351137
if (model.arch == LLM_ARCH_PLAMO2) { // Only for small inputs
11361138
// Create debug directory if it doesn't exist
11371139
#ifdef _WIN32
@@ -1230,6 +1232,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
12301232
}
12311233
}
12321234
}
1235+
#endif // PLAMO2_DEBUG
12331236

12341237
n_outputs_prev += n_outputs;
12351238
} while (mctx->next());

src/llama-model.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <functional>
2323
#include <map>
2424
#include <regex>
25+
#include <sstream>
2526
#include <stdexcept>
2627

2728
const char * llm_type_name(llm_type type) {
@@ -8008,8 +8009,11 @@ struct llm_build_plamo2 : public llm_graph_context {
80088009
// const int64_t n_embd_head = hparams.n_embd_head_v;
80098010
// ggml_tensor * inp_pos = build_inp_pos();
80108011

8012+
// TODO: Cast to f32 is currently required for ggml_get_rows in build_inp_embd
8013+
ggml_tensor * embed_tokens = ggml_cast(ctx0, model.tok_embd, GGML_TYPE_F32);
8014+
80118015
// {n_embd, n_tokens}
8012-
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
8016+
ggml_tensor * inpL = build_inp_embd(embed_tokens);
80138017
cb(inpL, "embedding_output", -1);
80148018

80158019
// ensure the memory context is hybrid
@@ -8023,9 +8027,8 @@ struct llm_build_plamo2 : public llm_graph_context {
80238027
// cb(model.layers[il].attn_norm, "attn_norm", il);
80248028

80258029
// pre_mixer_norm
8026-
// cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
8027-
cur = ggml_rms_norm(ctx0, inpL, hparams.f_norm_rms_eps);
8028-
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
8030+
cb(inpL, "attn_pre_norm_input", il);
8031+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
80298032
cb(cur, "attn_pre_norm", il);
80308033

80318034
// check if this layer is Mamba or Attention

0 commit comments

Comments
 (0)