Skip to content

Commit 86cfc18

Browse files
committed
No permute during convert (fixes qk tensors), proper norm application.
1 parent 64add82 commit 86cfc18

File tree

2 files changed

+10
-109
lines changed

2 files changed

+10
-109
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8569,6 +8569,7 @@ def prepare_tensors(self):
85698569
@ModelBase.register("ApertusForCausalLM")
85708570
class ApertusModel(LlamaModel):
85718571
model_arch = gguf.MODEL_ARCH.APERTUS
8572+
undo_permute = False
85728573

85738574
def modify_tensors(self, data_torch, name, bid):
85748575
# Handle xIELU activation parameters
@@ -8577,6 +8578,7 @@ def modify_tensors(self, data_torch, name, bid):
85778578

85788579
return super().modify_tensors(data_torch, name, bid)
85798580

8581+
85808582
class MistralModel(LlamaModel):
85818583
model_arch = gguf.MODEL_ARCH.LLAMA
85828584
model_name = "Mistral"

src/llama-model.cpp

Lines changed: 8 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -18690,78 +18690,6 @@ static float get_scalar_f32_val(const ggml_tensor *t) {
1869018690
return onef;
1869118691
}
1869218692

18693-
static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
18694-
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
18695-
float v;
18696-
if (type == GGML_TYPE_F16) {
18697-
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
18698-
} else if (type == GGML_TYPE_F32) {
18699-
v = *(float *) &data[i];
18700-
} else if (type == GGML_TYPE_I64) {
18701-
v = (float) *(int64_t *) &data[i];
18702-
} else if (type == GGML_TYPE_I32) {
18703-
v = (float) *(int32_t *) &data[i];
18704-
} else if (type == GGML_TYPE_I16) {
18705-
v = (float) *(int16_t *) &data[i];
18706-
} else if (type == GGML_TYPE_I8) {
18707-
v = (float) *(int8_t *) &data[i];
18708-
} else {
18709-
GGML_ABORT("fatal error");
18710-
}
18711-
return v;
18712-
}
18713-
18714-
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
18715-
GGML_ASSERT(n > 0);
18716-
float sum = 0;
18717-
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
18718-
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
18719-
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
18720-
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
18721-
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
18722-
sum += v;
18723-
}
18724-
}
18725-
}
18726-
}
18727-
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
18728-
LLAMA_LOG_DEBUG(" [\n");
18729-
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
18730-
if (i2 == n && ne[2] > 2*n) {
18731-
LLAMA_LOG_DEBUG(" ..., \n");
18732-
i2 = ne[2] - n;
18733-
}
18734-
LLAMA_LOG_DEBUG(" [\n");
18735-
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
18736-
if (i1 == n && ne[1] > 2*n) {
18737-
LLAMA_LOG_DEBUG(" ..., \n");
18738-
i1 = ne[1] - n;
18739-
}
18740-
LLAMA_LOG_DEBUG(" [");
18741-
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
18742-
if (i0 == n && ne[0] > 2*n) {
18743-
LLAMA_LOG_DEBUG("..., ");
18744-
i0 = ne[0] - n;
18745-
}
18746-
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
18747-
LLAMA_LOG_DEBUG("%12.4f", v);
18748-
if (i0 < ne[0] - 1) LLAMA_LOG_DEBUG(", ");
18749-
}
18750-
LLAMA_LOG_DEBUG("],\n");
18751-
}
18752-
LLAMA_LOG_DEBUG(" ],\n");
18753-
}
18754-
LLAMA_LOG_DEBUG(" ]\n");
18755-
LLAMA_LOG_DEBUG(" sum = %f\n", sum);
18756-
}
18757-
18758-
// TODO: make this abort configurable/optional?
18759-
if (std::isnan(sum)) {
18760-
LLAMA_LOG_ERROR("encountered NaN - aborting\n");
18761-
exit(0);
18762-
}
18763-
}
18764-
1876518693
// Apertus model graph builder with xIELU activation
1876618694
struct llm_build_apertus : public llm_graph_context {
1876718695
llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@@ -18785,9 +18713,8 @@ struct llm_build_apertus : public llm_graph_context {
1878518713
for (int il = 0; il < n_layer; ++il) {
1878618714
ggml_tensor * inpSA = inpL;
1878718715

18788-
// norm
1878918716
cur = build_norm(inpL,
18790-
model.layers[il].attn_norm, NULL,
18717+
model.layers[il].attn_norm, nullptr,
1879118718
LLM_NORM_RMS, il);
1879218719
cb(cur, "attn_norm", il);
1879318720

@@ -18806,42 +18733,14 @@ struct llm_build_apertus : public llm_graph_context {
1880618733
cb(Vcur, "Vcur", il);
1880718734

1880818735
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18809-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18810-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
18811-
18812-
ggml_tensor * Q2d = ggml_reshape_2d(ctx0, Qcur, n_embd_head, n_head * n_tokens);
18813-
ggml_tensor * K2d = ggml_reshape_2d(ctx0, Kcur, n_embd_head, n_head_kv * n_tokens);
18814-
18815-
cb(Q2d, "Q2D", il);
18816-
cb(K2d, "K2D", il);
18817-
18818-
// apply existing rms-norm which was originally written for 2D
18819-
Q2d = build_norm(Q2d, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
18820-
K2d = build_norm(K2d, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
18821-
18822-
cb(Q2d, "Q2D_normed", il);
18823-
cb(K2d, "K2D_normed", il);
18824-
18825-
// reshape back to 3D
18826-
Qcur = ggml_reshape_3d(ctx0, Q2d, n_embd_head, n_head, n_tokens);
18827-
Kcur = ggml_reshape_3d(ctx0, K2d, n_embd_head, n_head_kv, n_tokens);
18828-
18736+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
1882918737
cb(Qcur, "Qcur_normed", il);
18738+
18739+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18740+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
1883018741
cb(Kcur, "Kcur_normed", il);
18831-
18832-
// // copy the data from the GPU memory if needed
18833-
// const bool is_host = ggml_backend_buffer_is_host(rope_factors->buffer);
18834-
18835-
// auto n_bytes = ggml_nbytes(rope_factors);
18836-
// uint8_t loaded_data[n_bytes];
18837-
// if (!is_host) {
18838-
// ggml_backend_tensor_get(rope_factors, &loaded_data, 0, n_bytes);
18839-
// }
1884018742

18841-
// if (!ggml_is_quantized(rope_factors->type)) {
18842-
// uint8_t * data = is_host ? (uint8_t *) rope_factors->data : &loaded_data[0];
18843-
// ggml_print_tensor(data, rope_factors->type, rope_factors->ne, rope_factors->nb, 64);
18844-
// }
18743+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1884518744

1884618745
Qcur = ggml_rope_ext(
1884718746
ctx0, Qcur, inp_pos, rope_factors,
@@ -18876,7 +18775,7 @@ struct llm_build_apertus : public llm_graph_context {
1887618775
// feed-forward network with xIELU activation
1887718776
{
1887818777
cur = build_norm(ffn_inp,
18879-
model.layers[il].ffn_norm, NULL,
18778+
model.layers[il].ffn_norm, nullptr,
1888018779
LLM_NORM_RMS, il);
1888118780
cb(cur, "ffn_norm", il);
1888218781

@@ -18918,7 +18817,7 @@ struct llm_build_apertus : public llm_graph_context {
1891818817
cur = inpL;
1891918818

1892018819
cur = build_norm(cur,
18921-
model.output_norm, NULL,
18820+
model.output_norm, nullptr,
1892218821
LLM_NORM_RMS, -1);
1892318822

1892418823
cb(cur, "result_norm", -1);

0 commit comments

Comments
 (0)