@@ -18690,78 +18690,6 @@ static float get_scalar_f32_val(const ggml_tensor *t) {
1869018690 return onef;
1869118691}
1869218692
18693- static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
18694- size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
18695- float v;
18696- if (type == GGML_TYPE_F16) {
18697- v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
18698- } else if (type == GGML_TYPE_F32) {
18699- v = *(float *) &data[i];
18700- } else if (type == GGML_TYPE_I64) {
18701- v = (float) *(int64_t *) &data[i];
18702- } else if (type == GGML_TYPE_I32) {
18703- v = (float) *(int32_t *) &data[i];
18704- } else if (type == GGML_TYPE_I16) {
18705- v = (float) *(int16_t *) &data[i];
18706- } else if (type == GGML_TYPE_I8) {
18707- v = (float) *(int8_t *) &data[i];
18708- } else {
18709- GGML_ABORT("fatal error");
18710- }
18711- return v;
18712- }
18713-
18714- static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
18715- GGML_ASSERT(n > 0);
18716- float sum = 0;
18717- for (int64_t i3 = 0; i3 < ne[3]; i3++) {
18718- for (int64_t i2 = 0; i2 < ne[2]; i2++) {
18719- for (int64_t i1 = 0; i1 < ne[1]; i1++) {
18720- for (int64_t i0 = 0; i0 < ne[0]; i0++) {
18721- const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
18722- sum += v;
18723- }
18724- }
18725- }
18726- }
18727- for (int64_t i3 = 0; i3 < ne[3]; i3++) {
18728- LLAMA_LOG_DEBUG(" [\n");
18729- for (int64_t i2 = 0; i2 < ne[2]; i2++) {
18730- if (i2 == n && ne[2] > 2*n) {
18731- LLAMA_LOG_DEBUG(" ..., \n");
18732- i2 = ne[2] - n;
18733- }
18734- LLAMA_LOG_DEBUG(" [\n");
18735- for (int64_t i1 = 0; i1 < ne[1]; i1++) {
18736- if (i1 == n && ne[1] > 2*n) {
18737- LLAMA_LOG_DEBUG(" ..., \n");
18738- i1 = ne[1] - n;
18739- }
18740- LLAMA_LOG_DEBUG(" [");
18741- for (int64_t i0 = 0; i0 < ne[0]; i0++) {
18742- if (i0 == n && ne[0] > 2*n) {
18743- LLAMA_LOG_DEBUG("..., ");
18744- i0 = ne[0] - n;
18745- }
18746- const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
18747- LLAMA_LOG_DEBUG("%12.4f", v);
18748- if (i0 < ne[0] - 1) LLAMA_LOG_DEBUG(", ");
18749- }
18750- LLAMA_LOG_DEBUG("],\n");
18751- }
18752- LLAMA_LOG_DEBUG(" ],\n");
18753- }
18754- LLAMA_LOG_DEBUG(" ]\n");
18755- LLAMA_LOG_DEBUG(" sum = %f\n", sum);
18756- }
18757-
18758- // TODO: make this abort configurable/optional?
18759- if (std::isnan(sum)) {
18760- LLAMA_LOG_ERROR("encountered NaN - aborting\n");
18761- exit(0);
18762- }
18763- }
18764-
1876518693// Apertus model graph builder with xIELU activation
1876618694struct llm_build_apertus : public llm_graph_context {
1876718695 llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@@ -18785,9 +18713,8 @@ struct llm_build_apertus : public llm_graph_context {
1878518713 for (int il = 0; il < n_layer; ++il) {
1878618714 ggml_tensor * inpSA = inpL;
1878718715
18788- // norm
1878918716 cur = build_norm(inpL,
18790- model.layers[il].attn_norm, NULL ,
18717+ model.layers[il].attn_norm, nullptr ,
1879118718 LLM_NORM_RMS, il);
1879218719 cb(cur, "attn_norm", il);
1879318720
@@ -18806,42 +18733,14 @@ struct llm_build_apertus : public llm_graph_context {
1880618733 cb(Vcur, "Vcur", il);
1880718734
1880818735 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
18809- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18810- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
18811-
18812- ggml_tensor * Q2d = ggml_reshape_2d(ctx0, Qcur, n_embd_head, n_head * n_tokens);
18813- ggml_tensor * K2d = ggml_reshape_2d(ctx0, Kcur, n_embd_head, n_head_kv * n_tokens);
18814-
18815- cb(Q2d, "Q2D", il);
18816- cb(K2d, "K2D", il);
18817-
18818- // apply existing rms-norm which was originally written for 2D
18819- Q2d = build_norm(Q2d, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
18820- K2d = build_norm(K2d, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
18821-
18822- cb(Q2d, "Q2D_normed", il);
18823- cb(K2d, "K2D_normed", il);
18824-
18825- // reshape back to 3D
18826- Qcur = ggml_reshape_3d(ctx0, Q2d, n_embd_head, n_head, n_tokens);
18827- Kcur = ggml_reshape_3d(ctx0, K2d, n_embd_head, n_head_kv, n_tokens);
18828-
18736+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
1882918737 cb(Qcur, "Qcur_normed", il);
18738+
18739+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
18740+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
1883018741 cb(Kcur, "Kcur_normed", il);
18831-
18832- // // copy the data from the GPU memory if needed
18833- // const bool is_host = ggml_backend_buffer_is_host(rope_factors->buffer);
18834-
18835- // auto n_bytes = ggml_nbytes(rope_factors);
18836- // uint8_t loaded_data[n_bytes];
18837- // if (!is_host) {
18838- // ggml_backend_tensor_get(rope_factors, &loaded_data, 0, n_bytes);
18839- // }
1884018742
18841- // if (!ggml_is_quantized(rope_factors->type)) {
18842- // uint8_t * data = is_host ? (uint8_t *) rope_factors->data : &loaded_data[0];
18843- // ggml_print_tensor(data, rope_factors->type, rope_factors->ne, rope_factors->nb, 64);
18844- // }
18743+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1884518744
1884618745 Qcur = ggml_rope_ext(
1884718746 ctx0, Qcur, inp_pos, rope_factors,
@@ -18876,7 +18775,7 @@ struct llm_build_apertus : public llm_graph_context {
1887618775 // feed-forward network with xIELU activation
1887718776 {
1887818777 cur = build_norm(ffn_inp,
18879- model.layers[il].ffn_norm, NULL ,
18778+ model.layers[il].ffn_norm, nullptr ,
1888018779 LLM_NORM_RMS, il);
1888118780 cb(cur, "ffn_norm", il);
1888218781
@@ -18918,7 +18817,7 @@ struct llm_build_apertus : public llm_graph_context {
1891818817 cur = inpL;
1891918818
1892018819 cur = build_norm(cur,
18921- model.output_norm, NULL ,
18820+ model.output_norm, nullptr ,
1892218821 LLM_NORM_RMS, -1);
1892318822
1892418823 cb(cur, "result_norm", -1);
0 commit comments