@@ -508,9 +508,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
508508 llm_arch_is_recurrent(ml.get_arch()));
509509
510510 std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
511-
512511 std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
513512
513+ std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0);
514+ std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0);
515+ std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0);
516+ std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0);
517+
514518 ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
515519 ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
516520
@@ -1948,7 +1952,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
19481952 case LLM_ARCH_APERTUS:
19491953 {
19501954 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1951- hparams.n_ctx_orig_yarn = 8192;
1955+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
1956+ ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
1957+ ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
1958+ ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
1959+
19521960 switch (hparams.n_layer) {
19531961 case 32: type = LLM_TYPE_8B; break;
19541962 default: type = LLM_TYPE_UNKNOWN;
@@ -5769,12 +5777,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
57695777 layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
57705778 layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
57715779 layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED);
5772-
5773- // xIELU parameters for Apertus
5774- layer.ffn_act_alpha_n = create_tensor(tn(LLM_TENSOR_FFN_ACT_ALPHA_N, i), { 1 }, 0);
5775- layer.ffn_act_alpha_p = create_tensor(tn(LLM_TENSOR_FFN_ACT_ALPHA_P, i), { 1 }, 0);
5776- layer.ffn_act_beta = create_tensor(tn(LLM_TENSOR_FFN_ACT_BETA, i), { 1 }, 0);
5777- layer.ffn_act_eps = create_tensor(tn(LLM_TENSOR_FFN_ACT_EPS, i), { 1 }, 0);
57785780 }
57795781 } break;
57805782 default:
@@ -18727,17 +18729,10 @@ struct llm_build_apertus : public llm_graph_context {
1872718729 ggml_tensor * up = build_lora_mm(model.layers[il].ffn_up, cur);
1872818730 cb(up, "ffn_up", il);
1872918731
18730- // xIELU activation
18731- // Get the xIELU parameters from the model layers
18732- ggml_tensor * alpha_n = model.layers[il].ffn_act_alpha_n;
18733- ggml_tensor * alpha_p = model.layers[il].ffn_act_alpha_p;
18734- ggml_tensor * beta = model.layers[il].ffn_act_beta;
18735- ggml_tensor * eps = model.layers[il].ffn_act_eps;
18736-
18737- float alpha_n_val = get_scalar_f32_val(alpha_n);
18738- float alpha_p_val = get_scalar_f32_val(alpha_p);
18739- float beta_val = get_scalar_f32_val(beta);
18740- float eps_val = get_scalar_f32_val(eps);
18732+ float alpha_n_val = hparams.xielu_alpha_n[il];
18733+ float alpha_p_val = hparams.xielu_alpha_p[il];
18734+ float beta_val = hparams.xielu_beta[il];
18735+ float eps_val = hparams.xielu_eps[il];
1874118736
1874218737 // Apply xIELU activation
1874318738 ggml_tensor * activated = ggml_xielu(ctx0, up, alpha_n_val, alpha_p_val, beta_val, eps_val);
0 commit comments