@@ -663,6 +663,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
663663 default: type = LLM_TYPE_UNKNOWN;
664664 }
665665 } break;
666+ case LLM_ARCH_AFMOE:
667+ {
668+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
669+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
670+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
671+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
672+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
673+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
674+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
675+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
676+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
677+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
678+
679+ // Default to sigmoid if not set
680+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
681+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
682+ }
683+
684+ switch (hparams.n_layer) {
685+ case 56: type = LLM_TYPE_1B; break;
686+ default: type = LLM_TYPE_UNKNOWN;
687+ }
688+ } break;
666689 case LLM_ARCH_DECI:
667690 {
668691 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5519,6 +5542,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
55195542 layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
55205543 }
55215544 } break;
5545+ case LLM_ARCH_AFMOE:
5546+ {
5547+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5548+
5549+ // output
5550+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5551+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5552+
5553+ // if output is NULL, init from the input tok embed
5554+ if (output == NULL) {
5555+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5556+ }
5557+
5558+ const int64_t n_ff_exp = hparams.n_ff_exp;
5559+ const int64_t n_expert_shared = hparams.n_expert_shared;
5560+
5561+ for (int i = 0; i < n_layer; ++i) {
5562+ auto & layer = layers[i];
5563+
5564+ // dual attention normalization
5565+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
5566+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
5567+
5568+ // attention projections
5569+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5570+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
5571+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
5572+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
5573+
5574+ // Q/K normalization
5575+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
5576+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
5577+
5578+ // attention gating
5579+ layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
5580+
5581+ // dual ffn normalization
5582+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
5583+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
5584+
5585+ if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
5586+ // MoE layers
5587+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
5588+
5589+ // grouped expert weights
5590+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5591+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
5592+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
5593+
5594+ // shared expert
5595+ if (n_expert_shared > 0) {
5596+ const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
5597+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5598+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
5599+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
5600+ }
5601+ } else {
5602+ // Dense layers
5603+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
5604+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
5605+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
5606+ }
5607+ }
5608+ } break;
55225609 case LLM_ARCH_ERNIE4_5:
55235610 case LLM_ARCH_ERNIE4_5_MOE:
55245611 {
@@ -19578,6 +19665,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
1957819665 {
1957919666 llm = std::make_unique<llm_build_arcee>(*this, params);
1958019667 } break;
19668+ case LLM_ARCH_AFMOE:
19669+ {
19670+ llm = std::make_unique<llm_build_arcee>(*this, params);
19671+ } break;
1958119672 case LLM_ARCH_ERNIE4_5:
1958219673 {
1958319674 llm = std::make_unique<llm_build_ernie4_5>(*this, params);
@@ -19776,6 +19867,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1977619867 case LLM_ARCH_NEO_BERT:
1977719868 case LLM_ARCH_SMOLLM3:
1977819869 case LLM_ARCH_ARCEE:
19870+ case LLM_ARCH_AFMOE:
1977919871 case LLM_ARCH_ERNIE4_5:
1978019872 case LLM_ARCH_ERNIE4_5_MOE:
1978119873 return LLAMA_ROPE_TYPE_NORM;
0 commit comments