@@ -1436,6 +1436,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
14361436 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
14371437 } break;
14381438 case LLM_ARCH_BAMBA:
1439+ case LLM_ARCH_GRANITE_MOE_HYBRID:
14391440 {
14401441 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
14411442 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
@@ -1477,6 +1478,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
14771478 // TODO: Add llm type label (not sure this is useful)
14781479 default: type = LLM_TYPE_UNKNOWN;
14791480 }
1481+
1482+ // For Granite MoE Shared
1483+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
14801484 } break;
14811485 case LLM_ARCH_CHAMELEON:
14821486 {
@@ -3093,6 +3097,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
30933097 }
30943098 } break;
30953099 case LLM_ARCH_BAMBA:
3100+ case LLM_ARCH_GRANITE_MOE_HYBRID:
30963101 {
30973102 // mamba2 Mixer SSM params
30983103 // NOTE: int64_t for tensor dimensions
@@ -3159,14 +3164,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
31593164 }
31603165
31613166 // feed forward (w/ optional biases)
3162- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3163- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3164- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3165- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3166- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3167- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3168- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3169- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3167+ if (n_expert > 0) {
3168+ // MoE FFN
3169+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3170+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3171+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3172+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
3173+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3174+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3175+
3176+ // For Granite MoE Shared
3177+ if (hparams.n_ff_shexp > 0) {
3178+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3179+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3180+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
3181+ }
3182+ } else {
3183+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3184+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3185+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3186+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3187+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3188+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3189+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3190+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3191+ }
31703192 }
31713193 } break;
31723194 case LLM_ARCH_XVERSE:
@@ -4615,7 +4637,9 @@ void llama_model::print_info() const {
46154637
46164638 if (arch == LLM_ARCH_MINICPM ||
46174639 arch == LLM_ARCH_GRANITE ||
4618- arch == LLM_ARCH_GRANITE_MOE) {
4640+ arch == LLM_ARCH_GRANITE_MOE ||
4641+ arch == LLM_ARCH_GRANITE_MOE_HYBRID ||
4642+ arch == LLM_ARCH_BAMBA) {
46194643 LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
46204644 LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
46214645 LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -14046,6 +14070,12 @@ llm_graph_result_ptr llama_model::build_graph(
1404614070 {
1404714071 llm = std::make_unique<llm_build_granite>(*this, params, gf);
1404814072 } break;
14073+ case LLM_ARCH_GRANITE_MOE_HYBRID:
14074+ {
14075+ llm = std::make_unique<llm_build_hybrid_mamba>(*this, params, gf,
14076+ /* use_mamba2 */ true,
14077+ /* use_rope */ false);
14078+ } break;
1404914079 case LLM_ARCH_BAMBA:
1405014080 {
1405114081 llm = std::make_unique<llm_build_hybrid_mamba>(
@@ -14201,6 +14231,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1420114231 case LLM_ARCH_GLM4:
1420214232 case LLM_ARCH_GRANITE:
1420314233 case LLM_ARCH_GRANITE_MOE:
14234+ case LLM_ARCH_GRANITE_MOE_HYBRID:
1420414235 case LLM_ARCH_BAMBA:
1420514236 case LLM_ARCH_CHAMELEON:
1420614237 case LLM_ARCH_BAILINGMOE:
0 commit comments