@@ -1447,6 +1447,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
14471447 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
14481448 } break;
14491449 case LLM_ARCH_BAMBA:
1450+ case LLM_ARCH_GRANITE_MOE_HYBRID:
14501451 {
14511452 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
14521453 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
@@ -1488,6 +1489,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
14881489 // TODO: Add llm type label (not sure this is useful)
14891490 default: type = LLM_TYPE_UNKNOWN;
14901491 }
1492+
1493+ // For Granite MoE Shared
1494+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
14911495 } break;
14921496 case LLM_ARCH_CHAMELEON:
14931497 {
@@ -3101,6 +3105,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
31013105 }
31023106 } break;
31033107 case LLM_ARCH_BAMBA:
3108+ case LLM_ARCH_GRANITE_MOE_HYBRID:
31043109 {
31053110 // mamba2 Mixer SSM params
31063111 // NOTE: int64_t for tensor dimensions
@@ -3167,14 +3172,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
31673172 }
31683173
31693174 // feed forward (w/ optional biases)
3170- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3171- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3172- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3173- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3174- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3175- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3176- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3177- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3175+ if (n_expert > 0) {
3176+ // MoE FFN
3177+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3178+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3179+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3180+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
3181+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3182+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3183+
3184+ // For Granite MoE Shared
3185+ if (hparams.n_ff_shexp > 0) {
3186+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3187+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3188+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
3189+ }
3190+ } else {
3191+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3192+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3193+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3194+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3195+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3196+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3197+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3198+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3199+ }
31783200 }
31793201 } break;
31803202 case LLM_ARCH_XVERSE:
@@ -4623,7 +4645,9 @@ void llama_model::print_info() const {
46234645
46244646 if (arch == LLM_ARCH_MINICPM ||
46254647 arch == LLM_ARCH_GRANITE ||
4626- arch == LLM_ARCH_GRANITE_MOE) {
4648+ arch == LLM_ARCH_GRANITE_MOE ||
4649+ arch == LLM_ARCH_GRANITE_MOE_HYBRID ||
4650+ arch == LLM_ARCH_BAMBA) {
46274651 LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
46284652 LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
46294653 LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -14010,6 +14034,12 @@ llm_graph_result_ptr llama_model::build_graph(
1401014034 {
1401114035 llm = std::make_unique<llm_build_granite>(*this, params, gf);
1401214036 } break;
14037+ case LLM_ARCH_GRANITE_MOE_HYBRID:
14038+ {
14039+ llm = std::make_unique<llm_build_hybrid_mamba>(*this, params, gf,
14040+ /* use_mamba2 */ true,
14041+ /* use_rope */ false);
14042+ } break;
1401314043 case LLM_ARCH_BAMBA:
1401414044 {
1401514045 llm = std::make_unique<llm_build_hybrid_mamba>(
@@ -14169,6 +14199,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1416914199 case LLM_ARCH_GLM4:
1417014200 case LLM_ARCH_GRANITE:
1417114201 case LLM_ARCH_GRANITE_MOE:
14202+ case LLM_ARCH_GRANITE_MOE_HYBRID:
1417214203 case LLM_ARCH_BAMBA:
1417314204 case LLM_ARCH_CHAMELEON:
1417414205 case LLM_ARCH_BAILINGMOE:
0 commit comments