@@ -1460,6 +1460,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
14601460 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
14611461 } break;
14621462 case LLM_ARCH_BAMBA:
1463+ case LLM_ARCH_GRANITE_MOE_HYBRID:
14631464 {
14641465 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
14651466 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
@@ -1501,6 +1502,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
15011502 // TODO: Add llm type label (not sure this is useful)
15021503 default: type = LLM_TYPE_UNKNOWN;
15031504 }
1505+
1506+ // For Granite MoE Shared
1507+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
15041508 } break;
15051509 case LLM_ARCH_CHAMELEON:
15061510 {
@@ -3113,6 +3117,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
31133117 }
31143118 } break;
31153119 case LLM_ARCH_BAMBA:
3120+ case LLM_ARCH_GRANITE_MOE_HYBRID:
31163121 {
31173122 // mamba2 Mixer SSM params
31183123 // NOTE: int64_t for tensor dimensions
@@ -3179,14 +3184,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
31793184 }
31803185
31813186 // feed forward (w/ optional biases)
3182- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3183- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3184- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3185- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3186- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3187- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3188- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3189- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3187+ if (n_expert > 0) {
3188+ // MoE FFN
3189+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3190+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3191+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3192+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
3193+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3194+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3195+
3196+ // For Granite MoE Shared
3197+ if (hparams.n_ff_shexp > 0) {
3198+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3199+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3200+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
3201+ }
3202+ } else {
3203+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3204+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3205+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3206+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3207+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3208+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3209+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3210+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3211+ }
31903212 }
31913213 } break;
31923214 case LLM_ARCH_XVERSE:
@@ -4635,7 +4657,9 @@ void llama_model::print_info() const {
46354657
46364658 if (arch == LLM_ARCH_MINICPM ||
46374659 arch == LLM_ARCH_GRANITE ||
4638- arch == LLM_ARCH_GRANITE_MOE) {
4660+ arch == LLM_ARCH_GRANITE_MOE ||
4661+ arch == LLM_ARCH_GRANITE_MOE_HYBRID ||
4662+ arch == LLM_ARCH_BAMBA) {
46394663 LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
46404664 LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
46414665 LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -14035,6 +14059,12 @@ llm_graph_result_ptr llama_model::build_graph(
1403514059 {
1403614060 llm = std::make_unique<llm_build_granite>(*this, params, gf);
1403714061 } break;
14062+ case LLM_ARCH_GRANITE_MOE_HYBRID:
14063+ {
14064+ llm = std::make_unique<llm_build_hybrid_mamba>(*this, params, gf,
14065+ /* use_mamba2 */ true,
14066+ /* use_rope */ false);
14067+ } break;
1403814068 case LLM_ARCH_BAMBA:
1403914069 {
1404014070 llm = std::make_unique<llm_build_hybrid_mamba>(
@@ -14190,6 +14220,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1419014220 case LLM_ARCH_GLM4:
1419114221 case LLM_ARCH_GRANITE:
1419214222 case LLM_ARCH_GRANITE_MOE:
14223+ case LLM_ARCH_GRANITE_MOE_HYBRID:
1419314224 case LLM_ARCH_BAMBA:
1419414225 case LLM_ARCH_CHAMELEON:
1419514226 case LLM_ARCH_BAILINGMOE:
0 commit comments