@@ -1434,6 +1434,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
14341434 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
14351435 } break;
14361436 case LLM_ARCH_BAMBA:
1437+ case LLM_ARCH_GRANITE_MOE_HYBRID:
14371438 {
14381439 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
14391440 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /* required */ false);
@@ -1475,6 +1476,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
14751476 // TODO: Add llm type label (not sure this is useful)
14761477 default: type = LLM_TYPE_UNKNOWN;
14771478 }
1479+
1480+ // For Granite MoE Shared
1481+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
14781482 } break;
14791483 case LLM_ARCH_CHAMELEON:
14801484 {
@@ -3087,6 +3091,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
30873091 }
30883092 } break;
30893093 case LLM_ARCH_BAMBA:
3094+ case LLM_ARCH_GRANITE_MOE_HYBRID:
30903095 {
30913096 // mamba2 Mixer SSM params
30923097 // NOTE: int64_t for tensor dimensions
@@ -3153,14 +3158,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
31533158 }
31543159
31553160 // feed forward (w/ optional biases)
3156- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3157- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3158- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3159- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3160- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3161- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3162- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3163- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3161+ if (n_expert > 0) {
3162+ // MoE FFN
3163+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3164+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3165+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
3166+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
3167+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
3168+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3169+
3170+ // For Granite MoE Shared
3171+ if (hparams.n_ff_shexp > 0) {
3172+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3173+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
3174+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
3175+ }
3176+ } else {
3177+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3178+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
3179+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3180+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3181+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3182+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3183+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
3184+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
3185+ }
31643186 }
31653187 } break;
31663188 case LLM_ARCH_XVERSE:
@@ -4609,7 +4631,9 @@ void llama_model::print_info() const {
46094631
46104632 if (arch == LLM_ARCH_MINICPM ||
46114633 arch == LLM_ARCH_GRANITE ||
4612- arch == LLM_ARCH_GRANITE_MOE) {
4634+ arch == LLM_ARCH_GRANITE_MOE ||
4635+ arch == LLM_ARCH_GRANITE_MOE_HYBRID ||
4636+ arch == LLM_ARCH_BAMBA) {
46134637 LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
46144638 LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
46154639 LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@@ -13544,6 +13568,7 @@ llama_memory_i * llama_model::create_memory(
1354413568 std::max((uint32_t) 1, cparams.n_seq_max));
1354513569 } break;
1354613570 case LLM_ARCH_BAMBA:
13571+ case LLM_ARCH_GRANITE_MOE_HYBRID:
1354713572 {
1354813573 // make vectors of recurrent and non-recurrent layer indices
1354913574 std::vector<size_t> recurrent_layers;
@@ -13861,6 +13886,12 @@ llm_graph_result_ptr llama_model::build_graph(
1386113886 {
1386213887 llm = std::make_unique<llm_build_granite>(*this, params, gf);
1386313888 } break;
13889+ case LLM_ARCH_GRANITE_MOE_HYBRID:
13890+ {
13891+ llm = std::make_unique<llm_build_hybrid_mamba>(*this, params, gf,
13892+ /* use_mamba2 */ true,
13893+ /* use_rope */ false);
13894+ } break;
1386413895 case LLM_ARCH_BAMBA:
1386513896 {
1386613897 llm = std::make_unique<llm_build_hybrid_mamba>(
@@ -14016,6 +14047,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
1401614047 case LLM_ARCH_GLM4:
1401714048 case LLM_ARCH_GRANITE:
1401814049 case LLM_ARCH_GRANITE_MOE:
14050+ case LLM_ARCH_GRANITE_MOE_HYBRID:
1401914051 case LLM_ARCH_BAMBA:
1402014052 case LLM_ARCH_CHAMELEON:
1402114053 case LLM_ARCH_BAILINGMOE:
0 commit comments