@@ -165,6 +165,7 @@ enum llm_arch {
165165 LLM_ARCH_QWEN2MOE,
166166 LLM_ARCH_PHI2,
167167 LLM_ARCH_PHI3,
168+ LLM_ARCH_PHIMOE,
168169 LLM_ARCH_PLAMO,
169170 LLM_ARCH_CODESHELL,
170171 LLM_ARCH_ORION,
@@ -219,6 +220,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
219220 { LLM_ARCH_QWEN2MOE, "qwen2moe" },
220221 { LLM_ARCH_PHI2, "phi2" },
221222 { LLM_ARCH_PHI3, "phi3" },
223+ { LLM_ARCH_PHIMOE, "phimoe" },
222224 { LLM_ARCH_PLAMO, "plamo" },
223225 { LLM_ARCH_CODESHELL, "codeshell" },
224226 { LLM_ARCH_ORION, "orion" },
@@ -955,6 +957,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
955957 { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
956958 },
957959 },
960+ {
961+ LLM_ARCH_PHIMOE,
962+ {
963+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
964+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
965+ { LLM_TENSOR_OUTPUT, "output" },
966+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
967+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
968+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
969+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
970+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
971+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
972+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
973+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
974+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
975+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
976+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
977+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
978+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
979+ },
980+ },
958981 {
959982 LLM_ARCH_PLAMO,
960983 {
@@ -2428,6 +2451,7 @@ enum e_model {
24282451 MODEL_8x7B,
24292452 MODEL_8x22B,
24302453 MODEL_16x12B,
2454+ MODEL_16x3_8B,
24312455 MODEL_10B_128x3_66B,
24322456 MODEL_57B_A14B,
24332457 MODEL_27B,
@@ -5412,6 +5436,7 @@ static const char * llama_model_type_name(e_model type) {
54125436 case MODEL_8x7B: return "8x7B";
54135437 case MODEL_8x22B: return "8x22B";
54145438 case MODEL_16x12B: return "16x12B";
5439+ case MODEL_16x3_8B: return "16x3.8B";
54155440 case MODEL_10B_128x3_66B: return "10B+128x3.66B";
54165441 case MODEL_57B_A14B: return "57B.A14B";
54175442 case MODEL_27B: return "27B";
@@ -5817,6 +5842,15 @@ static void llm_load_hparams(
58175842 throw std::runtime_error("invalid value for sliding_window");
58185843 }
58195844 } break;
5845+ case LLM_ARCH_PHIMOE:
5846+ {
5847+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5848+
5849+ switch (hparams.n_layer) {
5850+ case 32: model.type = e_model::MODEL_16x3_8B; break;
5851+ default: model.type = e_model::MODEL_UNKNOWN;
5852+ }
5853+ } break;
58205854 case LLM_ARCH_PLAMO:
58215855 {
58225856 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -8325,6 +8359,50 @@ static bool llm_load_tensors(
83258359 layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
83268360 }
83278361 } break;
8362+ case LLM_ARCH_PHIMOE:
8363+ {
8364+ const int64_t n_embd_head = n_embd / n_head;
8365+
8366+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
8367+
8368+ // output
8369+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
8370+ model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
8371+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
8372+ model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
8373+
8374+ for (int i = 0; i < n_layer; ++i) {
8375+ auto & layer = model.layers[i];
8376+
8377+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
8378+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
8379+
8380+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
8381+ if (layer.wqkv == nullptr) {
8382+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
8383+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
8384+
8385+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
8386+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
8387+
8388+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
8389+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
8390+ }
8391+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
8392+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
8393+
8394+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
8395+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
8396+
8397+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
8398+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8399+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
8400+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8401+
8402+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8403+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8404+ }
8405+ } break;
83288406 case LLM_ARCH_PLAMO:
83298407 {
83308408 model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -16680,6 +16758,7 @@ static struct ggml_cgraph * llama_build_graph(
1668016758 result = llm.build_phi2();
1668116759 } break;
1668216760 case LLM_ARCH_PHI3:
16761+ case LLM_ARCH_PHIMOE:
1668316762 {
1668416763 result = llm.build_phi3();
1668516764 } break;
@@ -20012,6 +20091,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
2001220091 case LLM_ARCH_OLMOE:
2001320092 case LLM_ARCH_PHI2:
2001420093 case LLM_ARCH_PHI3:
20094+ case LLM_ARCH_PHIMOE:
2001520095 case LLM_ARCH_GEMMA:
2001620096 case LLM_ARCH_GEMMA2:
2001720097 case LLM_ARCH_STARCODER2:
0 commit comments