@@ -5364,6 +5364,10 @@ static void llm_load_vocab(
53645364 } else if (
53655365 tokenizer_pre == "exaone") {
53665366 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
5367+ } else if (
5368+ tokenizer_pre == "gpt-4o") {
5369+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT4O;
5370+ vocab.tokenizer_clean_spaces = false;
53675371 } else {
53685372 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
53695373 }
@@ -5946,6 +5950,7 @@ static bool llm_load_tensors(
59465950 const int64_t n_embd_gqa = n_embd_v_gqa;
59475951 const int64_t n_vocab = hparams.n_vocab;
59485952 const int64_t n_vocab_type = hparams.n_vocab_type;
5953+ const int64_t n_rot = hparams.n_rot;
59495954 const int64_t n_expert = hparams.n_expert;
59505955 const int64_t n_expert_used = hparams.n_expert_used;
59515956 const int64_t n_ctx_train = hparams.n_ctx_train;
@@ -6661,7 +6666,12 @@ static bool llm_load_tensors(
66616666 // output
66626667 {
66636668 model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
6664- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
6669+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, llama_model_loader::TENSOR_NOT_REQUIRED);
6670+
6671+ // if output is NULL, init from the input tok embed
6672+ if (model.output == NULL) {
6673+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, llama_model_loader::TENSOR_DUPLICATED);
6674+ }
66656675 }
66666676
66676677 for (int i = 0; i < n_layer; ++i) {
@@ -6680,8 +6690,8 @@ static bool llm_load_tensors(
66806690 layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
66816691 layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
66826692
6683- layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head /2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6684- layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head /2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6693+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_rot /2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6694+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_rot /2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
66856695 }
66866696 } break;
66876697 case LLM_ARCH_PLAMO:
@@ -10869,7 +10879,13 @@ struct llm_build_context {
1086910879 struct ggml_tensor * inp_pos = build_inp_pos();
1087010880
1087110881 // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10872- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
10882+ struct ggml_tensor * KQ_mask = nullptr;
10883+ if (hparams.n_swa == 0) {
10884+ // Phi-4 doesn't use sliding window attention
10885+ KQ_mask = build_inp_KQ_mask();
10886+ } else {
10887+ KQ_mask = build_inp_KQ_mask_swa();
10888+ }
1087310889
1087410890 for (int il = 0; il < n_layer; ++il) {
1087510891 auto residual = inpL;
@@ -10927,7 +10943,7 @@ struct llm_build_context {
1092710943
1092810944 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
1092910945 model.layers[il].wo, model.layers[il].bo,
10930- Kcur, Vcur, Qcur, KQ_mask_swa , n_tokens, kv_head, n_kv, 1.0f, cb, il);
10946+ Kcur, Vcur, Qcur, KQ_mask , n_tokens, kv_head, n_kv, 1.0f, cb, il);
1093110947 }
1093210948
1093310949 if (il == n_layer - 1) {
@@ -19325,6 +19341,14 @@ static int32_t llama_chat_apply_template_internal(
1932519341 if (add_ass) {
1932619342 ss << "<|assistant|>\n";
1932719343 }
19344+ } else if (tmpl == "phi4") {
19345+ // chatml template
19346+ for (auto message : chat) {
19347+ ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
19348+ }
19349+ if (add_ass) {
19350+ ss << "<|im_start|>assistant<|im_sep|>";
19351+ }
1932819352 } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
1932919353 // zephyr template
1933019354 for (auto message : chat) {
0 commit comments