Skip to content

Commit bd1bbe9

Browse files
authored
Merge pull request #744 from cjpais/phi4-support
Add phi4 support
2 parents e6daab0 + a4ece76 commit bd1bbe9

File tree

3 files changed

+37
-5
lines changed

3 files changed

+37
-5
lines changed

llama.cpp/llama-vocab.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,13 @@ struct llm_tokenizer_bpe {
438438
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
439439
};
440440
break;
441+
case LLAMA_VOCAB_PRE_TYPE_GPT4O:
442+
// original regex from tokenizer.json
443+
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
444+
regex_exprs = {
445+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
446+
};
447+
break;
441448
default:
442449
// default regex for BPE tokenization pre-processing
443450
regex_exprs = {

llama.cpp/llama.cpp

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5364,6 +5364,10 @@ static void llm_load_vocab(
53645364
} else if (
53655365
tokenizer_pre == "exaone") {
53665366
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
5367+
} else if (
5368+
tokenizer_pre == "gpt-4o") {
5369+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT4O;
5370+
vocab.tokenizer_clean_spaces = false;
53675371
} else {
53685372
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
53695373
}
@@ -5946,6 +5950,7 @@ static bool llm_load_tensors(
59465950
const int64_t n_embd_gqa = n_embd_v_gqa;
59475951
const int64_t n_vocab = hparams.n_vocab;
59485952
const int64_t n_vocab_type = hparams.n_vocab_type;
5953+
const int64_t n_rot = hparams.n_rot;
59495954
const int64_t n_expert = hparams.n_expert;
59505955
const int64_t n_expert_used = hparams.n_expert_used;
59515956
const int64_t n_ctx_train = hparams.n_ctx_train;
@@ -6661,7 +6666,12 @@ static bool llm_load_tensors(
66616666
// output
66626667
{
66636668
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
6664-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
6669+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, llama_model_loader::TENSOR_NOT_REQUIRED);
6670+
6671+
// if output is NULL, init from the input tok embed
6672+
if (model.output == NULL) {
6673+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, llama_model_loader::TENSOR_DUPLICATED);
6674+
}
66656675
}
66666676

66676677
for (int i = 0; i < n_layer; ++i) {
@@ -6680,8 +6690,8 @@ static bool llm_load_tensors(
66806690
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
66816691
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
66826692

6683-
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6684-
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6693+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_rot/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
6694+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_rot/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
66856695
}
66866696
} break;
66876697
case LLM_ARCH_PLAMO:
@@ -10869,7 +10879,13 @@ struct llm_build_context {
1086910879
struct ggml_tensor * inp_pos = build_inp_pos();
1087010880

1087110881
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
10872-
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
10882+
struct ggml_tensor * KQ_mask = nullptr;
10883+
if (hparams.n_swa == 0) {
10884+
// Phi-4 doesn't use sliding window attention
10885+
KQ_mask = build_inp_KQ_mask();
10886+
} else {
10887+
KQ_mask = build_inp_KQ_mask_swa();
10888+
}
1087310889

1087410890
for (int il = 0; il < n_layer; ++il) {
1087510891
auto residual = inpL;
@@ -10927,7 +10943,7 @@ struct llm_build_context {
1092710943

1092810944
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
1092910945
model.layers[il].wo, model.layers[il].bo,
10930-
Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
10946+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
1093110947
}
1093210948

1093310949
if (il == n_layer - 1) {
@@ -19325,6 +19341,14 @@ static int32_t llama_chat_apply_template_internal(
1932519341
if (add_ass) {
1932619342
ss << "<|assistant|>\n";
1932719343
}
19344+
} else if (tmpl == "phi4") {
19345+
// chatml template
19346+
for (auto message : chat) {
19347+
ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
19348+
}
19349+
if (add_ass) {
19350+
ss << "<|im_start|>assistant<|im_sep|>";
19351+
}
1932819352
} else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
1932919353
// zephyr template
1933019354
for (auto message : chat) {

llama.cpp/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ extern "C" {
9898
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
9999
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
100100
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
101+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 26,
101102
};
102103

103104
enum llama_rope_type {

0 commit comments

Comments
 (0)