Skip to content

Commit ead7370

Browse files
author
Judd
committed
support phi-3.5 mini & moe
1 parent c45c7e1 commit ead7370

File tree

7 files changed

+293
-38
lines changed

7 files changed

+293
-38
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
1313

1414
**What's New:**
1515

16+
* 2024-07-24: Phi-3.5 Mini & MoE
1617
* 2024-07-24: Llama 3.1
1718
* 2024-07-23: Llama-3-Groq with tool calling
1819
* 2024-07-17: Mistral Nemo

convert.py

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ class ModelType(Enum):
8181
Phi3 = 0x520
8282
Phi3_ScalingSU = 0x521
8383
Phi3_ScalingSU2 = 0x522
84+
Phi3_ScalingSU3 = 0x523
85+
Phi3MoE_ScalingSU = 0x530
8486

8587
Mistral = 0x600
8688
Mixtral = 0x601
@@ -2345,6 +2347,90 @@ def dump_config(f, config, ggml_type):
23452347
def get_weight_names(config):
23462348
return Phi3Converter.get_weight_names(config)
23472349

2350+
class Phi3SU3Converter(BaseConverter):
2351+
MODEL_TYPE = ModelType.Phi3_ScalingSU3
2352+
2353+
@classmethod
2354+
def state_dict_pp(cls, config, state_dict):
2355+
return Phi3SUConverter.state_dict_pp(config, state_dict)
2356+
2357+
@staticmethod
2358+
def dump_config(f, config, ggml_type):
2359+
Phi3SUConverter.dump_config(f, config, ggml_type)
2360+
2361+
config_values = [
2362+
config.rope_scaling['short_mscale'],
2363+
config.rope_scaling['long_mscale'],
2364+
]
2365+
f.write(struct.pack('<' + "f" * len(config_values), *config_values))
2366+
2367+
@staticmethod
2368+
def get_weight_names(config):
2369+
return Phi3SUConverter.get_weight_names(config)
2370+
2371+
class Phi3MoESUConverter(BaseConverter):
2372+
MODEL_TYPE = ModelType.Phi3MoE_ScalingSU
2373+
2374+
@classmethod
2375+
def pp(cls, config, name: str, tensor):
2376+
if name.endswith('k_proj.weight'):
2377+
return permute(tensor, config.num_key_value_heads)
2378+
elif name.endswith('q_proj.weight'):
2379+
return permute(tensor, config.num_attention_heads)
2380+
else:
2381+
return tensor
2382+
2383+
@staticmethod
2384+
def dump_config(f, config, ggml_type):
2385+
assert config.lm_head_bias, 'lm_head_bias must be True'
2386+
2387+
Phi3SU3Converter.dump_config(f, config, ggml_type)
2388+
2389+
config_values = [
2390+
config.num_experts_per_tok,
2391+
config.num_local_experts,
2392+
]
2393+
f.write(struct.pack('<' + "i" * len(config_values), *config_values))
2394+
2395+
@staticmethod
2396+
def get_weight_names(config):
2397+
weight_names = ["model.embed_tokens.weight"]
2398+
for i in range(config.num_hidden_layers):
2399+
weight_names += [
2400+
f"model.layers.{i}.input_layernorm.weight",
2401+
f"model.layers.{i}.input_layernorm.bias",
2402+
]
2403+
2404+
for j in range(config.num_local_experts):
2405+
weight_names += [
2406+
f"model.layers.{i}.block_sparse_moe.experts.{j}.w1.weight",
2407+
f"model.layers.{i}.block_sparse_moe.experts.{j}.w2.weight",
2408+
f"model.layers.{i}.block_sparse_moe.experts.{j}.w3.weight",
2409+
]
2410+
2411+
weight_names += [
2412+
f"model.layers.{i}.block_sparse_moe.gate.weight",
2413+
f"model.layers.{i}.post_attention_layernorm.weight",
2414+
f"model.layers.{i}.post_attention_layernorm.bias",
2415+
f"model.layers.{i}.self_attn.k_proj.weight",
2416+
f"model.layers.{i}.self_attn.k_proj.bias",
2417+
f"model.layers.{i}.self_attn.o_proj.weight",
2418+
f"model.layers.{i}.self_attn.o_proj.bias",
2419+
f"model.layers.{i}.self_attn.q_proj.weight",
2420+
f"model.layers.{i}.self_attn.q_proj.bias",
2421+
f"model.layers.{i}.self_attn.v_proj.weight",
2422+
f"model.layers.{i}.self_attn.v_proj.bias",
2423+
]
2424+
2425+
weight_names += [
2426+
"model.norm.weight",
2427+
"model.norm.bias",
2428+
"lm_head.weight",
2429+
"lm_head.bias"
2430+
]
2431+
2432+
return weight_names
2433+
23482434
class QWenConverter(BaseConverter):
23492435
MODEL_TYPE = ModelType.QWen
23502436
FILE_VERSION = 2
@@ -3677,9 +3763,14 @@ def main():
36773763
config.rope_scaling['type'] = 'longrope'
36783764

36793765
if config.rope_scaling['type'] == 'longrope':
3680-
Phi3SUConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
3766+
if 'long_mscale' in config.rope_scaling:
3767+
Phi3SU3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
3768+
else:
3769+
Phi3SUConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
36813770
else:
36823771
raise Exception(config.rope_scaling['type'])
3772+
elif arch == 'PhiMoEForCausalLM':
3773+
Phi3MoESUConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
36833774
elif arch == 'dolphinphi2':
36843775
Phi2Converter.MODEL_TYPE = ModelType.DolphinPhi2_v2 if config.hidden_act is not None else ModelType.DolphinPhi2
36853776
Phi2Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@
9191

9292
* [x] [Dolphin Phi-2](https://huggingface.co/cognitivecomputations/dolphin-2_6-phi-2/tree/a084bb141f99f67e8ff56a654e29ddd53a0b4d7a) (`-a DolphinPhi2`) 🐬
9393

94-
* [x] Phi-3 Mini: [Instruct-4k](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct), [Instruct-128k](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)
94+
* [x] Phi-3: [Mini-Instruct-4k](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct), [Mini-Instruct-128k](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct), [Medium-Instruct-4k](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct), [Medium-Instruct-128k](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)
9595

96-
* [x] Phi-3 Medium: [Instruct-4k](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct), [Instruct-128k](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct)
96+
* [x] Phi-3.5: [Mini-Instruct](https://huggingface.co/microsoft/Phi-3.5-mini-instruct), [MoE-Instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)
9797

9898
* QWen (`QWenLMHeadModel`, `Qwen2ForCausalLM`, `Qwen2MoeForCausalLM`)
9999
* [x] v1: [Chat-7B](https://huggingface.co/Qwen/Qwen-7B-Chat), [Chat-14B](https://huggingface.co/Qwen/Qwen-14B-Chat), [QAnything-7B](https://huggingface.co/netease-youdao/Qwen-7B-QAnything)

models/phi.cpp

Lines changed: 164 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,8 @@ namespace v3
386386
void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override;
387387
void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override;
388388
void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
389+
public:
390+
bool add_bos = true;
389391
};
390392

391393
static ChatHistoryEncoder _chat_encoder;
@@ -415,6 +417,15 @@ namespace v3
415417
end_token_id = tp->PieceToId("<|end|>");
416418
nl_token_id = tp->PieceToId("\n");
417419

420+
if (-1 == system_token_id)
421+
{
422+
CHATLLM_CHECK(tp->GetPieceSize() == 32000) << " unsupported tokenizer";
423+
system_token_id = 32006;
424+
user_token_id = 32010;
425+
assistant_token_id = 32001;
426+
end_token_id = 32007;
427+
}
428+
418429
pad_token_id = eos_token_id;
419430

420431
terminate_ids.insert(end_token_id);
@@ -512,7 +523,9 @@ namespace v3
512523
{
513524
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
514525

515-
ids.push_back(tok->bos_token_id);
526+
if (add_bos)
527+
ids.push_back(tok->bos_token_id);
528+
516529
if (tok->get_system_prompt().size() > 0)
517530
tok->encode(tok->get_system_prompt(), ids, tok->system_token_id, tok->end_token_id);
518531
}
@@ -577,6 +590,7 @@ namespace v3_su
577590
{
578591
auto &attention = get_typed_transformer<ModelClass>()->layers[i].attention;
579592
attention.config(config.original_max_position_embeddings, config.rope_theta,
593+
scaling_factor,
580594
scaling_factor,
581595
config.hidden_size / config.num_attention_heads / 2,
582596
config.short_factor,
@@ -590,54 +604,174 @@ namespace v3_su2
590604
{
591605
typedef v3_su::Config Config;
592606

593-
class ChatHistoryEncoder : public BaseHistoryEncoder
607+
class Tokenizer : public v3::Tokenizer
594608
{
595609
public:
596-
void append_sys_prompt(std::vector<int> &ids) const override;
597-
void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override;
598-
void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override;
599-
void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
610+
Tokenizer(const BaseConfig &config) : v3::Tokenizer(config, &v3::_chat_encoder)
611+
{
612+
append_nl_after_end_tok = true;
613+
v3::_chat_encoder.add_bos = false;
614+
}
600615
};
601616

602-
static ChatHistoryEncoder _chat_encoder;
617+
typedef v3_su::ConditionalGeneration ConditionalGeneration;
618+
}
603619

604-
class Tokenizer : public v3::Tokenizer
620+
namespace v3_su3
621+
{
622+
struct Config : public v3_su2::Config
623+
{
624+
float short_mscale;
625+
float long_mscale;
626+
};
627+
628+
typedef v3_su2::Tokenizer Tokenizer;
629+
630+
class ConditionalGeneration : public v3_su2::ConditionalGeneration
605631
{
606632
public:
607-
Tokenizer(const BaseConfig &config) : v3::Tokenizer(config, &_chat_encoder)
633+
ConditionalGeneration() = default;
634+
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type = ModelType::MODEL_TYPE_PHI3_SU3)
635+
: v3_su2::ConditionalGeneration(config, runtime_config, type, config.num_key_value_heads, config.max_length)
608636
{
609-
append_nl_after_end_tok = true;
637+
for (int i = 0; i < config.num_hidden_layers; i++)
638+
{
639+
auto &attention = get_typed_transformer<ModelClass>()->layers[i].attention;
640+
attention.config(config.original_max_position_embeddings, config.rope_theta,
641+
config.short_mscale,
642+
config.long_mscale,
643+
config.hidden_size / config.num_attention_heads / 2,
644+
config.short_factor,
645+
config.long_factor);
646+
}
610647
}
611648
};
649+
}
612650

613-
typedef v3_su::ConditionalGeneration ConditionalGeneration;
651+
namespace v3_moe
652+
{
653+
struct Config : public v3_su3::Config
654+
{
655+
int num_experts_per_tok;
656+
int num_local_experts;
657+
};
614658

615-
void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
659+
typedef v3_su3::Tokenizer Tokenizer;
660+
661+
template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class Phi3SparseMoE : public BaseSparseMLP
616662
{
617-
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
618-
append_ai_opening(round_idx, ids);
619-
tok->encode(ai, ids, -1, tok->end_token_id);
620-
}
663+
public:
664+
Phi3SparseMoE(InitContext *ctx, int hidden_size, int intermediate_size)
665+
: BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false)
666+
{
667+
}
668+
};
621669

622-
void ChatHistoryEncoder::append_sys_prompt(std::vector<int> &ids) const
670+
class Phi3SUSelfAttentionBiased : public Phi3SUSelfAttention
623671
{
624-
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
672+
public:
673+
Phi3SUSelfAttentionBiased(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length)
674+
: Phi3SUSelfAttention(ctx, hidden_size, num_attention_heads, num_kv_heads, max_length, true, true)
675+
{}
676+
};
625677

626-
if (tok->get_system_prompt().size() > 0)
627-
tok->encode(tok->get_system_prompt(), ids, tok->system_token_id, tok->end_token_id);
628-
}
678+
template<int num_local_experts, int num_experts_per_tok> class Phi3MoEBlock : public LMBlock1<LayerNorm, Phi3SUSelfAttentionBiased, LayerNorm,
679+
Phi3SparseMoE<num_local_experts, num_experts_per_tok>>
680+
{
681+
public:
682+
Phi3MoEBlock(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int max_length)
683+
: LMBlock1<LayerNorm, Phi3SUSelfAttentionBiased, LayerNorm,
684+
Phi3SparseMoE<num_local_experts, num_experts_per_tok>>(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, max_length)
685+
{}
686+
};
629687

630-
void ChatHistoryEncoder::append_user(int round_idx, const std::string &user, std::vector<int> &ids) const
688+
template<int _NUM_EXPERTS, int _EXPERTS_PER_TOK, ModelType type> class _ConditionalGeneration : public BaseModelForConditionalGeneration
631689
{
632-
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
690+
public:
691+
typedef BaseModelForConditionalGeneration Base;
692+
typedef Model<Config, Embedding, LayerNorm, Phi3MoEBlock<_NUM_EXPERTS, _EXPERTS_PER_TOK>, int, int, int, int, int> ModelClass;
693+
public:
694+
_ConditionalGeneration() = default;
633695

634-
tok->encode(user, ids, tok->user_token_id, tok->end_token_id);
635-
}
696+
_ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
697+
: Base(type, config, runtime_config), config(config)
698+
{
699+
constexpr size_t tensor_ovhd = GGML_TENSOR_SIZE + GGML_OBJECT_SIZE;
700+
const size_t num_tensors = 3 + 2 + config.num_hidden_layers * (11 + 3 + 5);
701+
const size_t ctx_size = num_tensors * tensor_ovhd;
702+
w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
703+
w_ctx_.dtype = config.dtype;
636704

637-
void ChatHistoryEncoder::append_ai_opening(int round_idx, std::vector<int> &ids) const
638-
{
639-
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
705+
CHATLLM_CHECK((_NUM_EXPERTS == config.num_local_experts) && (_EXPERTS_PER_TOK == config.num_experts_per_tok))
706+
<< "unsupported MoE param";
640707

641-
tok->encode("", ids, tok->assistant_token_id, -1);
642-
}
708+
Base::GRAPH_SIZE = 4096 * 2;
709+
710+
Base::transformer = new ModelClass(
711+
&w_ctx_, config, true,
712+
config.hidden_size, config.num_attention_heads,
713+
config.intermediate_size, config.num_key_value_heads, config.max_length);
714+
715+
for (int i = 0; i < config.num_hidden_layers; i++)
716+
{
717+
auto &attention = Base::get_typed_transformer<ModelClass>()->layers[i].attention;
718+
attention.config(config.original_max_position_embeddings, config.rope_theta,
719+
config.short_mscale,
720+
config.long_mscale,
721+
config.hidden_size / config.num_attention_heads / 2,
722+
config.short_factor,
723+
config.long_factor);
724+
}
725+
726+
CHATLLM_CHECK(w_ctx_.get_used_mem() == w_ctx_.get_mem_size()) << "corrupted model weights";
727+
}
728+
729+
void load(ModelLoader &loader) override
730+
{
731+
auto transformer = get_typed_transformer<ModelClass>();
732+
loader.read_tensor("model.embed_tokens.weight", transformer->word_embeddings.weight);
733+
for (int i = 0; i < config.num_hidden_layers; i++)
734+
{
735+
std::string layer_prefix = "model.layers." + std::to_string(Base::layer_ids[i]) + '.';
736+
737+
loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".w2.weight", transformer->layers[i].mlp.experts_down.weight);
738+
loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".w1.weight", transformer->layers[i].mlp.experts_gate.weight);
739+
loader.read_tensor(layer_prefix + "mlp.experts_up.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".w3.weight", transformer->layers[i].mlp.experts_up.weight);
740+
741+
loader.read_tensor(layer_prefix + "block_sparse_moe.gate.weight",
742+
transformer->layers[i].mlp.gate.weight);
743+
744+
loader.read_tensor(layer_prefix + "input_layernorm.weight",
745+
transformer->layers[i].input_layernorm.weight);
746+
loader.read_tensor(layer_prefix + "input_layernorm.bias",
747+
transformer->layers[i].input_layernorm.bias);
748+
749+
loader.read_tensor(layer_prefix + "post_attention_layernorm.weight",
750+
transformer->layers[i].post_attention_layernorm.weight);
751+
loader.read_tensor(layer_prefix + "post_attention_layernorm.bias",
752+
transformer->layers[i].post_attention_layernorm.bias);
753+
754+
loader.read_tensor(layer_prefix + "self_attn.k_proj.weight", transformer->layers[i].attention.k_proj.weight);
755+
loader.read_tensor(layer_prefix + "self_attn.k_proj.bias", transformer->layers[i].attention.k_proj.bias);
756+
loader.read_tensor(layer_prefix + "self_attn.o_proj.weight", transformer->layers[i].attention.o_proj.weight);
757+
loader.read_tensor(layer_prefix + "self_attn.o_proj.bias", transformer->layers[i].attention.o_proj.bias);
758+
loader.read_tensor(layer_prefix + "self_attn.q_proj.weight", transformer->layers[i].attention.q_proj.weight);
759+
loader.read_tensor(layer_prefix + "self_attn.q_proj.bias", transformer->layers[i].attention.q_proj.bias);
760+
loader.read_tensor(layer_prefix + "self_attn.v_proj.weight", transformer->layers[i].attention.v_proj.weight);
761+
loader.read_tensor(layer_prefix + "self_attn.v_proj.bias", transformer->layers[i].attention.v_proj.bias);
762+
}
763+
loader.read_tensor("model.norm.weight", transformer->final_layernorm.weight);
764+
loader.read_tensor("model.norm.bias", transformer->final_layernorm.bias);
765+
loader.read_tensor("lm_head.weight", dynamic_cast<Linear *>(transformer->lm_head)->weight);
766+
loader.read_tensor("lm_head.bias", dynamic_cast<Linear *>(transformer->lm_head)->bias);
767+
}
768+
769+
public:
770+
Config config;
771+
};
772+
773+
const int NUM_EXPERTS = 16;
774+
const int EXPERTS_PER_TOK = 2;
775+
776+
typedef _ConditionalGeneration<NUM_EXPERTS, EXPERTS_PER_TOK, MODEL_TYPE_PHI3_MOE> ConditionalGeneration;
643777
}

0 commit comments

Comments
 (0)