@@ -386,6 +386,8 @@ namespace v3
386386 void append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const override ;
387387 void append_user (int round_idx, const std::string &user, std::vector<int > &ids) const override ;
388388 void append_ai_opening (int round_idx, std::vector<int > &ids) const override ;
389+ public:
390+ bool add_bos = true ;
389391 };
390392
391393 static ChatHistoryEncoder _chat_encoder;
@@ -415,6 +417,15 @@ namespace v3
415417 end_token_id = tp->PieceToId (" <|end|>" );
416418 nl_token_id = tp->PieceToId (" \n " );
417419
420+ if (-1 == system_token_id)
421+ {
422+ CHATLLM_CHECK (tp->GetPieceSize () == 32000 ) << " unsupported tokenizer" ;
423+ system_token_id = 32006 ;
424+ user_token_id = 32010 ;
425+ assistant_token_id = 32001 ;
426+ end_token_id = 32007 ;
427+ }
428+
418429 pad_token_id = eos_token_id;
419430
420431 terminate_ids.insert (end_token_id);
@@ -512,7 +523,9 @@ namespace v3
512523 {
513524 Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
514525
515- ids.push_back (tok->bos_token_id );
526+ if (add_bos)
527+ ids.push_back (tok->bos_token_id );
528+
516529 if (tok->get_system_prompt ().size () > 0 )
517530 tok->encode (tok->get_system_prompt (), ids, tok->system_token_id , tok->end_token_id );
518531 }
@@ -577,6 +590,7 @@ namespace v3_su
577590 {
578591 auto &attention = get_typed_transformer<ModelClass>()->layers [i].attention ;
579592 attention.config (config.original_max_position_embeddings , config.rope_theta ,
593+ scaling_factor,
580594 scaling_factor,
581595 config.hidden_size / config.num_attention_heads / 2 ,
582596 config.short_factor ,
@@ -590,54 +604,174 @@ namespace v3_su2
590604{
591605 typedef v3_su::Config Config;
592606
593- class ChatHistoryEncoder : public BaseHistoryEncoder
607+ class Tokenizer : public v3 ::Tokenizer
594608 {
595609 public:
596- void append_sys_prompt (std::vector<int > &ids) const override ;
597- void append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const override ;
598- void append_user (int round_idx, const std::string &user, std::vector<int > &ids) const override ;
599- void append_ai_opening (int round_idx, std::vector<int > &ids) const override ;
610+ Tokenizer (const BaseConfig &config) : v3::Tokenizer(config, &v3::_chat_encoder)
611+ {
612+ append_nl_after_end_tok = true ;
613+ v3::_chat_encoder.add_bos = false ;
614+ }
600615 };
601616
602- static ChatHistoryEncoder _chat_encoder;
617+ typedef v3_su::ConditionalGeneration ConditionalGeneration;
618+ }
603619
604- class Tokenizer : public v3 ::Tokenizer
620+ namespace v3_su3
621+ {
622+ struct Config : public v3_su2 ::Config
623+ {
624+ float short_mscale;
625+ float long_mscale;
626+ };
627+
628+ typedef v3_su2::Tokenizer Tokenizer;
629+
630+ class ConditionalGeneration : public v3_su2 ::ConditionalGeneration
605631 {
606632 public:
607- Tokenizer (const BaseConfig &config) : v3::Tokenizer(config, &_chat_encoder)
633+ ConditionalGeneration () = default ;
634+ ConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config, ModelType type = ModelType::MODEL_TYPE_PHI3_SU3)
635+ : v3_su2::ConditionalGeneration(config, runtime_config, type, config.num_key_value_heads, config.max_length)
608636 {
609- append_nl_after_end_tok = true ;
637+ for (int i = 0 ; i < config.num_hidden_layers ; i++)
638+ {
639+ auto &attention = get_typed_transformer<ModelClass>()->layers [i].attention ;
640+ attention.config (config.original_max_position_embeddings , config.rope_theta ,
641+ config.short_mscale ,
642+ config.long_mscale ,
643+ config.hidden_size / config.num_attention_heads / 2 ,
644+ config.short_factor ,
645+ config.long_factor );
646+ }
610647 }
611648 };
649+ }
612650
613- typedef v3_su::ConditionalGeneration ConditionalGeneration;
651+ namespace v3_moe
652+ {
653+ struct Config : public v3_su3 ::Config
654+ {
655+ int num_experts_per_tok;
656+ int num_local_experts;
657+ };
614658
615- void ChatHistoryEncoder::append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const
659+ typedef v3_su3::Tokenizer Tokenizer;
660+
661+ template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class Phi3SparseMoE : public BaseSparseMLP
616662 {
617- Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
618- append_ai_opening (round_idx, ids);
619- tok->encode (ai, ids, -1 , tok->end_token_id );
620- }
663+ public:
664+ Phi3SparseMoE (InitContext *ctx, int hidden_size, int intermediate_size)
665+ : BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false )
666+ {
667+ }
668+ };
621669
622- void ChatHistoryEncoder::append_sys_prompt (std::vector< int > &ids) const
670+ class Phi3SUSelfAttentionBiased : public Phi3SUSelfAttention
623671 {
624- Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
672+ public:
673+ Phi3SUSelfAttentionBiased (InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int max_length)
674+ : Phi3SUSelfAttention(ctx, hidden_size, num_attention_heads, num_kv_heads, max_length, true , true )
675+ {}
676+ };
625677
626- if (tok->get_system_prompt ().size () > 0 )
627- tok->encode (tok->get_system_prompt (), ids, tok->system_token_id , tok->end_token_id );
628- }
678+ template <int num_local_experts, int num_experts_per_tok> class Phi3MoEBlock : public LMBlock1 <LayerNorm, Phi3SUSelfAttentionBiased, LayerNorm,
679+ Phi3SparseMoE<num_local_experts, num_experts_per_tok>>
680+ {
681+ public:
682+ Phi3MoEBlock (InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size, int num_kv_heads, int max_length)
683+ : LMBlock1<LayerNorm, Phi3SUSelfAttentionBiased, LayerNorm,
684+ Phi3SparseMoE<num_local_experts, num_experts_per_tok>>(ctx, hidden_size, num_attention_heads, intermediate_size, num_kv_heads, max_length)
685+ {}
686+ };
629687
630- void ChatHistoryEncoder::append_user ( int round_idx, const std::string &user, std::vector< int > &ids) const
688+ template < int _NUM_EXPERTS, int _EXPERTS_PER_TOK, ModelType type> class _ConditionalGeneration : public BaseModelForConditionalGeneration
631689 {
632- Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
690+ public:
691+ typedef BaseModelForConditionalGeneration Base;
692+ typedef Model<Config, Embedding, LayerNorm, Phi3MoEBlock<_NUM_EXPERTS, _EXPERTS_PER_TOK>, int , int , int , int , int > ModelClass;
693+ public:
694+ _ConditionalGeneration () = default ;
633695
634- tok->encode (user, ids, tok->user_token_id , tok->end_token_id );
635- }
696+ _ConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config)
697+ : Base(type, config, runtime_config), config(config)
698+ {
699+ constexpr size_t tensor_ovhd = GGML_TENSOR_SIZE + GGML_OBJECT_SIZE;
700+ const size_t num_tensors = 3 + 2 + config.num_hidden_layers * (11 + 3 + 5 );
701+ const size_t ctx_size = num_tensors * tensor_ovhd;
702+ w_ctx_.gctx = GGMLContext ({.mem_size = ctx_size, .mem_buffer = nullptr , .no_alloc = true });
703+ w_ctx_.dtype = config.dtype ;
636704
637- void ChatHistoryEncoder::append_ai_opening (int round_idx, std::vector<int > &ids) const
638- {
639- Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
705+ CHATLLM_CHECK ((_NUM_EXPERTS == config.num_local_experts ) && (_EXPERTS_PER_TOK == config.num_experts_per_tok ))
706+ << " unsupported MoE param" ;
640707
641- tok->encode (" " , ids, tok->assistant_token_id , -1 );
642- }
708+ Base::GRAPH_SIZE = 4096 * 2 ;
709+
710+ Base::transformer = new ModelClass (
711+ &w_ctx_, config, true ,
712+ config.hidden_size , config.num_attention_heads ,
713+ config.intermediate_size , config.num_key_value_heads , config.max_length );
714+
715+ for (int i = 0 ; i < config.num_hidden_layers ; i++)
716+ {
717+ auto &attention = Base::get_typed_transformer<ModelClass>()->layers [i].attention ;
718+ attention.config (config.original_max_position_embeddings , config.rope_theta ,
719+ config.short_mscale ,
720+ config.long_mscale ,
721+ config.hidden_size / config.num_attention_heads / 2 ,
722+ config.short_factor ,
723+ config.long_factor );
724+ }
725+
726+ CHATLLM_CHECK (w_ctx_.get_used_mem () == w_ctx_.get_mem_size ()) << " corrupted model weights" ;
727+ }
728+
729+ void load (ModelLoader &loader) override
730+ {
731+ auto transformer = get_typed_transformer<ModelClass>();
732+ loader.read_tensor (" model.embed_tokens.weight" , transformer->word_embeddings .weight );
733+ for (int i = 0 ; i < config.num_hidden_layers ; i++)
734+ {
735+ std::string layer_prefix = " model.layers." + std::to_string (Base::layer_ids[i]) + ' .' ;
736+
737+ loader.read_tensor (layer_prefix + " mlp.experts_down.weight" , layer_prefix + " block_sparse_moe.experts." , _NUM_EXPERTS, " .w2.weight" , transformer->layers [i].mlp .experts_down .weight );
738+ loader.read_tensor (layer_prefix + " mlp.experts_gate.weight" , layer_prefix + " block_sparse_moe.experts." , _NUM_EXPERTS, " .w1.weight" , transformer->layers [i].mlp .experts_gate .weight );
739+ loader.read_tensor (layer_prefix + " mlp.experts_up.weight" , layer_prefix + " block_sparse_moe.experts." , _NUM_EXPERTS, " .w3.weight" , transformer->layers [i].mlp .experts_up .weight );
740+
741+ loader.read_tensor (layer_prefix + " block_sparse_moe.gate.weight" ,
742+ transformer->layers [i].mlp .gate .weight );
743+
744+ loader.read_tensor (layer_prefix + " input_layernorm.weight" ,
745+ transformer->layers [i].input_layernorm .weight );
746+ loader.read_tensor (layer_prefix + " input_layernorm.bias" ,
747+ transformer->layers [i].input_layernorm .bias );
748+
749+ loader.read_tensor (layer_prefix + " post_attention_layernorm.weight" ,
750+ transformer->layers [i].post_attention_layernorm .weight );
751+ loader.read_tensor (layer_prefix + " post_attention_layernorm.bias" ,
752+ transformer->layers [i].post_attention_layernorm .bias );
753+
754+ loader.read_tensor (layer_prefix + " self_attn.k_proj.weight" , transformer->layers [i].attention .k_proj .weight );
755+ loader.read_tensor (layer_prefix + " self_attn.k_proj.bias" , transformer->layers [i].attention .k_proj .bias );
756+ loader.read_tensor (layer_prefix + " self_attn.o_proj.weight" , transformer->layers [i].attention .o_proj .weight );
757+ loader.read_tensor (layer_prefix + " self_attn.o_proj.bias" , transformer->layers [i].attention .o_proj .bias );
758+ loader.read_tensor (layer_prefix + " self_attn.q_proj.weight" , transformer->layers [i].attention .q_proj .weight );
759+ loader.read_tensor (layer_prefix + " self_attn.q_proj.bias" , transformer->layers [i].attention .q_proj .bias );
760+ loader.read_tensor (layer_prefix + " self_attn.v_proj.weight" , transformer->layers [i].attention .v_proj .weight );
761+ loader.read_tensor (layer_prefix + " self_attn.v_proj.bias" , transformer->layers [i].attention .v_proj .bias );
762+ }
763+ loader.read_tensor (" model.norm.weight" , transformer->final_layernorm .weight );
764+ loader.read_tensor (" model.norm.bias" , transformer->final_layernorm .bias );
765+ loader.read_tensor (" lm_head.weight" , dynamic_cast <Linear *>(transformer->lm_head )->weight );
766+ loader.read_tensor (" lm_head.bias" , dynamic_cast <Linear *>(transformer->lm_head )->bias );
767+ }
768+
769+ public:
770+ Config config;
771+ };
772+
773+ const int NUM_EXPERTS = 16 ;
774+ const int EXPERTS_PER_TOK = 2 ;
775+
776+ typedef _ConditionalGeneration<NUM_EXPERTS, EXPERTS_PER_TOK, MODEL_TYPE_PHI3_MOE> ConditionalGeneration;
643777}
0 commit comments