1+ #include " pangu.h"
2+
3+ namespace chatllm ::pangu::moe
4+ {
5+ class ChatHistoryEncoder : public BaseHistoryEncoder
6+ {
7+ public:
8+ void append_sys_prompt (std::vector<int > &ids) const override ;
9+ void append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const override ;
10+ void append_user (int round_idx, const std::string &user, std::vector<int > &ids) const override ;
11+ void append_user_opening (int round_idx, std::vector<int > &ids) const override ;
12+ void append_ai_opening (int round_idx, std::vector<int > &ids) const override ;
13+ };
14+ static ChatHistoryEncoder _chat_encoder;
15+
16+ Tokenizer::Tokenizer (const Config &config)
17+ : Tokenizer(config, &_chat_encoder)
18+ {}
19+
20+ Tokenizer::Tokenizer (const BaseConfig &config, BaseHistoryEncoder *encoder)
21+ : BaseTokenizer::BaseTokenizer(config, encoder)
22+ {
23+ sys_prompt = R"""( 你必须严格遵守法律法规和社会道德规范。生成任何内容时,都应避免涉及暴力、色情、恐怖主义、种族歧视、性别歧视等不当内容。一旦检测到输入或输出有此类倾向,应拒绝回答并发出警告。例如,如果输入内容包含暴力威胁或色情描述,应返回错误信息:“您的输入包含不当内容,无法处理。)""" ;
24+ }
25+
26+ size_t Tokenizer::load (tokenizer::DataReader *buffer, int n_vocab)
27+ {
28+ tp = new tokenizer::BPEProcessor1 ();
29+ size_t size = tp->Load (buffer, n_vocab);
30+
31+ pad_token_id = tp->PieceToId (" <pad>" );
32+ unused9_token_id = tp->PieceToId (" [unused9]" );
33+ unused10_token_id = tp->PieceToId (" [unused10]" );
34+ tp->OverrideTokenDecoding (tp->PieceToId (" [unused16]" ), " <think>" );
35+ tp->OverrideTokenDecoding (tp->PieceToId (" [unused17]" ), " </think>" );
36+ return size;
37+ }
38+
39+ void Tokenizer::encode_item (const char *tag, std::vector<int > &ids)
40+ {
41+ ids.push_back (unused9_token_id);
42+ encode (std::string (tag) + " :" , ids);
43+ }
44+
45+ void Tokenizer::encode_item (const char *tag, const std::string &content, std::vector<int > &ids)
46+ {
47+ ids.push_back (unused9_token_id);
48+ encode (std::string (tag) + " :" + content, ids);
49+ ids.push_back (unused10_token_id);
50+ }
51+
52+ void ChatHistoryEncoder::append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const
53+ {
54+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
55+ tok->encode_item (" 助手" , ai, ids);
56+ }
57+
58+ void ChatHistoryEncoder::append_sys_prompt (std::vector<int > &ids) const
59+ {
60+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
61+ ids.push_back (tok->bos_token_id );
62+ tok->encode_item (" 系统" , tok->get_system_prompt (), ids);
63+ }
64+
65+ void ChatHistoryEncoder::append_user (int round_idx, const std::string &user, std::vector<int > &ids) const
66+ {
67+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
68+ tok->encode_item (" 用户" , user, ids);
69+ }
70+
71+ void ChatHistoryEncoder::append_user_opening (int round_idx, std::vector<int > &ids) const
72+ {
73+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
74+ tok->encode_item (" 用户" , ids);
75+ }
76+
77+ void ChatHistoryEncoder::append_ai_opening (int round_idx, std::vector<int > &ids) const
78+ {
79+ Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
80+ tok->encode_item (" 助手" , ids);
81+ }
82+
83+ template <class PanguMoEMLP > class PanguMoEBlock : public LMBlock1 <RMSNorm, FullBiasedSelfAttention, RMSNorm, PanguMoEMLP>
84+ {
85+ public:
86+ PanguMoEBlock (InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size,
87+ int mlp_intermediate_size1, int mlp_intermediate_size2,
88+ int num_kv_heads, int head_dim, int max_length)
89+ : LMBlock1<RMSNorm, FullBiasedSelfAttention, RMSNorm, PanguMoEMLP>(ctx, hidden_size, num_attention_heads, intermediate_size, mlp_intermediate_size1, mlp_intermediate_size2,
90+ num_kv_heads, head_dim, max_length)
91+ {}
92+ };
93+
94+ template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class PanguSparseMoE : public BaseSparseMLP
95+ {
96+ public:
97+ PanguSparseMoE (InitContext *ctx, int hidden_size, int intermediate_size)
98+ : BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false , true , true )
99+ {
100+ norm_topk_prob = false ;
101+ }
102+ };
103+
104+ template <const int NUM_EXPERTS, const int EXPERTS_PER_TOK, const int EFFECTIVE_EXPERTS_PER_TOK> class GenericConditionalGeneration : public BaseModelForConditionalGeneration
105+ {
106+ public:
107+ typedef CombinedMLP<PanguSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK>, SiLUMLP> PanguMoEMLP;
108+ typedef PanguMoEBlock<PanguMoEMLP> MoEBlock;
109+ typedef BaseModelForConditionalGeneration Base;
110+ typedef Model<Config, Embedding, RMSNorm, MoEBlock, int , int , int , int , int , int , int , int > ModelClass;
111+ public:
112+ GenericConditionalGeneration () = default ;
113+
114+ GenericConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config)
115+ : BaseModelForConditionalGeneration(MODEL_TYPE_PANGU_MOE, config, runtime_config, 4096 * 4 ),
116+ config (config)
117+ {
118+ const size_t tensor_ovhd = ggml_tensor_overhead ();
119+ const size_t num_tensors = 3 + config.num_hidden_layers * 22 ;
120+ const size_t ctx_size = num_tensors * tensor_ovhd;
121+ w_ctx_.gctx = GGMLContext ({.mem_size = ctx_size, .mem_buffer = nullptr , .no_alloc = true });
122+ w_ctx_.dtype = config.dtype ;
123+
124+ Base::transformer = new ModelClass (
125+ &w_ctx_, config, false ,
126+ config.hidden_size , config.num_attention_heads ,
127+ config.intermediate_size , config.moe_intermediate_size , config.intermediate_size ,
128+ config.num_key_value_heads , config.hidden_size / config.num_attention_heads ,
129+ config.max_length );
130+
131+ for (int i = 0 ; i < config.num_hidden_layers ; i++)
132+ {
133+ auto &layer = Base::get_typed_transformer<ModelClass>()->layers [i];
134+ layer.attention .freq_base = config.rope_theta ;
135+ }
136+
137+ w_ctx_.check_used_mem_size (true );
138+ }
139+
140+ public:
141+ Config config;
142+ };
143+
144+ namespace experts_64
145+ {
146+ const int NUM_EXPERTS = 64 ;
147+ const int EXPERTS_PER_TOK = 8 ;
148+
149+ typedef GenericConditionalGeneration<NUM_EXPERTS, EXPERTS_PER_TOK, EXPERTS_PER_TOK> ConditionalGeneration;
150+ }
151+
152+ ConditionalGeneration::ConditionalGeneration (const Config &config, const RuntimeConfig &runtime_config)
153+ {
154+ switch (config.num_experts )
155+ {
156+ case experts_64::NUM_EXPERTS:
157+ set_proxy_model (new experts_64::ConditionalGeneration (config, runtime_config));
158+ break ;
159+ default :
160+ CHATLLM_CHECK (false ) << " unsupported MoE param: num_experts = " << config.num_experts ;
161+ break ;
162+ }
163+ }
164+
165+ void ConditionalGeneration::load (ModelLoader &loader)
166+ {
167+ loader.add_tensor_name_translations ({
168+ {" .mlp2." , " .shared_expert." },
169+ {" .mlp1.gate." , " .gate." },
170+ {" .mlp1.router_scale" , " .router_scale" },
171+ {" .mlp1.experts." , " .experts." },
172+ });
173+
174+ ModelProxy::load (loader);
175+ }
176+ }
0 commit comments