Skip to content

Commit 12471b9

Browse files
committed
support Pangu-Pro-MoE
1 parent 67e3746 commit 12471b9

File tree

11 files changed

+361
-20
lines changed

11 files changed

+361
-20
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ set(core_files src/backend.cpp
5959
models/hunyuan.cpp
6060
models/llama.cpp
6161
models/qwen.cpp
62+
models/pangu.cpp
6263
)
6364

6465
add_library(libchatllm SHARED EXCLUDE_FROM_ALL src/main.cpp ${core_files})

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
1313

1414
**What's New:**
1515

16+
* 2025-07-05: Pangu-Pro-MoE
1617
* 2025-07-04: ERNIE-MoE
1718
* 2025-06-30: Hunyuan-A13B, ERNIE-Dense
1819
* 2025-06-21: [I can hear](./docs/multimodal.md): Qwen2-Audio

convert.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ class ModelType(Enum):
199199

200200
ERNIE_MoE = 0x2500
201201

202+
PenguMoE = 0x2600
203+
202204
BCE_Embedding = 0x10000100
203205
BCE_ReRanker = 0x10000101
204206
BGE_M3 = 0x10000102
@@ -4765,6 +4767,67 @@ def get_weight_names(config):
47654767

47664768
return weight_names
47674769

4770+
class PanguMoEConverter(BaseConverter):
4771+
MODEL_TYPE = ModelType.PenguMoE
4772+
4773+
@staticmethod
4774+
def dump_config(f, config, ggml_type):
4775+
assert not config.tie_word_embeddings
4776+
assert config.intermediate_size is None
4777+
config.intermediate_size = config.shared_expert_intermediate_size
4778+
4779+
dump_llama_like_config(f, config, ggml_type)
4780+
4781+
config_values = [
4782+
config.num_key_value_heads,
4783+
config.moe_intermediate_size,
4784+
config.num_experts_per_tok,
4785+
config.num_experts,
4786+
]
4787+
f.write(struct.pack("i" * len(config_values), *config_values))
4788+
f.write(struct.pack("<f", config.rope_theta))
4789+
4790+
@staticmethod
4791+
def get_weight_names(config):
4792+
weight_names = ["model.embed_tokens.weight"]
4793+
for i in range(config.num_hidden_layers):
4794+
4795+
weight_names += [
4796+
f"model.layers.{i}.input_layernorm.weight",
4797+
]
4798+
4799+
for j in range(config.num_experts):
4800+
weight_names += [
4801+
f"model.layers.{i}.mlp.experts.{j}.down_proj.weight",
4802+
f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight",
4803+
f"model.layers.{i}.mlp.experts.{j}.up_proj.weight",
4804+
]
4805+
4806+
weight_names += [
4807+
f"model.layers.{i}.mlp.gate.weight",
4808+
f"model.layers.{i}.mlp.shared_expert.down_proj.weight",
4809+
f"model.layers.{i}.mlp.shared_expert.gate_proj.weight",
4810+
f"model.layers.{i}.mlp.shared_expert.up_proj.weight",
4811+
f"model.layers.{i}.mlp.router_scale",
4812+
4813+
f"model.layers.{i}.post_attention_layernorm.weight",
4814+
f"model.layers.{i}.self_attn.k_proj.weight",
4815+
f"model.layers.{i}.self_attn.k_proj.bias",
4816+
f"model.layers.{i}.self_attn.q_proj.weight",
4817+
f"model.layers.{i}.self_attn.q_proj.bias",
4818+
f"model.layers.{i}.self_attn.v_proj.weight",
4819+
f"model.layers.{i}.self_attn.v_proj.bias",
4820+
f"model.layers.{i}.self_attn.o_proj.weight",
4821+
f"model.layers.{i}.self_attn.o_proj.bias",
4822+
]
4823+
4824+
weight_names += [
4825+
"model.norm.weight",
4826+
"lm_head.weight"
4827+
]
4828+
4829+
return weight_names
4830+
47684831
class QWen3Converter(BaseConverter):
47694832
MODEL_TYPE = ModelType.QWen3
47704833

@@ -7598,6 +7661,8 @@ def main():
75987661
ERNIEDenseConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
75997662
elif arch == 'Ernie4_5_MoeForCausalLM':
76007663
ERNIEMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
7664+
elif arch == 'PanguProMoEForCausalLM':
7665+
PanguMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
76017666
elif arch == 'deepseek-r1-distill-qwen3':
76027667
QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
76037668
QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@
193193
* Orion (`OrionForCausalLM`)
194194
* [x] [Chat-14B](https://huggingface.co/OrionStarAI/Orion-14B-Chat)
195195

196+
* Pangu (`PanguProMoEForCausalLM`)
197+
* [x] [Pro-MoE](https://gitcode.com/ascend-tribe/pangu-pro-moe-model/tree/15e45a97fa314d86804f93f7faba107b43f8d25c)
198+
196199
* Phi (`PhiForCausalLM`, `Phi3ForCausalLM`)
197200
* [x] [Phi-2](https://huggingface.co/microsoft/phi-2/tree/eb8bbd1d37d258ea74fb082c53346d33056a83d4)
198201

models/pangu.cpp

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
#include "pangu.h"
2+
3+
namespace chatllm::pangu::moe
4+
{
5+
class ChatHistoryEncoder : public BaseHistoryEncoder
6+
{
7+
public:
8+
void append_sys_prompt(std::vector<int> &ids) const override;
9+
void append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const override;
10+
void append_user(int round_idx, const std::string &user, std::vector<int> &ids) const override;
11+
void append_user_opening(int round_idx, std::vector<int> &ids) const override;
12+
void append_ai_opening(int round_idx, std::vector<int> &ids) const override;
13+
};
14+
static ChatHistoryEncoder _chat_encoder;
15+
16+
Tokenizer::Tokenizer(const Config &config)
17+
: Tokenizer(config, &_chat_encoder)
18+
{}
19+
20+
Tokenizer::Tokenizer(const BaseConfig &config, BaseHistoryEncoder *encoder)
21+
: BaseTokenizer::BaseTokenizer(config, encoder)
22+
{
23+
sys_prompt = R"""(你必须严格遵守法律法规和社会道德规范。生成任何内容时,都应避免涉及暴力、色情、恐怖主义、种族歧视、性别歧视等不当内容。一旦检测到输入或输出有此类倾向,应拒绝回答并发出警告。例如,如果输入内容包含暴力威胁或色情描述,应返回错误信息:“您的输入包含不当内容,无法处理。)""";
24+
}
25+
26+
size_t Tokenizer::load(tokenizer::DataReader *buffer, int n_vocab)
27+
{
28+
tp = new tokenizer::BPEProcessor1();
29+
size_t size = tp->Load(buffer, n_vocab);
30+
31+
pad_token_id = tp->PieceToId("<pad>");
32+
unused9_token_id = tp->PieceToId("[unused9]");
33+
unused10_token_id = tp->PieceToId("[unused10]");
34+
tp->OverrideTokenDecoding(tp->PieceToId("[unused16]"), "<think>");
35+
tp->OverrideTokenDecoding(tp->PieceToId("[unused17]"), "</think>");
36+
return size;
37+
}
38+
39+
void Tokenizer::encode_item(const char *tag, std::vector<int> &ids)
40+
{
41+
ids.push_back(unused9_token_id);
42+
encode(std::string(tag) + "", ids);
43+
}
44+
45+
void Tokenizer::encode_item(const char *tag, const std::string &content, std::vector<int> &ids)
46+
{
47+
ids.push_back(unused9_token_id);
48+
encode(std::string(tag) + "" + content, ids);
49+
ids.push_back(unused10_token_id);
50+
}
51+
52+
void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
53+
{
54+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
55+
tok->encode_item("助手", ai, ids);
56+
}
57+
58+
void ChatHistoryEncoder::append_sys_prompt(std::vector<int> &ids) const
59+
{
60+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
61+
ids.push_back(tok->bos_token_id);
62+
tok->encode_item("系统", tok->get_system_prompt(), ids);
63+
}
64+
65+
void ChatHistoryEncoder::append_user(int round_idx, const std::string &user, std::vector<int> &ids) const
66+
{
67+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
68+
tok->encode_item("用户", user, ids);
69+
}
70+
71+
void ChatHistoryEncoder::append_user_opening(int round_idx, std::vector<int> &ids) const
72+
{
73+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
74+
tok->encode_item("用户", ids);
75+
}
76+
77+
void ChatHistoryEncoder::append_ai_opening(int round_idx, std::vector<int> &ids) const
78+
{
79+
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
80+
tok->encode_item("助手", ids);
81+
}
82+
83+
template <class PanguMoEMLP> class PanguMoEBlock : public LMBlock1<RMSNorm, FullBiasedSelfAttention, RMSNorm, PanguMoEMLP>
84+
{
85+
public:
86+
PanguMoEBlock(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size,
87+
int mlp_intermediate_size1, int mlp_intermediate_size2,
88+
int num_kv_heads, int head_dim, int max_length)
89+
: LMBlock1<RMSNorm, FullBiasedSelfAttention, RMSNorm, PanguMoEMLP>(ctx, hidden_size, num_attention_heads, intermediate_size, mlp_intermediate_size1, mlp_intermediate_size2,
90+
num_kv_heads, head_dim, max_length)
91+
{}
92+
};
93+
94+
template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class PanguSparseMoE : public BaseSparseMLP
95+
{
96+
public:
97+
PanguSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size)
98+
: BaseSparseMLP(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, ActFunc::SILU, false, true, true)
99+
{
100+
norm_topk_prob = false;
101+
}
102+
};
103+
104+
template <const int NUM_EXPERTS, const int EXPERTS_PER_TOK, const int EFFECTIVE_EXPERTS_PER_TOK> class GenericConditionalGeneration : public BaseModelForConditionalGeneration
105+
{
106+
public:
107+
typedef CombinedMLP<PanguSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK>, SiLUMLP> PanguMoEMLP;
108+
typedef PanguMoEBlock<PanguMoEMLP> MoEBlock;
109+
typedef BaseModelForConditionalGeneration Base;
110+
typedef Model<Config, Embedding, RMSNorm, MoEBlock, int, int, int, int, int, int, int, int> ModelClass;
111+
public:
112+
GenericConditionalGeneration() = default;
113+
114+
GenericConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
115+
: BaseModelForConditionalGeneration(MODEL_TYPE_PANGU_MOE, config, runtime_config, 4096 * 4),
116+
config(config)
117+
{
118+
const size_t tensor_ovhd = ggml_tensor_overhead();
119+
const size_t num_tensors = 3 + config.num_hidden_layers * 22;
120+
const size_t ctx_size = num_tensors * tensor_ovhd;
121+
w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
122+
w_ctx_.dtype = config.dtype;
123+
124+
Base::transformer = new ModelClass(
125+
&w_ctx_, config, false,
126+
config.hidden_size, config.num_attention_heads,
127+
config.intermediate_size, config.moe_intermediate_size, config.intermediate_size,
128+
config.num_key_value_heads, config.hidden_size / config.num_attention_heads,
129+
config.max_length);
130+
131+
for (int i = 0; i < config.num_hidden_layers; i++)
132+
{
133+
auto &layer = Base::get_typed_transformer<ModelClass>()->layers[i];
134+
layer.attention.freq_base = config.rope_theta;
135+
}
136+
137+
w_ctx_.check_used_mem_size(true);
138+
}
139+
140+
public:
141+
Config config;
142+
};
143+
144+
namespace experts_64
145+
{
146+
const int NUM_EXPERTS = 64;
147+
const int EXPERTS_PER_TOK = 8;
148+
149+
typedef GenericConditionalGeneration<NUM_EXPERTS, EXPERTS_PER_TOK, EXPERTS_PER_TOK> ConditionalGeneration;
150+
}
151+
152+
ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
153+
{
154+
switch (config.num_experts)
155+
{
156+
case experts_64::NUM_EXPERTS:
157+
set_proxy_model(new experts_64::ConditionalGeneration(config, runtime_config));
158+
break;
159+
default:
160+
CHATLLM_CHECK(false) << "unsupported MoE param: num_experts = " << config.num_experts;
161+
break;
162+
}
163+
}
164+
165+
void ConditionalGeneration::load(ModelLoader &loader)
166+
{
167+
loader.add_tensor_name_translations({
168+
{".mlp2.", ".shared_expert."},
169+
{".mlp1.gate.", ".gate."},
170+
{".mlp1.router_scale", ".router_scale"},
171+
{".mlp1.experts.", ".experts."},
172+
});
173+
174+
ModelProxy::load(loader);
175+
}
176+
}

models/pangu.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#pragma once
2+
3+
#include "../src/models.h"
4+
#include "../src/models_priv.h"
5+
6+
namespace chatllm::pangu::moe
7+
{
8+
struct Config : public BaseConfig
9+
{
10+
int num_key_value_heads;
11+
int moe_intermediate_size;
12+
int num_experts_per_tok;
13+
int num_experts;
14+
15+
float rope_theta;
16+
};
17+
18+
class Tokenizer : public BaseTokenizer
19+
{
20+
public:
21+
Tokenizer(const Config &config);
22+
Tokenizer(const BaseConfig &config, BaseHistoryEncoder *encoder);
23+
24+
size_t load(tokenizer::DataReader *buffer, int n_vocab) override;
25+
void encode_item(const char *tag, std::vector<int> &ids);
26+
void encode_item(const char *tag, const std::string &content, std::vector<int> &ids);
27+
public:
28+
int unused9_token_id;
29+
int unused10_token_id;
30+
};
31+
32+
class ConditionalGeneration : public ModelProxy
33+
{
34+
public:
35+
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config);
36+
void load(ModelLoader &loader);
37+
};
38+
}

scripts/models.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2973,5 +2973,21 @@
29732973
}
29742974
}
29752975
}
2976+
},
2977+
"pangu": {
2978+
"brief": "Pangu Pro MoE models are released by Huawei.",
2979+
"default": "a16b",
2980+
"license": "Pangu Model License Agreement",
2981+
"variants": {
2982+
"a16b": {
2983+
"default": "q4_0",
2984+
"quantized": {
2985+
"q4_0": {
2986+
"size": 40500401920,
2987+
"url": "chatllm_quantized_pangu/pangu-pro-moe-a16b-q4_0.bin"
2988+
}
2989+
}
2990+
}
2991+
}
29762992
}
29772993
}

0 commit comments

Comments
 (0)