Skip to content

Commit 980c7db

Browse files
committed
support GroveMoE
1 parent b777a1c commit 980c7db

File tree

20 files changed

+426
-71
lines changed

20 files changed

+426
-71
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ set(core_files src/backend.cpp
8585
models/granite.cpp
8686
models/groq.cpp
8787
models/grok.cpp
88+
models/grove.cpp
8889
models/hermes.cpp
8990
models/hunyuan.cpp
9091
models/index.cpp

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3131

3232
**What's New:**
3333

34+
* 2025-09-08: GroveMoE
3435
* 2025-09-03: Apertus
3536
* 2025-08-22: Seed-OSS
3637
* 2025-08-11: GPT-OSS

convert.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,8 @@ class ModelType(Enum):
215215

216216
Apertus = 0x2C00
217217

218+
GroveMoE = 0x2D00
219+
218220
BCE_Embedding = 0x10000100
219221
BCE_ReRanker = 0x10000101
220222
BGE_M3 = 0x10000102
@@ -7527,6 +7529,57 @@ def get_weight_names(config):
75277529

75287530
return weight_names
75297531

7532+
class GroveMoEConverter(BaseConverter):
7533+
MODEL_TYPE = ModelType.GroveMoE
7534+
7535+
@staticmethod
7536+
def dump_config(f, config, ggml_type):
7537+
assert config.use_sliding_window == False, "use_sliding_window must be False"
7538+
assert not config.attention_bias
7539+
assert (config.output_router_logits is None) or (not config.output_router_logits)
7540+
assert config.rope_scaling is None
7541+
assert config.norm_topk_prob
7542+
assert not config.tie_word_embeddings
7543+
assert config.mlp_only_layers == []
7544+
7545+
dump_llama_like_config(f, config, ggml_type)
7546+
7547+
config.num_experts_per_group = 2
7548+
config.parallel_expert_intermediate_size = 128
7549+
config.small_experts_weight = 0.05
7550+
7551+
config_values = [
7552+
config.num_key_value_heads,
7553+
config.head_dim,
7554+
config.rope_theta,
7555+
config.moe_intermediate_size,
7556+
config.num_experts_per_tok,
7557+
config.num_experts,
7558+
config.num_experts_per_group,
7559+
config.parallel_expert_intermediate_size,
7560+
config.small_experts_weight,
7561+
]
7562+
f.write(struct.pack("<iifiiiiif", *config_values))
7563+
7564+
@staticmethod
7565+
def get_weight_names(config):
7566+
QWen3Converter.layer_is_sparse = [True] * config.num_hidden_layers
7567+
weight_names = QWen3Converter.get_weight_names(config)
7568+
7569+
#Note: `expert_bias` is not used
7570+
#https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/main/modeling_grove_moe.py#L303
7571+
7572+
for i in range(config.num_hidden_layers):
7573+
for j in range(config.num_experts // config.num_experts_per_group):
7574+
weight_names += [
7575+
f"model.layers.{i}.mlp.chunk_experts.{j}.down_proj.weight",
7576+
f"model.layers.{i}.mlp.chunk_experts.{j}.gate_proj.weight",
7577+
f"model.layers.{i}.mlp.chunk_experts.{j}.up_proj.weight",
7578+
]
7579+
7580+
weight_names.sort()
7581+
return weight_names
7582+
75307583
def convert_grok_1_base(args, vocab, ggml_type):
75317584
def ffn_size(emb_size, widening_factor):
75327585
_ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -8128,6 +8181,8 @@ def main():
81288181
SeedOSSConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
81298182
elif arch == 'ApertusForCausalLM':
81308183
ApertusConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8184+
elif arch.endswith('GroveMoeForCausalLM'):
8185+
GroveMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
81318186
elif arch == 'deepseek-r1-distill-qwen3':
81328187
QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
81338188
QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@
8989
* [x] v3.1: [Instruct-1B-A400M](https://huggingface.co/ibm-granite/granite-3.1-1b-a400m-instruct), [Instruct-3B-A800M](https://huggingface.co/ibm-granite/granite-3.1-3b-a800m-instruct), [Instruct-2B](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct), [Instruct-8B](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
9090
* [x] v3.2: [Instruct-2B](https://huggingface.co/ibm-granite/granite-3.2-2b-instruct), [Instruct-2B](https://huggingface.co/ibm-granite/granite-3.2-8b-instruct), [Instruct-8B](https://huggingface.co/ibm-granite/granite-3.2-8b-instruct/tree/0276d996f60d5eb0b376b6d06622042d4ef3eb4b)
9191

92+
* GroveMoE (`GroveMoeForCausalLM`)
93+
* [x] [Inst](https://huggingface.co/inclusionAI/GroveMoE-Inst/tree/b3441abf1f3ed166e58c005ef2c528c584b55764)
94+
9295
* HunYuan (`HunYuanForCausalLM`)
9396
* [x] ~~Dense: [Instruct-7B](https://huggingface.co/tencent/Hunyuan-7B-Instruct)~~ (lost)
9497
* [x] Dense: [0.5B-Instruct](https://huggingface.co/tencent/Hunyuan-0.5B-Instruct/tree/9ec1774c379d7dde3f2d7ddd3286cde88949e181),

models/allenai.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ namespace chatllm::allenai::moe
9191
{
9292
std::string layer_prefix = "model.layers." + std::to_string(Base::layer_ids[i]) + '.';
9393

94-
loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "mlp.experts.", _NUM_EXPERTS, ".down_proj.weight", transformer->layers[i].mlp.experts_down.weight);
95-
loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "mlp.experts.", _NUM_EXPERTS, ".gate_proj.weight", transformer->layers[i].mlp.experts_gate.weight);
96-
loader.read_tensor(layer_prefix + "mlp.experts_up.weight", layer_prefix + "mlp.experts.", _NUM_EXPERTS, ".up_proj.weight", transformer->layers[i].mlp.experts_up.weight);
94+
loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "mlp.experts.", _NUM_EXPERTS, ".down_proj.weight", transformer->layers[i].mlp.experts.down.weight);
95+
loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "mlp.experts.", _NUM_EXPERTS, ".gate_proj.weight", transformer->layers[i].mlp.experts.gate.weight);
96+
loader.read_tensor(layer_prefix + "mlp.experts_up.weight", layer_prefix + "mlp.experts.", _NUM_EXPERTS, ".up_proj.weight", transformer->layers[i].mlp.experts.up.weight);
9797

9898
loader.read_tensor(layer_prefix + "mlp.gate.weight", transformer->layers[i].mlp.gate.weight);
9999

models/gpt.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,13 @@ Reasoning: medium
193193
ggml::tensor *calc_experts_outputs(ComputeContext *ctx, ggml::tensor *hidden_states,
194194
ggml::tensor *selected_experts) override
195195
{
196-
ggml::tensor *gated = experts_gate.forward(ctx, hidden_states, selected_experts); // [n_ff, num_experts_per_tok, qlen]
197-
ggml::tensor *up = experts_up.forward(ctx, hidden_states, selected_experts); // [n_ff, num_experts_per_tok, qlen]
196+
ggml::tensor *gated = experts.gate.forward(ctx, hidden_states, selected_experts); // [n_ff, num_experts_per_tok, qlen]
197+
ggml::tensor *up = experts.up.forward(ctx, hidden_states, selected_experts); // [n_ff, num_experts_per_tok, qlen]
198198

199199
ggml::tensor *par = ggml::swiglu_oai(ctx, gated, up, alpha, limit); // [n_ff, num_experts_per_tok, qlen]
200200

201-
ggml::tensor * experts = experts_down.forward(ctx, par, selected_experts); // [hidden_size, num_experts_per_tok, qlen]
202-
return experts;
201+
ggml::tensor * experts_out = experts.down.forward(ctx, par, selected_experts); // [hidden_size, num_experts_per_tok, qlen]
202+
return experts_out;
203203
}
204204
private:
205205
const float limit;

models/granite.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,9 @@ namespace chatllm::granite::moe
200200
{
201201
std::string layer_prefix = "model.layers." + std::to_string(Base::layer_ids[i]) + '.';
202202

203-
loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".down_proj.weight", transformer->layers[i].mlp.experts_down.weight);
204-
loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".gate_proj.weight", transformer->layers[i].mlp.experts_gate.weight);
205-
loader.read_tensor(layer_prefix + "mlp.experts_up.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".up_proj.weight", transformer->layers[i].mlp.experts_up.weight);
203+
loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".down_proj.weight", transformer->layers[i].mlp.experts.down.weight);
204+
loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".gate_proj.weight", transformer->layers[i].mlp.experts.gate.weight);
205+
loader.read_tensor(layer_prefix + "mlp.experts_up.weight", layer_prefix + "block_sparse_moe.experts.", _NUM_EXPERTS, ".up_proj.weight", transformer->layers[i].mlp.experts.up.weight);
206206

207207
loader.read_tensor(layer_prefix + "block_sparse_moe.router.layer.weight",
208208
transformer->layers[i].mlp.gate.weight);

models/grok.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,9 @@ namespace chatllm::grok::v1
142142
{
143143
std::string layer_prefix = "model.layers." + std::to_string(layer_ids[i]) + '.';
144144

145-
loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "experts.", config.num_experts, ".w2.weight", transformer->layers[i].mlp.experts_down.weight);
146-
loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "experts.", config.num_experts, ".w1.weight", transformer->layers[i].mlp.experts_gate.weight);
147-
loader.read_tensor(layer_prefix + "mlp.experts_up.weight", layer_prefix + "experts.", config.num_experts, ".w3.weight", transformer->layers[i].mlp.experts_up.weight);
145+
loader.read_tensor(layer_prefix + "mlp.experts_down.weight", layer_prefix + "experts.", config.num_experts, ".w2.weight", transformer->layers[i].mlp.experts.down.weight);
146+
loader.read_tensor(layer_prefix + "mlp.experts_gate.weight", layer_prefix + "experts.", config.num_experts, ".w1.weight", transformer->layers[i].mlp.experts.gate.weight);
147+
loader.read_tensor(layer_prefix + "mlp.experts_up.weight", layer_prefix + "experts.", config.num_experts, ".w3.weight", transformer->layers[i].mlp.experts.up.weight);
148148

149149
loader.read_tensor(layer_prefix + "self_attn.k_proj.weight", transformer->layers[i].attention.k_proj.weight);
150150
loader.read_tensor(layer_prefix + "self_attn.o_proj.weight", transformer->layers[i].attention.o_proj.weight);

models/grove.cpp

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#include "qwen.h"
2+
3+
namespace chatllm::grove::moe
4+
{
5+
struct Config : BaseConfig
6+
{
7+
int num_key_value_heads;
8+
int head_dim;
9+
float rope_theta;
10+
int moe_intermediate_size;
11+
int num_experts_per_tok;
12+
int num_experts;
13+
int num_experts_per_group;
14+
int small_experts_intermediate_size;
15+
float small_experts_weight;
16+
};
17+
18+
typedef qwen::v3::Tokenizer Tokenizer;
19+
20+
// TODO: optimization: same small expert might be calculated twice.
21+
class BigLittleGroupedSparseMoE : public BaseSparseMLP
22+
{
23+
public:
24+
BigLittleGroupedSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size, int num_local_experts, int num_experts_per_tok,
25+
int group_size, int small_experts_intermediate_size);
26+
int64_t get_param_num(bool effective_only) const override;
27+
void load(const std::string &path, TensorLoader *loader) override;
28+
29+
protected:
30+
ggml::tensor *forward_with_experts(ComputeContext *ctx, ggml::tensor *hidden_states,
31+
ggml::tensor *selected_experts,
32+
ggml::tensor *weights) override;
33+
public:
34+
MultiMLP small_experts;
35+
const int group_size;
36+
const int small_experts_intermediate_size;
37+
float small_experts_weight;
38+
};
39+
40+
BigLittleGroupedSparseMoE::BigLittleGroupedSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size, int num_local_experts, int num_experts_per_tok,
41+
int group_size, int small_experts_intermediate_size)
42+
: BaseSparseMLP(ctx, hidden_size, intermediate_size, num_local_experts, num_experts_per_tok, ActFunc::SILU, false),
43+
small_experts(ctx, hidden_size, small_experts_intermediate_size, num_local_experts / group_size, num_experts_per_tok, ActFunc::SILU, false, group_size),
44+
group_size(group_size), small_experts_intermediate_size(small_experts_intermediate_size),
45+
small_experts_weight(0.5f)
46+
{
47+
}
48+
49+
int64_t BigLittleGroupedSparseMoE::get_param_num(bool effective_only) const
50+
{
51+
int64_t r = 0;
52+
r += small_experts.get_param_num(effective_only);
53+
r += BaseSparseMLP::get_param_num(effective_only);
54+
return r;
55+
}
56+
57+
void BigLittleGroupedSparseMoE::load(const std::string &path, TensorLoader *loader)
58+
{
59+
BaseSparseMLP::load(path, loader);
60+
61+
small_experts.load(path + "chunk_experts.", loader);
62+
}
63+
64+
// selected_experts: [qlen, num_experts_per_tok]
65+
// weights: [1, num_experts_per_tok, qlen]
66+
ggml::tensor *BigLittleGroupedSparseMoE::forward_with_experts(ComputeContext *ctx, ggml::tensor *hidden_states,
67+
ggml::tensor *selected_experts,
68+
ggml::tensor *weights)
69+
{
70+
ggml::tensor * large_out = BaseSparseMLP::forward_with_experts(ctx, hidden_states, selected_experts, weights);
71+
ggml::tensor * small_out = BaseSparseMLP::forward_with_experts(ctx, hidden_states, selected_experts, weights,
72+
[this](ComputeContext *ctx, ggml::tensor *hidden_states, ggml::tensor *selected_experts)
73+
{
74+
return small_experts.forward(ctx, hidden_states, selected_experts);
75+
});
76+
77+
ggml::tensor * r = ggml::add(ctx, large_out, small_out);
78+
79+
return r;
80+
}
81+
82+
#define SMALL_EXPERTS_GROUP_SIZE 2
83+
#define SMALL_EXPERTS_INTERMEDIATE_SIZE 128
84+
85+
template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class GroveSparseMoE : public BigLittleGroupedSparseMoE
86+
{
87+
public:
88+
GroveSparseMoE(InitContext *ctx, int hidden_size, int intermediate_size)
89+
: BigLittleGroupedSparseMoE(ctx, hidden_size, intermediate_size, NUM_EXPERTS, EXPERTS_PER_TOK, SMALL_EXPERTS_GROUP_SIZE, SMALL_EXPERTS_INTERMEDIATE_SIZE)
90+
{}
91+
};
92+
93+
template <int NUM_EXPERTS, int EXPERTS_PER_TOK> class GroveMoEBlock : public
94+
LMBlock1<RMSNorm, qwen::v3::QWen3SelfAttention, RMSNorm, GroveSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK>>
95+
{
96+
public:
97+
typedef GroveSparseMoE<NUM_EXPERTS, EXPERTS_PER_TOK> MoEMLP;
98+
public:
99+
GroveMoEBlock(InitContext *ctx, int hidden_size, int num_attention_heads, int intermediate_size,
100+
int mlp_intermediate_size,
101+
int num_kv_heads,
102+
int head_dim, int max_length)
103+
: LMBlock1<RMSNorm, qwen::v3::QWen3SelfAttention, RMSNorm, MoEMLP>(ctx, hidden_size, num_attention_heads, intermediate_size, mlp_intermediate_size,
104+
num_kv_heads, head_dim, max_length)
105+
{}
106+
};
107+
108+
typedef GroveMoEBlock<128, 8> GroveMoEBlock128_8;
109+
110+
class ConditionalGeneration : public BaseModelForConditionalGeneration
111+
{
112+
public:
113+
typedef Model<Config, Embedding, RMSNorm, GroveMoEBlock128_8, int, int, int, int, int, int, int> ModelClass;
114+
public:
115+
ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config,
116+
ModelType type = ModelType::MODEL_TYPE_GROVE_MOE);
117+
};
118+
119+
120+
ConditionalGeneration::ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config, ModelType type)
121+
: BaseModelForConditionalGeneration(type, config, runtime_config, 4096 * 4)
122+
{
123+
const size_t tensor_ovhd = ggml_tensor_overhead();
124+
const size_t num_tensors = 3 + config.num_hidden_layers * (14 + 1 + 3);
125+
const size_t ctx_size = num_tensors * tensor_ovhd;
126+
w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
127+
w_ctx_.dtype = config.dtype;
128+
129+
CHATLLM_CHECK(config.num_experts_per_group == SMALL_EXPERTS_GROUP_SIZE);
130+
CHATLLM_CHECK(config.small_experts_intermediate_size == SMALL_EXPERTS_INTERMEDIATE_SIZE);
131+
132+
transformer = new ModelClass(&w_ctx_, config, false, config.hidden_size, config.num_attention_heads, config.intermediate_size,
133+
config.moe_intermediate_size, config.num_key_value_heads, config.head_dim, config.max_length);
134+
135+
for (int i = 0; i < config.num_hidden_layers; i++)
136+
{
137+
auto &layer = get_typed_transformer<ModelClass>()->layers[i];
138+
layer.attention.freq_base = config.rope_theta;
139+
layer.mlp.small_experts_weight = config.small_experts_weight;
140+
}
141+
142+
w_ctx_.check_used_mem_size(true);
143+
}
144+
145+
REGISTER_MODEL_LOADER(GROVE_MOE, moe, 1);
146+
}

models/llama.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -520,9 +520,9 @@ namespace chatllm::llama::v4
520520
auto *layer = dynamic_cast<LlamaMoEBlock *>(transformer->get_layer(i));
521521
attention = &layer->attention;
522522

523-
loader.read_tensor(layer_prefix + "mlp.mlp1.experts_down.weight", layer_prefix + "mlp.experts.", config.n_routed_experts, ".down_proj.weight", layer->mlp.mlp1.experts_down.weight);
524-
loader.read_tensor(layer_prefix + "mlp.mlp1.experts_gate.weight", layer_prefix + "mlp.experts.", config.n_routed_experts, ".gate_proj.weight", layer->mlp.mlp1.experts_gate.weight);
525-
loader.read_tensor(layer_prefix + "mlp.mlp1.experts_up.weight", layer_prefix + "mlp.experts.", config.n_routed_experts, ".up_proj.weight", layer->mlp.mlp1.experts_up.weight);
523+
loader.read_tensor(layer_prefix + "mlp.mlp1.experts_down.weight", layer_prefix + "mlp.experts.", config.n_routed_experts, ".down_proj.weight", layer->mlp.mlp1.experts.down.weight);
524+
loader.read_tensor(layer_prefix + "mlp.mlp1.experts_gate.weight", layer_prefix + "mlp.experts.", config.n_routed_experts, ".gate_proj.weight", layer->mlp.mlp1.experts.gate.weight);
525+
loader.read_tensor(layer_prefix + "mlp.mlp1.experts_up.weight", layer_prefix + "mlp.experts.", config.n_routed_experts, ".up_proj.weight", layer->mlp.mlp1.experts.up.weight);
526526

527527
loader.read_tensor(layer_prefix + "mlp.gate.weight", layer->mlp.mlp1.gate.weight);
528528

0 commit comments

Comments
 (0)