Skip to content

Commit 8844e33

Browse files
committed
fix code
Signed-off-by: qingjun <[email protected]> fix code Signed-off-by: qingjun <[email protected]> fix code Signed-off-by: qingjun <[email protected]> rm files Signed-off-by: qingjun <[email protected]>
1 parent 6385b84 commit 8844e33

15 files changed

+895
-98
lines changed

convert_hf_to_gguf.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
782782
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
783783
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
784784
res = "deepseek-r1-qwen"
785+
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
786+
# ref: https://huggingface.co/MiniMaxAI/MiniMax-Text-01
787+
res = "minimax-01"
785788
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
786789
# ref: https://huggingface.co/Xenova/gpt-4o
787790
res = "gpt-4o"
@@ -6161,6 +6164,68 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
61616164

61626165
return [(self.map_tensor_name(name), data_torch)]
61636166

6167+
@ModelBase.register("MiniMaxText01ForCausalLM")
6168+
class MiniMaxText01Model(TextModel):
6169+
model_arch = gguf.MODEL_ARCH.MINIMAX01
6170+
6171+
def set_gguf_parameters(self):
6172+
super().set_gguf_parameters()
6173+
6174+
layernorm_full_attention_alpha = self.hparams["layernorm_full_attention_alpha"]
6175+
layernorm_full_attention_beta = self.hparams["layernorm_full_attention_beta"]
6176+
layernorm_linear_attention_alpha = self.hparams["layernorm_linear_attention_alpha"]
6177+
layernorm_linear_attention_beta = self.hparams["layernorm_linear_attention_beta"]
6178+
layernorm_mlp_alpha = self.hparams["layernorm_mlp_alpha"]
6179+
layernorm_mlp_beta = self.hparams["layernorm_mlp_beta"]
6180+
assert layernorm_full_attention_alpha == layernorm_linear_attention_alpha == layernorm_mlp_alpha
6181+
assert layernorm_full_attention_beta == layernorm_linear_attention_beta == layernorm_mlp_beta == 1.0
6182+
# we do not store the layernorm betas as they are all 1.0
6183+
# layernorm alphas are stored as single residual_scale hparam
6184+
self.gguf_writer.add_residual_scale(layernorm_full_attention_alpha)
6185+
6186+
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
6187+
6188+
_experts: list[dict[str, Tensor]] | None = None
6189+
6190+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6191+
n_head = self.hparams["num_attention_heads"]
6192+
n_kv_head = self.hparams.get("num_key_value_heads")
6193+
6194+
# process the experts separately
6195+
if name.find("block_sparse_moe.experts") != -1:
6196+
n_experts = self.hparams["num_local_experts"]
6197+
6198+
assert bid is not None
6199+
6200+
if self._experts is None:
6201+
self._experts = [{} for _ in range(self.block_count)]
6202+
6203+
self._experts[bid][name] = data_torch
6204+
6205+
if len(self._experts[bid]) >= n_experts * 3:
6206+
tensors: list[tuple[str, Tensor]] = []
6207+
6208+
# merge the experts into a single 3d tensor
6209+
for wid in ["w1", "w2", "w3"]:
6210+
datas: list[Tensor] = []
6211+
6212+
for xid in range(n_experts):
6213+
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
6214+
datas.append(self._experts[bid][ename])
6215+
del self._experts[bid][ename]
6216+
6217+
data_torch = torch.stack(datas, dim=0)
6218+
6219+
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
6220+
6221+
new_name = self.map_tensor_name(merged_name)
6222+
6223+
tensors.append((new_name, data_torch))
6224+
return tensors
6225+
else:
6226+
return []
6227+
6228+
return [(self.map_tensor_name(name), data_torch)]
61646229

61656230
@ModelBase.register("UltravoxModel")
61666231
class UltravoxWhisperEncoderModel(WhisperEncoderModel):

gguf-py/gguf/constants.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ class MODEL_ARCH(IntEnum):
341341
GRANITE_MOE = auto()
342342
CHAMELEON = auto()
343343
WAVTOKENIZER_DEC = auto()
344+
MINIMAX01 = auto()
344345
PLM = auto()
345346
BAILINGMOE = auto()
346347

@@ -374,6 +375,7 @@ class MODEL_TENSOR(IntEnum):
374375
ATTN_NORM_2 = auto()
375376
ATTN_OUT_NORM = auto()
376377
ATTN_POST_NORM = auto()
378+
ATTN_GATE = auto()
377379
ATTN_ROT_EMBD = auto()
378380
FFN_GATE_INP = auto()
379381
FFN_GATE_INP_SHEXP = auto()
@@ -621,6 +623,7 @@ class MODEL_TENSOR(IntEnum):
621623
MODEL_ARCH.GRANITE_MOE: "granitemoe",
622624
MODEL_ARCH.CHAMELEON: "chameleon",
623625
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
626+
MODEL_ARCH.MINIMAX01: "minimax01",
624627
MODEL_ARCH.PLM: "plm",
625628
MODEL_ARCH.BAILINGMOE: "bailingmoe",
626629
}
@@ -657,6 +660,7 @@ class MODEL_TENSOR(IntEnum):
657660
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
658661
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
659662
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
663+
MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate",
660664
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
661665
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
662666
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
@@ -2024,6 +2028,24 @@ class MODEL_TENSOR(IntEnum):
20242028
MODEL_TENSOR.POSNET_ATTN_V,
20252029
MODEL_TENSOR.POSNET_ATTN_OUT,
20262030
],
2031+
MODEL_ARCH.MINIMAX01: [
2032+
MODEL_TENSOR.TOKEN_EMBD,
2033+
MODEL_TENSOR.OUTPUT_NORM,
2034+
MODEL_TENSOR.OUTPUT,
2035+
MODEL_TENSOR.ATTN_NORM,
2036+
MODEL_TENSOR.ATTN_NORM_2,
2037+
MODEL_TENSOR.ATTN_QKV,
2038+
MODEL_TENSOR.ATTN_Q,
2039+
MODEL_TENSOR.ATTN_K,
2040+
MODEL_TENSOR.ATTN_V,
2041+
MODEL_TENSOR.ATTN_OUT,
2042+
MODEL_TENSOR.ATTN_GATE,
2043+
MODEL_TENSOR.FFN_NORM,
2044+
MODEL_TENSOR.FFN_GATE_INP,
2045+
MODEL_TENSOR.FFN_GATE_EXP,
2046+
MODEL_TENSOR.FFN_DOWN_EXP,
2047+
MODEL_TENSOR.FFN_UP_EXP,
2048+
],
20272049
MODEL_ARCH.BAILINGMOE: [
20282050
MODEL_TENSOR.TOKEN_EMBD,
20292051
MODEL_TENSOR.OUTPUT_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ class TensorNameMap:
142142
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
143143
"rwkv.blocks.{bid}.ln2", # rwkv6
144144
"model.layers.{bid}.ln2", # rwkv7
145+
"model.layers.{bid}.self_attn.norm", # minimax_text-01
145146
),
146147

147148
# Attention query-key-value
@@ -229,6 +230,7 @@ class TensorNameMap:
229230
"encoder.layers.{bid}.self_attention.dense", # chatglm
230231
"transformer.layers.{bid}.attn.out_proj", # openelm
231232
"transformer.h.{bid}.attn.attention.out_proj", # exaone
233+
"model.layers.{bid}.self_attn.out_proj", # minimax_text-01
232234
"model.layers.{bid}.self_attn.o_proj", # llama4
233235
),
234236

@@ -253,6 +255,10 @@ class TensorNameMap:
253255
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
254256
),
255257

258+
MODEL_TENSOR.ATTN_GATE: (
259+
"model.layers.{bid}.self_attn.output_gate", # minimax-text-01
260+
),
261+
256262
# Feed-forward norm
257263
MODEL_TENSOR.FFN_NORM: (
258264
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ extern "C" {
114114
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
115115
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
116116
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
117+
LLAMA_VOCAB_PRE_TYPE_MINIMAX = 36,
117118
};
118119

119120
enum llama_rope_type {

minimax_implementation.patch

Whitespace-only changes.

src/llama-arch.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
7070
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
7171
{ LLM_ARCH_CHAMELEON, "chameleon" },
7272
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
73+
{ LLM_ARCH_MINIMAX01, "minimax01" },
7374
{ LLM_ARCH_PLM, "plm" },
7475
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
7576
{ LLM_ARCH_UNKNOWN, "(unknown)" },
@@ -1532,6 +1533,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
15321533
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
15331534
},
15341535
},
1536+
{
1537+
LLM_ARCH_MINIMAX01,
1538+
{
1539+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1540+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1541+
{ LLM_TENSOR_OUTPUT, "output" },
1542+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1543+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
1544+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
1545+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1546+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1547+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1548+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1549+
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
1550+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1551+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1552+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1553+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1554+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1555+
},
1556+
},
15351557
{
15361558
LLM_ARCH_BAILINGMOE,
15371559
{
@@ -1582,6 +1604,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
15821604
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
15831605
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
15841606
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1607+
{LLM_TENSOR_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
15851608
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
15861609
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
15871610
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},

src/llama-arch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ enum llm_arch {
7474
LLM_ARCH_GRANITE_MOE,
7575
LLM_ARCH_CHAMELEON,
7676
LLM_ARCH_WAVTOKENIZER_DEC,
77+
LLM_ARCH_MINIMAX01,
7778
LLM_ARCH_PLM,
7879
LLM_ARCH_BAILINGMOE,
7980
LLM_ARCH_UNKNOWN,
@@ -236,6 +237,7 @@ enum llm_tensor {
236237
LLM_TENSOR_ATTN_V,
237238
LLM_TENSOR_ATTN_QKV,
238239
LLM_TENSOR_ATTN_OUT,
240+
LLM_TENSOR_ATTN_GATE,
239241
LLM_TENSOR_ATTN_NORM,
240242
LLM_TENSOR_ATTN_NORM_2,
241243
LLM_TENSOR_ATTN_OUT_NORM,

0 commit comments

Comments
 (0)