Skip to content

Commit bf6f2a2

Browse files
Noedaqnixsynapse
authored andcommitted
model : add dots.llm1 architecture support (ggml-org#14044) (ggml-org#14118)
Adds: * Dots1Model to convert_hf_to_gguf.py * Computation graph code to llama-model.cpp * Chat template to llama-chat.cpp to detect this model's template. --- The model is called "dots.llm1" (I decided to shorten it to dots1 or DOTS1 in the code generally) architecture. The only models that exist as of writing of this commit that follow this architecture are "dots.llm1.inst" and "dots.llm1.base" from here: * https://huggingface.co/rednote-hilab/dots.llm1.inst * https://huggingface.co/rednote-hilab/dots.llm1.base The model architecture is a combination of Qwen and Deepseek parts, as seen here: https://github.com/huggingface/transformers/blob/ffe12627b4e84489d2ab91dd0ec00614855edc79/src/transformers/models/dots1/modular_dots1.py
1 parent 335d1fd commit bf6f2a2

File tree

5 files changed

+251
-300
lines changed

5 files changed

+251
-300
lines changed

convert_hf_to_gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5262,6 +5262,34 @@ def prepare_tensors(self):
52625262
raise ValueError(f"Unprocessed experts: {experts}")
52635263

52645264

5265+
@ModelBase.register("Dots1ForCausalLM")
5266+
class Dots1Model(Qwen2MoeModel):
5267+
model_arch = gguf.MODEL_ARCH.DOTS1
5268+
5269+
def __init__(self, *args, **kwargs):
5270+
super().__init__(*args, **kwargs)
5271+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5272+
5273+
def set_gguf_parameters(self):
5274+
super().set_gguf_parameters()
5275+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5276+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5277+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5278+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5279+
5280+
if self.hparams["scoring_func"] == "noaux_tc":
5281+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5282+
else:
5283+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5284+
5285+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5286+
if name.endswith("e_score_correction_bias"):
5287+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5288+
if "shared_experts" in name:
5289+
return [(self.map_tensor_name(name), data_torch)]
5290+
return super().modify_tensors(data_torch, name, bid)
5291+
5292+
52655293
@ModelBase.register("PLMForCausalLM")
52665294
class PLMModel(TextModel):
52675295
model_arch = gguf.MODEL_ARCH.PLM

gguf-py/gguf/constants.py

Lines changed: 1 addition & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,6 @@ class LLM:
118118
EMBEDDING_SCALE = "{arch}.embedding_scale"
119119
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
120120
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
121-
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
122-
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
123-
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
124-
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
125121

126122
class Attention:
127123
HEAD_COUNT = "{arch}.attention.head_count"
@@ -146,8 +142,6 @@ class Attention:
146142
SCALE = "{arch}.attention.scale"
147143
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
148144
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
149-
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
150-
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
151145

152146
class Rope:
153147
DIMENSION_COUNT = "{arch}.rope.dimension_count"
@@ -205,7 +199,6 @@ class Tokenizer:
205199
MASK_ID = "tokenizer.ggml.mask_token_id"
206200
ADD_BOS = "tokenizer.ggml.add_bos_token"
207201
ADD_EOS = "tokenizer.ggml.add_eos_token"
208-
ADD_SEP = "tokenizer.ggml.add_sep_token"
209202
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
210203
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
211204
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
@@ -299,7 +292,6 @@ class MODEL_ARCH(IntEnum):
299292
BERT = auto()
300293
NOMIC_BERT = auto()
301294
NOMIC_BERT_MOE = auto()
302-
NEO_BERT = auto()
303295
JINA_BERT_V2 = auto()
304296
BLOOM = auto()
305297
STABLELM = auto()
@@ -321,7 +313,6 @@ class MODEL_ARCH(IntEnum):
321313
GEMMA = auto()
322314
GEMMA2 = auto()
323315
GEMMA3 = auto()
324-
GEMMA3N = auto()
325316
STARCODER2 = auto()
326317
RWKV6 = auto()
327318
RWKV6QWEN2 = auto()
@@ -355,8 +346,6 @@ class MODEL_ARCH(IntEnum):
355346
PLM = auto()
356347
BAILINGMOE = auto()
357348
DOTS1 = auto()
358-
ARCEE = auto()
359-
ERNIE4_5 = auto()
360349

361350

362351
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -409,22 +398,6 @@ class MODEL_TENSOR(IntEnum):
409398
ATTN_Q_NORM = auto()
410399
ATTN_K_NORM = auto()
411400
LAYER_OUT_NORM = auto()
412-
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
413-
PER_LAYER_MODEL_PROJ = auto() # gemma3n
414-
PER_LAYER_INP_GATE = auto() # gemma3n
415-
PER_LAYER_PROJ = auto() # gemma3n
416-
PER_LAYER_PROJ_NORM = auto() # gemma3n
417-
PER_LAYER_POST_NORM = auto() # gemma3n
418-
ALTUP_PROJ = auto() # gemma3n
419-
ALTUP_UNEMBD_PROJ = auto() # gemma3n
420-
ALTUP_CORRECT_COEF = auto() # gemma3n
421-
ALTUP_CORRECT_SCALE = auto() # gemma3n
422-
ALTUP_PREDICT_COEF = auto() # gemma3n
423-
ALTUP_ROUTER = auto() # gemma3n
424-
ALTUP_ROUTER_NORM = auto() # gemma3n
425-
LAUREL_L = auto() # gemma3n
426-
LAUREL_R = auto() # gemma3n
427-
LAUREL_POST_NORM = auto() # gemma3n
428401
SSM_IN = auto()
429402
SSM_CONV1D = auto()
430403
SSM_X = auto()
@@ -602,7 +575,6 @@ class MODEL_TENSOR(IntEnum):
602575
MODEL_ARCH.BERT: "bert",
603576
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
604577
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
605-
MODEL_ARCH.NEO_BERT: "neo-bert",
606578
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
607579
MODEL_ARCH.BLOOM: "bloom",
608580
MODEL_ARCH.STABLELM: "stablelm",
@@ -624,7 +596,6 @@ class MODEL_TENSOR(IntEnum):
624596
MODEL_ARCH.GEMMA: "gemma",
625597
MODEL_ARCH.GEMMA2: "gemma2",
626598
MODEL_ARCH.GEMMA3: "gemma3",
627-
MODEL_ARCH.GEMMA3N: "gemma3n",
628599
MODEL_ARCH.STARCODER2: "starcoder2",
629600
MODEL_ARCH.RWKV6: "rwkv6",
630601
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
@@ -657,9 +628,7 @@ class MODEL_TENSOR(IntEnum):
657628
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
658629
MODEL_ARCH.PLM: "plm",
659630
MODEL_ARCH.BAILINGMOE: "bailingmoe",
660-
MODEL_ARCH.DOTS1: "dots1",
661-
MODEL_ARCH.ARCEE: "arcee",
662-
MODEL_ARCH.ERNIE4_5: "ernie4_5",
631+
MODEL_ARCH.DOTS1: "dots1"
663632
}
664633

665634
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -712,22 +681,6 @@ class MODEL_TENSOR(IntEnum):
712681
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
713682
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
714683
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
715-
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
716-
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
717-
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
718-
MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n
719-
MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n
720-
MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n
721-
MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n
722-
MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n
723-
MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n
724-
MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n
725-
MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n
726-
MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n
727-
MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n
728-
MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n
729-
MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n
730-
MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n
731684
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
732685
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
733686
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
@@ -1131,18 +1084,6 @@ class MODEL_TENSOR(IntEnum):
11311084
MODEL_TENSOR.FFN_UP_EXP,
11321085
MODEL_TENSOR.LAYER_OUT_NORM,
11331086
],
1134-
MODEL_ARCH.NEO_BERT: [
1135-
MODEL_TENSOR.TOKEN_EMBD,
1136-
MODEL_TENSOR.ATTN_NORM,
1137-
MODEL_TENSOR.ATTN_QKV,
1138-
MODEL_TENSOR.ATTN_OUT,
1139-
MODEL_TENSOR.FFN_NORM,
1140-
MODEL_TENSOR.FFN_DOWN,
1141-
MODEL_TENSOR.FFN_UP,
1142-
MODEL_TENSOR.ENC_OUTPUT_NORM,
1143-
MODEL_TENSOR.CLS,
1144-
MODEL_TENSOR.CLS_OUT,
1145-
],
11461087
MODEL_ARCH.JINA_BERT_V2: [
11471088
MODEL_TENSOR.TOKEN_EMBD,
11481089
MODEL_TENSOR.TOKEN_EMBD_NORM,
@@ -1533,41 +1474,6 @@ class MODEL_TENSOR(IntEnum):
15331474
MODEL_TENSOR.FFN_PRE_NORM,
15341475
MODEL_TENSOR.FFN_POST_NORM,
15351476
],
1536-
MODEL_ARCH.GEMMA3N: [
1537-
MODEL_TENSOR.TOKEN_EMBD,
1538-
MODEL_TENSOR.OUTPUT,
1539-
MODEL_TENSOR.OUTPUT_NORM,
1540-
MODEL_TENSOR.ATTN_Q,
1541-
MODEL_TENSOR.ATTN_Q_NORM,
1542-
MODEL_TENSOR.ATTN_K,
1543-
MODEL_TENSOR.ATTN_K_NORM,
1544-
MODEL_TENSOR.ATTN_V,
1545-
MODEL_TENSOR.ATTN_OUT,
1546-
MODEL_TENSOR.FFN_GATE,
1547-
MODEL_TENSOR.FFN_DOWN,
1548-
MODEL_TENSOR.FFN_UP,
1549-
MODEL_TENSOR.ATTN_NORM,
1550-
MODEL_TENSOR.ATTN_POST_NORM,
1551-
MODEL_TENSOR.FFN_PRE_NORM,
1552-
MODEL_TENSOR.FFN_POST_NORM,
1553-
# altup / laurel
1554-
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
1555-
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
1556-
MODEL_TENSOR.PER_LAYER_INP_GATE,
1557-
MODEL_TENSOR.PER_LAYER_PROJ,
1558-
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
1559-
MODEL_TENSOR.PER_LAYER_POST_NORM,
1560-
MODEL_TENSOR.ALTUP_PROJ,
1561-
MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
1562-
MODEL_TENSOR.ALTUP_CORRECT_COEF,
1563-
MODEL_TENSOR.ALTUP_CORRECT_SCALE,
1564-
MODEL_TENSOR.ALTUP_PREDICT_COEF,
1565-
MODEL_TENSOR.ALTUP_ROUTER,
1566-
MODEL_TENSOR.ALTUP_ROUTER_NORM,
1567-
MODEL_TENSOR.LAUREL_L,
1568-
MODEL_TENSOR.LAUREL_R,
1569-
MODEL_TENSOR.LAUREL_POST_NORM,
1570-
],
15711477
MODEL_ARCH.STARCODER2: [
15721478
MODEL_TENSOR.TOKEN_EMBD,
15731479
MODEL_TENSOR.OUTPUT_NORM,
@@ -2182,35 +2088,6 @@ class MODEL_TENSOR(IntEnum):
21822088
MODEL_TENSOR.FFN_UP_EXP,
21832089
MODEL_TENSOR.FFN_UP_SHEXP,
21842090
],
2185-
MODEL_ARCH.ARCEE: [
2186-
MODEL_TENSOR.TOKEN_EMBD,
2187-
MODEL_TENSOR.OUTPUT_NORM,
2188-
MODEL_TENSOR.OUTPUT,
2189-
MODEL_TENSOR.ROPE_FREQS,
2190-
MODEL_TENSOR.ATTN_NORM,
2191-
MODEL_TENSOR.ATTN_Q,
2192-
MODEL_TENSOR.ATTN_K,
2193-
MODEL_TENSOR.ATTN_V,
2194-
MODEL_TENSOR.ATTN_OUT,
2195-
MODEL_TENSOR.ATTN_ROT_EMBD,
2196-
MODEL_TENSOR.FFN_NORM,
2197-
MODEL_TENSOR.FFN_DOWN,
2198-
MODEL_TENSOR.FFN_UP,
2199-
],
2200-
MODEL_ARCH.ERNIE4_5: [
2201-
MODEL_TENSOR.TOKEN_EMBD,
2202-
MODEL_TENSOR.OUTPUT_NORM,
2203-
MODEL_TENSOR.OUTPUT,
2204-
MODEL_TENSOR.ATTN_NORM,
2205-
MODEL_TENSOR.ATTN_Q,
2206-
MODEL_TENSOR.ATTN_K,
2207-
MODEL_TENSOR.ATTN_V,
2208-
MODEL_TENSOR.ATTN_OUT,
2209-
MODEL_TENSOR.FFN_NORM,
2210-
MODEL_TENSOR.FFN_GATE,
2211-
MODEL_TENSOR.FFN_DOWN,
2212-
MODEL_TENSOR.FFN_UP,
2213-
],
22142091
# TODO
22152092
}
22162093

0 commit comments

Comments
 (0)