Skip to content

Commit 49209a7

Browse files
Noedaqnixsynapse
authored andcommitted
model : add dots.llm1 architecture support (ggml-org#14044) (ggml-org#14118)
Adds: * Dots1Model to convert_hf_to_gguf.py * Computation graph code to llama-model.cpp * Chat template to llama-chat.cpp to detect this model's template. --- The model is called "dots.llm1" (I decided to shorten it to dots1 or DOTS1 in the code generally) architecture. The only models that exist as of writing of this commit that follow this architecture are "dots.llm1.inst" and "dots.llm1.base" from here: * https://huggingface.co/rednote-hilab/dots.llm1.inst * https://huggingface.co/rednote-hilab/dots.llm1.base The model architecture is a combination of Qwen and Deepseek parts, as seen here: https://github.com/huggingface/transformers/blob/ffe12627b4e84489d2ab91dd0ec00614855edc79/src/transformers/models/dots1/modular_dots1.py
1 parent 3c07909 commit 49209a7

File tree

5 files changed

+251
-280
lines changed

5 files changed

+251
-280
lines changed

convert_hf_to_gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5262,6 +5262,34 @@ def prepare_tensors(self):
52625262
raise ValueError(f"Unprocessed experts: {experts}")
52635263

52645264

5265+
@ModelBase.register("Dots1ForCausalLM")
5266+
class Dots1Model(Qwen2MoeModel):
5267+
model_arch = gguf.MODEL_ARCH.DOTS1
5268+
5269+
def __init__(self, *args, **kwargs):
5270+
super().__init__(*args, **kwargs)
5271+
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5272+
5273+
def set_gguf_parameters(self):
5274+
super().set_gguf_parameters()
5275+
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5276+
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5277+
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5278+
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5279+
5280+
if self.hparams["scoring_func"] == "noaux_tc":
5281+
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5282+
else:
5283+
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5284+
5285+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5286+
if name.endswith("e_score_correction_bias"):
5287+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5288+
if "shared_experts" in name:
5289+
return [(self.map_tensor_name(name), data_torch)]
5290+
return super().modify_tensors(data_torch, name, bid)
5291+
5292+
52655293
@ModelBase.register("PLMForCausalLM")
52665294
class PLMModel(TextModel):
52675295
model_arch = gguf.MODEL_ARCH.PLM

gguf-py/gguf/constants.py

Lines changed: 1 addition & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,6 @@ class LLM:
118118
EMBEDDING_SCALE = "{arch}.embedding_scale"
119119
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
120120
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
121-
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
122-
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
123-
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
124-
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
125121

126122
class Attention:
127123
HEAD_COUNT = "{arch}.attention.head_count"
@@ -146,8 +142,6 @@ class Attention:
146142
SCALE = "{arch}.attention.scale"
147143
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
148144
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
149-
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
150-
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
151145

152146
class Rope:
153147
DIMENSION_COUNT = "{arch}.rope.dimension_count"
@@ -204,7 +198,6 @@ class Tokenizer:
204198
MASK_ID = "tokenizer.ggml.mask_token_id"
205199
ADD_BOS = "tokenizer.ggml.add_bos_token"
206200
ADD_EOS = "tokenizer.ggml.add_eos_token"
207-
ADD_SEP = "tokenizer.ggml.add_sep_token"
208201
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
209202
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
210203
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
@@ -298,7 +291,6 @@ class MODEL_ARCH(IntEnum):
298291
BERT = auto()
299292
NOMIC_BERT = auto()
300293
NOMIC_BERT_MOE = auto()
301-
NEO_BERT = auto()
302294
JINA_BERT_V2 = auto()
303295
BLOOM = auto()
304296
STABLELM = auto()
@@ -320,7 +312,6 @@ class MODEL_ARCH(IntEnum):
320312
GEMMA = auto()
321313
GEMMA2 = auto()
322314
GEMMA3 = auto()
323-
GEMMA3N = auto()
324315
STARCODER2 = auto()
325316
RWKV6 = auto()
326317
RWKV6QWEN2 = auto()
@@ -353,8 +344,6 @@ class MODEL_ARCH(IntEnum):
353344
PLM = auto()
354345
BAILINGMOE = auto()
355346
DOTS1 = auto()
356-
ARCEE = auto()
357-
ERNIE4_5 = auto()
358347

359348

360349
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -407,22 +396,6 @@ class MODEL_TENSOR(IntEnum):
407396
ATTN_Q_NORM = auto()
408397
ATTN_K_NORM = auto()
409398
LAYER_OUT_NORM = auto()
410-
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
411-
PER_LAYER_MODEL_PROJ = auto() # gemma3n
412-
PER_LAYER_INP_GATE = auto() # gemma3n
413-
PER_LAYER_PROJ = auto() # gemma3n
414-
PER_LAYER_PROJ_NORM = auto() # gemma3n
415-
PER_LAYER_POST_NORM = auto() # gemma3n
416-
ALTUP_PROJ = auto() # gemma3n
417-
ALTUP_UNEMBD_PROJ = auto() # gemma3n
418-
ALTUP_CORRECT_COEF = auto() # gemma3n
419-
ALTUP_CORRECT_SCALE = auto() # gemma3n
420-
ALTUP_PREDICT_COEF = auto() # gemma3n
421-
ALTUP_ROUTER = auto() # gemma3n
422-
ALTUP_ROUTER_NORM = auto() # gemma3n
423-
LAUREL_L = auto() # gemma3n
424-
LAUREL_R = auto() # gemma3n
425-
LAUREL_POST_NORM = auto() # gemma3n
426399
SSM_IN = auto()
427400
SSM_CONV1D = auto()
428401
SSM_X = auto()
@@ -599,7 +572,6 @@ class MODEL_TENSOR(IntEnum):
599572
MODEL_ARCH.BERT: "bert",
600573
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
601574
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
602-
MODEL_ARCH.NEO_BERT: "neo-bert",
603575
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
604576
MODEL_ARCH.BLOOM: "bloom",
605577
MODEL_ARCH.STABLELM: "stablelm",
@@ -621,7 +593,6 @@ class MODEL_TENSOR(IntEnum):
621593
MODEL_ARCH.GEMMA: "gemma",
622594
MODEL_ARCH.GEMMA2: "gemma2",
623595
MODEL_ARCH.GEMMA3: "gemma3",
624-
MODEL_ARCH.GEMMA3N: "gemma3n",
625596
MODEL_ARCH.STARCODER2: "starcoder2",
626597
MODEL_ARCH.RWKV6: "rwkv6",
627598
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
@@ -653,9 +624,7 @@ class MODEL_TENSOR(IntEnum):
653624
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
654625
MODEL_ARCH.PLM: "plm",
655626
MODEL_ARCH.BAILINGMOE: "bailingmoe",
656-
MODEL_ARCH.DOTS1: "dots1",
657-
MODEL_ARCH.ARCEE: "arcee",
658-
MODEL_ARCH.ERNIE4_5: "ernie4_5",
627+
MODEL_ARCH.DOTS1: "dots1"
659628
}
660629

661630
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -708,22 +677,6 @@ class MODEL_TENSOR(IntEnum):
708677
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
709678
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
710679
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
711-
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
712-
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
713-
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
714-
MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n
715-
MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n
716-
MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n
717-
MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n
718-
MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n
719-
MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n
720-
MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n
721-
MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n
722-
MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n
723-
MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n
724-
MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n
725-
MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n
726-
MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n
727680
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
728681
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
729682
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
@@ -1126,18 +1079,6 @@ class MODEL_TENSOR(IntEnum):
11261079
MODEL_TENSOR.FFN_UP_EXP,
11271080
MODEL_TENSOR.LAYER_OUT_NORM,
11281081
],
1129-
MODEL_ARCH.NEO_BERT: [
1130-
MODEL_TENSOR.TOKEN_EMBD,
1131-
MODEL_TENSOR.ATTN_NORM,
1132-
MODEL_TENSOR.ATTN_QKV,
1133-
MODEL_TENSOR.ATTN_OUT,
1134-
MODEL_TENSOR.FFN_NORM,
1135-
MODEL_TENSOR.FFN_DOWN,
1136-
MODEL_TENSOR.FFN_UP,
1137-
MODEL_TENSOR.ENC_OUTPUT_NORM,
1138-
MODEL_TENSOR.CLS,
1139-
MODEL_TENSOR.CLS_OUT,
1140-
],
11411082
MODEL_ARCH.JINA_BERT_V2: [
11421083
MODEL_TENSOR.TOKEN_EMBD,
11431084
MODEL_TENSOR.TOKEN_EMBD_NORM,
@@ -1528,41 +1469,6 @@ class MODEL_TENSOR(IntEnum):
15281469
MODEL_TENSOR.FFN_PRE_NORM,
15291470
MODEL_TENSOR.FFN_POST_NORM,
15301471
],
1531-
MODEL_ARCH.GEMMA3N: [
1532-
MODEL_TENSOR.TOKEN_EMBD,
1533-
MODEL_TENSOR.OUTPUT,
1534-
MODEL_TENSOR.OUTPUT_NORM,
1535-
MODEL_TENSOR.ATTN_Q,
1536-
MODEL_TENSOR.ATTN_Q_NORM,
1537-
MODEL_TENSOR.ATTN_K,
1538-
MODEL_TENSOR.ATTN_K_NORM,
1539-
MODEL_TENSOR.ATTN_V,
1540-
MODEL_TENSOR.ATTN_OUT,
1541-
MODEL_TENSOR.FFN_GATE,
1542-
MODEL_TENSOR.FFN_DOWN,
1543-
MODEL_TENSOR.FFN_UP,
1544-
MODEL_TENSOR.ATTN_NORM,
1545-
MODEL_TENSOR.ATTN_POST_NORM,
1546-
MODEL_TENSOR.FFN_PRE_NORM,
1547-
MODEL_TENSOR.FFN_POST_NORM,
1548-
# altup / laurel
1549-
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
1550-
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
1551-
MODEL_TENSOR.PER_LAYER_INP_GATE,
1552-
MODEL_TENSOR.PER_LAYER_PROJ,
1553-
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
1554-
MODEL_TENSOR.PER_LAYER_POST_NORM,
1555-
MODEL_TENSOR.ALTUP_PROJ,
1556-
MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
1557-
MODEL_TENSOR.ALTUP_CORRECT_COEF,
1558-
MODEL_TENSOR.ALTUP_CORRECT_SCALE,
1559-
MODEL_TENSOR.ALTUP_PREDICT_COEF,
1560-
MODEL_TENSOR.ALTUP_ROUTER,
1561-
MODEL_TENSOR.ALTUP_ROUTER_NORM,
1562-
MODEL_TENSOR.LAUREL_L,
1563-
MODEL_TENSOR.LAUREL_R,
1564-
MODEL_TENSOR.LAUREL_POST_NORM,
1565-
],
15661472
MODEL_ARCH.STARCODER2: [
15671473
MODEL_TENSOR.TOKEN_EMBD,
15681474
MODEL_TENSOR.OUTPUT_NORM,
@@ -2164,35 +2070,6 @@ class MODEL_TENSOR(IntEnum):
21642070
MODEL_TENSOR.FFN_UP_EXP,
21652071
MODEL_TENSOR.FFN_UP_SHEXP,
21662072
],
2167-
MODEL_ARCH.ARCEE: [
2168-
MODEL_TENSOR.TOKEN_EMBD,
2169-
MODEL_TENSOR.OUTPUT_NORM,
2170-
MODEL_TENSOR.OUTPUT,
2171-
MODEL_TENSOR.ROPE_FREQS,
2172-
MODEL_TENSOR.ATTN_NORM,
2173-
MODEL_TENSOR.ATTN_Q,
2174-
MODEL_TENSOR.ATTN_K,
2175-
MODEL_TENSOR.ATTN_V,
2176-
MODEL_TENSOR.ATTN_OUT,
2177-
MODEL_TENSOR.ATTN_ROT_EMBD,
2178-
MODEL_TENSOR.FFN_NORM,
2179-
MODEL_TENSOR.FFN_DOWN,
2180-
MODEL_TENSOR.FFN_UP,
2181-
],
2182-
MODEL_ARCH.ERNIE4_5: [
2183-
MODEL_TENSOR.TOKEN_EMBD,
2184-
MODEL_TENSOR.OUTPUT_NORM,
2185-
MODEL_TENSOR.OUTPUT,
2186-
MODEL_TENSOR.ATTN_NORM,
2187-
MODEL_TENSOR.ATTN_Q,
2188-
MODEL_TENSOR.ATTN_K,
2189-
MODEL_TENSOR.ATTN_V,
2190-
MODEL_TENSOR.ATTN_OUT,
2191-
MODEL_TENSOR.FFN_NORM,
2192-
MODEL_TENSOR.FFN_GATE,
2193-
MODEL_TENSOR.FFN_DOWN,
2194-
MODEL_TENSOR.FFN_UP,
2195-
],
21962073
# TODO
21972074
}
21982075

0 commit comments

Comments
 (0)