Skip to content

Commit 045b1ac

Browse files
committed
llama: attempt to add modern-bert
1 parent 663445b commit 045b1ac

16 files changed

+590
-3
lines changed

convert_hf_to_gguf.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
809809
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
810810
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
811811
res = "minerva-7b"
812+
if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
813+
# ref: https://huggingface.co/answerdotai/ModernBERT-base
814+
res = "modern-bert"
812815

813816
if res is None:
814817
logger.warning("\n")
@@ -3932,6 +3935,28 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
39323935

39333936
return super().modify_tensors(data_torch, name, bid)
39343937

3938+
@ModelBase.register("ModernBert", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
3939+
class ModernBertModel(BertModel):
3940+
model_arch = gguf.MODEL_ARCH.MODERN_BERT
3941+
3942+
def set_gguf_parameters(self):
3943+
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
3944+
self.gguf_writer.add_rope_freq_base(self.hparams["global_rope_theta"])
3945+
self.gguf_writer.add_rope_freq_base_swa(self.hparams["local_rope_theta"])
3946+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
3947+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3948+
3949+
super().set_gguf_parameters()
3950+
3951+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3952+
# These layers act as MLM head, so we don't need them
3953+
if name.startswith("decoder."):
3954+
return []
3955+
3956+
if name.startswith("model."):
3957+
name = name[6:]
3958+
3959+
return super().modify_tensors(data_torch, name, bid)
39353960

39363961
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
39373962
class RobertaModel(BertModel):

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum):
128128
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
129129
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
130130
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
131+
{"name": "modern-bert", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
131132
]
132133

133134
# some models are known to be broken upstream, so we will skip them as exceptions

gguf-py/gguf/constants.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ class Rope:
147147
DIMENSION_COUNT = "{arch}.rope.dimension_count"
148148
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
149149
FREQ_BASE = "{arch}.rope.freq_base"
150+
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
150151
SCALING_TYPE = "{arch}.rope.scaling.type"
151152
SCALING_FACTOR = "{arch}.rope.scaling.factor"
152153
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
@@ -289,6 +290,7 @@ class MODEL_ARCH(IntEnum):
289290
STARCODER = auto()
290291
REFACT = auto()
291292
BERT = auto()
293+
MODERN_BERT = auto()
292294
NOMIC_BERT = auto()
293295
NOMIC_BERT_MOE = auto()
294296
JINA_BERT_V2 = auto()
@@ -477,6 +479,7 @@ class MODEL_TENSOR(IntEnum):
477479
ENC_FFN_UP = auto()
478480
ENC_OUTPUT_NORM = auto()
479481
CLS = auto() # classifier
482+
CLS_NORM = auto() # classifier normalization
480483
CLS_OUT = auto() # classifier output projection
481484
CONV1D = auto()
482485
CONVNEXT_DW = auto()
@@ -569,6 +572,7 @@ class MODEL_TENSOR(IntEnum):
569572
MODEL_ARCH.STARCODER: "starcoder",
570573
MODEL_ARCH.REFACT: "refact",
571574
MODEL_ARCH.BERT: "bert",
575+
MODEL_ARCH.MODERN_BERT: "modern-bert",
572576
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
573577
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
574578
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
@@ -757,6 +761,7 @@ class MODEL_TENSOR(IntEnum):
757761
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
758762
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
759763
MODEL_TENSOR.CLS: "cls",
764+
MODEL_TENSOR.CLS_NORM: "cls.norm",
760765
MODEL_TENSOR.CLS_OUT: "cls.output",
761766
MODEL_TENSOR.CONV1D: "conv1d",
762767
MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
@@ -1047,6 +1052,20 @@ class MODEL_TENSOR(IntEnum):
10471052
MODEL_TENSOR.CLS,
10481053
MODEL_TENSOR.CLS_OUT,
10491054
],
1055+
MODEL_ARCH.MODERN_BERT: [
1056+
MODEL_TENSOR.TOKEN_EMBD,
1057+
MODEL_TENSOR.TOKEN_EMBD_NORM,
1058+
MODEL_TENSOR.ATTN_NORM,
1059+
MODEL_TENSOR.ATTN_QKV,
1060+
MODEL_TENSOR.ATTN_OUT,
1061+
MODEL_TENSOR.ATTN_OUT_NORM,
1062+
MODEL_TENSOR.FFN_DOWN,
1063+
MODEL_TENSOR.FFN_UP,
1064+
MODEL_TENSOR.ENC_OUTPUT_NORM,
1065+
MODEL_TENSOR.CLS,
1066+
MODEL_TENSOR.CLS_NORM,
1067+
MODEL_TENSOR.CLS_OUT,
1068+
],
10501069
MODEL_ARCH.NOMIC_BERT: [
10511070
MODEL_TENSOR.TOKEN_EMBD,
10521071
MODEL_TENSOR.TOKEN_EMBD_NORM,

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,9 @@ def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
812812

813813
def add_rope_freq_base(self, value: float) -> None:
814814
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
815+
816+
def add_rope_freq_base_swa(self, value: float) -> None:
817+
self.add_float32(Keys.Rope.FREQ_BASE_SWA.format(arch=self.arch), value)
815818

816819
def add_rope_scaling_type(self, value: RopeScalingType) -> None:
817820
self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value)

gguf-py/gguf/tensor_mapping.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class TensorNameMap:
1616
"model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
1717
"tok_embeddings", # llama-pth
1818
"embeddings.word_embeddings", # bert nomic-bert
19+
"embeddings.tok_embeddings", # modern-bert
1920
"language_model.embedding.word_embeddings", # persimmon
2021
"wte", # gpt2
2122
"transformer.embd.wte", # phi2
@@ -42,6 +43,7 @@ class TensorNameMap:
4243
MODEL_TENSOR.TOKEN_EMBD_NORM: (
4344
"word_embeddings_layernorm", # bloom
4445
"embeddings.LayerNorm", # bert
46+
"embeddings.norm", # modern-bert
4547
"emb_ln", # nomic-bert
4648
"transformer.norm", # openelm
4749
"rwkv.blocks.0.pre_ln", # rwkv
@@ -134,6 +136,7 @@ class TensorNameMap:
134136
"rwkv.blocks.{bid}.ln1", # rwkv6
135137
"model.layers.{bid}.ln1", # rwkv7
136138
"model.layers.{bid}.input_layernorm", # llama4
139+
"layers.{bid}.attn_norm", # modern-bert
137140
),
138141

139142
# Attention norm 2
@@ -161,6 +164,7 @@ class TensorNameMap:
161164
"model.layers.{bid}.self_attn.qkv_proj", # phi3
162165
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
163166
"transformer.layers.{bid}.attn.qkv_proj", # openelm
167+
"layers.{bid}.attn.Wqkv", # modern-bert
164168
),
165169

166170
# Attention query
@@ -236,6 +240,7 @@ class TensorNameMap:
236240
"transformer.layers.{bid}.attn.out_proj", # openelm
237241
"transformer.h.{bid}.attn.attention.out_proj", # exaone
238242
"model.layers.{bid}.self_attn.o_proj", # llama4
243+
"layers.{bid}.attn.Wo", # modern-bert
239244
),
240245

241246
# Attention output norm
@@ -245,6 +250,7 @@ class TensorNameMap:
245250
"encoder.layers.{bid}.norm1", # nomic-bert
246251
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
247252
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
253+
"layers.{bid}.mlp_norm" # modern-bert
248254
),
249255

250256
MODEL_TENSOR.ATTN_POST_NORM: (
@@ -338,6 +344,7 @@ class TensorNameMap:
338344
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
339345
"transformer.h.{bid}.mlp.c_fc_1", # exaone
340346
"model.layers.{bid}.feed_forward.up_proj", # llama4
347+
"layers.{bid}.mlp.Wi" # modern-bert
341348
),
342349

343350
MODEL_TENSOR.FFN_UP_EXP: (
@@ -420,6 +427,7 @@ class TensorNameMap:
420427
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
421428
"model.layers.h.{bid}.mlp.c_proj", # exaone
422429
"model.layers.{bid}.feed_forward.down_proj", # llama4
430+
"layers.{bid}.mlp.Wo" # modern-bert
423431
),
424432

425433
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -830,12 +838,18 @@ class TensorNameMap:
830838
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
831839
MODEL_TENSOR.ENC_OUTPUT_NORM: (
832840
"encoder.final_layer_norm", # t5
841+
"final_norm", # modern-bert
833842
),
834843

835844
MODEL_TENSOR.CLS: (
836845
"classifier", # jina
837846
"classifier.dense", # roberta
838847
"pre_classifier", # distillbert
848+
"head.dense", # modern-bert
849+
),
850+
851+
MODEL_TENSOR.CLS_NORM: (
852+
"head.norm", # modern-bert
839853
),
840854

841855
MODEL_TENSOR.CLS_OUT: (

models/ggml-vocab-modern-bert.gguf

1.06 MB
Binary file not shown.
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Äpfel
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
!!!!!!
77+
__ggml_vocab_test__
78+
3
79+
__ggml_vocab_test__
80+
33
81+
__ggml_vocab_test__
82+
333
83+
__ggml_vocab_test__
84+
3333
85+
__ggml_vocab_test__
86+
33333
87+
__ggml_vocab_test__
88+
333333
89+
__ggml_vocab_test__
90+
3333333
91+
__ggml_vocab_test__
92+
33333333
93+
__ggml_vocab_test__
94+
333333333
95+
__ggml_vocab_test__
96+
Cửa Việt
97+
__ggml_vocab_test__
98+
discards
99+
__ggml_vocab_test__
100+
101+
102+
103+
104+
105+
106+
107+
108+
109+
110+
111+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112+
__ggml_vocab_test__
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
728 577 24142 2607
2+
37515 18569 293
3+
4+
209
5+
50276
6+
50275
7+
186
8+
187
9+
535
10+
2756
11+
186 187
12+
12092 1533
13+
24387 1533
14+
12092 3645
15+
24387 3645
16+
24387 3645 2
17+
12092 13 1533 2
18+
24387 13 1533 2
19+
436 310 22692 101 236 15 14161
20+
88 27244 818 16853 16392 20505 4989 11917
21+
32520 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389
22+
18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 18081 212 18081 238 18081 216 39936 212 18081 220
23+
14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 313 7483 802 80 8020 326 556 697 1211 10669 10
24+
12092
25+
24387
26+
50276 12092
27+
50275 12092
28+
50274 12092
29+
50274 12092 187 50274 12092
30+
313
31+
187 426
32+
8 8685
33+
12092 13 340 8 455 2 1359 403 368 49042 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241
34+
18963 4672
35+
20
36+
1610
37+
20084
38+
26409
39+
1610 20084
40+
26409 1610
41+
26409 20084
42+
26409 26409
43+
26409 1610 20084
44+
36 6829 244 66 17721 35177 85
45+
1262 2196
46+
586 1744 33525 186 209 623 28910 187 50276 187 50275 187 50274 187 50273 187 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 22692 101 236 14931 101 236 495 5922 30057 495 20084 495 26409 30057 20084 495 26409 1610 495 26409 20084 495 15 20 495 537 20 495 1051 20 209 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 14931 235 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241 16081 6877 12880 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389 42011 35033 34842 11202 9739 9739 33021 18963 4672 25561 8220 309 1849 644 686 42618 344 434 627 13 686 1848 368 2119 32 686 46 417 2119 309 1833 1056 352 13 686 37 368 751 690 10331 32 844 8 31516 247 8 77 45

0 commit comments

Comments
 (0)