@@ -523,15 +523,15 @@ def set_gguf_parameters(self):
523523 self.gguf_writer.add_context_length(n_ctx)
524524 logger.info(f"gguf: context length = {n_ctx}")
525525
526- if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
526+ if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim" ], optional=True)) is not None:
527527 self.gguf_writer.add_embedding_length(n_embd)
528528 logger.info(f"gguf: embedding length = {n_embd}")
529529
530- if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
530+ if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim" ], optional=True)) is not None:
531531 self.gguf_writer.add_feed_forward_length(n_ff)
532532 logger.info(f"gguf: feed forward length = {n_ff}")
533533
534- if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
534+ if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads" ], optional=True)) is not None:
535535 self.gguf_writer.add_head_count(n_head)
536536 logger.info(f"gguf: head count = {n_head}")
537537
@@ -674,12 +674,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
674674 if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
675675 # ref: https://huggingface.co/tiiuae/falcon-7b
676676 res = "falcon"
677- if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
678- # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
679- res = "falcon3"
680677 if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
681678 # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
682679 res = "bert-bge"
680+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
681+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
682+ res = "falcon3"
683683 if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
684684 # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
685685 res = "bert-bge-large"
@@ -731,9 +731,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
731731 if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
732732 # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
733733 res = "jina-v2-code"
734- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
735- # ref: https://huggingface.co/THUDM/glm-4-9b-chat
736- res = "chatglm-bpe"
737734 if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
738735 # ref: https://huggingface.co/LumiOpen/Viking-7B
739736 res = "viking"
@@ -764,9 +761,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
764761 if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
765762 # ref: https://huggingface.co/facebook/chameleon-7b
766763 res = "chameleon"
767- if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
768- # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
769- res = "minerva-7b"
770764 if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
771765 # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
772766 res = "roberta-bpe"
@@ -797,15 +791,24 @@ def get_vocab_base_pre(self, tokenizer) -> str:
797791 if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
798792 # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
799793 res = "llama4"
800- if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
801- # ref: https://huggingface.co/THUDM/glm-4-9b-hf
802- res = "glm4"
803794 if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
804795 # ref: https://huggingface.co/mistral-community/pixtral-12b
805796 res = "pixtral"
806797 if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
807798 # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
808799 res = "seed-coder"
800+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
801+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
802+ res = "chatglm-bpe"
803+ if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
804+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
805+ res = "chatglm-bpe"
806+ if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
807+ # ref: https://huggingface.co/THUDM/glm-4-9b-hf
808+ res = "glm4"
809+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
810+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
811+ res = "minerva-7b"
809812
810813 if res is None:
811814 logger.warning("\n")
@@ -1044,6 +1047,10 @@ def _set_vocab_rwkv_world(self):
10441047 special_vocab.chat_template = "rwkv-world"
10451048 # hack: Add '\n\n' as the EOT token to make it chat normally
10461049 special_vocab._set_special_token("eot", 261)
1050+ # hack: Override these as they have already been set (incorrectly)
1051+ special_vocab.special_token_ids["bos"] = 0
1052+ special_vocab.special_token_ids["eos"] = 0
1053+
10471054 special_vocab.add_to_gguf(self.gguf_writer)
10481055
10491056 def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
@@ -3907,6 +3914,26 @@ def _xlmroberta_set_vocab(self) -> None:
39073914 self.gguf_writer.add_add_eos_token(True)
39083915
39093916
3917+ @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
3918+ class DistilBertModel(BertModel):
3919+ model_arch = gguf.MODEL_ARCH.BERT
3920+
3921+ def set_gguf_parameters(self):
3922+ self.gguf_writer.add_layer_norm_eps(1e-12)
3923+ logger.info("gguf: layer norm epsilon = 1e-12")
3924+ super().set_gguf_parameters()
3925+
3926+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3927+ if name.startswith("distilbert."):
3928+ name = name[11:]
3929+
3930+ # These layers act as MLM head, so we don't need them
3931+ if name.startswith("vocab_"):
3932+ return []
3933+
3934+ return super().modify_tensors(data_torch, name, bid)
3935+
3936+
39103937@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
39113938class RobertaModel(BertModel):
39123939 model_arch = gguf.MODEL_ARCH.BERT
0 commit comments