@@ -523,15 +523,15 @@ def set_gguf_parameters(self):
523523 self .gguf_writer .add_context_length (n_ctx )
524524 logger .info (f"gguf: context length = { n_ctx } " )
525525
526- if (n_embd := self .find_hparam (["hidden_size" , "n_embd" ], optional = True )) is not None :
526+ if (n_embd := self .find_hparam (["hidden_size" , "n_embd" , "dim" ], optional = True )) is not None :
527527 self .gguf_writer .add_embedding_length (n_embd )
528528 logger .info (f"gguf: embedding length = { n_embd } " )
529529
530- if (n_ff := self .find_hparam (["intermediate_size" , "n_inner" ], optional = True )) is not None :
530+ if (n_ff := self .find_hparam (["intermediate_size" , "n_inner" , "hidden_dim" ], optional = True )) is not None :
531531 self .gguf_writer .add_feed_forward_length (n_ff )
532532 logger .info (f"gguf: feed forward length = { n_ff } " )
533533
534- if (n_head := self .find_hparam (["num_attention_heads" , "n_head" ], optional = True )) is not None :
534+ if (n_head := self .find_hparam (["num_attention_heads" , "n_head" , "n_heads" ], optional = True )) is not None :
535535 self .gguf_writer .add_head_count (n_head )
536536 logger .info (f"gguf: head count = { n_head } " )
537537
@@ -674,12 +674,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
674674 if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed" :
675675 # ref: https://huggingface.co/tiiuae/falcon-7b
676676 res = "falcon"
677- if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e" :
678- # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
679- res = "falcon3"
680677 if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f" :
681678 # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
682679 res = "bert-bge"
680+ if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e" :
681+ # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
682+ res = "falcon3"
683683 if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7" :
684684 # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
685685 res = "bert-bge-large"
@@ -731,9 +731,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
731731 if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a" :
732732 # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
733733 res = "jina-v2-code"
734- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" :
735- # ref: https://huggingface.co/THUDM/glm-4-9b-chat
736- res = "chatglm-bpe"
737734 if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee" :
738735 # ref: https://huggingface.co/LumiOpen/Viking-7B
739736 res = "viking"
@@ -764,9 +761,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
764761 if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450" :
765762 # ref: https://huggingface.co/facebook/chameleon-7b
766763 res = "chameleon"
767- if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" :
768- # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
769- res = "minerva-7b"
770764 if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65" :
771765 # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
772766 res = "roberta-bpe"
@@ -797,15 +791,24 @@ def get_vocab_base_pre(self, tokenizer) -> str:
797791 if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406" :
798792 # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
799793 res = "llama4"
800- if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
801- # ref: https://huggingface.co/THUDM/glm-4-9b-hf
802- res = "glm4"
803794 if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3" :
804795 # ref: https://huggingface.co/mistral-community/pixtral-12b
805796 res = "pixtral"
806797 if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec" :
807798 # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
808799 res = "seed-coder"
800+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" :
801+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
802+ res = "chatglm-bpe"
803+ if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516" :
804+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
805+ res = "chatglm-bpe"
806+ if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2" :
807+ # ref: https://huggingface.co/THUDM/glm-4-9b-hf
808+ res = "glm4"
809+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" :
810+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
811+ res = "minerva-7b"
809812
810813 if res is None :
811814 logger .warning ("\n " )
@@ -1044,6 +1047,10 @@ def _set_vocab_rwkv_world(self):
10441047 special_vocab .chat_template = "rwkv-world"
10451048 # hack: Add '\n\n' as the EOT token to make it chat normally
10461049 special_vocab ._set_special_token ("eot" , 261 )
1050+ # hack: Override these as they have already been set (incorrectly)
1051+ special_vocab .special_token_ids ["bos" ] = 0
1052+ special_vocab .special_token_ids ["eos" ] = 0
1053+
10471054 special_vocab .add_to_gguf (self .gguf_writer )
10481055
10491056 def _set_vocab_builtin (self , model_name : Literal ["gpt-neox" , "llama-spm" ], vocab_size : int ):
@@ -3907,6 +3914,26 @@ def _xlmroberta_set_vocab(self) -> None:
39073914 self .gguf_writer .add_add_eos_token (True )
39083915
39093916
3917+ @ModelBase .register ("DistilBertModel" , "DistilBertForMaskedLM" , "DistilBertForSequenceClassification" )
3918+ class DistilBertModel (BertModel ):
3919+ model_arch = gguf .MODEL_ARCH .BERT
3920+
3921+ def set_gguf_parameters (self ):
3922+ self .gguf_writer .add_layer_norm_eps (1e-12 )
3923+ logger .info ("gguf: layer norm epsilon = 1e-12" )
3924+ super ().set_gguf_parameters ()
3925+
3926+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3927+ if name .startswith ("distilbert." ):
3928+ name = name [11 :]
3929+
3930+ # These layers act as MLM head, so we don't need them
3931+ if name .startswith ("vocab_" ):
3932+ return []
3933+
3934+ return super ().modify_tensors (data_torch , name , bid )
3935+
3936+
39103937@ModelBase .register ("RobertaModel" , "RobertaForSequenceClassification" )
39113938class RobertaModel (BertModel ):
39123939 model_arch = gguf .MODEL_ARCH .BERT
0 commit comments