@@ -802,6 +802,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
802802 if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec" :
803803 # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
804804 res = "seed-coder"
805+ if chkhsh == "a81863d07e75497e2194eb1a1574d5e5cd4d5f85a87a0728b922bf2bed6fb327" :
806+ # ref: https://huggingface.co/jinaai/jina-embeddings-v3
807+ res = "jina-v3"
805808
806809 if res is None :
807810 logger .warning ("\n " )
@@ -3829,26 +3832,59 @@ def _is_tokenizer_xlmroberta(self) -> bool:
38293832class XLMRobertaModel (BertModel ):
38303833 model_arch = gguf .MODEL_ARCH .BERT
38313834
3832- def __init__ (self , * args , ** kwargs ):
3833- super ().__init__ (* args , ** kwargs )
3834- self ._xlmroberta_tokenizer_init ()
3835+ def __init__ (self , dir_model : Path , ftype : gguf .LlamaFileType , fname_out : Path , ** kwargs : Any ):
3836+ hparams = kwargs .pop ("hparams" , None )
3837+ if hparams is None :
3838+ hparams = ModelBase .load_hparams (dir_model )
3839+
3840+ if hparams .get ("lora_adaptations" ):
3841+ self .model_arch = gguf .MODEL_ARCH .JINA_BERT_V3
3842+
3843+ super ().__init__ (dir_model , ftype , fname_out , hparams = hparams , ** kwargs )
3844+
3845+ self ._tokenizer_is_xlmroberta = False if self .model_arch == gguf .MODEL_ARCH .JINA_BERT_V3 else True
3846+ if self ._tokenizer_is_xlmroberta :
3847+ self ._xlmroberta_tokenizer_init ()
38353848
38363849 def set_vocab (self ):
3837- self ._xlmroberta_set_vocab ()
3850+ if self ._tokenizer_is_xlmroberta :
3851+ return self ._xlmroberta_set_vocab ()
3852+ return super ().set_vocab ()
38383853
38393854 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
38403855 # if name starts with "roberta.", remove the prefix
38413856 # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
38423857 if name .startswith ("roberta." ):
38433858 name = name [8 :]
38443859
3860+ # jina-embeddings-v3
3861+ if ".parametrizations." in name :
3862+ name = name .replace (".parametrizations." , "." )
3863+ if name .endswith (".original" ):
3864+ name = name [:- 9 ]
3865+
38453866 # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
38463867 if name == "embeddings.position_embeddings.weight" :
38473868 if self ._position_offset is not None :
38483869 data_torch = data_torch [self ._position_offset :,:]
38493870
3871+ if name .endswith (".lora_A" ):
3872+ # TODO: convert loras
3873+ return []
3874+
3875+ if name .endswith (".lora_B" ):
3876+ # TODO: convert loras
3877+ return []
3878+
38503879 return super ().modify_tensors (data_torch , name , bid )
38513880
3881+ def set_gguf_parameters (self ):
3882+ super ().set_gguf_parameters ()
3883+
3884+ # jina-embeddings-v3
3885+ if rotary_emb_base := self .hparams .get ("rotary_emb_base" ):
3886+ self .gguf_writer .add_rope_freq_base (rotary_emb_base )
3887+
38523888
38533889@ModelBase .register ("GemmaForCausalLM" )
38543890class GemmaModel (TextModel ):
0 commit comments