@@ -620,6 +620,9 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
620620 if "thinker_config" in config :
621621 # rename for Qwen2.5-Omni
622622 config ["text_config" ] = config ["thinker_config" ]["text_config" ]
623+ if "language_config" in config :
624+ # rename for DeepSeekOCR
625+ config ["text_config" ] = config ["language_config" ]
623626 return config
624627
625628 @classmethod
@@ -1442,7 +1445,7 @@ class MmprojModel(ModelBase):
14421445 preprocessor_config : dict [str , Any ]
14431446 global_config : dict [str , Any ]
14441447
1445- n_block_keys = ["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ]
1448+ n_block_keys = ["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" , "width.clip-l-14-224.layers" , "sam_vit_b.layers" ]
14461449
14471450 has_vision_encoder : bool = True # by default
14481451 has_audio_encoder : bool = False
@@ -1488,13 +1491,31 @@ def __init__(self, *args, **kwargs):
14881491 # TODO @ngxson : this is a hack to support both vision and audio encoders
14891492 have_multiple_encoders = self .has_audio_encoder and self .has_vision_encoder
14901493 self .block_count = 128 if have_multiple_encoders else self .find_hparam (self .n_block_keys , True )
1494+ # FIXME: DeepseekOCRVisionModel specific hack
1495+ if self .block_count is None :
1496+ if isinstance (self , DeepseekOCRVisionModel ):
1497+ clip_block_count = self .hparams ['width' ]['clip-l-14-224' ]['layers' ]
1498+ sam_block_count = self .hparams ['width' ]['sam_vit_b' ]['layers' ]
1499+ if clip_block_count is not None :
1500+ self .block_count = clip_block_count
1501+ if sam_block_count is not None :
1502+ self .block_count = sam_block_count if self .block_count is None else self .block_count + sam_block_count
1503+ if self .block_count is None :
1504+ raise KeyError (f"could not find block count using any of: { self .n_block_keys } " )
14911505 self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
14921506
14931507 # load preprocessor config
14941508 self .preprocessor_config = {}
14951509 if not self .is_mistral_format :
1496- with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1497- self .preprocessor_config = json .load (f )
1510+ # check if preprocessor_config.json exists
1511+ if (self .dir_model / "preprocessor_config.json" ).is_file ():
1512+ with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1513+ self .preprocessor_config = json .load (f )
1514+ else :
1515+ # try "processing_config" file if exists
1516+ if (self .dir_model / "processing_config.json" ).is_file ():
1517+ with open (self .dir_model / "processing_config.json" , "r" , encoding = "utf-8" ) as f :
1518+ self .preprocessor_config = json .load (f )
14981519
14991520 def get_vision_config (self ) -> dict [str , Any ] | None :
15001521 config_name = "vision_config" if not self .is_mistral_format else "vision_encoder"
@@ -5770,6 +5791,61 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
57705791
57715792 return [] # skip other tensors
57725793
5794+ @ModelBase .register ("DeepseekOCRForCausalLM" )
5795+ class DeepseekOCRVisionModel (MmprojModel ):
5796+ def set_gguf_parameters (self ):
5797+ super ().set_gguf_parameters ()
5798+ hparams = self .hparams
5799+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .DEEPSEEKOCR )
5800+ # default values below are taken from HF tranformers code
5801+ self .gguf_writer .add_vision_attention_layernorm_eps (hparams .get ("layer_norm_eps" , 1e-6 ))
5802+ self .gguf_writer .add_vision_use_gelu (True )
5803+ # calculate proj_scale_factor (used by tinygemma3 test model)
5804+ image_seq_length = self .preprocessor_config .get ("image_seq_length" , 256 )
5805+ n_per_side = int (image_seq_length ** 0.5 )
5806+ image_size = self .hparams ["image_size" ]
5807+ patch_size = self .hparams ["patch_size" ]
5808+ proj_scale_factor = (image_size // patch_size ) // n_per_side
5809+ if proj_scale_factor > 0 and proj_scale_factor != 4 :
5810+ # we only need to write this if it's not the default value
5811+ # in this case, we are converting a test model
5812+ self .gguf_writer .add_vision_projector_scale_factor (proj_scale_factor )
5813+
5814+ def get_vision_config (self ) -> dict [str , Any ]:
5815+ orig_vision_config = self .global_config .get ("vision_config" )
5816+
5817+ super ().get_vision_config ()
5818+
5819+ def tensor_force_quant (self , name , new_name , bid , n_dims ):
5820+ # related to https://github.com/ggml-org/llama.cpp/issues/13025
5821+ if "input_projection" in name :
5822+ return gguf .GGMLQuantizationType .F16
5823+ if ".embeddings." in name :
5824+ return gguf .GGMLQuantizationType .F32
5825+ return super ().tensor_force_quant (name , new_name , bid , n_dims )
5826+
5827+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
5828+ del bid # unused
5829+
5830+ if "vision_model.head." in name :
5831+ return [] # skip redundant tensors for tinygemma3
5832+
5833+ if name .startswith ("multi_modal_projector." ) or name .startswith ("vision_tower." ) \
5834+ or name .startswith ("multimodal_projector." ) or name .startswith ("vision_model." ):
5835+ # process vision tensors
5836+ name = name .replace ("_weight" , ".weight" )
5837+
5838+ # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
5839+ # the other norm values are part of SigLIP model, and they are already correct
5840+ # ref code: Gemma3RMSNorm
5841+ if "soft_emb_norm.weight" in name :
5842+ logger .info (f"Correcting norm value for '{ name } '" )
5843+ data_torch = data_torch + 1
5844+
5845+ return [(self .map_tensor_name (name ), data_torch )]
5846+
5847+ return [] # skip other tensors
5848+
57735849
57745850@ModelBase .register ("Gemma3nForConditionalGeneration" )
57755851class Gemma3NModel (Gemma3Model ):
@@ -6943,6 +7019,7 @@ def prepare_tensors(self):
69437019@ModelBase .register (
69447020 "DeepseekV2ForCausalLM" ,
69457021 "DeepseekV3ForCausalLM" ,
7022+ "DeepseekOCRForCausalLM" ,
69467023 "KimiVLForConditionalGeneration" ,
69477024)
69487025class DeepseekV2Model (TextModel ):
@@ -7009,31 +7086,35 @@ def set_gguf_parameters(self):
70097086
70107087 super ().set_gguf_parameters ()
70117088 hparams = self .hparams
7089+ kv_lora_rank = hparams ["q_lora_rank" ] if hparams ["q_lora_rank" ] is not None else 512
7090+ routed_scaling_factor = hparams .get ("routed_scaling_factor" , 1.0 )
7091+ norm_topk_prob = hparams .get ("norm_topk_prob" , False )
7092+ scoring_func = hparams .get ("scoring_func" , "softmax" )
70127093
70137094 self .gguf_writer .add_leading_dense_block_count (hparams ["first_k_dense_replace" ])
70147095 self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
70157096 if "q_lora_rank" in hparams and hparams ["q_lora_rank" ] is not None :
70167097 self .gguf_writer .add_q_lora_rank (hparams ["q_lora_rank" ])
7017- self .gguf_writer .add_kv_lora_rank (hparams [ " kv_lora_rank" ] )
7098+ self .gguf_writer .add_kv_lora_rank (kv_lora_rank )
70187099
70197100 # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
7020- self .gguf_writer .add_key_length (hparams [ " kv_lora_rank" ] + hparams ["qk_rope_head_dim" ])
7021- self .gguf_writer .add_value_length (hparams [ " kv_lora_rank" ] )
7101+ self .gguf_writer .add_key_length (kv_lora_rank + hparams ["qk_rope_head_dim" ])
7102+ self .gguf_writer .add_value_length (kv_lora_rank )
70227103 self .gguf_writer .add_key_length_mla (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
70237104 self .gguf_writer .add_value_length_mla (hparams ["v_head_dim" ])
70247105
70257106 self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
70267107 self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
70277108 self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
7028- self .gguf_writer .add_expert_weights_scale (hparams [ " routed_scaling_factor" ] )
7029- self .gguf_writer .add_expert_weights_norm (hparams [ " norm_topk_prob" ] )
7109+ self .gguf_writer .add_expert_weights_scale (routed_scaling_factor )
7110+ self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
70307111
7031- if hparams [ " scoring_func" ] == "sigmoid" :
7112+ if scoring_func == "sigmoid" :
70327113 self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
7033- elif hparams [ " scoring_func" ] == "softmax" :
7114+ elif scoring_func == "softmax" :
70347115 self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SOFTMAX )
70357116 else :
7036- raise ValueError (f"Unsupported scoring_func value: { hparams [ ' scoring_func' ] } " )
7117+ raise ValueError (f"Unsupported scoring_func value: { scoring_func } " )
70377118
70387119 self .gguf_writer .add_rope_dimension_count (hparams ["qk_rope_head_dim" ])
70397120
@@ -7043,12 +7124,14 @@ def set_gguf_parameters(self):
70437124 self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
70447125 self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_scaling ["original_max_position_embeddings" ])
70457126 self .gguf_writer .add_rope_scaling_yarn_log_mul (0.1 * rope_scaling ["mscale_all_dim" ])
7127+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams .get ("rms_norm_eps" , 1e-6 ))
70467128
70477129 _experts : list [dict [str , Tensor ]] | None = None
70487130
70497131 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
70507132 # skip vision tensors and remove "language_model." for Kimi-VL
7051- if "vision_tower" in name or "multi_modal_projector" in name :
7133+ if "vision_" in name or "multi_modal_projector" in name \
7134+ or "image_newline" in name or "model.projector" in name or "sam_model" in name or "view_seperator" in name :
70527135 return []
70537136
70547137 if name .startswith ("language_model." ):
0 commit comments