@@ -236,6 +236,7 @@ class ModelType(Enum):
236236
237237 LlaMA4 = ModelTypeTagChatImageIn + 0x0000001
238238 Gemma3Vis = ModelTypeTagChatImageIn + 0x0000011
239+ DotsOCR = ModelTypeTagChatImageIn + 0x0000020
239240
240241 Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
241242
@@ -7948,6 +7949,96 @@ def get_weight_names(config):
79487949
79497950 return weight_names
79507951
7952+ class DotsOCRConverter (BaseConverter ):
7953+ MODEL_TYPE = ModelType .DotsOCR
7954+
7955+ @classmethod
7956+ def state_dict_pp (cls , config , state_dict ):
7957+ r = {}
7958+ for name in state_dict :
7959+ tensor : torch .Tensor = state_dict [name ]
7960+ if not name .startswith ('vision_tower' ):
7961+ r [name ] = tensor
7962+ continue
7963+
7964+ if name .startswith ('vision_tower.blocks.' ):
7965+ name = name .replace ('vision_tower.blocks.' , 'vision_model.layers.' )
7966+ if '.attn.proj.' in name :
7967+ name = name .replace ('.proj.' , '.o_proj.' )
7968+ r [name ] = tensor
7969+ elif '.qkv.' in name :
7970+ num_heads = config .vision_config ['hidden_size' ]
7971+ q , k , v = tensor .split ([num_heads , num_heads , num_heads ], dim = 0 )
7972+ r [name .replace ('.qkv.' , '.q_proj.' )] = q
7973+ r [name .replace ('.qkv.' , '.k_proj.' )] = k
7974+ r [name .replace ('.qkv.' , '.v_proj.' )] = v
7975+ elif '.fc3' in name :
7976+ r [name .replace ('.fc3.' , '.up_proj.' )] = tensor
7977+ elif '.fc2' in name :
7978+ r [name .replace ('.fc2.' , '.down_proj.' )] = tensor
7979+ elif '.fc1' in name :
7980+ r [name .replace ('.fc1.' , '.gate_proj.' )] = tensor
7981+ else :
7982+ r [name ] = tensor
7983+ elif name .startswith ('vision_tower.merger' ):
7984+ name = name .replace ('vision_tower.merger.' , 'vision_model.merger.' )
7985+
7986+ if '.mlp.0.' in name :
7987+ name = name .replace ('.mlp.0.' , '.mlp.fc0.' )
7988+ elif '.mlp.2.' in name :
7989+ name = name .replace ('.mlp.2.' , '.mlp.fc1.' )
7990+
7991+ r [name ] = tensor
7992+ elif name .startswith ('vision_tower.patch_embed.patchifier.' ):
7993+ name = name .replace ('vision_tower.patch_embed.patchifier.' , 'vision_model.patch_embed.' )
7994+ r [name ] = tensor
7995+ else :
7996+ name = name .replace ('vision_tower.' , 'vision_model.' )
7997+ r [name ] = tensor
7998+
7999+ return r
8000+
8001+ @staticmethod
8002+ def dump_config (f , config , ggml_type ):
8003+ vis_config = AttributeDict (config .vision_config )
8004+ assert vis_config .post_norm
8005+ assert not vis_config .use_bias
8006+ QWen2Converter .dump_config (f , config , ggml_type )
8007+
8008+ @staticmethod
8009+ def get_weight_names (config ):
8010+ weight_names = QWen2Converter .get_weight_names (config )
8011+
8012+ vis_config = AttributeDict (config .vision_config )
8013+
8014+ for i in range (vis_config .num_hidden_layers ):
8015+ weight_names += [
8016+ f"vision_model.layers.{ i } .attn.q_proj.weight" ,
8017+ f"vision_model.layers.{ i } .attn.k_proj.weight" ,
8018+ f"vision_model.layers.{ i } .attn.v_proj.weight" ,
8019+ f"vision_model.layers.{ i } .attn.o_proj.weight" ,
8020+ f"vision_model.layers.{ i } .mlp.up_proj.weight" ,
8021+ f"vision_model.layers.{ i } .mlp.down_proj.weight" ,
8022+ f"vision_model.layers.{ i } .mlp.gate_proj.weight" ,
8023+ f"vision_model.layers.{ i } .norm1.weight" ,
8024+ f"vision_model.layers.{ i } .norm2.weight" ,
8025+ ]
8026+
8027+ weight_names += [
8028+ "vision_model.merger.ln_q.bias" ,
8029+ "vision_model.merger.ln_q.weight" ,
8030+ "vision_model.merger.mlp.fc0.bias" ,
8031+ "vision_model.merger.mlp.fc0.weight" ,
8032+ "vision_model.merger.mlp.fc1.bias" ,
8033+ "vision_model.merger.mlp.fc1.weight" ,
8034+ "vision_model.patch_embed.norm.weight" ,
8035+ "vision_model.patch_embed.proj.bias" ,
8036+ "vision_model.patch_embed.proj.weight" ,
8037+ "vision_model.post_trunk_norm.weight" ,
8038+ ]
8039+
8040+ return weight_names
8041+
79518042def convert_grok_1_base (args , vocab , ggml_type ):
79528043 def ffn_size (emb_size , widening_factor ):
79538044 _ffn_size = int (widening_factor * emb_size ) * 2 // 3
@@ -8561,6 +8652,8 @@ def main():
85618652 elif arch == 'MultiModalityCausalLM' :
85628653 assert JanusConverter .is_proper_config (config )
85638654 JanusConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
8655+ elif arch .endswith ('DotsOCRForCausalLM' ):
8656+ DotsOCRConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
85648657 elif arch == 'deepseek-r1-distill-qwen3' :
85658658 QWen3Converter .MODEL_TYPE = ModelType .DeepSeek_R1_Distill_QWen3
85668659 QWen3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
0 commit comments