Skip to content

Commit 84f755f

Browse files
committed
support dots.ocr
1 parent 5030b1e commit 84f755f

File tree

11 files changed

+783
-43
lines changed

11 files changed

+783
-43
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ set(core_files src/backend.cpp
7676
models/decilm.cpp
7777
models/deepseek.cpp
7878
models/dolphinphi2.cpp
79+
models/dots.cpp
7980
models/ernie.cpp
8081
models/exaone.cpp
8182
models/falcon.cpp

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia" style="text-
3131

3232
**What's New:**
3333

34+
* 2025-10-13: dots.ocr
3435
* 2025-10-10: [I can draw](./docs/multimodal.md): Janus-Pro
3536
* 2025-09-23: Qwen2.5-VL
3637
* 2025-09-15: Ling/Ring-mini-2.0

convert.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ class ModelType(Enum):
236236

237237
LlaMA4 = ModelTypeTagChatImageIn + 0x0000001
238238
Gemma3Vis = ModelTypeTagChatImageIn + 0x0000011
239+
DotsOCR = ModelTypeTagChatImageIn + 0x0000020
239240

240241
Qwen2Audio = ModelTypeTagChatAudioIn + 0x0000001
241242

@@ -7948,6 +7949,96 @@ def get_weight_names(config):
79487949

79497950
return weight_names
79507951

7952+
class DotsOCRConverter(BaseConverter):
7953+
MODEL_TYPE = ModelType.DotsOCR
7954+
7955+
@classmethod
7956+
def state_dict_pp(cls, config, state_dict):
7957+
r = {}
7958+
for name in state_dict:
7959+
tensor: torch.Tensor = state_dict[name]
7960+
if not name.startswith('vision_tower'):
7961+
r[name] = tensor
7962+
continue
7963+
7964+
if name.startswith('vision_tower.blocks.'):
7965+
name = name.replace('vision_tower.blocks.', 'vision_model.layers.')
7966+
if '.attn.proj.' in name:
7967+
name = name.replace('.proj.', '.o_proj.')
7968+
r[name] = tensor
7969+
elif '.qkv.' in name:
7970+
num_heads = config.vision_config['hidden_size']
7971+
q, k, v = tensor.split([num_heads, num_heads, num_heads], dim=0)
7972+
r[name.replace('.qkv.', '.q_proj.')] = q
7973+
r[name.replace('.qkv.', '.k_proj.')] = k
7974+
r[name.replace('.qkv.', '.v_proj.')] = v
7975+
elif '.fc3' in name:
7976+
r[name.replace('.fc3.', '.up_proj.')] = tensor
7977+
elif '.fc2' in name:
7978+
r[name.replace('.fc2.', '.down_proj.')] = tensor
7979+
elif '.fc1' in name:
7980+
r[name.replace('.fc1.', '.gate_proj.')] = tensor
7981+
else:
7982+
r[name] = tensor
7983+
elif name.startswith('vision_tower.merger'):
7984+
name = name.replace('vision_tower.merger.', 'vision_model.merger.')
7985+
7986+
if '.mlp.0.' in name:
7987+
name = name.replace('.mlp.0.', '.mlp.fc0.')
7988+
elif '.mlp.2.' in name:
7989+
name = name.replace('.mlp.2.', '.mlp.fc1.')
7990+
7991+
r[name] = tensor
7992+
elif name.startswith('vision_tower.patch_embed.patchifier.'):
7993+
name = name.replace('vision_tower.patch_embed.patchifier.', 'vision_model.patch_embed.')
7994+
r[name] = tensor
7995+
else:
7996+
name = name.replace('vision_tower.', 'vision_model.')
7997+
r[name] = tensor
7998+
7999+
return r
8000+
8001+
@staticmethod
8002+
def dump_config(f, config, ggml_type):
8003+
vis_config = AttributeDict(config.vision_config)
8004+
assert vis_config.post_norm
8005+
assert not vis_config.use_bias
8006+
QWen2Converter.dump_config(f, config, ggml_type)
8007+
8008+
@staticmethod
8009+
def get_weight_names(config):
8010+
weight_names = QWen2Converter.get_weight_names(config)
8011+
8012+
vis_config = AttributeDict(config.vision_config)
8013+
8014+
for i in range(vis_config.num_hidden_layers):
8015+
weight_names += [
8016+
f"vision_model.layers.{i}.attn.q_proj.weight",
8017+
f"vision_model.layers.{i}.attn.k_proj.weight",
8018+
f"vision_model.layers.{i}.attn.v_proj.weight",
8019+
f"vision_model.layers.{i}.attn.o_proj.weight",
8020+
f"vision_model.layers.{i}.mlp.up_proj.weight",
8021+
f"vision_model.layers.{i}.mlp.down_proj.weight",
8022+
f"vision_model.layers.{i}.mlp.gate_proj.weight",
8023+
f"vision_model.layers.{i}.norm1.weight",
8024+
f"vision_model.layers.{i}.norm2.weight",
8025+
]
8026+
8027+
weight_names += [
8028+
"vision_model.merger.ln_q.bias",
8029+
"vision_model.merger.ln_q.weight",
8030+
"vision_model.merger.mlp.fc0.bias",
8031+
"vision_model.merger.mlp.fc0.weight",
8032+
"vision_model.merger.mlp.fc1.bias",
8033+
"vision_model.merger.mlp.fc1.weight",
8034+
"vision_model.patch_embed.norm.weight",
8035+
"vision_model.patch_embed.proj.bias",
8036+
"vision_model.patch_embed.proj.weight",
8037+
"vision_model.post_trunk_norm.weight",
8038+
]
8039+
8040+
return weight_names
8041+
79518042
def convert_grok_1_base(args, vocab, ggml_type):
79528043
def ffn_size(emb_size, widening_factor):
79538044
_ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -8561,6 +8652,8 @@ def main():
85618652
elif arch == 'MultiModalityCausalLM':
85628653
assert JanusConverter.is_proper_config(config)
85638654
JanusConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
8655+
elif arch.endswith('DotsOCRForCausalLM'):
8656+
DotsOCRConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
85648657
elif arch == 'deepseek-r1-distill-qwen3':
85658658
QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
85668659
QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)

docs/models.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,14 @@ Please use `--format completion` for these models.
376376

377377
Note: Use `--set do-split 1` to enable _Split_.
378378

379+
## OCR Models
380+
381+
* dots.ocr (`DotsOCRForCausalLM`)
382+
* [x] [3B](https://huggingface.co/rednote-hilab/dots.ocr/tree/ba670c5dcf03ff4e02015558c95b4042f5dce069)
383+
384+
Note: Prompt for OCR: _{{image:...}}Extract the text content from this image_. [Here](https://github.com/rednote-hilab/dots.ocr/blob/master/dots_ocr/utils/prompts.py)
385+
are other prompts for OCR. Use `+single-turn` to discard history automatically.
386+
379387
## RAG Models
380388

381389
* Text Embedding (`XLMRobertaModel`)

0 commit comments

Comments
 (0)