Skip to content

Commit 431d519

Browse files
authored
fix: parsing MinerU result of VLM backend (#1380)
1 parent ef2645c commit 431d519

File tree

1 file changed

+72
-1
lines changed

1 file changed

+72
-1
lines changed

aperag/docparser/mineru_common.py

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ class BlockType:
115115
TableBody = "table_body"
116116
TableCaption = "table_caption"
117117
TableFootnote = "table_footnote"
118+
Code = "code"
119+
CodeBody = "code_body"
120+
CodeCaption = "code_caption"
118121

119122

120123
class ContentType:
@@ -145,13 +148,28 @@ def convert_para(
145148
}
146149
)
147150

148-
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
151+
if para_type in [BlockType.Text, BlockType.Index]:
149152
return [
150153
TextPart(
151154
content=merge_para_with_text(para_block),
152155
metadata=metadata,
153156
)
154157
]
158+
elif para_type == BlockType.List:
159+
# The output of VLM backend for the List type is different than the pipeline backend.
160+
# See https://opendatalab.github.io/MinerU/reference/output_files/#intermediate-processing-results-middlejson_1
161+
if para_block.get("sub_type") is None:
162+
# The `sub_type` field is exclusive to the VLM backend.
163+
# Its absence indicates the pipeline backend is in use.
164+
return [
165+
TextPart(
166+
content=merge_para_with_text(para_block),
167+
metadata=metadata,
168+
)
169+
]
170+
else:
171+
# In VLM backend, the List block is a second-level block.
172+
return _convert_list_para(image_dir, para_block, metadata)
155173
elif para_type == BlockType.Title:
156174
title_level = para_block.get("level", 1)
157175
return [
@@ -172,6 +190,9 @@ def convert_para(
172190
return _convert_image_para(image_dir, para_block, metadata)
173191
elif para_type == BlockType.Table:
174192
return _convert_table_para(image_dir, para_block, metadata)
193+
elif para_type == BlockType.Code:
194+
# Code blocks are exclusive to the VLM backend.
195+
return _convert_code_para(image_dir, para_block, metadata)
175196

176197
return []
177198

@@ -291,3 +312,53 @@ def _convert_table_para(image_dir: Path, para_block: dict[str, Any], metadata: d
291312
url=asset_url,
292313
)
293314
return [asset_bin_part, img_part]
315+
316+
317+
def _convert_list_para(image_dir: Path, para_block: dict[str, Any], metadata: dict[str, Any]) -> list[Part]:
318+
items: list[str] = []
319+
for block in para_block["blocks"]:
320+
if block["type"] == BlockType.Text:
321+
items.append(merge_para_with_text(block))
322+
323+
if len(items) == 0:
324+
return []
325+
326+
result: list[Part] = []
327+
for item in items:
328+
result.append(TextPart(content=item, metadata=metadata))
329+
return result
330+
331+
332+
def _convert_code_para(image_dir: Path, para_block: dict[str, Any], metadata: dict[str, Any]) -> list[Part]:
333+
code_body = ""
334+
code_caption = ""
335+
for block in para_block["blocks"]:
336+
block_type = block["type"]
337+
if block_type == BlockType.CodeBody:
338+
for line in block["lines"]:
339+
for span in line["spans"]:
340+
if span["type"] == ContentType.Text:
341+
code_body += span["content"] + "\n"
342+
elif block_type == BlockType.CodeCaption:
343+
for line in block["lines"]:
344+
for span in line["spans"]:
345+
if span["type"] == ContentType.Text:
346+
code_caption += span["content"] + "\n"
347+
348+
result = []
349+
if code_caption:
350+
code_caption_part = TextPart(
351+
content=code_caption,
352+
metadata=metadata,
353+
)
354+
result.append(code_caption_part)
355+
356+
if code_body:
357+
# TODO: add a CodePart
358+
code_body_part = TextPart(
359+
content=code_body,
360+
metadata=metadata,
361+
)
362+
result.append(code_body_part)
363+
364+
return result

0 commit comments

Comments
 (0)