Skip to content

Commit 583aa1a

Browse files
authored
Merge pull request #72 from opendatalab/dev
support 2.5 pro
2 parents 2d04c0a + e04608c commit 583aa1a

18 files changed

+558
-60
lines changed

mineru_vl_utils/logits_processor/vllm_v1_no_repeat_ngram.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from typing import Any
44

5+
from loguru import logger
56
import torch
67
from vllm.config import VllmConfig
78

@@ -52,7 +53,7 @@ def update_state(self, batch_update: BatchUpdate | None) -> None:
5253
val = _get_int_value(params.extra_args, "no_repeat_ngram_size")
5354
no_repeat_ngram_size = 0 if (val is None or val < 0) else val
5455
if isinstance(params.extra_args, dict) and params.extra_args.get("debug"):
55-
print(f"Request {index}: no_repeat_ngram_size = {no_repeat_ngram_size}")
56+
logger.debug("Request {}: no_repeat_ngram_size = {}", index, no_repeat_ngram_size)
5657
self.req_info[index] = (no_repeat_ngram_size, output_tok_ids, {})
5758

5859
for a_index, b_index, direct in batch_update.moved:

mineru_vl_utils/mineru_client.py

Lines changed: 95 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ def __init__(
7878
"<|rotate_left|>": 270,
7979
}
8080

81+
IMAGE_ANALYSIS_TYPES = {"image", "chart"}
82+
IMAGE_CAPTION_CONTAINER_TYPES = {"image", "chart", "image_block"}
83+
INTERNAL_BLOCK_THRESHOLD = 0.9
84+
IMAGE_ANALYSIS_MIN_BLOCK_SIZE = 0.1
85+
IMAGE_ANALYSIS_MIN_BLOCK_AREA = 0.01
86+
8187

8288
def _convert_bbox(bbox: Sequence[int] | Sequence[str]) -> list[float] | None:
8389
bbox = tuple(map(int, bbox))
@@ -98,12 +104,8 @@ def _parse_angle(tail: str) -> Literal[None, 0, 90, 180, 270]:
98104
return None
99105

100106

101-
def _parse_merge_type(tail: str) -> Literal[None, 'src', 'tgt']:
102-
if "txt_contd_src" in tail:
103-
return "src"
104-
elif "txt_contd_tgt" in tail:
105-
return "tgt"
106-
return None
107+
def _parse_merge_prev(tail: str) -> bool:
108+
return "txt_contd_tgt" in tail
107109

108110

109111
class MinerUClientHelper:
@@ -137,6 +139,57 @@ def __init__(
137139
self.enable_table_formula_eq_wrap = enable_table_formula_eq_wrap
138140
self.debug = debug
139141

142+
@staticmethod
143+
def _bbox_intersection_area(a: Sequence[float], b: Sequence[float]) -> float:
144+
x1 = max(a[0], b[0])
145+
y1 = max(a[1], b[1])
146+
x2 = min(a[2], b[2])
147+
y2 = min(a[3], b[3])
148+
if x2 <= x1 or y2 <= y1:
149+
return 0.0
150+
return (x2 - x1) * (y2 - y1)
151+
152+
@classmethod
153+
def _bbox_cover_ratio(cls, inner: Sequence[float], outer: Sequence[float]) -> float:
154+
inner_area = max(0.0, inner[2] - inner[0]) * max(0.0, inner[3] - inner[1])
155+
if inner_area == 0:
156+
return 0.0
157+
return cls._bbox_intersection_area(inner, outer) / inner_area
158+
159+
@classmethod
160+
def _find_covered_block_indices(
161+
cls,
162+
blocks: Sequence[ContentBlock],
163+
candidate_types: set[str],
164+
container_types: set[str],
165+
threshold: float = INTERNAL_BLOCK_THRESHOLD,
166+
) -> set[int]:
167+
container_indices = [idx for idx, block in enumerate(blocks) if block.type in container_types]
168+
if not container_indices:
169+
return set()
170+
171+
covered_indices: set[int] = set()
172+
for idx, block in enumerate(blocks):
173+
if block.type not in candidate_types:
174+
continue
175+
for container_idx in container_indices:
176+
if idx == container_idx:
177+
continue
178+
if cls._bbox_cover_ratio(block.bbox, blocks[container_idx].bbox) >= threshold:
179+
covered_indices.add(idx)
180+
break
181+
return covered_indices
182+
183+
@staticmethod
184+
def _is_eligible_for_image_analysis(block: ContentBlock) -> bool:
185+
x1, y1, x2, y2 = block.bbox
186+
width = x2 - x1
187+
height = y2 - y1
188+
return (
189+
(width > IMAGE_ANALYSIS_MIN_BLOCK_SIZE and height > IMAGE_ANALYSIS_MIN_BLOCK_SIZE)
190+
or width * height > IMAGE_ANALYSIS_MIN_BLOCK_AREA
191+
)
192+
140193
def resize_by_need(self, image: Image.Image) -> Image.Image:
141194
edge_ratio = max(image.size) / min(image.size)
142195
if edge_ratio > self.max_image_edge_ratio:
@@ -169,19 +222,26 @@ def parse_layout_output(self, output: str) -> list[ContentBlock]:
169222
x1, y1, x2, y2, ref_type, rotate_token, tail = match.groups()
170223
bbox = _convert_bbox((x1, y1, x2, y2))
171224
if bbox is None:
172-
print(f"Warning: invalid bbox in line: {match.group(0)}")
225+
logger.warning("Invalid bbox in layout output line: {}", match.group(0))
173226
continue # Skip invalid bbox
174227
ref_type = ref_type.lower()
228+
if ref_type == "inline_formula":
229+
if self.debug:
230+
logger.debug("Skipping inline formula block in layout output: {}", match.group(0))
231+
continue
175232
if ref_type not in BLOCK_TYPES:
176-
print(f"Warning: unknown block type in line: {match.group(0)}")
233+
logger.warning("Unknown block type in layout output line: {}", match.group(0))
177234
continue # Skip unknown block types
178235
angle = _parse_angle(rotate_token) if rotate_token else None
179236
if angle is None:
180-
print(f"Warning: no angle found in line: {match.group(0)}")
181-
merge_type = _parse_merge_type(tail)
182-
blocks.append(ContentBlock(ref_type, bbox, angle=angle, merge_type=merge_type))
237+
logger.warning("No angle found in layout output line: {}", match.group(0))
238+
if ref_type == "text":
239+
merge_prev = _parse_merge_prev(tail)
240+
blocks.append(ContentBlock(ref_type, bbox, angle=angle, merge_prev=merge_prev))
241+
else:
242+
blocks.append(ContentBlock(ref_type, bbox, angle=angle))
183243
if not matched and output.strip():
184-
print(f"Warning: output does not match layout format: {output}")
244+
logger.warning("Layout output does not match expected format: {}", output)
185245
return blocks
186246

187247
def prepare_for_extract(
@@ -190,13 +250,27 @@ def prepare_for_extract(
190250
blocks: list[ContentBlock],
191251
not_extract_list: list[str] | None = None,
192252
) -> tuple[list[Image.Image | bytes], list[str], list[SamplingParams | None], list[int]]:
253+
internal_caption_indices = self._find_covered_block_indices(
254+
blocks,
255+
candidate_types={"image_caption"},
256+
container_types=IMAGE_CAPTION_CONTAINER_TYPES,
257+
)
258+
if internal_caption_indices:
259+
blocks[:] = [block for idx, block in enumerate(blocks) if idx not in internal_caption_indices]
260+
261+
non_standalone_visual_indices = self._find_covered_block_indices(
262+
blocks,
263+
candidate_types=IMAGE_ANALYSIS_TYPES,
264+
container_types={"image_block"},
265+
)
266+
193267
image = get_rgb_image(image)
194268
width, height = image.size
195269
block_images: list[Image.Image | bytes] = []
196270
prompts: list[str] = []
197271
sampling_params: list[SamplingParams | None] = []
198272
indices: list[int] = []
199-
skip_list = {"list", "equation_block"}
273+
skip_list = {"list", "equation_block", "image_block"}
200274
if not self.image_analysis:
201275
skip_list.update({"image", "chart"})
202276
if not_extract_list:
@@ -214,12 +288,17 @@ def prepare_for_extract(
214288
continue # Skip blocks that should not be extracted.
215289
if block.type == "image" and is_absorbed_table_image(block):
216290
continue
291+
if block.type in IMAGE_ANALYSIS_TYPES:
292+
if idx in non_standalone_visual_indices:
293+
continue
294+
if not self._is_eligible_for_image_analysis(block):
295+
continue
217296
table_image_prepared = False
218297
x1, y1, x2, y2 = block.bbox
219298
scaled_bbox = (x1 * width, y1 * height, x2 * width, y2 * height)
220299
block_image = image.crop(scaled_bbox)
221300
if block_image.width < 1 or block_image.height < 1:
222-
print(f"Warning: cropped block image has invalid size {block_image.size}")
301+
logger.warning("Cropped block image has invalid size {}", block_image.size)
223302
continue
224303
if block.type == "table":
225304
image_indices = table_to_images.get(idx, [])
@@ -253,7 +332,7 @@ def post_process(self, blocks: list[ContentBlock]) -> list[ContentBlock]:
253332
debug=self.debug,
254333
)
255334
except Exception as e:
256-
print(f"Warning: post-processing failed with error: {e}")
335+
logger.warning("Post-processing failed with error: {}", e)
257336
clean_blocks = [block for block in blocks if not (block.type == "image" and is_absorbed_table_image(block))]
258337
return cleanup_table_image_metadata(clean_blocks)
259338

@@ -383,7 +462,7 @@ def __init__(
383462
elif env_debug_value.lower() in ["false", "0", "no"]:
384463
debug = False
385464
else:
386-
logger.warning(f"unknown MINERU_VL_DEBUG_ENABLE config: {env_debug_value}, pass")
465+
logger.warning("unknown MINERU_VL_DEBUG_ENABLE config: {}, pass", env_debug_value)
387466

388467
if backend == "transformers":
389468
if model is None or processor is None:

mineru_vl_utils/post_process/__init__.py

100755100644
Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from loguru import logger
2+
13
from ..structs import ContentBlock
24
from .equation_big import try_fix_equation_big
35
from .equation_block import do_handle_equation_block
@@ -10,7 +12,9 @@
1012
from .text_inline_spacing import try_fix_macro_spacing_in_markdown
1113
from .text_display2inline import try_convert_display_to_inline
1214
from .text_move_underscores_outside import try_move_underscores_outside
15+
from .image_analysis_postprocess import convert_markdown_table_to_html, process_image_or_chart
1316
from .otsl2html import convert_otsl_to_html
17+
from .json2markdown import json2md
1418
from .table_image_processor import (
1519
cleanup_table_image_metadata,
1620
is_absorbed_table_image,
@@ -59,10 +63,43 @@ def simple_process(
5963
try:
6064
content = convert_otsl_to_html(content)
6165
except Exception as e:
62-
print("Warning: Failed to convert OTSL to HTML: ", e)
63-
print("Content: ", block.content)
66+
logger.warning("Failed to convert OTSL to HTML: {}; content: {}", e, block.content)
6467
content = replace_table_image_tokens(content, block.get(TABLE_IMAGE_TOKEN_MAP_KEY))
6568
block.content = replace_table_formula_delimiters(content, enabled=enable_table_formula_eq_wrap)
69+
if block.type in {"image", "chart"} and block.content:
70+
try:
71+
block_image_analysis_result = process_image_or_chart(block.content)
72+
class_name = block_image_analysis_result["class"]
73+
content = block_image_analysis_result["content"]
74+
if class_name == "pure_table":
75+
block.type = "table"
76+
table_html = convert_markdown_table_to_html(content)
77+
if table_html is None:
78+
logger.warning("Failed to convert markdown table to HTML: {}", content)
79+
block.content = content
80+
else:
81+
block.content = replace_table_formula_delimiters(
82+
table_html,
83+
enabled=enable_table_formula_eq_wrap,
84+
)
85+
elif class_name == "pure_formula":
86+
block.type = "equation"
87+
block.content = content
88+
elif class_name == "chart":
89+
block.type = "chart"
90+
block["sub_type"] = block_image_analysis_result["sub_class"]
91+
block.content = content
92+
else:
93+
block.type = "image"
94+
block["sub_type"] = class_name
95+
if class_name == "natural_image" or not content:
96+
block.content = block_image_analysis_result["caption"]
97+
else:
98+
block.content = content
99+
100+
except Exception as e:
101+
logger.warning("Failed to process image/chart: {}; content: {}", e, block.content)
102+
block.content = None # or keep original content, depending on your preference
66103
return blocks
67104

68105

@@ -90,17 +127,15 @@ def post_process(
90127
try:
91128
block.content = _process_equation(block.content, debug=debug)
92129
except Exception as e:
93-
print("Warning: Failed to process equation: ", e)
94-
print("Content: ", block.content)
130+
logger.warning("Failed to process equation: {}; content: {}", e, block.content)
95131

96132
elif block.type == "text" and block.content:
97133
try:
98134
block.content = try_convert_display_to_inline(block.content, debug=debug)
99135
block.content = try_fix_macro_spacing_in_markdown(block.content, debug=debug)
100136
block.content = try_move_underscores_outside(block.content, debug=debug)
101137
except Exception as e:
102-
print("Warning: Failed to process text: ", e)
103-
print("Content: ", block.content)
138+
logger.warning("Failed to process text: {}; content: {}", e, block.content)
104139

105140
if handle_equation_block:
106141
blocks = do_handle_equation_block(blocks, debug=debug)

mineru_vl_utils/post_process/equation_big.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import re
22

3+
from loguru import logger
4+
35

46
def try_fix_equation_big(latex: str, debug: bool = False) -> str:
57

@@ -480,6 +482,6 @@ def try_fix_equation_big(latex: str, debug: bool = False) -> str:
480482
latex = re.sub(r"\\bigtimes", r"\\times", latex)
481483

482484
if debug and original_latex != latex:
483-
print(f"Fixed equation big from: {original_latex} to: {latex}")
485+
logger.debug("Fixed equation big from: {} to: {}", original_latex, latex)
484486

485487
return latex

mineru_vl_utils/post_process/equation_block.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import re
22

3+
from loguru import logger
4+
35
from ..structs import ContentBlock
46

57

@@ -66,7 +68,7 @@ def do_handle_equation_block(
6668

6769
if debug:
6870
for idx, span_indices in sem_equation_spans.items():
69-
print(f"Combined equation_block at idx {idx} with spans at {span_indices}")
71+
logger.debug("Combined equation_block at idx {} with spans at {}", idx, span_indices)
7072

7173
out_blocks: list[ContentBlock] = []
7274
for idx in range(len(blocks)):

mineru_vl_utils/post_process/equation_delimeters.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from loguru import logger
2+
3+
14
def try_fix_equation_delimeters(latex: str, debug: bool = False) -> str:
25

36
new_latex = latex.strip()
@@ -8,7 +11,7 @@ def try_fix_equation_delimeters(latex: str, debug: bool = False) -> str:
811
new_latex = new_latex.strip()
912

1013
if debug and new_latex != latex:
11-
print(f"Fixed equation delimeters from: {latex} to: {new_latex}")
14+
logger.debug("Fixed equation delimiters from: {} to: {}", latex, new_latex)
1215
return new_latex
1316

1417

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import re
22

3+
from loguru import logger
4+
35

46
def try_fix_equation_double_subscript(latex: str, debug: bool = False) -> str:
57
pattern = r"_\s*\{([^{}]|\{[^{}]*\})*\}\s*_\s*\{([^{}]|\{[^{}]*\})*\}"
68
if not re.search(pattern, latex):
79
return latex
810
new_latex = re.sub(pattern, "", latex)
911
if debug:
10-
print(f"Fixed equation double-subscript from: {latex} to: {new_latex}")
12+
logger.debug("Fixed equation double-subscript from: {} to: {}", latex, new_latex)
1113
return new_latex

mineru_vl_utils/post_process/equation_fix_eqqcolon.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import re
22

3+
from loguru import logger
4+
35

46
def try_fix_equation_eqqcolon(latex: str, debug: bool = False) -> str:
57
new_latex = re.sub(r"\\eqqcolon", "=:", latex)
68
new_latex = re.sub(r"\\coloneqq", ":=", new_latex)
79
if debug and new_latex != latex:
8-
print(f"Fixed equation eq-colon from: {latex} to: {new_latex}")
10+
logger.debug("Fixed equation eq-colon from: {} to: {}", latex, new_latex)
911
return new_latex
1012

1113

mineru_vl_utils/post_process/equation_left_right.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import re
22

3+
from loguru import logger
4+
35
VALID_LEFT_TOKEN_LIST = [
46
"\\left\\lbrace",
57
"\\left\\lVert",
@@ -365,7 +367,7 @@ def try_match_equation_left_right(latex: str, debug: bool = False) -> str:
365367
fixed_latex = fix_left_right_mismatch(latex)
366368

367369
if debug:
368-
print(f"Trying to fix left-right mismatch in equation: {latex}")
369-
print(f"Fixed equation: {fixed_latex}")
370+
logger.debug("Trying to fix left-right mismatch in equation: {}", latex)
371+
logger.debug("Fixed equation: {}", fixed_latex)
370372

371373
return fixed_latex

0 commit comments

Comments
 (0)