opendatalab
diff --git a/‎mineru_vl_utils/logits_processor/vllm_v1_no_repeat_ngram.py‎
Lines changed: 2 additions & 1 deletion b/‎mineru_vl_utils/logits_processor/vllm_v1_no_repeat_ngram.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mineru_vl_utils/mineru_client.py‎
Lines changed: 95 additions & 16 deletions b/‎mineru_vl_utils/mineru_client.py‎
Lines changed: 95 additions & 16 deletions
diff --git a/‎mineru_vl_utils/post_process/__init__.py‎
100755100644
Lines changed: 41 additions & 6 deletions b/‎mineru_vl_utils/post_process/__init__.py‎
100755100644
Lines changed: 41 additions & 6 deletions
diff --git a/‎mineru_vl_utils/post_process/equation_big.py‎
Lines changed: 3 additions & 1 deletion b/‎mineru_vl_utils/post_process/equation_big.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mineru_vl_utils/post_process/equation_block.py‎
Lines changed: 3 additions & 1 deletion b/‎mineru_vl_utils/post_process/equation_block.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mineru_vl_utils/post_process/equation_delimeters.py‎
Lines changed: 4 additions & 1 deletion b/‎mineru_vl_utils/post_process/equation_delimeters.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎mineru_vl_utils/post_process/equation_double_subscript.py‎
Lines changed: 3 additions & 1 deletion b/‎mineru_vl_utils/post_process/equation_double_subscript.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mineru_vl_utils/post_process/equation_fix_eqqcolon.py‎
Lines changed: 3 additions & 1 deletion b/‎mineru_vl_utils/post_process/equation_fix_eqqcolon.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mineru_vl_utils/post_process/equation_left_right.py‎
Lines changed: 4 additions & 2 deletions b/‎mineru_vl_utils/post_process/equation_left_right.py‎
Lines changed: 4 additions & 2 deletions
@@ -2,6 +2,7 @@
 
 from typing import Any
 
+from loguru import logger
 import torch
 from vllm.config import VllmConfig
 
@@ -52,7 +53,7 @@ def update_state(self, batch_update: BatchUpdate | None) -> None:
             val = _get_int_value(params.extra_args, "no_repeat_ngram_size")
             no_repeat_ngram_size = 0 if (val is None or val < 0) else val
             if isinstance(params.extra_args, dict) and params.extra_args.get("debug"):
-                print(f"Request {index}: no_repeat_ngram_size = {no_repeat_ngram_size}")
+                logger.debug("Request {}: no_repeat_ngram_size = {}", index, no_repeat_ngram_size)
             self.req_info[index] = (no_repeat_ngram_size, output_tok_ids, {})
 
         for a_index, b_index, direct in batch_update.moved:
 
@@ -78,6 +78,12 @@ def __init__(
     "<|rotate_left|>": 270,
 }
 
+IMAGE_ANALYSIS_TYPES = {"image", "chart"}
+IMAGE_CAPTION_CONTAINER_TYPES = {"image", "chart", "image_block"}
+INTERNAL_BLOCK_THRESHOLD = 0.9
+IMAGE_ANALYSIS_MIN_BLOCK_SIZE = 0.1
+IMAGE_ANALYSIS_MIN_BLOCK_AREA = 0.01
+
 
 def _convert_bbox(bbox: Sequence[int] | Sequence[str]) -> list[float] | None:
     bbox = tuple(map(int, bbox))
@@ -98,12 +104,8 @@ def _parse_angle(tail: str) -> Literal[None, 0, 90, 180, 270]:
     return None
 
 
-def _parse_merge_type(tail: str) -> Literal[None, 'src', 'tgt']:
-    if "txt_contd_src" in tail:
-        return "src"
-    elif "txt_contd_tgt" in tail:
-        return "tgt"
-    return None
+def _parse_merge_prev(tail: str) -> bool:
+    return "txt_contd_tgt" in tail
 
 
 class MinerUClientHelper:
@@ -137,6 +139,57 @@ def __init__(
         self.enable_table_formula_eq_wrap = enable_table_formula_eq_wrap
         self.debug = debug
 
+    @staticmethod
+    def _bbox_intersection_area(a: Sequence[float], b: Sequence[float]) -> float:
+        x1 = max(a[0], b[0])
+        y1 = max(a[1], b[1])
+        x2 = min(a[2], b[2])
+        y2 = min(a[3], b[3])
+        if x2 <= x1 or y2 <= y1:
+            return 0.0
+        return (x2 - x1) * (y2 - y1)
+
+    @classmethod
+    def _bbox_cover_ratio(cls, inner: Sequence[float], outer: Sequence[float]) -> float:
+        inner_area = max(0.0, inner[2] - inner[0]) * max(0.0, inner[3] - inner[1])
+        if inner_area == 0:
+            return 0.0
+        return cls._bbox_intersection_area(inner, outer) / inner_area
+
+    @classmethod
+    def _find_covered_block_indices(
+        cls,
+        blocks: Sequence[ContentBlock],
+        candidate_types: set[str],
+        container_types: set[str],
+        threshold: float = INTERNAL_BLOCK_THRESHOLD,
+    ) -> set[int]:
+        container_indices = [idx for idx, block in enumerate(blocks) if block.type in container_types]
+        if not container_indices:
+            return set()
+
+        covered_indices: set[int] = set()
+        for idx, block in enumerate(blocks):
+            if block.type not in candidate_types:
+                continue
+            for container_idx in container_indices:
+                if idx == container_idx:
+                    continue
+                if cls._bbox_cover_ratio(block.bbox, blocks[container_idx].bbox) >= threshold:
+                    covered_indices.add(idx)
+                    break
+        return covered_indices
+
+    @staticmethod
+    def _is_eligible_for_image_analysis(block: ContentBlock) -> bool:
+        x1, y1, x2, y2 = block.bbox
+        width = x2 - x1
+        height = y2 - y1
+        return (
+            (width > IMAGE_ANALYSIS_MIN_BLOCK_SIZE and height > IMAGE_ANALYSIS_MIN_BLOCK_SIZE)
+            or width * height > IMAGE_ANALYSIS_MIN_BLOCK_AREA
+        )
+
     def resize_by_need(self, image: Image.Image) -> Image.Image:
         edge_ratio = max(image.size) / min(image.size)
         if edge_ratio > self.max_image_edge_ratio:
@@ -169,19 +222,26 @@ def parse_layout_output(self, output: str) -> list[ContentBlock]:
             x1, y1, x2, y2, ref_type, rotate_token, tail = match.groups()
             bbox = _convert_bbox((x1, y1, x2, y2))
             if bbox is None:
-                print(f"Warning: invalid bbox in line: {match.group(0)}")
+                logger.warning("Invalid bbox in layout output line: {}", match.group(0))
                 continue  # Skip invalid bbox
             ref_type = ref_type.lower()
+            if ref_type == "inline_formula":
+                if self.debug:
+                    logger.debug("Skipping inline formula block in layout output: {}", match.group(0))
+                continue
             if ref_type not in BLOCK_TYPES:
-                print(f"Warning: unknown block type in line: {match.group(0)}")
+                logger.warning("Unknown block type in layout output line: {}", match.group(0))
                 continue  # Skip unknown block types
             angle = _parse_angle(rotate_token) if rotate_token else None
             if angle is None:
-                print(f"Warning: no angle found in line: {match.group(0)}")
-            merge_type = _parse_merge_type(tail)
-            blocks.append(ContentBlock(ref_type, bbox, angle=angle, merge_type=merge_type))
+                logger.warning("No angle found in layout output line: {}", match.group(0))
+            if ref_type == "text":
+                merge_prev = _parse_merge_prev(tail)
+                blocks.append(ContentBlock(ref_type, bbox, angle=angle, merge_prev=merge_prev))
+            else:
+                blocks.append(ContentBlock(ref_type, bbox, angle=angle))
         if not matched and output.strip():
-            print(f"Warning: output does not match layout format: {output}")
+            logger.warning("Layout output does not match expected format: {}", output)
         return blocks
 
     def prepare_for_extract(
@@ -190,13 +250,27 @@ def prepare_for_extract(
         blocks: list[ContentBlock],
         not_extract_list: list[str] | None = None,
     ) -> tuple[list[Image.Image | bytes], list[str], list[SamplingParams | None], list[int]]:
+        internal_caption_indices = self._find_covered_block_indices(
+            blocks,
+            candidate_types={"image_caption"},
+            container_types=IMAGE_CAPTION_CONTAINER_TYPES,
+        )
+        if internal_caption_indices:
+            blocks[:] = [block for idx, block in enumerate(blocks) if idx not in internal_caption_indices]
+
+        non_standalone_visual_indices = self._find_covered_block_indices(
+            blocks,
+            candidate_types=IMAGE_ANALYSIS_TYPES,
+            container_types={"image_block"},
+        )
+
         image = get_rgb_image(image)
         width, height = image.size
         block_images: list[Image.Image | bytes] = []
         prompts: list[str] = []
         sampling_params: list[SamplingParams | None] = []
         indices: list[int] = []
-        skip_list = {"list", "equation_block"}
+        skip_list = {"list", "equation_block", "image_block"}
         if not self.image_analysis:
             skip_list.update({"image", "chart"})
         if not_extract_list:
@@ -214,12 +288,17 @@ def prepare_for_extract(
                 continue  # Skip blocks that should not be extracted.
             if block.type == "image" and is_absorbed_table_image(block):
                 continue
+            if block.type in IMAGE_ANALYSIS_TYPES:
+                if idx in non_standalone_visual_indices:
+                    continue
+                if not self._is_eligible_for_image_analysis(block):
+                    continue
             table_image_prepared = False
             x1, y1, x2, y2 = block.bbox
             scaled_bbox = (x1 * width, y1 * height, x2 * width, y2 * height)
             block_image = image.crop(scaled_bbox)
             if block_image.width < 1 or block_image.height < 1:
-                print(f"Warning: cropped block image has invalid size {block_image.size}")
+                logger.warning("Cropped block image has invalid size {}", block_image.size)
                 continue
             if block.type == "table":
                 image_indices = table_to_images.get(idx, [])
@@ -253,7 +332,7 @@ def post_process(self, blocks: list[ContentBlock]) -> list[ContentBlock]:
                 debug=self.debug,
             )
         except Exception as e:
-            print(f"Warning: post-processing failed with error: {e}")
+            logger.warning("Post-processing failed with error: {}", e)
             clean_blocks = [block for block in blocks if not (block.type == "image" and is_absorbed_table_image(block))]
             return cleanup_table_image_metadata(clean_blocks)
 
@@ -383,7 +462,7 @@ def __init__(
             elif env_debug_value.lower() in ["false", "0", "no"]:
                 debug = False
             else:
-                logger.warning(f"unknown MINERU_VL_DEBUG_ENABLE config: {env_debug_value}, pass")
+                logger.warning("unknown MINERU_VL_DEBUG_ENABLE config: {}, pass", env_debug_value)
 
         if backend == "transformers":
             if model is None or processor is None:
 
@@ -1,3 +1,5 @@
+from loguru import logger
+
 from ..structs import ContentBlock
 from .equation_big import try_fix_equation_big
 from .equation_block import do_handle_equation_block
@@ -10,7 +12,9 @@
 from .text_inline_spacing import try_fix_macro_spacing_in_markdown
 from .text_display2inline import try_convert_display_to_inline
 from .text_move_underscores_outside import try_move_underscores_outside
+from .image_analysis_postprocess import convert_markdown_table_to_html, process_image_or_chart
 from .otsl2html import convert_otsl_to_html
+from .json2markdown import json2md
 from .table_image_processor import (
     cleanup_table_image_metadata,
     is_absorbed_table_image,
@@ -59,10 +63,43 @@ def simple_process(
             try:
                 content = convert_otsl_to_html(content)
             except Exception as e:
-                print("Warning: Failed to convert OTSL to HTML: ", e)
-                print("Content: ", block.content)
+                logger.warning("Failed to convert OTSL to HTML: {}; content: {}", e, block.content)
             content = replace_table_image_tokens(content, block.get(TABLE_IMAGE_TOKEN_MAP_KEY))
             block.content = replace_table_formula_delimiters(content, enabled=enable_table_formula_eq_wrap)
+        if block.type in {"image", "chart"} and block.content:
+            try:
+                block_image_analysis_result = process_image_or_chart(block.content)
+                class_name = block_image_analysis_result["class"]
+                content = block_image_analysis_result["content"]
+                if class_name == "pure_table":
+                    block.type = "table"
+                    table_html = convert_markdown_table_to_html(content)
+                    if table_html is None:
+                        logger.warning("Failed to convert markdown table to HTML: {}", content)
+                        block.content = content
+                    else:
+                        block.content = replace_table_formula_delimiters(
+                            table_html,
+                            enabled=enable_table_formula_eq_wrap,
+                        )
+                elif class_name == "pure_formula":
+                    block.type = "equation"
+                    block.content = content
+                elif class_name == "chart":
+                    block.type = "chart"
+                    block["sub_type"] = block_image_analysis_result["sub_class"]
+                    block.content = content
+                else:
+                    block.type = "image"
+                    block["sub_type"] = class_name
+                    if class_name == "natural_image" or not content:
+                        block.content = block_image_analysis_result["caption"]
+                    else:
+                        block.content = content
+
+            except Exception as e:
+                logger.warning("Failed to process image/chart: {}; content: {}", e, block.content)
+                block.content = None  # or keep original content, depending on your preference
     return blocks
 
 
@@ -90,17 +127,15 @@ def post_process(
             try:
                 block.content = _process_equation(block.content, debug=debug)
             except Exception as e:
-                print("Warning: Failed to process equation: ", e)
-                print("Content: ", block.content)
+                logger.warning("Failed to process equation: {}; content: {}", e, block.content)
 
         elif block.type == "text" and block.content:
             try:
                 block.content = try_convert_display_to_inline(block.content, debug=debug)
                 block.content = try_fix_macro_spacing_in_markdown(block.content, debug=debug)
                 block.content = try_move_underscores_outside(block.content, debug=debug)
             except Exception as e:
-                print("Warning: Failed to process text: ", e)
-                print("Content: ", block.content)
+                logger.warning("Failed to process text: {}; content: {}", e, block.content)
 
     if handle_equation_block:
         blocks = do_handle_equation_block(blocks, debug=debug)
 
@@ -1,5 +1,7 @@
 import re
 
+from loguru import logger
+
 
 def try_fix_equation_big(latex: str, debug: bool = False) -> str:
 
@@ -480,6 +482,6 @@ def try_fix_equation_big(latex: str, debug: bool = False) -> str:
     latex = re.sub(r"\\bigtimes", r"\\times", latex)
 
     if debug and original_latex != latex:
-        print(f"Fixed equation big from: {original_latex} to: {latex}")
+        logger.debug("Fixed equation big from: {} to: {}", original_latex, latex)
 
     return latex
@@ -1,5 +1,7 @@
 import re
 
+from loguru import logger
+
 from ..structs import ContentBlock
 
 
@@ -66,7 +68,7 @@ def do_handle_equation_block(
 
     if debug:
         for idx, span_indices in sem_equation_spans.items():
-            print(f"Combined equation_block at idx {idx} with spans at {span_indices}")
+            logger.debug("Combined equation_block at idx {} with spans at {}", idx, span_indices)
 
     out_blocks: list[ContentBlock] = []
     for idx in range(len(blocks)):
 
@@ -1,3 +1,6 @@
+from loguru import logger
+
+
 def try_fix_equation_delimeters(latex: str, debug: bool = False) -> str:
 
     new_latex = latex.strip()
@@ -8,7 +11,7 @@ def try_fix_equation_delimeters(latex: str, debug: bool = False) -> str:
     new_latex = new_latex.strip()
 
     if debug and new_latex != latex:
-        print(f"Fixed equation delimeters from: {latex} to: {new_latex}")
+        logger.debug("Fixed equation delimiters from: {} to: {}", latex, new_latex)
     return new_latex
 
 
 
@@ -1,11 +1,13 @@
 import re
 
+from loguru import logger
+
 
 def try_fix_equation_double_subscript(latex: str, debug: bool = False) -> str:
     pattern = r"_\s*\{([^{}]|\{[^{}]*\})*\}\s*_\s*\{([^{}]|\{[^{}]*\})*\}"
     if not re.search(pattern, latex):
         return latex
     new_latex = re.sub(pattern, "", latex)
     if debug:
-        print(f"Fixed equation double-subscript from: {latex} to: {new_latex}")
+        logger.debug("Fixed equation double-subscript from: {} to: {}", latex, new_latex)
     return new_latex
@@ -1,11 +1,13 @@
 import re
 
+from loguru import logger
+
 
 def try_fix_equation_eqqcolon(latex: str, debug: bool = False) -> str:
     new_latex = re.sub(r"\\eqqcolon", "=:", latex)
     new_latex = re.sub(r"\\coloneqq", ":=", new_latex)
     if debug and new_latex != latex:
-        print(f"Fixed equation eq-colon from: {latex} to: {new_latex}")
+        logger.debug("Fixed equation eq-colon from: {} to: {}", latex, new_latex)
     return new_latex
 
 
 
@@ -1,5 +1,7 @@
 import re
 
+from loguru import logger
+
 VALID_LEFT_TOKEN_LIST = [
     "\\left\\lbrace",
     "\\left\\lVert",
@@ -365,7 +367,7 @@ def try_match_equation_left_right(latex: str, debug: bool = False) -> str:
     fixed_latex = fix_left_right_mismatch(latex)
 
     if debug:
-        print(f"Trying to fix left-right mismatch in equation: {latex}")
-        print(f"Fixed equation: {fixed_latex}")
+        logger.debug("Trying to fix left-right mismatch in equation: {}", latex)
+        logger.debug("Fixed equation: {}", fixed_latex)
 
     return fixed_latex