@@ -78,6 +78,12 @@ def __init__(
7878 "<|rotate_left|>" : 270 ,
7979}
8080
81+ IMAGE_ANALYSIS_TYPES = {"image" , "chart" }
82+ IMAGE_CAPTION_CONTAINER_TYPES = {"image" , "chart" , "image_block" }
83+ INTERNAL_BLOCK_THRESHOLD = 0.9
84+ IMAGE_ANALYSIS_MIN_BLOCK_SIZE = 0.1
85+ IMAGE_ANALYSIS_MIN_BLOCK_AREA = 0.01
86+
8187
8288def _convert_bbox (bbox : Sequence [int ] | Sequence [str ]) -> list [float ] | None :
8389 bbox = tuple (map (int , bbox ))
@@ -98,12 +104,8 @@ def _parse_angle(tail: str) -> Literal[None, 0, 90, 180, 270]:
98104 return None
99105
100106
101- def _parse_merge_type (tail : str ) -> Literal [None , 'src' , 'tgt' ]:
102- if "txt_contd_src" in tail :
103- return "src"
104- elif "txt_contd_tgt" in tail :
105- return "tgt"
106- return None
107+ def _parse_merge_prev (tail : str ) -> bool :
108+ return "txt_contd_tgt" in tail
107109
108110
109111class MinerUClientHelper :
@@ -137,6 +139,57 @@ def __init__(
137139 self .enable_table_formula_eq_wrap = enable_table_formula_eq_wrap
138140 self .debug = debug
139141
142+ @staticmethod
143+ def _bbox_intersection_area (a : Sequence [float ], b : Sequence [float ]) -> float :
144+ x1 = max (a [0 ], b [0 ])
145+ y1 = max (a [1 ], b [1 ])
146+ x2 = min (a [2 ], b [2 ])
147+ y2 = min (a [3 ], b [3 ])
148+ if x2 <= x1 or y2 <= y1 :
149+ return 0.0
150+ return (x2 - x1 ) * (y2 - y1 )
151+
152+ @classmethod
153+ def _bbox_cover_ratio (cls , inner : Sequence [float ], outer : Sequence [float ]) -> float :
154+ inner_area = max (0.0 , inner [2 ] - inner [0 ]) * max (0.0 , inner [3 ] - inner [1 ])
155+ if inner_area == 0 :
156+ return 0.0
157+ return cls ._bbox_intersection_area (inner , outer ) / inner_area
158+
159+ @classmethod
160+ def _find_covered_block_indices (
161+ cls ,
162+ blocks : Sequence [ContentBlock ],
163+ candidate_types : set [str ],
164+ container_types : set [str ],
165+ threshold : float = INTERNAL_BLOCK_THRESHOLD ,
166+ ) -> set [int ]:
167+ container_indices = [idx for idx , block in enumerate (blocks ) if block .type in container_types ]
168+ if not container_indices :
169+ return set ()
170+
171+ covered_indices : set [int ] = set ()
172+ for idx , block in enumerate (blocks ):
173+ if block .type not in candidate_types :
174+ continue
175+ for container_idx in container_indices :
176+ if idx == container_idx :
177+ continue
178+ if cls ._bbox_cover_ratio (block .bbox , blocks [container_idx ].bbox ) >= threshold :
179+ covered_indices .add (idx )
180+ break
181+ return covered_indices
182+
183+ @staticmethod
184+ def _is_eligible_for_image_analysis (block : ContentBlock ) -> bool :
185+ x1 , y1 , x2 , y2 = block .bbox
186+ width = x2 - x1
187+ height = y2 - y1
188+ return (
189+ (width > IMAGE_ANALYSIS_MIN_BLOCK_SIZE and height > IMAGE_ANALYSIS_MIN_BLOCK_SIZE )
190+ or width * height > IMAGE_ANALYSIS_MIN_BLOCK_AREA
191+ )
192+
140193 def resize_by_need (self , image : Image .Image ) -> Image .Image :
141194 edge_ratio = max (image .size ) / min (image .size )
142195 if edge_ratio > self .max_image_edge_ratio :
@@ -169,19 +222,26 @@ def parse_layout_output(self, output: str) -> list[ContentBlock]:
169222 x1 , y1 , x2 , y2 , ref_type , rotate_token , tail = match .groups ()
170223 bbox = _convert_bbox ((x1 , y1 , x2 , y2 ))
171224 if bbox is None :
172- print ( f"Warning: invalid bbox in line: { match .group (0 )} " )
225+ logger . warning ( "Invalid bbox in layout output line: {}" , match .group (0 ))
173226 continue # Skip invalid bbox
174227 ref_type = ref_type .lower ()
228+ if ref_type == "inline_formula" :
229+ if self .debug :
230+ logger .debug ("Skipping inline formula block in layout output: {}" , match .group (0 ))
231+ continue
175232 if ref_type not in BLOCK_TYPES :
176- print ( f"Warning: unknown block type in line: { match .group (0 )} " )
233+ logger . warning ( "Unknown block type in layout output line: {}" , match .group (0 ))
177234 continue # Skip unknown block types
178235 angle = _parse_angle (rotate_token ) if rotate_token else None
179236 if angle is None :
180- print (f"Warning: no angle found in line: { match .group (0 )} " )
181- merge_type = _parse_merge_type (tail )
182- blocks .append (ContentBlock (ref_type , bbox , angle = angle , merge_type = merge_type ))
237+ logger .warning ("No angle found in layout output line: {}" , match .group (0 ))
238+ if ref_type == "text" :
239+ merge_prev = _parse_merge_prev (tail )
240+ blocks .append (ContentBlock (ref_type , bbox , angle = angle , merge_prev = merge_prev ))
241+ else :
242+ blocks .append (ContentBlock (ref_type , bbox , angle = angle ))
183243 if not matched and output .strip ():
184- print ( f"Warning: output does not match layout format: { output } " )
244+ logger . warning ( "Layout output does not match expected format: {}" , output )
185245 return blocks
186246
187247 def prepare_for_extract (
@@ -190,13 +250,27 @@ def prepare_for_extract(
190250 blocks : list [ContentBlock ],
191251 not_extract_list : list [str ] | None = None ,
192252 ) -> tuple [list [Image .Image | bytes ], list [str ], list [SamplingParams | None ], list [int ]]:
253+ internal_caption_indices = self ._find_covered_block_indices (
254+ blocks ,
255+ candidate_types = {"image_caption" },
256+ container_types = IMAGE_CAPTION_CONTAINER_TYPES ,
257+ )
258+ if internal_caption_indices :
259+ blocks [:] = [block for idx , block in enumerate (blocks ) if idx not in internal_caption_indices ]
260+
261+ non_standalone_visual_indices = self ._find_covered_block_indices (
262+ blocks ,
263+ candidate_types = IMAGE_ANALYSIS_TYPES ,
264+ container_types = {"image_block" },
265+ )
266+
193267 image = get_rgb_image (image )
194268 width , height = image .size
195269 block_images : list [Image .Image | bytes ] = []
196270 prompts : list [str ] = []
197271 sampling_params : list [SamplingParams | None ] = []
198272 indices : list [int ] = []
199- skip_list = {"list" , "equation_block" }
273+ skip_list = {"list" , "equation_block" , "image_block" }
200274 if not self .image_analysis :
201275 skip_list .update ({"image" , "chart" })
202276 if not_extract_list :
@@ -214,12 +288,17 @@ def prepare_for_extract(
214288 continue # Skip blocks that should not be extracted.
215289 if block .type == "image" and is_absorbed_table_image (block ):
216290 continue
291+ if block .type in IMAGE_ANALYSIS_TYPES :
292+ if idx in non_standalone_visual_indices :
293+ continue
294+ if not self ._is_eligible_for_image_analysis (block ):
295+ continue
217296 table_image_prepared = False
218297 x1 , y1 , x2 , y2 = block .bbox
219298 scaled_bbox = (x1 * width , y1 * height , x2 * width , y2 * height )
220299 block_image = image .crop (scaled_bbox )
221300 if block_image .width < 1 or block_image .height < 1 :
222- print ( f"Warning: cropped block image has invalid size { block_image .size } " )
301+ logger . warning ( "Cropped block image has invalid size {}" , block_image .size )
223302 continue
224303 if block .type == "table" :
225304 image_indices = table_to_images .get (idx , [])
@@ -253,7 +332,7 @@ def post_process(self, blocks: list[ContentBlock]) -> list[ContentBlock]:
253332 debug = self .debug ,
254333 )
255334 except Exception as e :
256- print ( f"Warning: post -processing failed with error: { e } " )
335+ logger . warning ( "Post -processing failed with error: {}" , e )
257336 clean_blocks = [block for block in blocks if not (block .type == "image" and is_absorbed_table_image (block ))]
258337 return cleanup_table_image_metadata (clean_blocks )
259338
@@ -383,7 +462,7 @@ def __init__(
383462 elif env_debug_value .lower () in ["false" , "0" , "no" ]:
384463 debug = False
385464 else :
386- logger .warning (f "unknown MINERU_VL_DEBUG_ENABLE config: { env_debug_value } , pass" )
465+ logger .warning ("unknown MINERU_VL_DEBUG_ENABLE config: {}, pass" , env_debug_value )
387466
388467 if backend == "transformers" :
389468 if model is None or processor is None :
0 commit comments