Skip to content

Commit 560b7e4

Browse files
author
Jin Zhen Jiang
committed
feat: add simple_post_process mode
1 parent ceee5af commit 560b7e4

File tree

2 files changed

+23
-6
lines changed

2 files changed

+23
-6
lines changed

mineru_vl_utils/mineru_client.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def __init__(
8888
layout_image_size: tuple[int, int],
8989
min_image_edge: int,
9090
max_image_edge_ratio: float,
91+
simple_post_process: bool,
9192
handle_equation_block: bool,
9293
abandon_list: bool,
9394
abandon_paratext: bool,
@@ -99,6 +100,7 @@ def __init__(
99100
self.layout_image_size = layout_image_size
100101
self.min_image_edge = min_image_edge
101102
self.max_image_edge_ratio = max_image_edge_ratio
103+
self.simple_post_process = simple_post_process
102104
self.handle_equation_block = handle_equation_block
103105
self.abandon_list = abandon_list
104106
self.abandon_paratext = abandon_paratext
@@ -190,6 +192,7 @@ def post_process(self, blocks: list[ContentBlock]) -> list[ContentBlock]:
190192
try:
191193
return post_process(
192194
blocks,
195+
simple_post_process=self.simple_post_process,
193196
handle_equation_block=self.handle_equation_block,
194197
abandon_list=self.abandon_list,
195198
abandon_paratext=self.abandon_paratext,
@@ -298,6 +301,7 @@ def __init__(
298301
layout_image_size: tuple[int, int] = (1036, 1036),
299302
min_image_edge: int = 28,
300303
max_image_edge_ratio: float = 50,
304+
simple_post_process: bool = False,
301305
handle_equation_block: bool = True,
302306
abandon_list: bool = False,
303307
abandon_paratext: bool = False,
@@ -337,6 +341,7 @@ def __init__(
337341
)
338342
if processor is None:
339343
processor = AutoProcessor.from_pretrained(model_path, use_fast=True)
344+
340345
elif backend == "mlx-engine":
341346
if model is None or processor is None:
342347
if not model_path:
@@ -414,6 +419,7 @@ def __init__(
414419
layout_image_size=layout_image_size,
415420
min_image_edge=min_image_edge,
416421
max_image_edge_ratio=max_image_edge_ratio,
422+
simple_post_process=simple_post_process,
417423
handle_equation_block=handle_equation_block,
418424
abandon_list=abandon_list,
419425
abandon_paratext=abandon_paratext,

mineru_vl_utils/post_process/__init__.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,31 @@ def _add_equation_brackets(content: str) -> str:
3737
return content
3838

3939

40+
def simple_process(blocks: list[ContentBlock]) -> list[ContentBlock]:
41+
for block in blocks:
42+
if block.type == "table" and block.content:
43+
try:
44+
block.content = convert_otsl_to_html(block.content)
45+
except Exception as e:
46+
print("Warning: Failed to convert OTSL to HTML: ", e)
47+
print("Content: ", block.content)
48+
return blocks
49+
50+
4051
def post_process(
4152
blocks: list[ContentBlock],
53+
simple_post_process: bool,
4254
handle_equation_block: bool,
4355
abandon_list: bool,
4456
abandon_paratext: bool,
4557
debug: bool = False,
4658
) -> list[ContentBlock]:
59+
blocks = simple_process(blocks)
60+
61+
if simple_post_process:
62+
return blocks
63+
4764
for block in blocks:
48-
if block.type == "table" and block.content:
49-
try:
50-
block.content = convert_otsl_to_html(block.content)
51-
except Exception as e:
52-
print("Warning: Failed to convert OTSL to HTML: ", e)
53-
print("Content: ", block.content)
5465
if block.type == "equation" and block.content:
5566
try:
5667
block.content = _process_equation(block.content, debug=debug)

0 commit comments

Comments
 (0)