Skip to content

Commit f40e9d3

Browse files
authored
Merge pull request #41 from opendatalab/dev
Dev
2 parents f3da2e7 + 30f0cd4 commit f40e9d3

File tree

5 files changed

+139
-113
lines changed

5 files changed

+139
-113
lines changed

mineru_vl_utils/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
from mineru_vl_utils.version import __version__
55

6-
76
__lazy_attrs__ = {
87
"MinerUClient": (".mineru_client", "MinerUClient"),
98
"MinerUSamplingParams": (".mineru_client", "MinerUSamplingParams"),

mineru_vl_utils/mineru_client.py

Lines changed: 59 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -154,16 +154,22 @@ def prepare_for_extract(
154154
self,
155155
image: Image.Image,
156156
blocks: list[ContentBlock],
157+
not_extract_list: list[str] | None = None,
157158
) -> tuple[list[Image.Image | bytes], list[str], list[SamplingParams | None], list[int]]:
158159
image = get_rgb_image(image)
159160
width, height = image.size
160161
block_images: list[Image.Image | bytes] = []
161162
prompts: list[str] = []
162163
sampling_params: list[SamplingParams | None] = []
163164
indices: list[int] = []
165+
skip_list = {"image", "list", "equation_block"}
166+
if not_extract_list:
167+
for not_extract_type in not_extract_list:
168+
if not_extract_type in BLOCK_TYPES:
169+
skip_list.add(not_extract_type)
164170
for idx, block in enumerate(blocks):
165-
if block.type in ("image", "list", "equation_block"):
166-
continue # Skip image blocks.
171+
if block.type in skip_list:
172+
continue # Skip blocks that should not be extracted.
167173
x1, y1, x2, y2 = block.bbox
168174
scaled_bbox = (x1 * width, y1 * height, x2 * width, y2 * height)
169175
block_image = image.crop(scaled_bbox)
@@ -181,13 +187,17 @@ def prepare_for_extract(
181187
return block_images, prompts, sampling_params, indices
182188

183189
def post_process(self, blocks: list[ContentBlock]) -> list[ContentBlock]:
184-
return post_process(
185-
blocks,
186-
handle_equation_block=self.handle_equation_block,
187-
abandon_list=self.abandon_list,
188-
abandon_paratext=self.abandon_paratext,
189-
debug=self.debug,
190-
)
190+
try:
191+
return post_process(
192+
blocks,
193+
handle_equation_block=self.handle_equation_block,
194+
abandon_list=self.abandon_list,
195+
abandon_paratext=self.abandon_paratext,
196+
debug=self.debug,
197+
)
198+
except Exception as e:
199+
print(f"Warning: post-processing failed with error: {e}")
200+
return blocks
191201

192202
def batch_prepare_for_layout(
193203
self,
@@ -212,10 +222,11 @@ def batch_prepare_for_extract(
212222
executor: Executor | None,
213223
images: list[Image.Image],
214224
blocks_list: list[list[ContentBlock]],
225+
not_extract_list: list[str] | None = None,
215226
) -> list[tuple[list[Image.Image | bytes], list[str], list[SamplingParams | None], list[int]]]:
216227
if executor is None:
217-
return [self.prepare_for_extract(im, bls) for im, bls in zip(images, blocks_list)]
218-
return list(executor.map(self.prepare_for_extract, images, blocks_list))
228+
return [self.prepare_for_extract(im, bls, not_extract_list) for im, bls in zip(images, blocks_list)]
229+
return list(executor.map(self.prepare_for_extract, images, blocks_list, [not_extract_list] * len(images)))
219230

220231
def batch_post_process(
221232
self,
@@ -247,9 +258,10 @@ async def aio_prepare_for_extract(
247258
executor: Executor | None,
248259
image: Image.Image,
249260
blocks: list[ContentBlock],
261+
not_extract_list: list[str] | None = None,
250262
) -> tuple[list[Image.Image | bytes], list[str], list[SamplingParams | None], list[int]]:
251263
loop = asyncio.get_running_loop()
252-
return await loop.run_in_executor(executor, self.prepare_for_extract, image, blocks)
264+
return await loop.run_in_executor(executor, self.prepare_for_extract, image, blocks, not_extract_list)
253265

254266
async def aio_post_process(
255267
self,
@@ -607,9 +619,10 @@ def two_step_extract(
607619
self,
608620
image: Image.Image,
609621
priority: int | None = None,
622+
not_extract_list: list[str] | None = None,
610623
) -> list[ContentBlock]:
611624
blocks = self.layout_detect(image, priority)
612-
block_images, prompts, params, indices = self.helper.prepare_for_extract(image, blocks)
625+
block_images, prompts, params, indices = self.helper.prepare_for_extract(image, blocks, not_extract_list)
613626
outputs = self.client.batch_predict(block_images, prompts, params, priority)
614627
for idx, output in zip(indices, outputs):
615628
blocks[idx].content = output
@@ -620,10 +633,16 @@ async def aio_two_step_extract(
620633
image: Image.Image,
621634
priority: int | None = None,
622635
semaphore: asyncio.Semaphore | None = None,
636+
not_extract_list: list[str] | None = None,
623637
) -> list[ContentBlock]:
624638
semaphore = semaphore or asyncio.Semaphore(self.max_concurrency)
625639
blocks = await self.aio_layout_detect(image, priority, semaphore)
626-
block_images, prompts, params, indices = await self.helper.aio_prepare_for_extract(self.executor, image, blocks)
640+
block_images, prompts, params, indices = await self.helper.aio_prepare_for_extract(
641+
self.executor,
642+
image,
643+
blocks,
644+
not_extract_list,
645+
)
627646
outputs = await self.client.aio_batch_predict(block_images, prompts, params, priority, semaphore=semaphore)
628647
for idx, output in zip(indices, outputs):
629648
blocks[idx].content = output
@@ -633,13 +652,14 @@ def concurrent_two_step_extract(
633652
self,
634653
images: list[Image.Image],
635654
priority: Sequence[int | None] | int | None = None,
655+
not_extract_list: list[str] | None = None,
636656
) -> list[list[ContentBlock]]:
637657
try:
638658
loop = asyncio.get_running_loop()
639659
except RuntimeError:
640660
loop = None
641661

642-
task = self.aio_concurrent_two_step_extract(images, priority)
662+
task = self.aio_concurrent_two_step_extract(images, priority, not_extract_list)
643663

644664
if loop is not None:
645665
return loop.run_until_complete(task)
@@ -650,6 +670,7 @@ async def aio_concurrent_two_step_extract(
650670
self,
651671
images: list[Image.Image],
652672
priority: Sequence[int | None] | int | None = None,
673+
not_extract_list: list[str] | None = None,
653674
semaphore: asyncio.Semaphore | None = None,
654675
) -> list[list[ContentBlock]]:
655676
if priority is None and self.incremental_priority:
@@ -658,7 +679,7 @@ async def aio_concurrent_two_step_extract(
658679
priority = [priority] * len(images)
659680
semaphore = semaphore or asyncio.Semaphore(self.max_concurrency)
660681
return await gather_tasks(
661-
tasks=[self.aio_two_step_extract(*args, semaphore) for args in zip(images, priority)],
682+
tasks=[self.aio_two_step_extract(*args, semaphore, not_extract_list) for args in zip(images, priority)],
662683
use_tqdm=self.use_tqdm,
663684
tqdm_desc="Two Step Extraction",
664685
)
@@ -667,6 +688,7 @@ def stepping_two_step_extract(
667688
self,
668689
images: list[Image.Image],
669690
priority: Sequence[int | None] | int | None = None,
691+
not_extract_list: list[str] | None = None,
670692
) -> list[list[ContentBlock]]:
671693
if priority is None and self.incremental_priority:
672694
priority = list(range(len(images)))
@@ -675,7 +697,12 @@ def stepping_two_step_extract(
675697
all_prompts: list[str] = []
676698
all_params: list[SamplingParams | None] = []
677699
all_indices: list[tuple[int, int]] = []
678-
prepared_inputs = self.helper.batch_prepare_for_extract(self.executor, images, blocks_list)
700+
prepared_inputs = self.helper.batch_prepare_for_extract(
701+
self.executor,
702+
images,
703+
blocks_list,
704+
not_extract_list,
705+
)
679706
for img_idx, (block_images, prompts, params, indices) in enumerate(prepared_inputs):
680707
all_images.extend(block_images)
681708
all_prompts.extend(prompts)
@@ -690,6 +717,7 @@ async def aio_stepping_two_step_extract(
690717
self,
691718
images: list[Image.Image],
692719
priority: Sequence[int | None] | int | None = None,
720+
not_extract_list: list[str] | None = None,
693721
semaphore: asyncio.Semaphore | None = None,
694722
) -> list[list[ContentBlock]]:
695723
if priority is None and self.incremental_priority:
@@ -701,7 +729,14 @@ async def aio_stepping_two_step_extract(
701729
all_params: list[SamplingParams | None] = []
702730
all_indices: list[tuple[int, int]] = []
703731
prepared_inputs = await gather_tasks(
704-
tasks=[self.helper.aio_prepare_for_extract(self.executor, *args) for args in zip(images, blocks_list)],
732+
tasks=[
733+
self.helper.aio_prepare_for_extract(
734+
self.executor,
735+
*args,
736+
not_extract_list,
737+
)
738+
for args in zip(images, blocks_list)
739+
],
705740
use_tqdm=self.use_tqdm,
706741
tqdm_desc="Extract Preparation",
707742
)
@@ -731,20 +766,22 @@ def batch_two_step_extract(
731766
self,
732767
images: list[Image.Image],
733768
priority: Sequence[int | None] | int | None = None,
769+
not_extract_list: list[str] | None = None,
734770
) -> list[list[ContentBlock]]:
735771
if self.batching_mode == "concurrent":
736-
return self.concurrent_two_step_extract(images, priority)
772+
return self.concurrent_two_step_extract(images, priority, not_extract_list)
737773
else: # self.batching_mode == "stepping"
738-
return self.stepping_two_step_extract(images, priority)
774+
return self.stepping_two_step_extract(images, priority, not_extract_list)
739775

740776
async def aio_batch_two_step_extract(
741777
self,
742778
images: list[Image.Image],
743779
priority: Sequence[int | None] | int | None = None,
780+
not_extract_list: list[str] | None = None,
744781
semaphore: asyncio.Semaphore | None = None,
745782
) -> list[list[ContentBlock]]:
746783
semaphore = semaphore or asyncio.Semaphore(self.max_concurrency)
747784
if self.batching_mode == "concurrent":
748-
return await self.aio_concurrent_two_step_extract(images, priority, semaphore)
785+
return await self.aio_concurrent_two_step_extract(images, priority, not_extract_list, semaphore)
749786
else: # self.batching_mode == "stepping"
750-
return await self.aio_stepping_two_step_extract(images, priority, semaphore)
787+
return await self.aio_stepping_two_step_extract(images, priority, not_extract_list, semaphore)

mineru_vl_utils/post_process/__init__.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
from .equation_unbalanced_braces import try_fix_unbalanced_braces
21
from ..structs import ContentBlock
2+
from .equation_big import try_fix_equation_big
33
from .equation_block import do_handle_equation_block
44
from .equation_double_subscript import try_fix_equation_double_subscript
55
from .equation_fix_eqqcolon import try_fix_equation_eqqcolon
6-
from .equation_big import try_fix_equation_big
7-
from .equation_leq import try_fix_equation_leq
86
from .equation_left_right import try_match_equation_left_right
7+
from .equation_leq import try_fix_equation_leq
8+
from .equation_unbalanced_braces import try_fix_unbalanced_braces
99
from .otsl2html import convert_otsl_to_html
1010

1111
PARATEXT_TYPES = {
@@ -46,9 +46,17 @@ def post_process(
4646
) -> list[ContentBlock]:
4747
for block in blocks:
4848
if block.type == "table" and block.content:
49-
block.content = convert_otsl_to_html(block.content)
49+
try:
50+
block.content = convert_otsl_to_html(block.content)
51+
except Exception as e:
52+
print("Warning: Failed to convert OTSL to HTML: ", e)
53+
print("Content: ", block.content)
5054
if block.type == "equation" and block.content:
51-
block.content = _process_equation(block.content, debug=debug)
55+
try:
56+
block.content = _process_equation(block.content, debug=debug)
57+
except Exception as e:
58+
print("Warning: Failed to process equation: ", e)
59+
print("Content: ", block.content)
5260

5361
if handle_equation_block:
5462
blocks = do_handle_equation_block(blocks, debug=debug)

mineru_vl_utils/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.17"
1+
__version__ = "0.1.17.1"

0 commit comments

Comments
 (0)