@@ -154,16 +154,22 @@ def prepare_for_extract(
154154 self ,
155155 image : Image .Image ,
156156 blocks : list [ContentBlock ],
157+ not_extract_list : list [str ] | None = None ,
157158 ) -> tuple [list [Image .Image | bytes ], list [str ], list [SamplingParams | None ], list [int ]]:
158159 image = get_rgb_image (image )
159160 width , height = image .size
160161 block_images : list [Image .Image | bytes ] = []
161162 prompts : list [str ] = []
162163 sampling_params : list [SamplingParams | None ] = []
163164 indices : list [int ] = []
165+ skip_list = {"image" , "list" , "equation_block" }
166+ if not_extract_list :
167+ for not_extract_type in not_extract_list :
168+ if not_extract_type in BLOCK_TYPES :
169+ skip_list .add (not_extract_type )
164170 for idx , block in enumerate (blocks ):
165- if block .type in ( "image" , "list" , "equation_block" ) :
166- continue # Skip image blocks.
171+ if block .type in skip_list :
172+ continue # Skip blocks that should not be extracted .
167173 x1 , y1 , x2 , y2 = block .bbox
168174 scaled_bbox = (x1 * width , y1 * height , x2 * width , y2 * height )
169175 block_image = image .crop (scaled_bbox )
@@ -181,13 +187,17 @@ def prepare_for_extract(
181187 return block_images , prompts , sampling_params , indices
182188
183189 def post_process (self , blocks : list [ContentBlock ]) -> list [ContentBlock ]:
184- return post_process (
185- blocks ,
186- handle_equation_block = self .handle_equation_block ,
187- abandon_list = self .abandon_list ,
188- abandon_paratext = self .abandon_paratext ,
189- debug = self .debug ,
190- )
190+ try :
191+ return post_process (
192+ blocks ,
193+ handle_equation_block = self .handle_equation_block ,
194+ abandon_list = self .abandon_list ,
195+ abandon_paratext = self .abandon_paratext ,
196+ debug = self .debug ,
197+ )
198+ except Exception as e :
199+ print (f"Warning: post-processing failed with error: { e } " )
200+ return blocks
191201
192202 def batch_prepare_for_layout (
193203 self ,
@@ -212,10 +222,11 @@ def batch_prepare_for_extract(
212222 executor : Executor | None ,
213223 images : list [Image .Image ],
214224 blocks_list : list [list [ContentBlock ]],
225+ not_extract_list : list [str ] | None = None ,
215226 ) -> list [tuple [list [Image .Image | bytes ], list [str ], list [SamplingParams | None ], list [int ]]]:
216227 if executor is None :
217- return [self .prepare_for_extract (im , bls ) for im , bls in zip (images , blocks_list )]
218- return list (executor .map (self .prepare_for_extract , images , blocks_list ))
228+ return [self .prepare_for_extract (im , bls , not_extract_list ) for im , bls in zip (images , blocks_list )]
229+ return list (executor .map (self .prepare_for_extract , images , blocks_list , [ not_extract_list ] * len ( images ) ))
219230
220231 def batch_post_process (
221232 self ,
@@ -247,9 +258,10 @@ async def aio_prepare_for_extract(
247258 executor : Executor | None ,
248259 image : Image .Image ,
249260 blocks : list [ContentBlock ],
261+ not_extract_list : list [str ] | None = None ,
250262 ) -> tuple [list [Image .Image | bytes ], list [str ], list [SamplingParams | None ], list [int ]]:
251263 loop = asyncio .get_running_loop ()
252- return await loop .run_in_executor (executor , self .prepare_for_extract , image , blocks )
264+ return await loop .run_in_executor (executor , self .prepare_for_extract , image , blocks , not_extract_list )
253265
254266 async def aio_post_process (
255267 self ,
@@ -607,9 +619,10 @@ def two_step_extract(
607619 self ,
608620 image : Image .Image ,
609621 priority : int | None = None ,
622+ not_extract_list : list [str ] | None = None ,
610623 ) -> list [ContentBlock ]:
611624 blocks = self .layout_detect (image , priority )
612- block_images , prompts , params , indices = self .helper .prepare_for_extract (image , blocks )
625+ block_images , prompts , params , indices = self .helper .prepare_for_extract (image , blocks , not_extract_list )
613626 outputs = self .client .batch_predict (block_images , prompts , params , priority )
614627 for idx , output in zip (indices , outputs ):
615628 blocks [idx ].content = output
@@ -620,10 +633,16 @@ async def aio_two_step_extract(
620633 image : Image .Image ,
621634 priority : int | None = None ,
622635 semaphore : asyncio .Semaphore | None = None ,
636+ not_extract_list : list [str ] | None = None ,
623637 ) -> list [ContentBlock ]:
624638 semaphore = semaphore or asyncio .Semaphore (self .max_concurrency )
625639 blocks = await self .aio_layout_detect (image , priority , semaphore )
626- block_images , prompts , params , indices = await self .helper .aio_prepare_for_extract (self .executor , image , blocks )
640+ block_images , prompts , params , indices = await self .helper .aio_prepare_for_extract (
641+ self .executor ,
642+ image ,
643+ blocks ,
644+ not_extract_list ,
645+ )
627646 outputs = await self .client .aio_batch_predict (block_images , prompts , params , priority , semaphore = semaphore )
628647 for idx , output in zip (indices , outputs ):
629648 blocks [idx ].content = output
@@ -633,13 +652,14 @@ def concurrent_two_step_extract(
633652 self ,
634653 images : list [Image .Image ],
635654 priority : Sequence [int | None ] | int | None = None ,
655+ not_extract_list : list [str ] | None = None ,
636656 ) -> list [list [ContentBlock ]]:
637657 try :
638658 loop = asyncio .get_running_loop ()
639659 except RuntimeError :
640660 loop = None
641661
642- task = self .aio_concurrent_two_step_extract (images , priority )
662+ task = self .aio_concurrent_two_step_extract (images , priority , not_extract_list )
643663
644664 if loop is not None :
645665 return loop .run_until_complete (task )
@@ -650,6 +670,7 @@ async def aio_concurrent_two_step_extract(
650670 self ,
651671 images : list [Image .Image ],
652672 priority : Sequence [int | None ] | int | None = None ,
673+ not_extract_list : list [str ] | None = None ,
653674 semaphore : asyncio .Semaphore | None = None ,
654675 ) -> list [list [ContentBlock ]]:
655676 if priority is None and self .incremental_priority :
@@ -658,7 +679,7 @@ async def aio_concurrent_two_step_extract(
658679 priority = [priority ] * len (images )
659680 semaphore = semaphore or asyncio .Semaphore (self .max_concurrency )
660681 return await gather_tasks (
661- tasks = [self .aio_two_step_extract (* args , semaphore ) for args in zip (images , priority )],
682+ tasks = [self .aio_two_step_extract (* args , semaphore , not_extract_list ) for args in zip (images , priority )],
662683 use_tqdm = self .use_tqdm ,
663684 tqdm_desc = "Two Step Extraction" ,
664685 )
@@ -667,6 +688,7 @@ def stepping_two_step_extract(
667688 self ,
668689 images : list [Image .Image ],
669690 priority : Sequence [int | None ] | int | None = None ,
691+ not_extract_list : list [str ] | None = None ,
670692 ) -> list [list [ContentBlock ]]:
671693 if priority is None and self .incremental_priority :
672694 priority = list (range (len (images )))
@@ -675,7 +697,12 @@ def stepping_two_step_extract(
675697 all_prompts : list [str ] = []
676698 all_params : list [SamplingParams | None ] = []
677699 all_indices : list [tuple [int , int ]] = []
678- prepared_inputs = self .helper .batch_prepare_for_extract (self .executor , images , blocks_list )
700+ prepared_inputs = self .helper .batch_prepare_for_extract (
701+ self .executor ,
702+ images ,
703+ blocks_list ,
704+ not_extract_list ,
705+ )
679706 for img_idx , (block_images , prompts , params , indices ) in enumerate (prepared_inputs ):
680707 all_images .extend (block_images )
681708 all_prompts .extend (prompts )
@@ -690,6 +717,7 @@ async def aio_stepping_two_step_extract(
690717 self ,
691718 images : list [Image .Image ],
692719 priority : Sequence [int | None ] | int | None = None ,
720+ not_extract_list : list [str ] | None = None ,
693721 semaphore : asyncio .Semaphore | None = None ,
694722 ) -> list [list [ContentBlock ]]:
695723 if priority is None and self .incremental_priority :
@@ -701,7 +729,14 @@ async def aio_stepping_two_step_extract(
701729 all_params : list [SamplingParams | None ] = []
702730 all_indices : list [tuple [int , int ]] = []
703731 prepared_inputs = await gather_tasks (
704- tasks = [self .helper .aio_prepare_for_extract (self .executor , * args ) for args in zip (images , blocks_list )],
732+ tasks = [
733+ self .helper .aio_prepare_for_extract (
734+ self .executor ,
735+ * args ,
736+ not_extract_list ,
737+ )
738+ for args in zip (images , blocks_list )
739+ ],
705740 use_tqdm = self .use_tqdm ,
706741 tqdm_desc = "Extract Preparation" ,
707742 )
@@ -731,20 +766,22 @@ def batch_two_step_extract(
731766 self ,
732767 images : list [Image .Image ],
733768 priority : Sequence [int | None ] | int | None = None ,
769+ not_extract_list : list [str ] | None = None ,
734770 ) -> list [list [ContentBlock ]]:
735771 if self .batching_mode == "concurrent" :
736- return self .concurrent_two_step_extract (images , priority )
772+ return self .concurrent_two_step_extract (images , priority , not_extract_list )
737773 else : # self.batching_mode == "stepping"
738- return self .stepping_two_step_extract (images , priority )
774+ return self .stepping_two_step_extract (images , priority , not_extract_list )
739775
740776 async def aio_batch_two_step_extract (
741777 self ,
742778 images : list [Image .Image ],
743779 priority : Sequence [int | None ] | int | None = None ,
780+ not_extract_list : list [str ] | None = None ,
744781 semaphore : asyncio .Semaphore | None = None ,
745782 ) -> list [list [ContentBlock ]]:
746783 semaphore = semaphore or asyncio .Semaphore (self .max_concurrency )
747784 if self .batching_mode == "concurrent" :
748- return await self .aio_concurrent_two_step_extract (images , priority , semaphore )
785+ return await self .aio_concurrent_two_step_extract (images , priority , not_extract_list , semaphore )
749786 else : # self.batching_mode == "stepping"
750- return await self .aio_stepping_two_step_extract (images , priority , semaphore )
787+ return await self .aio_stepping_two_step_extract (images , priority , not_extract_list , semaphore )
0 commit comments