@@ -87,44 +87,44 @@ def from_file(
8787 """Creates a DocumentLayout from a pdf file."""
8888 logger .info (f"Reading PDF for file: { filename } ..." )
8989
90- # Store pdf images for later use
91- output_dir = create_image_output_dir (filename )
92- layouts , _image_paths = load_pdf (
93- filename ,
94- pdf_image_dpi ,
95- output_folder = output_dir ,
96- path_only = True ,
97- )
98- image_paths = cast (List [str ], _image_paths )
99- if len (layouts ) > len (image_paths ):
100- raise RuntimeError (
101- "Some images were not loaded. "
102- "Check that poppler is installed and in your $PATH." ,
90+ with tempfile .TemporaryDirectory () as temp_dir :
91+ layouts , _image_paths = load_pdf (
92+ filename ,
93+ pdf_image_dpi ,
94+ output_folder = temp_dir ,
95+ path_only = True ,
10396 )
104- pages : List [PageLayout ] = []
105- if fixed_layouts is None :
106- fixed_layouts = [None for _ in layouts ]
107- for i , (image_path , layout , fixed_layout ) in enumerate (
108- zip (image_paths , layouts , fixed_layouts ),
109- ):
110- # NOTE(robinson) - In the future, maybe we detect the page number and default
111- # to the index if it is not detected
112- with Image .open (image_path ) as image :
113- page = PageLayout .from_image (
114- image ,
115- image_path = image_path ,
116- number = i + 1 ,
117- detection_model = detection_model ,
118- element_extraction_model = element_extraction_model ,
119- layout = layout ,
120- ocr_strategy = ocr_strategy ,
121- ocr_languages = ocr_languages ,
122- ocr_mode = ocr_mode ,
123- fixed_layout = fixed_layout ,
124- extract_tables = extract_tables ,
97+ image_paths = cast (List [str ], _image_paths )
98+ if len (layouts ) > len (image_paths ):
99+ raise RuntimeError (
100+ "Some images were not loaded. "
101+ "Check that poppler is installed and in your $PATH." ,
125102 )
126- pages .append (page )
127- return cls .from_pages (pages )
103+
104+ pages : List [PageLayout ] = []
105+ if fixed_layouts is None :
106+ fixed_layouts = [None for _ in layouts ]
107+ for i , (image_path , layout , fixed_layout ) in enumerate (
108+ zip (image_paths , layouts , fixed_layouts ),
109+ ):
110+ # NOTE(robinson) - In the future, maybe we detect the page number and default
111+ # to the index if it is not detected
112+ with Image .open (image_path ) as image :
113+ page = PageLayout .from_image (
114+ image ,
115+ number = i + 1 ,
116+ document_filename = filename ,
117+ detection_model = detection_model ,
118+ element_extraction_model = element_extraction_model ,
119+ layout = layout ,
120+ ocr_strategy = ocr_strategy ,
121+ ocr_languages = ocr_languages ,
122+ ocr_mode = ocr_mode ,
123+ fixed_layout = fixed_layout ,
124+ extract_tables = extract_tables ,
125+ )
126+ pages .append (page )
127+ return cls .from_pages (pages )
128128
129129 @classmethod
130130 def from_image_file (
@@ -180,7 +180,8 @@ def __init__(
180180 image : Image .Image ,
181181 layout : Optional [List [TextRegion ]],
182182 image_metadata : Optional [dict ] = None ,
183- image_path : Optional [Union [str , PurePath ]] = None ,
183+ image_path : Optional [Union [str , PurePath ]] = None , # TODO: Deprecate
184+ document_filename : Optional [Union [str , PurePath ]] = None ,
184185 detection_model : Optional [UnstructuredObjectDetectionModel ] = None ,
185186 element_extraction_model : Optional [UnstructuredElementExtractionModel ] = None ,
186187 ocr_strategy : str = "auto" ,
@@ -196,6 +197,7 @@ def __init__(
196197 self .image_metadata = image_metadata
197198 self .image_path = image_path
198199 self .image_array : Union [np .ndarray , None ] = None
200+ self .document_filename = document_filename
199201 self .layout = layout
200202 self .number = number
201203 self .detection_model = detection_model
@@ -305,7 +307,11 @@ def _get_image_array(self) -> Union[np.ndarray, None]:
305307 self .image_array = np .array (image )
306308 return self .image_array
307309
308- def annotate (self , colors : Optional [Union [List [str ], str ]] = None ) -> Image .Image :
310+ def annotate (
311+ self ,
312+ colors : Optional [Union [List [str ], str ]] = None ,
313+ image_dpi : int = 200 ,
314+ ) -> Image .Image :
309315 """Annotates the elements on the page image."""
310316 if colors is None :
311317 colors = ["red" for _ in self .elements ]
@@ -315,18 +321,46 @@ def annotate(self, colors: Optional[Union[List[str], str]] = None) -> Image.Imag
315321 if len (colors ) < len (self .elements ):
316322 n_copies = (len (self .elements ) // len (colors )) + 1
317323 colors = colors * n_copies
318- img = self .image .copy () if self .image else Image .open (self .image_path )
324+
325+ # Hotload image if it hasn't been loaded yet
326+ if self .image :
327+ img = self .image .copy ()
328+ elif self .image_path :
329+ img = Image .open (self .image_path )
330+ else :
331+ img = self ._get_image (self .document_filename , self .number , image_dpi )
319332
320333 for el , color in zip (self .elements , colors ):
321334 if isinstance (el , Rectangle ):
322335 img = draw_bbox (img , el , color = color )
336+
323337 return img
324338
339+ def _get_image (self , filename , page_number , pdf_image_dpi : int = 200 ) -> Image .Image :
340+ """Hotloads a page image from a pdf file."""
341+
342+ with tempfile .TemporaryDirectory () as temp_dir :
343+ _image_paths = pdf2image .convert_from_path (
344+ filename ,
345+ dpi = pdf_image_dpi ,
346+ output_folder = temp_dir ,
347+ paths_only = True ,
348+ )
349+ image_paths = cast (List [str ], _image_paths )
350+ if page_number > len (image_paths ):
351+ raise ValueError (
352+ f"Page number { page_number } is greater than the number of pages in the PDF." ,
353+ )
354+
355+ with Image .open (image_paths [page_number - 1 ]) as image :
356+ return image .copy ()
357+
325358 @classmethod
326359 def from_image (
327360 cls ,
328361 image : Image .Image ,
329- image_path : Optional [Union [str , PurePath ]],
362+ image_path : Optional [Union [str , PurePath ]] = None ,
363+ document_filename : Optional [Union [str , PurePath ]] = None ,
330364 number : int = 1 ,
331365 detection_model : Optional [UnstructuredObjectDetectionModel ] = None ,
332366 element_extraction_model : Optional [UnstructuredElementExtractionModel ] = None ,
@@ -363,6 +397,9 @@ def from_image(
363397 "height" : page .image .height if page .image else None ,
364398 }
365399 page .image_path = os .path .abspath (image_path ) if image_path else None
400+ page .document_filename = os .path .abspath (document_filename ) if document_filename else None
401+
402+ # Clear the image to save memory
366403 page .image = None
367404
368405 return page
@@ -480,7 +517,7 @@ def get_element_from_block(
480517def load_pdf (
481518 filename : str ,
482519 dpi : int = 200 ,
483- output_folder : Union [str , PurePath ] = None , # type: ignore
520+ output_folder : Optional [ Union [str , PurePath ]] = None ,
484521 path_only : bool = False ,
485522) -> Tuple [List [List [TextRegion ]], Union [List [Image .Image ], List [str ]]]:
486523 """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
@@ -509,30 +546,23 @@ def load_pdf(
509546 if path_only and not output_folder :
510547 raise ValueError ("output_folder must be specified if path_only is true" )
511548
512- images = pdf2image .convert_from_path (
513- filename ,
514- dpi = dpi ,
515- output_folder = output_folder ,
516- paths_only = path_only ,
517- )
549+ if output_folder is not None :
550+ images = pdf2image .convert_from_path (
551+ filename ,
552+ dpi = dpi ,
553+ output_folder = output_folder ,
554+ paths_only = path_only ,
555+ )
556+ else :
557+ images = pdf2image .convert_from_path (
558+ filename ,
559+ dpi = dpi ,
560+ paths_only = path_only ,
561+ )
518562
519563 return layouts , images
520564
521565
522- def create_image_output_dir (
523- filename : Union [str , PurePath ],
524- ) -> Union [str , PurePath ]:
525- """Creates a directory to store the converted images from the pdf pages and returns the
526- directory path"""
527- parent_dir = os .path .abspath (os .path .dirname (filename ))
528- f_name_without_extension = os .path .splitext (os .path .basename (filename ))[0 ]
529-
530- # Add a suffix to avoid conflicts in case original file doesn't have an extension
531- output_dir = os .path .join (parent_dir , f"{ f_name_without_extension } _images" )
532- os .makedirs (output_dir , exist_ok = True )
533- return output_dir
534-
535-
536566def parse_ocr_data (ocr_data : dict ) -> List [TextRegion ]:
537567 """
538568 Parse the OCR result data to extract a list of TextRegion objects.
0 commit comments