|
27 | 27 | merge_inferred_layout_with_ocr_layout, |
28 | 28 | ) |
29 | 29 | from unstructured_inference.inference.ordering import order_layout |
| 30 | +from unstructured_inference.inference.pdf import get_images_from_pdf_element |
30 | 31 | from unstructured_inference.logger import logger |
31 | 32 | from unstructured_inference.models.base import get_model |
32 | 33 | from unstructured_inference.models.detectron2onnx import ( |
|
37 | 38 | UnstructuredObjectDetectionModel, |
38 | 39 | ) |
39 | 40 | from unstructured_inference.patches.pdfminer import parse_keyword |
| 41 | +from unstructured_inference.utils import write_image |
40 | 42 | from unstructured_inference.visualize import draw_bbox |
41 | 43 |
|
42 | 44 | # NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix |
@@ -356,6 +358,33 @@ def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutEleme |
356 | 358 | ] |
357 | 359 | return elements |
358 | 360 |
|
| 361 | + def extract_images(self, output_dir_path: Optional[str] = None): |
| 362 | + """ |
| 363 | + Extract and save images from the page. This method iterates through the layout elements |
| 364 | + of the page, identifies image regions, and extracts and saves them as separate image files. |
| 365 | + """ |
| 366 | + |
| 367 | + if not output_dir_path: |
| 368 | + output_dir_path = os.path.join(os.getcwd(), "figures") |
| 369 | + os.makedirs(output_dir_path, exist_ok=True) |
| 370 | + |
| 371 | + figure_number = 0 |
| 372 | + for el in self.elements: |
| 373 | + if isinstance(el, LocationlessLayoutElement) or el.type not in ["Image"]: |
| 374 | + continue |
| 375 | + |
| 376 | + figure_number += 1 |
| 377 | + try: |
| 378 | + output_f_path = os.path.join( |
| 379 | + output_dir_path, |
| 380 | + f"figure-{self.number}-{figure_number}.jpg", |
| 381 | + ) |
| 382 | + cropped_image = self.image.crop((el.x1, el.y1, el.x2, el.y2)) |
| 383 | + write_image(cropped_image, output_f_path) |
| 384 | + el.image_path = output_f_path |
| 385 | + except (ValueError, IOError): |
| 386 | + logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True) |
| 387 | + |
359 | 388 | def _get_image_array(self) -> Union[np.ndarray, None]: |
360 | 389 | """Converts the raw image into a numpy array.""" |
361 | 390 | if self.image_array is None: |
@@ -439,11 +468,12 @@ def from_image( |
439 | 468 | ocr_mode: str = OCRMode.FULL_PAGE.value, |
440 | 469 | extract_tables: bool = False, |
441 | 470 | fixed_layout: Optional[List[TextRegion]] = None, |
442 | | - **kwargs, |
| 471 | + supplement_with_ocr_elements: bool = True, |
| 472 | + extract_images_in_pdf: bool = False, |
| 473 | + image_output_dir_path: Optional[str] = None, |
| 474 | + analysis: bool = False, |
443 | 475 | ): |
444 | 476 | """Creates a PageLayout from an already-loaded PIL Image.""" |
445 | | - analysis = kwargs.get("analysis", False) |
446 | | - supplement_with_ocr_elements = kwargs.get("supplement_with_ocr_elements", True) |
447 | 477 |
|
448 | 478 | page = cls( |
449 | 479 | number=number, |
@@ -474,6 +504,9 @@ def from_image( |
474 | 504 | page.image_path = os.path.abspath(image_path) if image_path else None |
475 | 505 | page.document_filename = os.path.abspath(document_filename) if document_filename else None |
476 | 506 |
|
| 507 | + if extract_images_in_pdf: |
| 508 | + page.extract_images(image_output_dir_path) |
| 509 | + |
477 | 510 | # Clear the image to save memory |
478 | 511 | page.image = None |
479 | 512 |
|
@@ -602,21 +635,29 @@ def load_pdf( |
602 | 635 | ) -> Tuple[List[List[TextRegion]], Union[List[Image.Image], List[str]]]: |
603 | 636 | """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the |
604 | 637 | pdf pages using pdf2image""" |
| 638 | + |
605 | 639 | layouts = [] |
606 | 640 | for page in extract_pages(filename): |
607 | | - layout = [] |
| 641 | + layout: List[TextRegion] = [] |
608 | 642 | height = page.height |
609 | 643 | for element in page: |
610 | 644 | x1, y2, x2, y1 = element.bbox |
611 | 645 | y1 = height - y1 |
612 | 646 | y2 = height - y2 |
613 | 647 | # Coefficient to rescale bounding box to be compatible with images |
614 | 648 | coef = dpi / 72 |
615 | | - _text, element_class = ( |
616 | | - (element.get_text(), EmbeddedTextRegion) |
617 | | - if hasattr(element, "get_text") |
618 | | - else (None, ImageTextRegion) |
619 | | - ) |
| 649 | + |
| 650 | + if hasattr(element, "get_text"): |
| 651 | + _text = element.get_text() |
| 652 | + element_class = EmbeddedTextRegion # type: ignore |
| 653 | + else: |
| 654 | + embedded_images = get_images_from_pdf_element(element) |
| 655 | + if len(embedded_images) > 0: |
| 656 | + _text = None |
| 657 | + element_class = ImageTextRegion # type: ignore |
| 658 | + else: |
| 659 | + continue |
| 660 | + |
620 | 661 | text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text) |
621 | 662 |
|
622 | 663 | if text_region.area() > 0: |
|
0 commit comments