Skip to content

Image Extraction pypdf #178

@namtr92

Description

@namtr92

Looking at the code base:

def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
        """Extract images from a PDF page and get the text using images_to_text.

        Args:
            page: The page object from which to extract images.

        Returns:
            str: The extracted text from the images on the page.
        """
        if not self.images_parser:
            return ""
        import pypdf
        from PIL import Image

        if "/XObject" not in cast(dict, page["/Resources"]).keys():
            return ""

        xObject = page["/Resources"]["/XObject"].get_object()
        images = []
        for obj in xObject:
            np_image: Any = None
            if xObject[obj]["/Subtype"] == "/Image":
                img_filter = (
                    xObject[obj]["/Filter"][1:]
                    if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
                    else xObject[obj]["/Filter"][0][1:]
                )
                if img_filter in _PDF_FILTER_WITHOUT_LOSS:
                    height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]

                    np_image = np.frombuffer(
                        xObject[obj].get_data(), dtype=np.uint8
                    ).reshape(height, width, -1)
                elif img_filter in _PDF_FILTER_WITH_LOSS:
                    np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))

                else:
                    logger.warning("Unknown PDF Filter!")
                if np_image is not None:
                    image_bytes = io.BytesIO()

                    if image_bytes.getbuffer().nbytes == 0:
                        continue

                    Image.fromarray(np_image).save(image_bytes, format="PNG")
                    blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
                    image_text = next(self.images_parser.lazy_parse(blob)).page_content
                    images.append(
                        _format_inner_image(blob, image_text, self.images_inner_format)
                    )
        return _FORMAT_IMAGE_STR.format(
            image_text=_JOIN_IMAGES.join(filter(None, images))
        )

It should be like this, otherwise the image cannot be extract:

if np_image is not None:
    image_bytes = io.BytesIO()
    Image.fromarray(np_image).save(image_bytes, format="PNG") #save image before check
    logger.info(f"image_bytes: {image_bytes.getbuffer().nbytes}")
    if image_bytes.getbuffer().nbytes == 0:
        continue
                    

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions