False color imaging #2956

YikunHan42 · 2024-01-02T07:25:34Z

YikunHan42
Jan 2, 2024

Hi Jorj, PyMuPDF is an excellent tool!

I am trying to extract images from scientific literature PDFs, but some of the images are not color-mapped correctly. Do you have any ideas or possible solutions? Thank you very much for your help in advance!

Example with correct colors:

Example with wrong colors:

Here is the original code:

import fitz  # PyMuPDF
import os

def extract_images_from_pdf(pdf_path, output_folder):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    doc = fitz.open(pdf_path)

    for i in range(len(doc)):
        page = doc.load_page(i)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            image_filename = f"{pdf_name}_page_{i+1}_img_{image_index+1}.png"
            image_output_path = os.path.join(output_folder, image_filename)

            with open(image_output_path, "wb") as f:
                f.write(image_bytes)

            print(f"Image saved to {image_output_path}")

    doc.close()

def extract_images_from_folder(pdf_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            extract_images_from_pdf(pdf_path, output_folder)

# Example usage
pdf_folder = "./pdf"  # Folder containing PDFs
output_folder = "./output"  # Output folder for all images
extract_images_from_folder(pdf_folder, output_folder)

Answered by JorjMcKie

Jan 2, 2024

Thanks for the report. Please also include the document or page to reproduce.
Looking at your code and image though suggests that you ignored the mask image and only converted the base image.
Therefore the alpha channel is not included.

View full answer

JorjMcKie · 2024-01-02T08:14:42Z

JorjMcKie
Jan 2, 2024
Maintainer

Thanks for the report. Please also include the document or page to reproduce.
Looking at your code and image though suggests that you ignored the mask image and only converted the base image.
Therefore the alpha channel is not included.

0 replies

YikunHan42 · 2024-01-02T08:54:35Z

YikunHan42
Jan 2, 2024
Author

Thanks for your quick reponse. I took a look at the documentation and changed my code

import fitz  # PyMuPDF
import os
def extract_images_from_pdf(pdf_path, output_folder):
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    doc = fitz.open(pdf_path)

    for i in range(len(doc)):
        page = doc.load_page(i)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image_info = doc.extract_image(xref)

            # Create a Pixmap object from the image
            try:
                pix = fitz.Pixmap(doc, xref)
            except Exception as e:
                print(f"An error occurred while creating pixmap for image xref {xref}: {e}")
                continue  # Skip this image and move to the next one

            # Check if the image has an associated smask (alpha channel)
            smask_xref = base_image_info.get("smask")
            if smask_xref > 0:
                # Create Pixmap objects for the image and smask
                pix1 = fitz.Pixmap(doc, xref)
                pix2 = fitz.Pixmap(doc, smask_xref)
                # Check if pix1 is not grayscale, as we cannot directly assign alpha to grayscale images
                if not pix1.alpha and pix1.n - pix2.n == 4:  # smask must be grayscale
                    # Combine the base image and the mask
                    pix1.set_alpha(pix2.samples)  # set alpha from smask
                pix = pix1  # use pix1 as the final image

            # If the image is CMYK, convert it to RGB
            if pix and pix.colorspace and pix.colorspace.n == 4:  # CMYK has 4 components
                pix = fitz.Pixmap(fitz.csRGB, pix)

            # Define the output image path with PDF name as prefix
            image_filename = f"{pdf_name}_page_{i + 1}_img_{image_index + 1}.png"
            image_output_path = os.path.join(output_folder, image_filename)

            # Save the image as PNG
            pix.save(image_output_path)
            print(f"Image saved to {image_output_path}")

            # Clean up the Pixmap objects to free memory
            pix = None
            pix1 = None
            pix2 = None

    doc.close()


def extract_images_from_folder(pdf_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            extract_images_from_pdf(pdf_path, output_folder)


# Example usage
pdf_folder = "./pdf"  # Folder containing PDFs
output_folder = "./output"  # Output folder for all images
extract_images_from_folder(pdf_folder, output_folder)

Seemingly, it works for most pdf, but it returns (here is a truncation of output)

Image saved to ./output\10.1016+j.cej.2022.134617_page_1_img_3.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_4_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_4_img_2.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_5_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_6_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_7_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_8_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_1_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_1_img_2.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_1_img_3.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_4_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_4_img_2.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_5_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_5_img_2.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_6_img_1.png
Image saved to ./output\10.1016+j.coelec.2021.100898_page_1_img_1.png
Traceback (most recent call last):
  File "D:\Study\cas\pdf2jpg.py", line 67, in <module>
    extract_images_from_folder(pdf_folder, output_folder)
  File "D:\Study\cas\pdf2jpg.py", line 61, in extract_images_from_folder
    extract_images_from_pdf(pdf_path, output_folder)
  File "D:\Study\cas\pdf2jpg.py", line 43, in extract_images_from_pdf
    pix.save(image_output_path)
  File "D:\Study\cas\venv\lib\site-packages\fitz\fitz.py", line 7486, in save
    return self._writeIMG(filename, idx, jpg_quality)
  File "D:\Study\cas\venv\lib\site-packages\fitz\fitz.py", line 7454, in _writeIMG
    return _fitz.Pixmap__writeIMG(self, filename, format, jpg_quality)
RuntimeError: pixmap must be grayscale or rgb to write as png

Process finished with exit code 1

Here is the specific pdf to reproduce.
10.1016+j.coelec.2021.100898.pdf

0 replies

JorjMcKie · 2024-01-02T09:02:39Z

JorjMcKie
Jan 2, 2024
Maintainer

For all I can see, everything works as designed now with your changes.
I am therefore converting this to a "Discussions" item for subsequent communications.

0 replies

JorjMcKie · 2024-01-02T09:06:41Z

JorjMcKie
Jan 2, 2024
Maintainer

The failing image has a colorspace other than grayscale or RGB - probably CMYK.
This cannot be saved as PNG - this is what the message says.
So you must check for colorspace and convert to RGB if not already GRAY / RGB.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

False color imaging #2956

Uh oh!

{{title}}

Uh oh!

Replies: 4 comments

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

False color imaging #2956

Uh oh!

YikunHan42 Jan 2, 2024

Replies: 4 comments

Uh oh!

JorjMcKie Jan 2, 2024 Maintainer

Uh oh!

YikunHan42 Jan 2, 2024 Author

Uh oh!

JorjMcKie Jan 2, 2024 Maintainer

Uh oh!

JorjMcKie Jan 2, 2024 Maintainer

YikunHan42
Jan 2, 2024

JorjMcKie
Jan 2, 2024
Maintainer

YikunHan42
Jan 2, 2024
Author

JorjMcKie
Jan 2, 2024
Maintainer

JorjMcKie
Jan 2, 2024
Maintainer