False color imaging #2956
-
Hi Jorj, I am trying to extract images from scientific literature PDFs, but some of the images are not color-mapped correctly. Do you have any ideas or possible solutions? Thank you very much for your help in advance! Here is the original code: import fitz # PyMuPDF
import os
def extract_images_from_pdf(pdf_path, output_folder):
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
doc = fitz.open(pdf_path)
for i in range(len(doc)):
page = doc.load_page(i)
image_list = page.get_images(full=True)
for image_index, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_filename = f"{pdf_name}_page_{i+1}_img_{image_index+1}.png"
image_output_path = os.path.join(output_folder, image_filename)
with open(image_output_path, "wb") as f:
f.write(image_bytes)
print(f"Image saved to {image_output_path}")
doc.close()
def extract_images_from_folder(pdf_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(pdf_folder):
if filename.lower().endswith(".pdf"):
pdf_path = os.path.join(pdf_folder, filename)
extract_images_from_pdf(pdf_path, output_folder)
# Example usage
pdf_folder = "./pdf" # Folder containing PDFs
output_folder = "./output" # Output folder for all images
extract_images_from_folder(pdf_folder, output_folder) |
Beta Was this translation helpful? Give feedback.
Replies: 4 comments
-
Thanks for the report. Please also include the document or page to reproduce. |
Beta Was this translation helpful? Give feedback.
-
Thanks for your quick reponse. I took a look at the documentation and changed my code import fitz # PyMuPDF
import os
def extract_images_from_pdf(pdf_path, output_folder):
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
doc = fitz.open(pdf_path)
for i in range(len(doc)):
page = doc.load_page(i)
image_list = page.get_images(full=True)
for image_index, img in enumerate(image_list):
xref = img[0]
base_image_info = doc.extract_image(xref)
# Create a Pixmap object from the image
try:
pix = fitz.Pixmap(doc, xref)
except Exception as e:
print(f"An error occurred while creating pixmap for image xref {xref}: {e}")
continue # Skip this image and move to the next one
# Check if the image has an associated smask (alpha channel)
smask_xref = base_image_info.get("smask")
if smask_xref > 0:
# Create Pixmap objects for the image and smask
pix1 = fitz.Pixmap(doc, xref)
pix2 = fitz.Pixmap(doc, smask_xref)
# Check if pix1 is not grayscale, as we cannot directly assign alpha to grayscale images
if not pix1.alpha and pix1.n - pix2.n == 4: # smask must be grayscale
# Combine the base image and the mask
pix1.set_alpha(pix2.samples) # set alpha from smask
pix = pix1 # use pix1 as the final image
# If the image is CMYK, convert it to RGB
if pix and pix.colorspace and pix.colorspace.n == 4: # CMYK has 4 components
pix = fitz.Pixmap(fitz.csRGB, pix)
# Define the output image path with PDF name as prefix
image_filename = f"{pdf_name}_page_{i + 1}_img_{image_index + 1}.png"
image_output_path = os.path.join(output_folder, image_filename)
# Save the image as PNG
pix.save(image_output_path)
print(f"Image saved to {image_output_path}")
# Clean up the Pixmap objects to free memory
pix = None
pix1 = None
pix2 = None
doc.close()
def extract_images_from_folder(pdf_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for filename in os.listdir(pdf_folder):
if filename.lower().endswith(".pdf"):
pdf_path = os.path.join(pdf_folder, filename)
extract_images_from_pdf(pdf_path, output_folder)
# Example usage
pdf_folder = "./pdf" # Folder containing PDFs
output_folder = "./output" # Output folder for all images
extract_images_from_folder(pdf_folder, output_folder) Seemingly, it works for most pdf, but it returns (here is a truncation of output) Image saved to ./output\10.1016+j.cej.2022.134617_page_1_img_3.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_4_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_4_img_2.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_5_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_6_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_7_img_1.png
Image saved to ./output\10.1016+j.cej.2022.134617_page_8_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_1_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_1_img_2.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_1_img_3.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_4_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_4_img_2.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_5_img_1.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_5_img_2.png
Image saved to ./output\10.1016+j.ces.2021.117402_page_6_img_1.png
Image saved to ./output\10.1016+j.coelec.2021.100898_page_1_img_1.png
Traceback (most recent call last):
File "D:\Study\cas\pdf2jpg.py", line 67, in <module>
extract_images_from_folder(pdf_folder, output_folder)
File "D:\Study\cas\pdf2jpg.py", line 61, in extract_images_from_folder
extract_images_from_pdf(pdf_path, output_folder)
File "D:\Study\cas\pdf2jpg.py", line 43, in extract_images_from_pdf
pix.save(image_output_path)
File "D:\Study\cas\venv\lib\site-packages\fitz\fitz.py", line 7486, in save
return self._writeIMG(filename, idx, jpg_quality)
File "D:\Study\cas\venv\lib\site-packages\fitz\fitz.py", line 7454, in _writeIMG
return _fitz.Pixmap__writeIMG(self, filename, format, jpg_quality)
RuntimeError: pixmap must be grayscale or rgb to write as png
Process finished with exit code 1 Here is the specific pdf to reproduce. |
Beta Was this translation helpful? Give feedback.
-
For all I can see, everything works as designed now with your changes. |
Beta Was this translation helpful? Give feedback.
-
The failing image has a colorspace other than grayscale or RGB - probably CMYK. |
Beta Was this translation helpful? Give feedback.
Thanks for the report. Please also include the document or page to reproduce.
Looking at your code and image though suggests that you ignored the mask image and only converted the base image.
Therefore the alpha channel is not included.