fix: handle image of size 0 bytes in PyPDFParser (#84)

soucosmo · web-flow · commit 5200f7cec69d · 2025-06-02T16:53:25.000-04:00
# Fix - Skip the for iteration if the image has 0 bytes Note: The problem was reported by my coworker. @luma-pires
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -457,6 +457,10 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
                     logger.warning("Unknown PDF Filter!")
                 if np_image is not None:
                     image_bytes = io.BytesIO()
+
+                    if image_bytes.getbuffer().nbytes == 0:
+                        continue
+
                     Image.fromarray(np_image).save(image_bytes, format="PNG")
                     blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
                     image_text = next(self.images_parser.lazy_parse(blob)).page_content
@@ -1108,6 +1112,9 @@ def _extract_images_from_page(
                     pix.height, pix.width, -1
                 )
                 image_bytes = io.BytesIO()
+                if image_bytes.getbuffer().nbytes == 0:
+                    continue
+
                 numpy.save(image_bytes, image)
                 blob = Blob.from_data(
                     image_bytes.getvalue(), mime_type="application/x-npy"