Skip to content

Commit 5200f7c

Browse files
authored
fix: handle image of size 0 bytes in PyPDFParser (#84)
# Fix - Skip the for iteration if the image has 0 bytes Note: The problem was reported by my coworker. @luma-pires
1 parent a4dac55 commit 5200f7c

File tree

1 file changed

+7
-0
lines changed
  • libs/community/langchain_community/document_loaders/parsers

1 file changed

+7
-0
lines changed

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,10 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
457457
logger.warning("Unknown PDF Filter!")
458458
if np_image is not None:
459459
image_bytes = io.BytesIO()
460+
461+
if image_bytes.getbuffer().nbytes == 0:
462+
continue
463+
460464
Image.fromarray(np_image).save(image_bytes, format="PNG")
461465
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
462466
image_text = next(self.images_parser.lazy_parse(blob)).page_content
@@ -1108,6 +1112,9 @@ def _extract_images_from_page(
11081112
pix.height, pix.width, -1
11091113
)
11101114
image_bytes = io.BytesIO()
1115+
if image_bytes.getbuffer().nbytes == 0:
1116+
continue
1117+
11111118
numpy.save(image_bytes, image)
11121119
blob = Blob.from_data(
11131120
image_bytes.getvalue(), mime_type="application/x-npy"

0 commit comments

Comments
 (0)