Skip to content

Commit 8e5d2a4

Browse files
authored
community[patch]: update PyPDFParser to take into account filters returned as arrays (#30489)
The image parsing is generating a bug as the the extracted objects for the /Filter returns sometimes an array, sometimes a string. Fix [Issue 30098](#30098)
1 parent 422ba4c commit 8e5d2a4

File tree

1 file changed

+8
-2
lines changed
  • libs/community/langchain_community/document_loaders/parsers

1 file changed

+8
-2
lines changed

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
428428
"""
429429
if not self.images_parser:
430430
return ""
431+
import pypdf
431432
from PIL import Image
432433

433434
if "/XObject" not in cast(dict, page["/Resources"]).keys():
@@ -438,13 +439,18 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
438439
for obj in xObject:
439440
np_image: Any = None
440441
if xObject[obj]["/Subtype"] == "/Image":
441-
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
442+
img_filter = (
443+
xObject[obj]["/Filter"][1:]
444+
if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
445+
else xObject[obj]["/Filter"][0][1:]
446+
)
447+
if img_filter in _PDF_FILTER_WITHOUT_LOSS:
442448
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
443449

444450
np_image = np.frombuffer(
445451
xObject[obj].get_data(), dtype=np.uint8
446452
).reshape(height, width, -1)
447-
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
453+
elif img_filter in _PDF_FILTER_WITH_LOSS:
448454
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
449455

450456
else:

0 commit comments

Comments
 (0)