community[patch]: update PyPDFParser to take into account filters returned as arrays (#30489)

pprados · web-flow · commit 8e5d2a44ce42 · 2025-03-26T14:16:54.000-04:00
The image parsing is generating a bug as the the extracted objects for the /Filter returns sometimes an array, sometimes a string. Fix [Issue 30098](#30098)
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -428,6 +428,7 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
         """
         if not self.images_parser:
             return ""
+        import pypdf
         from PIL import Image
 
         if "/XObject" not in cast(dict, page["/Resources"]).keys():
@@ -438,13 +439,18 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
         for obj in xObject:
             np_image: Any = None
             if xObject[obj]["/Subtype"] == "/Image":
-                if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
+                img_filter = (
+                    xObject[obj]["/Filter"][1:]
+                    if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject
+                    else xObject[obj]["/Filter"][0][1:]
+                )
+                if img_filter in _PDF_FILTER_WITHOUT_LOSS:
                     height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
 
                     np_image = np.frombuffer(
                         xObject[obj].get_data(), dtype=np.uint8
                     ).reshape(height, width, -1)
-                elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
+                elif img_filter in _PDF_FILTER_WITH_LOSS:
                     np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
 
                 else: