ROB: Handle some None values for broken PDF files (#3230)

stefan6419846 · web-flow · commit 9ee8a5a2f073 · 2025-04-01T15:35:33.000+02:00
These issues were discovered while trying to extract text and images
from some PDF files which were incomplete, but partially fixed.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1044,10 +1044,11 @@ def get_contents(self) -> Optional[ContentStream]:
                 pdf = cast(IndirectObject, self.indirect_reference).pdf
             except AttributeError:
                 pdf = None
-            obj = self[PG.CONTENTS].get_object()
-            if isinstance(obj, NullObject):
+            obj = self[PG.CONTENTS]
+            if is_null_or_none(obj):
                 return None
-            return ContentStream(obj, pdf)
+            resolved_object = obj.get_object()
+            return ContentStream(resolved_object, pdf)
         return None
 
     def replace_contents(
@@ -1846,8 +1847,8 @@ def _extract_text(
             # file as not damaged, no need to check for TJ or Tj
             return ""
 
-        if "/Font" in resources_dict:
-            for f in cast(DictionaryObject, resources_dict["/Font"]):
+        if "/Font" in resources_dict and (font := resources_dict["/Font"]):
+            for f in cast(DictionaryObject, font):
                 cmaps[f] = build_char_map(f, space_width, obj)
         cmap: Tuple[
             Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
@@ -1864,7 +1865,7 @@ def _extract_text(
             )
             if not isinstance(content, ContentStream):
                 content = ContentStream(content, pdf, "bytes")
-        except KeyError:  # no content can be extracted (certainly empty page)
+        except (AttributeError, KeyError):  # no content can be extracted (certainly empty page)
             return ""
         # We check all strings are TextStringObjects. ByteStringObjects
         # are strings where the byte->string encoding was unknown, so adding
diff --git a/tests/test_page.py b/tests/test_page.py
@@ -5,7 +5,8 @@
 from io import BytesIO
 from pathlib import Path
 from random import shuffle
-from typing import List, Tuple
+from typing import Any, List, Tuple
+from unittest import mock
 
 import pytest
 
@@ -1471,3 +1472,36 @@ def test_recursive_get_page_from_node():
     writer.insert_page(writer.pages[0], -1)
     with pytest.raises(ValueError):
         writer.insert_page(writer.pages[0], -10)
+
+
+def test_get_contents__none_type():
+    # We can observe this in reality as well, but these documents might be
+    # confidential. Thus use a more complex dummy implementation here while
+    # assigning a value of `None` is not possible from code, but from PDFs
+    # itself.
+    class MyPage(PageObject):
+        def __contains__(self, item) -> bool:
+            assert item == "/Contents"
+            return True
+
+        def __getitem__(self, item) -> Any:
+            assert item == "/Contents"
+
+    page = MyPage()
+    assert page.get_contents() is None
+
+
+def test_extract_text__none_type():
+    class MyPage(PageObject):
+        def __getitem__(self, item) -> Any:
+            if item == "/Contents":
+                return None
+            return super().__getitem__(item)
+
+    page = MyPage()
+    resources = DictionaryObject()
+    none_reference = IndirectObject(1, 0, None)
+    resources[NameObject("/Font")] = none_reference
+    page[NameObject("/Resources")] = resources
+    with mock.patch.object(none_reference, "get_object", return_value=None):
+        assert page.extract_text() == ""