Skip to content

Commit 9ee8a5a

Browse files
ROB: Handle some None values for broken PDF files (#3230)
These issues were discovered while trying to extract text and images from some PDF files which were incomplete, but partially fixed.
1 parent 7f7fd95 commit 9ee8a5a

File tree

2 files changed

+42
-7
lines changed

2 files changed

+42
-7
lines changed

pypdf/_page.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,10 +1044,11 @@ def get_contents(self) -> Optional[ContentStream]:
10441044
pdf = cast(IndirectObject, self.indirect_reference).pdf
10451045
except AttributeError:
10461046
pdf = None
1047-
obj = self[PG.CONTENTS].get_object()
1048-
if isinstance(obj, NullObject):
1047+
obj = self[PG.CONTENTS]
1048+
if is_null_or_none(obj):
10491049
return None
1050-
return ContentStream(obj, pdf)
1050+
resolved_object = obj.get_object()
1051+
return ContentStream(resolved_object, pdf)
10511052
return None
10521053

10531054
def replace_contents(
@@ -1846,8 +1847,8 @@ def _extract_text(
18461847
# file as not damaged, no need to check for TJ or Tj
18471848
return ""
18481849

1849-
if "/Font" in resources_dict:
1850-
for f in cast(DictionaryObject, resources_dict["/Font"]):
1850+
if "/Font" in resources_dict and (font := resources_dict["/Font"]):
1851+
for f in cast(DictionaryObject, font):
18511852
cmaps[f] = build_char_map(f, space_width, obj)
18521853
cmap: Tuple[
18531854
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
@@ -1864,7 +1865,7 @@ def _extract_text(
18641865
)
18651866
if not isinstance(content, ContentStream):
18661867
content = ContentStream(content, pdf, "bytes")
1867-
except KeyError: # no content can be extracted (certainly empty page)
1868+
except (AttributeError, KeyError): # no content can be extracted (certainly empty page)
18681869
return ""
18691870
# We check all strings are TextStringObjects. ByteStringObjects
18701871
# are strings where the byte->string encoding was unknown, so adding

tests/test_page.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from io import BytesIO
66
from pathlib import Path
77
from random import shuffle
8-
from typing import List, Tuple
8+
from typing import Any, List, Tuple
9+
from unittest import mock
910

1011
import pytest
1112

@@ -1471,3 +1472,36 @@ def test_recursive_get_page_from_node():
14711472
writer.insert_page(writer.pages[0], -1)
14721473
with pytest.raises(ValueError):
14731474
writer.insert_page(writer.pages[0], -10)
1475+
1476+
1477+
def test_get_contents__none_type():
1478+
# We can observe this in reality as well, but these documents might be
1479+
# confidential. Thus use a more complex dummy implementation here while
1480+
# assigning a value of `None` is not possible from code, but from PDFs
1481+
# itself.
1482+
class MyPage(PageObject):
1483+
def __contains__(self, item) -> bool:
1484+
assert item == "/Contents"
1485+
return True
1486+
1487+
def __getitem__(self, item) -> Any:
1488+
assert item == "/Contents"
1489+
1490+
page = MyPage()
1491+
assert page.get_contents() is None
1492+
1493+
1494+
def test_extract_text__none_type():
1495+
class MyPage(PageObject):
1496+
def __getitem__(self, item) -> Any:
1497+
if item == "/Contents":
1498+
return None
1499+
return super().__getitem__(item)
1500+
1501+
page = MyPage()
1502+
resources = DictionaryObject()
1503+
none_reference = IndirectObject(1, 0, None)
1504+
resources[NameObject("/Font")] = none_reference
1505+
page[NameObject("/Resources")] = resources
1506+
with mock.patch.object(none_reference, "get_object", return_value=None):
1507+
assert page.extract_text() == ""

0 commit comments

Comments
 (0)