SEC: Limit decompressed size for FlateDecode filter (#3430)

stefan6419846 · web-flow · commit bb3a69030fde · 2025-08-11T16:10:25.000+02:00
Closes #3429.
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -971,15 +971,21 @@ def _read_pdf15_xref_stream(
         if cast(str, xrefstream["/Type"]) != "/XRef":
             raise PdfReadError(f"Unexpected type {xrefstream['/Type']!r}")
         self.cache_indirect_object(generation, idnum, xrefstream)
-        stream_data = BytesIO(xrefstream.get_data())
+
         # Index pairs specify the subsections in the dictionary.
         # If none, create one subsection that spans everything.
-        idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
+        if "/Size" not in xrefstream:
+            # According to table 17 of the PDF 2.0 specification, this key is required.
+            raise PdfReadError(f"Size missing from XRef stream {xrefstream!r}!")
+        idx_pairs = xrefstream.get("/Index", [0, xrefstream["/Size"]])
+
         entry_sizes = cast(dict[Any, Any], xrefstream.get("/W"))
         assert len(entry_sizes) >= 3
         if self.strict and len(entry_sizes) > 3:
             raise PdfReadError(f"Too many entry sizes: {entry_sizes}")
 
+        stream_data = BytesIO(xrefstream.get_data())
+
         def get_entry(i: int) -> Union[int, tuple[int, ...]]:
             # Reads the correct number of bytes for each entry. See the
             # discussion of the W parameter in PDF spec table 17.
diff --git a/pypdf/errors.py b/pypdf/errors.py
@@ -64,3 +64,7 @@ class EmptyImageDataError(PyPdfError):
 
 
 STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly"
+
+
+class LimitReachedError(PyPdfError):
+    """Raised when a limit is reached."""
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -59,7 +59,7 @@
 from .constants import ImageAttributes as IA
 from .constants import LzwFilterParameters as LZW
 from .constants import StreamAttributes as SA
-from .errors import DependencyError, PdfReadError, PdfStreamError
+from .errors import DependencyError, LimitReachedError, PdfReadError, PdfStreamError
 from .generic import (
     ArrayObject,
     DictionaryObject,
@@ -69,6 +69,18 @@
     is_null_or_none,
 )
 
+ZLIB_MAX_OUTPUT_LENGTH = 75_000_000
+
+
+def _decompress_with_limit(data: bytes) -> bytes:
+    decompressor = zlib.decompressobj()
+    result = decompressor.decompress(data, max_length=ZLIB_MAX_OUTPUT_LENGTH)
+    if decompressor.unconsumed_tail:
+        raise LimitReachedError(
+            f"Limit reached while decompressing. {len(decompressor.unconsumed_tail)} bytes remaining."
+        )
+    return result
+
 
 def decompress(data: bytes) -> bytes:
     """
@@ -78,6 +90,12 @@ def decompress(data: bytes) -> bytes:
     If the decompression fails due to a zlib error, it falls back
     to using a decompression object with a larger window size.
 
+    Please note that the output length is limited to avoid memory
+    issues. If you need to process larger content streams, consider
+    adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you
+    are only dealing with trusted inputs and/or want to disable these
+    limits, set the value to `0`.
+
     Args:
         data: The input data to be decompressed.
 
@@ -86,38 +104,43 @@ def decompress(data: bytes) -> bytes:
 
     """
     try:
-        return zlib.decompress(data)
+        return _decompress_with_limit(data)
     except zlib.error:
-        try:
-            # For larger files, use decompression object to enable buffered reading
-            return zlib.decompressobj().decompress(data)
-        except zlib.error:
-            # First quick approach for known issue with faulty added bytes to the
-            # tail of the encoded stream from early Adobe Distiller or Pitstop versions
-            # with CR char as the default line separator (assumed by reverse engeneering)
-            # that breaks the decoding process in the end.
-            #
-            # Try first to cut off some of the tail byte by byte, however limited to not
-            # iterate through too many loops and kill the performance for large streams,
-            # to then allow the final fallback to run. Added this intermediate attempt,
-            # because starting from the head of the stream byte by byte kills completely
-            # the performace for large streams (e.g. 6 MB) with the tail-byte-issue
-            # and takes ages. This solution is really fast:
-            max_tail_cut_off_bytes: int = 8
-            for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
-                try:
-                    return zlib.decompressobj().decompress(data[:-i])
-                except zlib.error:
-                    pass
-            # If still failing, then try with increased window size
-            d = zlib.decompressobj(zlib.MAX_WBITS | 32)
-            result_str = b""
-            for b in [data[i : i + 1] for i in range(len(data))]:
-                try:
-                    result_str += d.decompress(b)
-                except zlib.error:
-                    pass
-            return result_str
+        # First quick approach: There are known issues with faulty added bytes to the
+        # tail of the encoded stream from early Adobe Distiller or Pitstop versions
+        # with CR char as the default line separator (assumed by reverse engineering)
+        # that breaks the decoding process in the end.
+        #
+        # Try first to cut off some of the tail byte by byte, but limited to not
+        # iterate through too many loops and kill the performance for large streams,
+        # to then allow the final fallback to run. Added this intermediate attempt,
+        # because starting from the head of the stream byte by byte kills completely
+        # the performance for large streams (e.g., 6 MB) with the tail-byte-issue
+        # and takes ages. This solution is really fast:
+        max_tail_cut_off_bytes: int = 8
+        for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
+            try:
+                return _decompress_with_limit(data[:-i])
+            except zlib.error:
+                pass
+
+        # If still failing, then try with increased window size.
+        decompressor = zlib.decompressobj(zlib.MAX_WBITS | 32)
+        result_str = b""
+        remaining_limit = ZLIB_MAX_OUTPUT_LENGTH
+        data_single_bytes = [data[i : i + 1] for i in range(len(data))]
+        for index, b in enumerate(data_single_bytes):
+            try:
+                decompressed = decompressor.decompress(b, max_length=remaining_limit)
+                result_str += decompressed
+                remaining_limit -= len(decompressed)
+                if remaining_limit <= 0:
+                    raise LimitReachedError(
+                        f"Limit reached while decompressing. {len(data_single_bytes) - index} bytes remaining."
+                    )
+            except zlib.error:
+                pass
+        return result_str
 
 
 class FlateDecode:
@@ -732,7 +755,7 @@ def decode_stream_data(stream: Any) -> bytes:
     if not isinstance(decode_parms, (list, tuple)):
         decode_parms = (decode_parms,)
     data: bytes = stream._data
-    # If there is not data to decode we should not try to decode the data.
+    # If there is no data to decode, we should not try to decode it.
     if not data:
         return data
     for filter_name, params in zip(filters, decode_parms):
diff --git a/tests/example_files.yaml b/tests/example_files.yaml
@@ -118,3 +118,5 @@
   url: https://github.com/user-attachments/files/20923310/large_lzw_example_encoded.dat.txt
 - local_filename: issue-3419.pdf
   url: https://github.com/user-attachments/files/21578875/layout-parser-paper-with-empty-pages.pdf
+- local_filename: issue-3429.pdf
+  url: https://github.com/user-attachments/files/21711469/bomb.pdf
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -3,6 +3,7 @@
 import shutil
 import string
 import subprocess
+import zlib
 from io import BytesIO
 from itertools import product as cartesian_product
 from pathlib import Path
@@ -13,7 +14,7 @@
 from PIL import Image, ImageOps
 
 from pypdf import PdfReader
-from pypdf.errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError
+from pypdf.errors import DependencyError, DeprecationError, LimitReachedError, PdfReadError, PdfStreamError
 from pypdf.filters import (
     ASCII85Decode,
     ASCIIHexDecode,
@@ -23,6 +24,7 @@
     FlateDecode,
     JBIG2Decode,
     RunLengthDecode,
+    decompress,
 )
 from pypdf.generic import (
     ArrayObject,
@@ -675,12 +677,15 @@ def test_ccitt_fax_decode__black_is_1():
 def test_flate_decode__image_is_none_due_to_size_limit(caplog):
     url = "https://github.com/user-attachments/files/19464256/file.pdf"
     name = "issue3220.pdf"
-    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
-    images = reader.pages[0].images
-    assert len(images) == 1
-    image = images[0]
-    assert image.name == "Im0.png"
-    assert image.image is None
+
+    with mock.patch("pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH", 0):
+        reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+        images = reader.pages[0].images
+        assert len(images) == 1
+        image = images[0]
+        assert image.name == "Im0.png"
+        assert image.image is None
+
     assert (
         "Failed loading image: Image size (180000000 pixels) exceeds limit of "
         "178956970 pixels, could be decompression bomb DOS attack."
@@ -845,3 +850,25 @@ def test_rle_decode_exception_with_corrupted_stream():
     )
     with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
         RunLengthDecode.decode(data)
+
+
+def test_decompress():
+    data = string.printable.encode("utf-8") + string.printable[::-1].encode("utf-8")
+    compressed = FlateDecode.encode(data)
+
+    # # Decompress regularly.
+    decompressed = decompress(compressed)
+    assert decompressed == data
+
+    # # Decompress byte-wise.
+    with mock.patch("pypdf.filters._decompress_with_limit", side_effect=zlib.error):
+        decompressed = decompress(compressed)
+        assert decompressed == data
+
+    # Decompress byte-wise with very low output limit.
+    with mock.patch("pypdf.filters._decompress_with_limit", side_effect=zlib.error), \
+            mock.patch("pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH", len(compressed) - 13), \
+            pytest.raises(
+                LimitReachedError, match=r"^Limit reached while decompressing\. 12 bytes remaining\."
+            ):
+        decompress(compressed)
diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -1842,3 +1842,18 @@ def test_trailer_cannot_be_read():
     with pytest.raises(PdfReadError, match=r"^Trailer cannot be read: Unexpected type '/Invalid'$"):
         reader = PdfReader(BytesIO(data))
         list(reader.pages)
+
+
+@pytest.mark.enable_socket
+def test_read_pdf15_xref_stream():
+    data = get_data_from_url(name="issue-3429.pdf")
+
+    with pytest.raises(PdfReadError, match=r"^Trailer cannot be read: Size missing from XRef stream {"):
+        PdfReader(BytesIO(data))
+
+    data_modified = data.replace(b"/XRef/", b"/XRef/Size/2/")
+    with pytest.raises(
+            PdfReadError,
+            match=r"^Trailer cannot be read: Limit reached while decompressing\. 1545392 bytes remaining\.$"
+    ):
+        PdfReader(BytesIO(data_modified))