5959from .constants import ImageAttributes as IA
6060from .constants import LzwFilterParameters as LZW
6161from .constants import StreamAttributes as SA
62- from .errors import DependencyError , PdfReadError , PdfStreamError
62+ from .errors import DependencyError , LimitReachedError , PdfReadError , PdfStreamError
6363from .generic import (
6464 ArrayObject ,
6565 DictionaryObject ,
6969 is_null_or_none ,
7070)
7171
72+ ZLIB_MAX_OUTPUT_LENGTH = 75_000_000
73+
74+
75+ def _decompress_with_limit (data : bytes ) -> bytes :
76+ decompressor = zlib .decompressobj ()
77+ result = decompressor .decompress (data , max_length = ZLIB_MAX_OUTPUT_LENGTH )
78+ if decompressor .unconsumed_tail :
79+ raise LimitReachedError (
80+ f"Limit reached while decompressing. { len (decompressor .unconsumed_tail )} bytes remaining."
81+ )
82+ return result
83+
7284
7385def decompress (data : bytes ) -> bytes :
7486 """
@@ -78,6 +90,12 @@ def decompress(data: bytes) -> bytes:
7890 If the decompression fails due to a zlib error, it falls back
7991 to using a decompression object with a larger window size.
8092
93+ Please note that the output length is limited to avoid memory
94+ issues. If you need to process larger content streams, consider
95+ adapting ``pypdf.filters.ZLIB_MAX_OUTPUT_LENGTH``. In case you
96+ are only dealing with trusted inputs and/or want to disable these
97+ limits, set the value to `0`.
98+
8199 Args:
82100 data: The input data to be decompressed.
83101
@@ -86,38 +104,43 @@ def decompress(data: bytes) -> bytes:
86104
87105 """
88106 try :
89- return zlib . decompress (data )
107+ return _decompress_with_limit (data )
90108 except zlib .error :
91- try :
92- # For larger files, use decompression object to enable buffered reading
93- return zlib .decompressobj ().decompress (data )
94- except zlib .error :
95- # First quick approach for known issue with faulty added bytes to the
96- # tail of the encoded stream from early Adobe Distiller or Pitstop versions
97- # with CR char as the default line separator (assumed by reverse engeneering)
98- # that breaks the decoding process in the end.
99- #
100- # Try first to cut off some of the tail byte by byte, however limited to not
101- # iterate through too many loops and kill the performance for large streams,
102- # to then allow the final fallback to run. Added this intermediate attempt,
103- # because starting from the head of the stream byte by byte kills completely
104- # the performace for large streams (e.g. 6 MB) with the tail-byte-issue
105- # and takes ages. This solution is really fast:
106- max_tail_cut_off_bytes : int = 8
107- for i in range (1 , min (max_tail_cut_off_bytes + 1 , len (data ))):
108- try :
109- return zlib .decompressobj ().decompress (data [:- i ])
110- except zlib .error :
111- pass
112- # If still failing, then try with increased window size
113- d = zlib .decompressobj (zlib .MAX_WBITS | 32 )
114- result_str = b""
115- for b in [data [i : i + 1 ] for i in range (len (data ))]:
116- try :
117- result_str += d .decompress (b )
118- except zlib .error :
119- pass
120- return result_str
109+ # First quick approach: There are known issues with faulty added bytes to the
110+ # tail of the encoded stream from early Adobe Distiller or Pitstop versions
111+ # with CR char as the default line separator (assumed by reverse engineering)
112+ # that breaks the decoding process in the end.
113+ #
114+ # Try first to cut off some of the tail byte by byte, but limited to not
115+ # iterate through too many loops and kill the performance for large streams,
116+ # to then allow the final fallback to run. Added this intermediate attempt,
117+ # because starting from the head of the stream byte by byte kills completely
118+ # the performance for large streams (e.g., 6 MB) with the tail-byte-issue
119+ # and takes ages. This solution is really fast:
120+ max_tail_cut_off_bytes : int = 8
121+ for i in range (1 , min (max_tail_cut_off_bytes + 1 , len (data ))):
122+ try :
123+ return _decompress_with_limit (data [:- i ])
124+ except zlib .error :
125+ pass
126+
127+ # If still failing, then try with increased window size.
128+ decompressor = zlib .decompressobj (zlib .MAX_WBITS | 32 )
129+ result_str = b""
130+ remaining_limit = ZLIB_MAX_OUTPUT_LENGTH
131+ data_single_bytes = [data [i : i + 1 ] for i in range (len (data ))]
132+ for index , b in enumerate (data_single_bytes ):
133+ try :
134+ decompressed = decompressor .decompress (b , max_length = remaining_limit )
135+ result_str += decompressed
136+ remaining_limit -= len (decompressed )
137+ if remaining_limit <= 0 :
138+ raise LimitReachedError (
139+ f"Limit reached while decompressing. { len (data_single_bytes ) - index } bytes remaining."
140+ )
141+ except zlib .error :
142+ pass
143+ return result_str
121144
122145
123146class FlateDecode :
@@ -732,7 +755,7 @@ def decode_stream_data(stream: Any) -> bytes:
732755 if not isinstance (decode_parms , (list , tuple )):
733756 decode_parms = (decode_parms ,)
734757 data : bytes = stream ._data
735- # If there is not data to decode we should not try to decode the data .
758+ # If there is no data to decode, we should not try to decode it .
736759 if not data :
737760 return data
738761 for filter_name , params in zip (filters , decode_parms ):
0 commit comments