python · serhiy-storchaka · May 20, 2024 · May 20, 2024 · May 20, 2024 · May 22, 2024
diff --git a/Lib/pickle.py b/Lib/pickle.py
@@ -190,6 +190,11 @@ def __init__(self, value):
 __all__.extend(x for x in dir() if x.isupper() and not x.startswith('_'))
 
 
+# Data larger than this will be read in chunks, to prevent extreme
+# overallocation.
+_MIN_READ_BUF_SIZE = (1 << 20)
+
+
 class _Framer:
 
     _FRAME_SIZE_MIN = 4
@@ -288,7 +293,7 @@ def read(self, n):
                     "pickle exhausted before end of frame")
             return data
         else:
-            return self.file_read(n)
+            return self._chunked_file_read(n)
 
     def readline(self):
         if self.current_frame:
@@ -303,11 +308,23 @@ def readline(self):
         else:
             return self.file_readline()
 
+    def _chunked_file_read(self, size):
+        cursize = min(size, _MIN_READ_BUF_SIZE)
+        b = self.file_read(cursize)
+        while cursize < size and len(b) == cursize:
+            delta = min(cursize, size - cursize)
+            b += self.file_read(delta)
+            cursize += delta
+        return b
+
     def load_frame(self, frame_size):
         if self.current_frame and self.current_frame.read() != b'':
             raise UnpicklingError(
                 "beginning of a new frame before end of current frame")
-        self.current_frame = io.BytesIO(self.file_read(frame_size))
+        data = self._chunked_file_read(frame_size)
+        if len(data) < frame_size:
+            raise EOFError
+        self.current_frame = io.BytesIO(data)
 
 
 # Tools used for pickling.
@@ -1497,12 +1514,17 @@ def load_binbytes8(self):
     dispatch[BINBYTES8[0]] = load_binbytes8
 
     def load_bytearray8(self):
-        len, = unpack('<Q', self.read(8))
-        if len > maxsize:
+        size, = unpack('<Q', self.read(8))
+        if size > maxsize:
             raise UnpicklingError("BYTEARRAY8 exceeds system's maximum size "
                                   "of %d bytes" % maxsize)
-        b = bytearray(len)
-        self.readinto(b)
+        cursize = min(size, _MIN_READ_BUF_SIZE)
+        b = bytearray(cursize)
+        if self.readinto(b) == cursize:
+            while cursize < size and len(b) == cursize:
+                delta = min(cursize, size - cursize)
+                b += self.read(delta)
+                cursize += delta
         self.append(b)
     dispatch[BYTEARRAY8[0]] = load_bytearray8
 

diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
@@ -74,6 +74,15 @@ def count_opcode(code, pickle):
 def identity(x):
     return x
 
+def itersize(start, stop):
+    # Produce geometrical increasing sequence from start to stop
+    # (inclusively) for tests.
+    size = start
+    while size < stop:
+        yield size
+        size <<= 1
+    yield stop
+
 
 class UnseekableIO(io.BytesIO):
     def peek(self, *args):
@@ -853,9 +862,8 @@ def assert_is_copy(self, obj, objcopy, msg=None):
                 self.assertEqual(getattr(obj, slot, None),
                                  getattr(objcopy, slot, None), msg=msg)
 
-    def check_unpickling_error(self, errors, data):
-        with self.subTest(data=data), \
-             self.assertRaises(errors):
+    def check_unpickling_error_strict(self, errors, data):
+        with self.assertRaises(errors):
             try:
                 self.loads(data)
             except BaseException as exc:
@@ -864,6 +872,10 @@ def check_unpickling_error(self, errors, data):
                           (data, exc.__class__.__name__, exc))
                 raise
 
+    def check_unpickling_error(self, errors, data):
+        with self.subTest(data=data):
+            self.check_unpickling_error_strict(errors, data)
+
     def test_load_from_data0(self):
         self.assert_is_copy(self._testdata, self.loads(DATA0))
 
@@ -1135,6 +1147,141 @@ def test_negative_32b_binput(self):
         dumped = b'\x80\x03X\x01\x00\x00\x00ar\xff\xff\xff\xff.'
         self.check_unpickling_error(ValueError, dumped)
 
+    def test_too_large_put(self):
+        # Test that PUT with large id does not cause allocation of
+        # too large memo table.
+        data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
+                          b'g' + str(n).encode() + b'\nt.')
+        #    0: (    MARK
+        #    1: (        MARK
+        #    2: l            LIST       (MARK at 1)
+        #    3: p        PUT        1000000000000
+        #   18: g        GET        1000000000000
+        #   33: t        TUPLE      (MARK at 0)
+        #   34: .    STOP
+        for idx in [10**6, 10**9, 10**12]:
+            if idx > sys.maxsize:
+                continue
+            self.assertEqual(self.loads(data(idx)), ([],)*2)
+
+    def test_too_large_long_binput(self):
+        # Test that LONG_BINPUT with large id does not cause allocation of
+        # too large memo table.
+        data = lambda n: (b'(]r' + struct.pack('<I', n) +
+                          b'j' + struct.pack('<I', n) + b't.')
+        #    0: (    MARK
+        #    1: ]        EMPTY_LIST
+        #    2: r        LONG_BINPUT 4294967295
+        #    7: j        LONG_BINGET 4294967295
+        #   12: t        TUPLE      (MARK at 0)
+        #   13: .    STOP
+        for idx in itersize(1 << 20, min(sys.maxsize, (1 << 32) - 1)):
+            self.assertEqual(self.loads(data(idx)), ([],)*2)
+
+    def _test_truncated_data(self, dumped, expected_error=None):
+        # Test that instructions to read large data without providing
+        # such amount of data do not cause large memory usage.
+        if expected_error is None:
+            expected_error = self.truncated_data_error
+        # BytesIO
+        with self.assertRaisesRegex(*expected_error):
+            self.loads(dumped)
+        if hasattr(self, 'unpickler'):
+            try:
+                with open(TESTFN, 'wb') as f:
+                    f.write(dumped)
+                # buffered file
+                with open(TESTFN, 'rb') as f:
+                    u = self.unpickler(f)
+                    with self.assertRaisesRegex(*expected_error):
+                        u.load()
+                # unbuffered file
+                with open(TESTFN, 'rb', buffering=0) as f:
+                    u = self.unpickler(f)
+                    with self.assertRaisesRegex(*expected_error):
+                        u.load()
+            finally:
+                os_helper.unlink(TESTFN)
+
+    def test_truncated_large_binstring(self):
+        data = lambda size: b'T' + struct.pack('<I', size) + b'.' * 5
+        #    0: T    BINSTRING  '....'
+        #    9: .    STOP
+        self.assertEqual(self.loads(data(4)), '....') # self-testing
+        for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
+            self._test_truncated_data(data(size))
+        self._test_truncated_data(data(1 << 31),
+            (pickle.UnpicklingError, 'truncated|exceeds|negative byte count'))
+
+    def test_truncated_large_binunicode(self):
+        data = lambda size: b'X' + struct.pack('<I', size) + b'.' * 5
+        #    0: X    BINUNICODE '....'
+        #    9: .    STOP
+        self.assertEqual(self.loads(data(4)), '....') # self-testing
+        for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 32) - 1)):
+            self._test_truncated_data(data(size))
+
+    def test_truncated_large_binbytes(self):
+        data = lambda size: b'B' + struct.pack('<I', size) + b'.' * 5
+        #    0: B    BINBYTES   b'....'
+        #    9: .    STOP
+        self.assertEqual(self.loads(data(4)), b'....') # self-testing
+        for size in itersize(1 << 10, min(sys.maxsize, 1 << 31)):
+            self._test_truncated_data(data(size))
+
+    def test_truncated_large_long4(self):
+        data = lambda size: b'\x8b' + struct.pack('<I', size) + b'.' * 5
+        #    0: \x8b LONG4      0x2e2e2e2e
+        #    9: .    STOP
+        self.assertEqual(self.loads(data(4)), 0x2e2e2e2e) # self-testing
+        for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
+            self._test_truncated_data(data(size))
+        self._test_truncated_data(data(1 << 31),
+            (pickle.UnpicklingError, 'LONG pickle has negative byte count'))
+
+    def test_truncated_large_frame(self):
+        data = lambda size: b'\x95' + struct.pack('<Q', size) + b'N.'
+        #    0: \x95 FRAME      2
+        #    9: N    NONE
+        #   10: .    STOP
+        self.assertIsNone(self.loads(data(2))) # self-testing
+        for size in itersize(1 << 10, sys.maxsize - 9):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1),
+                ((OverflowError, ValueError),
+                 'FRAME length exceeds|frame size > sys.maxsize'))
+
+    def test_truncated_large_binunicode8(self):
+        data = lambda size: b'\x8d' + struct.pack('<Q', size) + b'.' * 5
+        #    0: \x8d BINUNICODE8 '....'
+        #   13: .    STOP
+        self.assertEqual(self.loads(data(4)), '....') # self-testing
+        for size in itersize(1 << 10, sys.maxsize - 9):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
+
+    def test_truncated_large_binbytes8(self):
+        data = lambda size: b'\x8e' + struct.pack('<Q', size) + b'.' * 5
+        #    0: \x8e BINBYTES8  b'....'
+        #   13: .    STOP
+        self.assertEqual(self.loads(data(4)), b'....') # self-testing
+        for size in itersize(1 << 10, sys.maxsize):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
+
+    def test_truncated_large_bytearray8(self):
+        data = lambda size: b'\x96' + struct.pack('<Q', size) + b'.' * 5
+        #    0: \x96 BYTEARRAY8 bytearray(b'....')
+        #   13: .    STOP
+        self.assertEqual(self.loads(data(4)), bytearray(b'....')) # self-testing
+        for size in itersize(1 << 10, sys.maxsize):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
+
     def test_badly_escaped_string(self):
         self.check_unpickling_error(ValueError, b"S'\\'\n.")
 

diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
@@ -52,6 +52,8 @@ class PyUnpicklerTests(AbstractUnpickleTests, unittest.TestCase):
     truncated_errors = (pickle.UnpicklingError, EOFError,
                         AttributeError, ValueError,
                         struct.error, IndexError, ImportError)
+    truncated_data_error = (EOFError, '')
+    size_overflow_error = (pickle.UnpicklingError, 'exceeds')
 
     def loads(self, buf, **kwds):
         f = io.BytesIO(buf)
@@ -96,6 +98,8 @@ class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests,
     truncated_errors = (pickle.UnpicklingError, EOFError,
                         AttributeError, ValueError,
                         struct.error, IndexError, ImportError)
+    truncated_data_error = ((pickle.UnpicklingError, EOFError), '')
+    size_overflow_error = ((OverflowError, pickle.UnpicklingError), 'exceeds')
 
     def dumps(self, arg, protocol=None, **kwargs):
         return pickle.dumps(arg, protocol, **kwargs)
@@ -368,6 +372,8 @@ class CUnpicklerTests(PyUnpicklerTests):
         unpickler = _pickle.Unpickler
         bad_stack_errors = (pickle.UnpicklingError,)
         truncated_errors = (pickle.UnpicklingError,)
+        truncated_data_error = (pickle.UnpicklingError, 'truncated')
+        size_overflow_error = (OverflowError, 'exceeds')
 
     class CPicklingErrorTests(PyPicklingErrorTests):
         pickler = _pickle.Pickler
@@ -471,7 +477,7 @@ def test_pickler(self):
                 0)  # Write buffer is cleared after every dump().
 
         def test_unpickler(self):
-            basesize = support.calcobjsize('2P2n2P 2P2n2i5P 2P3n8P2n2i')
+            basesize = support.calcobjsize('2P2n3P 2P2n2i5P 2P3n8P2n2i')
             unpickler = _pickle.Unpickler
             P = struct.calcsize('P')  # Size of memo table entry.
             n = struct.calcsize('n')  # Size of mark table entry.

diff --git a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
@@ -0,0 +1,2 @@
+:mod:`pickle` now avoids allocating an arbitrary large amount of memory
+based on small untrusted input when unpickling specially crafted data.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		:mod:`pickle` now avoids allocating an arbitrary large amount of memory
		based on small untrusted input when unpickling specially crafted data.