From 822230d411fed5d92706be9413440dea093e2905 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 20 May 2024 12:35:59 +0300
Subject: [PATCH 01/12] gh-115952: Fix vulnerability in the pickle module

Loading a small data which does not even involve arbitrary code execution
could consume arbitrary large amount of memory. There were three issues:

* PUT and LONG_BINPUT with large argument (the C implementation only).
  Since the memo is implemented in C as a continuous dynamic array, a single
  opcode can cause its resizing to arbitrary size. Now the sparsity of
  memo indices is limited.
* BINBYTES, BINBYTES8 and BYTEARRAY8 with large argument.  They allocated
  the bytes or bytearray object of the specified size before reading into
  it.  Now they read very large data by chunks.
* BINSTRING, BINUNICODE, LONG4, BINUNICODE8 and FRAME with large
  argument.  They read the whole data by calling the read() method of
  the underlying file object, which usually allocates the bytes object of
  the specified size before reading into it.  Now they read very large data
  by chunks.
---
 Lib/pickle.py                                 |  34 ++-
 Lib/test/pickletester.py                      | 102 ++++++++-
 Lib/test/test_pickle.py                       |  38 ++++
 ...-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst |   3 +
 Modules/_pickle.c                             | 209 ++++++++++++------
 5 files changed, 304 insertions(+), 82 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 33c97c8c5efb28..2e190a0e02124e 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -191,6 +191,11 @@ def __init__(self, value):
 __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$", x)])
 
 
+# Data larger that this will be read in chunks, to prevent extreme
+# overallocation.
+SAFE_BUF_SIZE = (1 << 20)
+
+
 class _Framer:
 
     _FRAME_SIZE_MIN = 4
@@ -289,7 +294,7 @@ def read(self, n):
                     "pickle exhausted before end of frame")
             return data
         else:
-            return self.file_read(n)
+            return self.safe_file_read(n)
 
     def readline(self):
         if self.current_frame:
@@ -304,11 +309,23 @@ def readline(self):
         else:
             return self.file_readline()
 
+    def safe_file_read(self, size):
+        cursize = min(size, SAFE_BUF_SIZE)
+        b = self.file_read(cursize)
+        while cursize < size and len(b) == cursize:
+            delta = min(cursize, size - cursize)
+            b += self.file_read(delta)
+            cursize += delta
+        return b
+
     def load_frame(self, frame_size):
         if self.current_frame and self.current_frame.read() != b'':
             raise UnpicklingError(
                 "beginning of a new frame before end of current frame")
-        self.current_frame = io.BytesIO(self.file_read(frame_size))
+        data = self.safe_file_read(frame_size)
+        if len(data) < frame_size:
+            raise EOFError
+        self.current_frame = io.BytesIO(data)
 
 
 # Tools used for pickling.
@@ -1378,12 +1395,17 @@ def load_binbytes8(self):
     dispatch[BINBYTES8[0]] = load_binbytes8
 
     def load_bytearray8(self):
-        len, = unpack('<Q', self.read(8))
-        if len > maxsize:
+        size, = unpack('<Q', self.read(8))
+        if size > maxsize:
             raise UnpicklingError("BYTEARRAY8 exceeds system's maximum size "
                                   "of %d bytes" % maxsize)
-        b = bytearray(len)
-        self.readinto(b)
+        cursize = min(size, SAFE_BUF_SIZE)
+        b = bytearray(cursize)
+        if self.readinto(b) == cursize:
+            while cursize < size and len(b) == cursize:
+                delta = min(cursize, size - cursize)
+                b += self.read(delta)
+                cursize += delta
         self.append(b)
     dispatch[BYTEARRAY8[0]] = load_bytearray8
 
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 93e7dbbd103934..769fd99fb1381c 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -853,9 +853,8 @@ def assert_is_copy(self, obj, objcopy, msg=None):
                 self.assertEqual(getattr(obj, slot, None),
                                  getattr(objcopy, slot, None), msg=msg)
 
-    def check_unpickling_error(self, errors, data):
-        with self.subTest(data=data), \
-             self.assertRaises(errors):
+    def check_unpickling_error_strict(self, errors, data):
+        with self.assertRaises(errors):
             try:
                 self.loads(data)
             except BaseException as exc:
@@ -864,6 +863,10 @@ def check_unpickling_error(self, errors, data):
                           (data, exc.__class__.__name__, exc))
                 raise
 
+    def check_unpickling_error(self, errors, data):
+        with self.subTest(data=data):
+            self.check_unpickling_error_strict(errors, data)
+
     def test_load_from_data0(self):
         self.assert_is_copy(self._testdata, self.loads(DATA0))
 
@@ -1115,6 +1118,99 @@ def test_negative_32b_binput(self):
         dumped = b'\x80\x03X\x01\x00\x00\x00ar\xff\xff\xff\xff.'
         self.check_unpickling_error(ValueError, dumped)
 
+    def itersize(self, start, stop):
+        size = start
+        while size < stop:
+            yield size
+            size <<= 1
+
+    def _test_truncated_data(self, dumped, expected_error=None):
+        if expected_error is None:
+            expected_error = self.truncated_data_error
+        # BytesIO
+        with self.assertRaisesRegex(*expected_error):
+            self.loads(dumped)
+        if hasattr(self, 'unpickler'):
+            try:
+                with open(TESTFN, 'wb') as f:
+                    f.write(dumped)
+                # buffered file
+                with open(TESTFN, 'rb') as f:
+                    u = self.unpickler(f)
+                    with self.assertRaisesRegex(*expected_error):
+                        u.load()
+                # unbuffered file
+                with open(TESTFN, 'rb', buffering=0) as f:
+                    u = self.unpickler(f)
+                    with self.assertRaisesRegex(*expected_error):
+                        u.load()
+            finally:
+                os_helper.unlink(TESTFN)
+
+    def test_truncated_large_binstring(self):
+        data = lambda size: b'T' + struct.pack('<I', size) + b'.' * 5
+        self.assertEqual(self.loads(data(4)), '....') # self-testing
+        for size in self.itersize(1 << 10, 1 << 31):
+            self._test_truncated_data(data(size))
+        self._test_truncated_data(data(1 << 31),
+            (pickle.UnpicklingError, 'truncated|negative byte count'))
+
+    def test_truncated_large_binunicode(self):
+        data = lambda size: b'X' + struct.pack('<I', size) + b'.' * 5
+        self.assertEqual(self.loads(data(4)), '....') # self-testing
+        for size in self.itersize(1 << 10, 1 << 31):
+            self._test_truncated_data(data(size))
+        self._test_truncated_data(data(1 << 31))
+
+    def test_truncated_large_binbytes(self):
+        data = lambda size: b'B' + struct.pack('<I', size) + b'.' * 5
+        self.assertEqual(self.loads(data(4)), b'....') # self-testing
+        for size in self.itersize(1 << 10, 1 << 31):
+            self._test_truncated_data(data(size))
+        self._test_truncated_data(data(1 << 31))
+
+    def test_truncated_large_long4(self):
+        data = lambda size: b'\x8b' + struct.pack('<I', size) + b'.' * 5
+        self.assertEqual(self.loads(data(4)), 0x2e2e2e2e) # self-testing
+        for size in self.itersize(1 << 10, 1 << 31):
+            self._test_truncated_data(data(size))
+        self._test_truncated_data(data(1 << 31),
+            (pickle.UnpicklingError, 'LONG pickle has negative byte count'))
+
+    def test_truncated_large_frame(self):
+        data = lambda size: b'\x95' + struct.pack('<Q', size) + b'N.'
+        self.assertIsNone(self.loads(data(2))) # self-testing
+        for size in self.itersize(1 << 10, sys.maxsize):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1),
+                ((OverflowError, ValueError),
+                 'FRAME length exceeds|frame size > sys.maxsize'))
+
+    def test_truncated_large_binunicode8(self):
+        data = lambda size: b'\x8d' + struct.pack('<Q', size) + b'.' * 5
+        self.assertEqual(self.loads(data(4)), '....') # self-testing
+        for size in self.itersize(1 << 10, sys.maxsize):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
+
+    def test_truncated_large_binbytes8(self):
+        data = lambda size: b'\x8e' + struct.pack('<Q', size) + b'.' * 5
+        self.assertEqual(self.loads(data(4)), b'....') # self-testing
+        for size in self.itersize(1 << 10, sys.maxsize):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
+
+    def test_truncated_large_bytearray8(self):
+        data = lambda size: b'\x96' + struct.pack('<Q', size) + b'.' * 5
+        self.assertEqual(self.loads(data(4)), bytearray(b'....')) # self-testing
+        for size in self.itersize(1 << 10, sys.maxsize):
+            self._test_truncated_data(data(size))
+        if sys.maxsize + 1 < 1 << 64:
+            self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
+
     def test_badly_escaped_string(self):
         self.check_unpickling_error(ValueError, b"S'\\'\n.")
 
diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index 19f977971570b7..1b5863ab171988 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -48,12 +48,27 @@ class PyUnpicklerTests(AbstractUnpickleTests, unittest.TestCase):
     truncated_errors = (pickle.UnpicklingError, EOFError,
                         AttributeError, ValueError,
                         struct.error, IndexError, ImportError)
+    truncated_data_error = (EOFError, '')
+    size_overflow_error = (pickle.UnpicklingError, 'exceeds')
 
     def loads(self, buf, **kwds):
         f = io.BytesIO(buf)
         u = self.unpickler(f, **kwds)
         return u.load()
 
+    def test_too_large_put(self):
+        data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
+                          b'g' + str(n).encode() + b'\nt.')
+        for idx in [10**6, 10**9, 10**12]:
+            self.assertEqual(self.loads(data(idx)), ([],)*2)
+
+
+    def test_too_large_long_binput(self):
+        data = lambda n: (b'(]r' + struct.pack('<I', n) +
+                          b'j' + struct.pack('<I', n) + b't.')
+        for idx in self.itersize(1 << 17, 1 << 31):
+            self.assertEqual(self.loads(data(idx)), ([],)*2)
+
 
 class PyPicklerTests(AbstractPickleTests, unittest.TestCase):
 
@@ -80,6 +95,8 @@ class InMemoryPickleTests(AbstractPickleTests, AbstractUnpickleTests,
     truncated_errors = (pickle.UnpicklingError, EOFError,
                         AttributeError, ValueError,
                         struct.error, IndexError, ImportError)
+    truncated_data_error = ((pickle.UnpicklingError, EOFError), '')
+    size_overflow_error = ((OverflowError, pickle.UnpicklingError), 'exceeds')
 
     def dumps(self, arg, protocol=None, **kwargs):
         return pickle.dumps(arg, protocol, **kwargs)
@@ -266,6 +283,27 @@ class CUnpicklerTests(PyUnpicklerTests):
         unpickler = _pickle.Unpickler
         bad_stack_errors = (pickle.UnpicklingError,)
         truncated_errors = (pickle.UnpicklingError,)
+        truncated_data_error = (pickle.UnpicklingError, 'truncated')
+        size_overflow_error = (OverflowError, 'exceeds')
+
+        def test_too_large_put(self):
+            data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
+                              b'g' + str(n).encode() + b'\nt.')
+            self.assertEqual(self.loads(data(100000)), ([],)*2) # self-testing
+            for idx in [10**6, 10**9, 10**12]:
+                with self.assertRaisesRegex(pickle.UnpicklingError,
+                                            'too sparse memo indices'):
+                    self.loads(data(idx))
+
+
+        def test_too_large_long_binput(self):
+            data = lambda n: (b'(]r' + struct.pack('<I', n) +
+                              b'j' + struct.pack('<I', n) + b't.')
+            self.assertEqual(self.loads(data(1 << 16)), ([],)*2) # self-testing
+            for idx in self.itersize(1 << 20, 1 << 31):
+                with self.assertRaisesRegex(pickle.UnpicklingError,
+                                            'too sparse memo indices'):
+                    self.loads(data(idx))
 
     class CPicklerTests(PyPicklerTests):
         pickler = _pickle.Pickler
diff --git a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
new file mode 100644
index 00000000000000..3d4de4be87ecf5
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
@@ -0,0 +1,3 @@
+Fix a vulnerability in the :mod:`pickle` module which allowed to consume
+arbitrary large amount of memory when loading a small data which does not
+even involve arbitrary code execution.
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
index 754a326822e0f0..c6c48d9beb035c 100644
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -153,6 +153,9 @@ enum {
 
     /* Prefetch size when unpickling (disabled on unpeekable streams) */
     PREFETCH = 8192 * 16,
+    /* Data larger that this will be read in chunks, to prevent extreme
+       overallocation. */
+    SAFE_BUF_SIZE = 1 << 20,
 
     FRAME_SIZE_MIN = 4,
     FRAME_SIZE_TARGET = 64 * 1024,
@@ -1241,6 +1244,66 @@ _Unpickler_SkipConsumed(UnpicklerObject *self)
 
 static const Py_ssize_t READ_WHOLE_LINE = -1;
 
+/* Don't call it directly: use _Unpickler_ReadInto() */
+static Py_ssize_t
+_Unpickler_ReadIntoFromFile(PickleState *state, UnpicklerObject *self, char *buf,
+                            Py_ssize_t n)
+{
+    assert(n != READ_WHOLE_LINE);
+
+    if (!self->readinto) {
+        /* readinto() not supported on file-like object, fall back to read()
+         * and copy into destination buffer (bpo-39681) */
+        PyObject* len = PyLong_FromSsize_t(n);
+        if (len == NULL) {
+            return -1;
+        }
+        PyObject* data = _Pickle_FastCall(self->read, len);
+        if (data == NULL) {
+            return -1;
+        }
+        if (!PyBytes_Check(data)) {
+            PyErr_Format(PyExc_ValueError,
+                         "read() returned non-bytes object (%R)",
+                         Py_TYPE(data));
+            Py_DECREF(data);
+            return -1;
+        }
+        Py_ssize_t read_size = PyBytes_GET_SIZE(data);
+        if (read_size < n) {
+            Py_DECREF(data);
+            return bad_readline(state);
+        }
+        memcpy(buf, PyBytes_AS_STRING(data), n);
+        Py_DECREF(data);
+        return n;
+    }
+
+    /* Call readinto() into user buffer */
+    PyObject *buf_obj = PyMemoryView_FromMemory(buf, n, PyBUF_WRITE);
+    if (buf_obj == NULL) {
+        return -1;
+    }
+    PyObject *read_size_obj = _Pickle_FastCall(self->readinto, buf_obj);
+    if (read_size_obj == NULL) {
+        return -1;
+    }
+    Py_ssize_t read_size = PyLong_AsSsize_t(read_size_obj);
+    Py_DECREF(read_size_obj);
+
+    if (read_size < 0) {
+        if (!PyErr_Occurred()) {
+            PyErr_SetString(PyExc_ValueError,
+                            "readinto() returned negative size");
+        }
+        return -1;
+    }
+    if (read_size < n) {
+        return bad_readline(state);
+    }
+    return n;
+}
+
 /* If reading from a file, we need to only pull the bytes we need, since there
    may be multiple pickle objects arranged contiguously in the same input
    buffer.
@@ -1256,7 +1319,7 @@ static const Py_ssize_t READ_WHOLE_LINE = -1;
    causing the Unpickler to go back to the file for more data. Use the returned
    size to tell you how much data you can process. */
 static Py_ssize_t
-_Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
+_Unpickler_ReadFromFile(PickleState *state, UnpicklerObject *self, Py_ssize_t n)
 {
     PyObject *data;
     Py_ssize_t read_size;
@@ -1268,6 +1331,9 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
 
     if (n == READ_WHOLE_LINE) {
         data = PyObject_CallNoArgs(self->readline);
+        if (data == NULL) {
+            return -1;
+        }
     }
     else {
         PyObject *len;
@@ -1292,13 +1358,28 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
                     return n;
             }
         }
-        len = PyLong_FromSsize_t(n);
+        Py_ssize_t cursize = Py_MIN(n, SAFE_BUF_SIZE);
+        len = PyLong_FromSsize_t(cursize);
         if (len == NULL)
             return -1;
         data = _Pickle_FastCall(self->read, len);
+        if (data == NULL) {
+            return -1;
+        }
+        while (cursize < n) {
+            Py_ssize_t prevsize = cursize;
+            cursize += Py_MIN(cursize, n - cursize);
+            if (_PyBytes_Resize(&data, cursize) < 0) {
+                return -1;
+            }
+            if (_Unpickler_ReadIntoFromFile(state, self,
+                    PyBytes_AS_STRING(data) + prevsize, cursize - prevsize) < 0)
+            {
+                Py_DECREF(data);
+                return -1;
+            }
+        }
     }
-    if (data == NULL)
-        return -1;
 
     read_size = _Unpickler_SetStringInput(self, data);
     Py_DECREF(data);
@@ -1325,7 +1406,7 @@ _Unpickler_ReadImpl(UnpicklerObject *self, PickleState *st, char **s, Py_ssize_t
         return bad_readline(st);
 
     /* Extend the buffer to satisfy desired size */
-    num_read = _Unpickler_ReadFromFile(self, n);
+    num_read = _Unpickler_ReadFromFile(st, self, n);
     if (num_read < 0)
         return -1;
     if (num_read < n)
@@ -1372,57 +1453,7 @@ _Unpickler_ReadInto(PickleState *state, UnpicklerObject *self, char *buf,
         return -1;
     }
 
-    if (!self->readinto) {
-        /* readinto() not supported on file-like object, fall back to read()
-         * and copy into destination buffer (bpo-39681) */
-        PyObject* len = PyLong_FromSsize_t(n);
-        if (len == NULL) {
-            return -1;
-        }
-        PyObject* data = _Pickle_FastCall(self->read, len);
-        if (data == NULL) {
-            return -1;
-        }
-        if (!PyBytes_Check(data)) {
-            PyErr_Format(PyExc_ValueError,
-                         "read() returned non-bytes object (%R)",
-                         Py_TYPE(data));
-            Py_DECREF(data);
-            return -1;
-        }
-        Py_ssize_t read_size = PyBytes_GET_SIZE(data);
-        if (read_size < n) {
-            Py_DECREF(data);
-            return bad_readline(state);
-        }
-        memcpy(buf, PyBytes_AS_STRING(data), n);
-        Py_DECREF(data);
-        return n;
-    }
-
-    /* Call readinto() into user buffer */
-    PyObject *buf_obj = PyMemoryView_FromMemory(buf, n, PyBUF_WRITE);
-    if (buf_obj == NULL) {
-        return -1;
-    }
-    PyObject *read_size_obj = _Pickle_FastCall(self->readinto, buf_obj);
-    if (read_size_obj == NULL) {
-        return -1;
-    }
-    Py_ssize_t read_size = PyLong_AsSsize_t(read_size_obj);
-    Py_DECREF(read_size_obj);
-
-    if (read_size < 0) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(PyExc_ValueError,
-                            "readinto() returned negative size");
-        }
-        return -1;
-    }
-    if (read_size < n) {
-        return bad_readline(state);
-    }
-    return n;
+    return _Unpickler_ReadIntoFromFile(state, self, buf, n);
 }
 
 /* Read `n` bytes from the unpickler's data source, storing the result in `*s`.
@@ -1482,7 +1513,7 @@ _Unpickler_Readline(PickleState *state, UnpicklerObject *self, char **result)
     if (!self->read)
         return bad_readline(state);
 
-    num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE);
+    num_read = _Unpickler_ReadFromFile(state, self, READ_WHOLE_LINE);
     if (num_read < 0)
         return -1;
     if (num_read == 0 || self->input_buffer[num_read - 1] != '\n')
@@ -1526,11 +1557,15 @@ _Unpickler_MemoGet(UnpicklerObject *self, size_t idx)
 /* Returns -1 (with an exception set) on failure, 0 on success.
    This takes its own reference to `value`. */
 static int
-_Unpickler_MemoPut(UnpicklerObject *self, size_t idx, PyObject *value)
+_Unpickler_MemoPut(PickleState *st, UnpicklerObject *self, size_t idx, PyObject *value)
 {
     PyObject *old_item;
 
     if (idx >= self->memo_size) {
+        if (idx > self->memo_len * 2 + (1 << 17)) {
+            PyErr_SetString(st->UnpicklingError, "too sparse memo indices");
+            return -1;
+        }
         if (_Unpickler_ResizeMemoList(self, idx * 2) < 0)
             return -1;
         assert(idx < self->memo_size);
@@ -5451,13 +5486,28 @@ load_counted_binbytes(PickleState *state, UnpicklerObject *self, int nbytes)
         return -1;
     }
 
-    bytes = PyBytes_FromStringAndSize(NULL, size);
-    if (bytes == NULL)
-        return -1;
-    if (_Unpickler_ReadInto(state, self, PyBytes_AS_STRING(bytes), size) < 0) {
-        Py_DECREF(bytes);
+    Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE);
+    Py_ssize_t prevsize = 0;
+    bytes = PyBytes_FromStringAndSize(NULL, cursize);
+    if (bytes == NULL) {
         return -1;
     }
+    while (1) {
+        if (_Unpickler_ReadInto(state, self,
+                PyBytes_AS_STRING(bytes) + prevsize, cursize - prevsize) < 0)
+        {
+            Py_DECREF(bytes);
+            return -1;
+        }
+        if (cursize >= size) {
+            break;
+        }
+        prevsize = cursize;
+        cursize += Py_MIN(cursize, size - cursize);
+        if (_PyBytes_Resize(&bytes, cursize) < 0) {
+            return -1;
+        }
+    }
 
     PDATA_PUSH(self->stack, bytes, -1);
     return 0;
@@ -5482,14 +5532,27 @@ load_counted_bytearray(PickleState *state, UnpicklerObject *self)
         return -1;
     }
 
-    bytearray = PyByteArray_FromStringAndSize(NULL, size);
+    Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE);
+    Py_ssize_t prevsize = 0;
+    bytearray = PyByteArray_FromStringAndSize(NULL, cursize);
     if (bytearray == NULL) {
         return -1;
     }
-    char *str = PyByteArray_AS_STRING(bytearray);
-    if (_Unpickler_ReadInto(state, self, str, size) < 0) {
-        Py_DECREF(bytearray);
-        return -1;
+    while (1) {
+        if (_Unpickler_ReadInto(state, self,
+                PyByteArray_AS_STRING(bytearray) + prevsize,
+                cursize - prevsize) < 0) {
+            Py_DECREF(bytearray);
+            return -1;
+        }
+        if (cursize >= size) {
+            break;
+        }
+        prevsize = cursize;
+        cursize += Py_MIN(cursize, size - cursize);
+        if (PyByteArray_Resize(bytearray, cursize) < 0) {
+            return -1;
+        }
     }
 
     PDATA_PUSH(self->stack, bytearray, -1);
@@ -6279,7 +6342,7 @@ load_put(PickleState *state, UnpicklerObject *self)
         return -1;
     }
 
-    return _Unpickler_MemoPut(self, idx, value);
+    return _Unpickler_MemoPut(state, self, idx, value);
 }
 
 static int
@@ -6298,7 +6361,7 @@ load_binput(PickleState *state, UnpicklerObject *self)
 
     idx = Py_CHARMASK(s[0]);
 
-    return _Unpickler_MemoPut(self, idx, value);
+    return _Unpickler_MemoPut(state, self, idx, value);
 }
 
 static int
@@ -6322,7 +6385,7 @@ load_long_binput(PickleState *state, UnpicklerObject *self)
         return -1;
     }
 
-    return _Unpickler_MemoPut(self, idx, value);
+    return _Unpickler_MemoPut(state, self, idx, value);
 }
 
 static int
@@ -6334,7 +6397,7 @@ load_memoize(PickleState *state, UnpicklerObject *self)
         return Pdata_stack_underflow(state, self->stack);
     value = self->stack->data[Py_SIZE(self->stack) - 1];
 
-    return _Unpickler_MemoPut(self, self->memo_len, value);
+    return _Unpickler_MemoPut(state, self, self->memo_len, value);
 }
 
 static int
@@ -7432,7 +7495,7 @@ Unpickler_set_memo(UnpicklerObject *self, PyObject *obj, void *Py_UNUSED(ignored
                                 "memo key must be positive integers.");
                 goto error;
             }
-            if (_Unpickler_MemoPut(self, idx, value) < 0)
+            if (_Unpickler_MemoPut(state, self, idx, value) < 0)
                 goto error;
         }
     }

From 88f1461cfa9c3f921be1c04ddb6e09581d6375c0 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 20 May 2024 14:01:54 +0300
Subject: [PATCH 02/12] Try to fix tests of 32-bit platforms.

---
 Lib/test/pickletester.py | 17 ++++++++---------
 Lib/test/test_pickle.py  |  2 +-
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 769fd99fb1381c..66902599dd538e 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1123,6 +1123,7 @@ def itersize(self, start, stop):
         while size < stop:
             yield size
             size <<= 1
+        yield stop
 
     def _test_truncated_data(self, dumped, expected_error=None):
         if expected_error is None:
@@ -1150,29 +1151,27 @@ def _test_truncated_data(self, dumped, expected_error=None):
     def test_truncated_large_binstring(self):
         data = lambda size: b'T' + struct.pack('<I', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, 1 << 31):
+        for size in self.itersize(1 << 10, (1 << 31) - 1):
             self._test_truncated_data(data(size))
         self._test_truncated_data(data(1 << 31),
-            (pickle.UnpicklingError, 'truncated|negative byte count'))
+            (pickle.UnpicklingError, 'truncated|exceeds|negative byte count'))
 
     def test_truncated_large_binunicode(self):
         data = lambda size: b'X' + struct.pack('<I', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, 1 << 31):
+        for size in self.itersize(1 << 10, min(sys.maxsize, 1 << 31)):
             self._test_truncated_data(data(size))
-        self._test_truncated_data(data(1 << 31))
 
     def test_truncated_large_binbytes(self):
         data = lambda size: b'B' + struct.pack('<I', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), b'....') # self-testing
-        for size in self.itersize(1 << 10, 1 << 31):
+        for size in self.itersize(1 << 10, min(sys.maxsize, 1 << 31)):
             self._test_truncated_data(data(size))
-        self._test_truncated_data(data(1 << 31))
 
     def test_truncated_large_long4(self):
         data = lambda size: b'\x8b' + struct.pack('<I', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), 0x2e2e2e2e) # self-testing
-        for size in self.itersize(1 << 10, 1 << 31):
+        for size in self.itersize(1 << 10, (1 << 31) - 1):
             self._test_truncated_data(data(size))
         self._test_truncated_data(data(1 << 31),
             (pickle.UnpicklingError, 'LONG pickle has negative byte count'))
@@ -1180,7 +1179,7 @@ def test_truncated_large_long4(self):
     def test_truncated_large_frame(self):
         data = lambda size: b'\x95' + struct.pack('<Q', size) + b'N.'
         self.assertIsNone(self.loads(data(2))) # self-testing
-        for size in self.itersize(1 << 10, sys.maxsize):
+        for size in self.itersize(1 << 10, sys.maxsize - 9):
             self._test_truncated_data(data(size))
         if sys.maxsize + 1 < 1 << 64:
             self._test_truncated_data(data(sys.maxsize + 1),
@@ -1190,7 +1189,7 @@ def test_truncated_large_frame(self):
     def test_truncated_large_binunicode8(self):
         data = lambda size: b'\x8d' + struct.pack('<Q', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, sys.maxsize):
+        for size in self.itersize(1 << 10, sys.maxsize - 9):
             self._test_truncated_data(data(size))
         if sys.maxsize + 1 < 1 << 64:
             self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index 1b5863ab171988..cc6b27ec08941d 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -290,7 +290,7 @@ def test_too_large_put(self):
             data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
                               b'g' + str(n).encode() + b'\nt.')
             self.assertEqual(self.loads(data(100000)), ([],)*2) # self-testing
-            for idx in [10**6, 10**9, 10**12]:
+            for idx in [10**6, 10**9, min(sys.maxsize, 10**12)]:
                 with self.assertRaisesRegex(pickle.UnpicklingError,
                                             'too sparse memo indices'):
                     self.loads(data(idx))

From 048099b45beeaf372cef18431e1cb2a6edf6a4bb Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 20 May 2024 20:51:10 +0300
Subject: [PATCH 03/12] Try to fix more tests on 32-bit platforms.

---
 Lib/test/pickletester.py | 6 +++---
 Lib/test/test_pickle.py  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 66902599dd538e..fff54a6f22f23e 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1151,7 +1151,7 @@ def _test_truncated_data(self, dumped, expected_error=None):
     def test_truncated_large_binstring(self):
         data = lambda size: b'T' + struct.pack('<I', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, (1 << 31) - 1):
+        for size in self.itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
             self._test_truncated_data(data(size))
         self._test_truncated_data(data(1 << 31),
             (pickle.UnpicklingError, 'truncated|exceeds|negative byte count'))
@@ -1159,7 +1159,7 @@ def test_truncated_large_binstring(self):
     def test_truncated_large_binunicode(self):
         data = lambda size: b'X' + struct.pack('<I', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, min(sys.maxsize, 1 << 31)):
+        for size in self.itersize(1 << 10, min(sys.maxsize - 5, (1 << 32) - 1)):
             self._test_truncated_data(data(size))
 
     def test_truncated_large_binbytes(self):
@@ -1171,7 +1171,7 @@ def test_truncated_large_binbytes(self):
     def test_truncated_large_long4(self):
         data = lambda size: b'\x8b' + struct.pack('<I', size) + b'.' * 5
         self.assertEqual(self.loads(data(4)), 0x2e2e2e2e) # self-testing
-        for size in self.itersize(1 << 10, (1 << 31) - 1):
+        for size in self.itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
             self._test_truncated_data(data(size))
         self._test_truncated_data(data(1 << 31),
             (pickle.UnpicklingError, 'LONG pickle has negative byte count'))
diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index cc6b27ec08941d..809bdab06260df 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -66,7 +66,7 @@ def test_too_large_put(self):
     def test_too_large_long_binput(self):
         data = lambda n: (b'(]r' + struct.pack('<I', n) +
                           b'j' + struct.pack('<I', n) + b't.')
-        for idx in self.itersize(1 << 17, 1 << 31):
+        for idx in self.itersize(1 << 17, min(sys.maxsize, (1 << 32) - 1)):
             self.assertEqual(self.loads(data(idx)), ([],)*2)
 
 
@@ -300,7 +300,7 @@ def test_too_large_long_binput(self):
             data = lambda n: (b'(]r' + struct.pack('<I', n) +
                               b'j' + struct.pack('<I', n) + b't.')
             self.assertEqual(self.loads(data(1 << 16)), ([],)*2) # self-testing
-            for idx in self.itersize(1 << 20, 1 << 31):
+            for idx in self.itersize(1 << 20, min(sys.maxsize, (1 << 32) - 1)):
                 with self.assertRaisesRegex(pickle.UnpicklingError,
                                             'too sparse memo indices'):
                     self.loads(data(idx))

From d9d1d1d244e90fa4a19c35e38286f485c680eb29 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Wed, 22 May 2024 16:18:17 +0300
Subject: [PATCH 04/12] Apply suggestions from code review

---
 Lib/pickle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 2e190a0e02124e..9be5c2401c7a9e 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -191,7 +191,7 @@ def __init__(self, value):
 __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$", x)])
 
 
-# Data larger that this will be read in chunks, to prevent extreme
+# Data larger than this will be read in chunks, to prevent extreme
 # overallocation.
 SAFE_BUF_SIZE = (1 << 20)
 

From d0e667eb848cf28e027645b13546ec961b623f57 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 29 Jun 2024 11:10:31 +0300
Subject: [PATCH 05/12] Remove empty lines.

---
 Lib/test/test_pickle.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index 809bdab06260df..9deec6dc881761 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -62,7 +62,6 @@ def test_too_large_put(self):
         for idx in [10**6, 10**9, 10**12]:
             self.assertEqual(self.loads(data(idx)), ([],)*2)
 
-
     def test_too_large_long_binput(self):
         data = lambda n: (b'(]r' + struct.pack('<I', n) +
                           b'j' + struct.pack('<I', n) + b't.')
@@ -295,7 +294,6 @@ def test_too_large_put(self):
                                             'too sparse memo indices'):
                     self.loads(data(idx))
 
-
         def test_too_large_long_binput(self):
             data = lambda n: (b'(]r' + struct.pack('<I', n) +
                               b'j' + struct.pack('<I', n) + b't.')

From b257974d3ca2fc6cbf6c9fdff017bb07ebca2b05 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sun, 30 Jun 2024 13:32:39 +0300
Subject: [PATCH 06/12] Change names, add more commentis and update the NEWS
 entry.

---
 Lib/pickle.py                                   | 12 ++++++------
 Lib/test/pickletester.py                        |  2 ++
 ...24-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst |  5 ++---
 Modules/_pickle.c                               | 17 ++++++++++++-----
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/Lib/pickle.py b/Lib/pickle.py
index e287e162925989..1e10459e58405e 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -193,7 +193,7 @@ def __init__(self, value):
 
 # Data larger than this will be read in chunks, to prevent extreme
 # overallocation.
-SAFE_BUF_SIZE = (1 << 20)
+_MIN_READ_BUF_SIZE = (1 << 20)
 
 
 class _Framer:
@@ -294,7 +294,7 @@ def read(self, n):
                     "pickle exhausted before end of frame")
             return data
         else:
-            return self.safe_file_read(n)
+            return self._chunked_file_read(n)
 
     def readline(self):
         if self.current_frame:
@@ -309,8 +309,8 @@ def readline(self):
         else:
             return self.file_readline()
 
-    def safe_file_read(self, size):
-        cursize = min(size, SAFE_BUF_SIZE)
+    def _chunked_file_read(self, size):
+        cursize = min(size, _MIN_READ_BUF_SIZE)
         b = self.file_read(cursize)
         while cursize < size and len(b) == cursize:
             delta = min(cursize, size - cursize)
@@ -322,7 +322,7 @@ def load_frame(self, frame_size):
         if self.current_frame and self.current_frame.read() != b'':
             raise UnpicklingError(
                 "beginning of a new frame before end of current frame")
-        data = self.safe_file_read(frame_size)
+        data = self._chunked_file_read(frame_size)
         if len(data) < frame_size:
             raise EOFError
         self.current_frame = io.BytesIO(data)
@@ -1419,7 +1419,7 @@ def load_bytearray8(self):
         if size > maxsize:
             raise UnpicklingError("BYTEARRAY8 exceeds system's maximum size "
                                   "of %d bytes" % maxsize)
-        cursize = min(size, SAFE_BUF_SIZE)
+        cursize = min(size, _MIN_READ_BUF_SIZE)
         b = bytearray(cursize)
         if self.readinto(b) == cursize:
             while cursize < size and len(b) == cursize:
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 6184837b26afd8..2aeee4fef213b3 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1119,6 +1119,8 @@ def test_negative_32b_binput(self):
         self.check_unpickling_error(ValueError, dumped)
 
     def itersize(self, start, stop):
+        # Produce geometrical increasing sequence from start to stop
+        # (inclusively) for tests.
         size = start
         while size < stop:
             yield size
diff --git a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
index 3d4de4be87ecf5..cbe0e53e8fabb0 100644
--- a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
+++ b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
@@ -1,3 +1,2 @@
-Fix a vulnerability in the :mod:`pickle` module which allowed to consume
-arbitrary large amount of memory when loading a small data which does not
-even involve arbitrary code execution.
+:mod:`pickle` now avoids allocation of arbitrary large amount of memory
+when unpickling specially crafted data.
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
index 4de33ab6cdd87c..51c91fb3e53e8b 100644
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -155,7 +155,7 @@ enum {
     PREFETCH = 8192 * 16,
     /* Data larger that this will be read in chunks, to prevent extreme
        overallocation. */
-    SAFE_BUF_SIZE = 1 << 20,
+    MIN_READ_BUF_SIZE = 1 << 20,
 
     FRAME_SIZE_MIN = 4,
     FRAME_SIZE_TARGET = 64 * 1024,
@@ -1358,7 +1358,7 @@ _Unpickler_ReadFromFile(PickleState *state, UnpicklerObject *self, Py_ssize_t n)
                     return n;
             }
         }
-        Py_ssize_t cursize = Py_MIN(n, SAFE_BUF_SIZE);
+        Py_ssize_t cursize = Py_MIN(n, MIN_READ_BUF_SIZE);
         len = PyLong_FromSsize_t(cursize);
         if (len == NULL)
             return -1;
@@ -1562,7 +1562,14 @@ _Unpickler_MemoPut(PickleState *st, UnpicklerObject *self, size_t idx, PyObject
     PyObject *old_item;
 
     if (idx >= self->memo_size) {
-        if (idx > self->memo_len * 2 + (1 << 17)) {
+        /* MAX_MEMO_INDICES_GAP was introduced mainly for making testing of
+         * PUT, BINPUT and LONG_BINPUT opcodes simpler.  It should be more
+         * than 1<<16 for LONG_BINPUT.
+         * The standard pickler never produces data that requires more than 0.
+         * The Python code does not have such limitation.
+         */
+        const int MAX_MEMO_INDICES_GAP = 1 << 17;
+        if (idx > self->memo_len * 2 + MAX_MEMO_INDICES_GAP) {
             PyErr_SetString(st->UnpicklingError, "too sparse memo indices");
             return -1;
         }
@@ -5485,7 +5492,7 @@ load_counted_binbytes(PickleState *state, UnpicklerObject *self, int nbytes)
         return -1;
     }
 
-    Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE);
+    Py_ssize_t cursize = Py_MIN(size, MIN_READ_BUF_SIZE);
     Py_ssize_t prevsize = 0;
     bytes = PyBytes_FromStringAndSize(NULL, cursize);
     if (bytes == NULL) {
@@ -5531,7 +5538,7 @@ load_counted_bytearray(PickleState *state, UnpicklerObject *self)
         return -1;
     }
 
-    Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE);
+    Py_ssize_t cursize = Py_MIN(size, MIN_READ_BUF_SIZE);
     Py_ssize_t prevsize = 0;
     bytearray = PyByteArray_FromStringAndSize(NULL, cursize);
     if (bytearray == NULL) {

From 184984db7e85c19fae6258a471e7cbe527069c2f Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 6 Sep 2024 10:59:12 +0300
Subject: [PATCH 07/12] Support arbitrary non-continuous memo keys.

---
 Lib/test/pickletester.py |  12 +++++
 Lib/test/test_pickle.py  |  32 +-----------
 Modules/_pickle.c        | 110 ++++++++++++++++++++++++---------------
 3 files changed, 82 insertions(+), 72 deletions(-)

diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 548ec60bd34a66..b5b4da00049886 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1127,6 +1127,18 @@ def itersize(self, start, stop):
             size <<= 1
         yield stop
 
+    def test_too_large_put(self):
+        data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
+                          b'g' + str(n).encode() + b'\nt.')
+        for idx in [10**6, 10**9, 10**12]:
+            self.assertEqual(self.loads(data(idx)), ([],)*2)
+
+    def test_too_large_long_binput(self):
+        data = lambda n: (b'(]r' + struct.pack('<I', n) +
+                          b'j' + struct.pack('<I', n) + b't.')
+        for idx in self.itersize(1 << 20, min(sys.maxsize, (1 << 32) - 1)):
+            self.assertEqual(self.loads(data(idx)), ([],)*2)
+
     def _test_truncated_data(self, dumped, expected_error=None):
         if expected_error is None:
             expected_error = self.truncated_data_error
diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index aa18eb25d77cb2..0a0e270783b75d 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -57,18 +57,6 @@ def loads(self, buf, **kwds):
         u = self.unpickler(f, **kwds)
         return u.load()
 
-    def test_too_large_put(self):
-        data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
-                          b'g' + str(n).encode() + b'\nt.')
-        for idx in [10**6, 10**9, 10**12]:
-            self.assertEqual(self.loads(data(idx)), ([],)*2)
-
-    def test_too_large_long_binput(self):
-        data = lambda n: (b'(]r' + struct.pack('<I', n) +
-                          b'j' + struct.pack('<I', n) + b't.')
-        for idx in self.itersize(1 << 17, min(sys.maxsize, (1 << 32) - 1)):
-            self.assertEqual(self.loads(data(idx)), ([],)*2)
-
 
 class PyPicklingErrorTests(AbstractPicklingErrorTests, unittest.TestCase):
 
@@ -300,24 +288,6 @@ class CUnpicklerTests(PyUnpicklerTests):
         truncated_data_error = (pickle.UnpicklingError, 'truncated')
         size_overflow_error = (OverflowError, 'exceeds')
 
-        def test_too_large_put(self):
-            data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
-                              b'g' + str(n).encode() + b'\nt.')
-            self.assertEqual(self.loads(data(100000)), ([],)*2) # self-testing
-            for idx in [10**6, 10**9, min(sys.maxsize, 10**12)]:
-                with self.assertRaisesRegex(pickle.UnpicklingError,
-                                            'too sparse memo indices'):
-                    self.loads(data(idx))
-
-        def test_too_large_long_binput(self):
-            data = lambda n: (b'(]r' + struct.pack('<I', n) +
-                              b'j' + struct.pack('<I', n) + b't.')
-            self.assertEqual(self.loads(data(1 << 16)), ([],)*2) # self-testing
-            for idx in self.itersize(1 << 20, min(sys.maxsize, (1 << 32) - 1)):
-                with self.assertRaisesRegex(pickle.UnpicklingError,
-                                            'too sparse memo indices'):
-                    self.loads(data(idx))
-
     class CPicklingErrorTests(PyPicklingErrorTests):
         pickler = _pickle.Pickler
 
@@ -420,7 +390,7 @@ def test_pickler(self):
                 0)  # Write buffer is cleared after every dump().
 
         def test_unpickler(self):
-            basesize = support.calcobjsize('2P2nP 2P2n2i5P 2P3n8P2n2i')
+            basesize = support.calcobjsize('2P2n2P 2P2n2i5P 2P3n8P2n2i')
             unpickler = _pickle.Unpickler
             P = struct.calcsize('P')  # Size of memo table entry.
             n = struct.calcsize('n')  # Size of mark table entry.
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
index 444b06ef6e5e90..68bf5c3f150770 100644
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -651,10 +651,11 @@ typedef struct UnpicklerObject {
     Pdata *stack;               /* Pickle data stack, store unpickled objects. */
 
     /* The unpickler memo is just an array of PyObject *s. Using a dict
-       is unnecessary, since the keys are contiguous ints. */
+       is unnecessary, since the keys usually are contiguous ints. */
     PyObject **memo;
     size_t memo_size;       /* Capacity of the memo array */
     size_t memo_len;        /* Number of objects in the memo */
+    PyObject *memo_dict;    /* The backup memo dict for non-continuous keys. */
 
     PyObject *persistent_load;  /* persistent_load() method, can be NULL. */
 
@@ -1546,32 +1547,64 @@ _Unpickler_ResizeMemoList(UnpicklerObject *self, size_t new_size)
 
 /* Returns NULL if idx is out of bounds. */
 static PyObject *
-_Unpickler_MemoGet(UnpicklerObject *self, size_t idx)
+_Unpickler_MemoGet(PickleState *st, UnpicklerObject *self, size_t idx)
 {
-    if (idx >= self->memo_size)
-        return NULL;
-
-    return self->memo[idx];
+    PyObject *value;
+    if (idx < self->memo_size) {
+        value = self->memo[idx];
+        if (value != NULL) {
+            return value;
+        }
+    }
+    if (self->memo_dict != NULL) {
+        PyObject *key = PyLong_FromSsize_t(idx);
+        if (key == NULL) {
+            return NULL;
+        }
+        if (idx < self->memo_size) {
+            (void)PyDict_Pop(self->memo_dict, key, &value);
+            self->memo[idx] = value;
+        }
+        else {
+            value = PyDict_GetItemWithError(self->memo_dict, key);
+        }
+        Py_DECREF(key);
+        if (value != NULL || PyErr_Occurred()) {
+            return value;
+        }
+    }
+    PyErr_Format(st->UnpicklingError, "Memo value not found at index %zd", idx);
+    return NULL;
 }
 
 /* Returns -1 (with an exception set) on failure, 0 on success.
    This takes its own reference to `value`. */
 static int
-_Unpickler_MemoPut(PickleState *st, UnpicklerObject *self, size_t idx, PyObject *value)
+_Unpickler_MemoPut(UnpicklerObject *self, size_t idx, PyObject *value)
 {
     PyObject *old_item;
 
     if (idx >= self->memo_size) {
-        /* MAX_MEMO_INDICES_GAP was introduced mainly for making testing of
-         * PUT, BINPUT and LONG_BINPUT opcodes simpler.  It should be more
-         * than 1<<16 for LONG_BINPUT.
-         * The standard pickler never produces data that requires more than 0.
-         * The Python code does not have such limitation.
-         */
-        const int MAX_MEMO_INDICES_GAP = 1 << 17;
-        if (idx > self->memo_len * 2 + MAX_MEMO_INDICES_GAP) {
-            PyErr_SetString(st->UnpicklingError, "too sparse memo indices");
-            return -1;
+        if (idx > self->memo_len * 2) {
+            /* The memo keys are too sparse. Use a dict instead of
+             * a continuous array for the memo. */
+            if (self->memo_dict == NULL) {
+                self->memo_dict = PyDict_New();
+                if (self->memo_dict == NULL) {
+                    return -1;
+                }
+            }
+            PyObject *key = PyLong_FromSize_t(idx);
+            if (key == NULL) {
+                return -1;
+            }
+
+            if (PyDict_SetItem(self->memo_dict, key, value) < 0) {
+                Py_DECREF(key);
+                return -1;
+            }
+            Py_DECREF(key);
+            return 0;
         }
         if (_Unpickler_ResizeMemoList(self, idx * 2) < 0)
             return -1;
@@ -1642,6 +1675,7 @@ _Unpickler_New(PyObject *module)
     self->memo = memo;
     self->memo_size = MEMO_SIZE;
     self->memo_len = 0;
+    self->memo_dict = NULL;
     self->persistent_load = NULL;
     memset(&self->buffer, 0, sizeof(Py_buffer));
     self->input_buffer = NULL;
@@ -6149,20 +6183,15 @@ load_get(PickleState *st, UnpicklerObject *self)
     if (key == NULL)
         return -1;
     idx = PyLong_AsSsize_t(key);
+    Py_DECREF(key);
     if (idx == -1 && PyErr_Occurred()) {
-        Py_DECREF(key);
         return -1;
     }
 
-    value = _Unpickler_MemoGet(self, idx);
+    value = _Unpickler_MemoGet(st, self, idx);
     if (value == NULL) {
-        if (!PyErr_Occurred()) {
-           PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx);
-        }
-        Py_DECREF(key);
         return -1;
     }
-    Py_DECREF(key);
 
     PDATA_APPEND(self->stack, value, -1);
     return 0;
@@ -6180,13 +6209,8 @@ load_binget(PickleState *st, UnpicklerObject *self)
 
     idx = Py_CHARMASK(s[0]);
 
-    value = _Unpickler_MemoGet(self, idx);
+    value = _Unpickler_MemoGet(st, self, idx);
     if (value == NULL) {
-        PyObject *key = PyLong_FromSsize_t(idx);
-        if (key != NULL) {
-            PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx);
-            Py_DECREF(key);
-        }
         return -1;
     }
 
@@ -6206,13 +6230,8 @@ load_long_binget(PickleState *st, UnpicklerObject *self)
 
     idx = calc_binsize(s, 4);
 
-    value = _Unpickler_MemoGet(self, idx);
+    value = _Unpickler_MemoGet(st, self, idx);
     if (value == NULL) {
-        PyObject *key = PyLong_FromSsize_t(idx);
-        if (key != NULL) {
-            PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx);
-            Py_DECREF(key);
-        }
         return -1;
     }
 
@@ -6337,7 +6356,7 @@ load_put(PickleState *state, UnpicklerObject *self)
         return -1;
     }
 
-    return _Unpickler_MemoPut(state, self, idx, value);
+    return _Unpickler_MemoPut(self, idx, value);
 }
 
 static int
@@ -6356,7 +6375,7 @@ load_binput(PickleState *state, UnpicklerObject *self)
 
     idx = Py_CHARMASK(s[0]);
 
-    return _Unpickler_MemoPut(state, self, idx, value);
+    return _Unpickler_MemoPut(self, idx, value);
 }
 
 static int
@@ -6380,7 +6399,7 @@ load_long_binput(PickleState *state, UnpicklerObject *self)
         return -1;
     }
 
-    return _Unpickler_MemoPut(state, self, idx, value);
+    return _Unpickler_MemoPut(self, idx, value);
 }
 
 static int
@@ -6392,7 +6411,7 @@ load_memoize(PickleState *state, UnpicklerObject *self)
         return Pdata_stack_underflow(state, self->stack);
     value = self->stack->data[Py_SIZE(self->stack) - 1];
 
-    return _Unpickler_MemoPut(state, self, self->memo_len, value);
+    return _Unpickler_MemoPut(self, self->memo_len, value);
 }
 
 static int
@@ -7141,6 +7160,13 @@ _pickle_Unpickler___sizeof___impl(UnpicklerObject *self)
     size_t res = _PyObject_SIZE(Py_TYPE(self));
     if (self->memo != NULL)
         res += self->memo_size * sizeof(PyObject *);
+    if (self->memo_dict != NULL) {
+        size_t s = _PySys_GetSizeOf(self->memo_dict);
+        if (s == (size_t)-1) {
+            return -1;
+        }
+        res += s;
+    }
     if (self->marks != NULL)
         res += (size_t)self->marks_size * sizeof(Py_ssize_t);
     if (self->input_line != NULL)
@@ -7175,6 +7201,7 @@ Unpickler_clear(UnpicklerObject *self)
         self->buffer.buf = NULL;
     }
 
+    Py_CLEAR(self->memo_dict);
     _Unpickler_MemoCleanup(self);
     PyMem_Free(self->marks);
     self->marks = NULL;
@@ -7209,6 +7236,7 @@ Unpickler_traverse(UnpicklerObject *self, visitproc visit, void *arg)
     Py_VISIT(self->stack);
     Py_VISIT(self->persistent_load);
     Py_VISIT(self->buffers);
+    Py_VISIT(self->memo_dict);
     PyObject **memo = self->memo;
     if (memo) {
         Py_ssize_t i = self->memo_size;
@@ -7514,7 +7542,7 @@ Unpickler_set_memo(UnpicklerObject *self, PyObject *obj, void *Py_UNUSED(ignored
                                 "memo key must be positive integers.");
                 goto error;
             }
-            if (_Unpickler_MemoPut(state, self, idx, value) < 0)
+            if (_Unpickler_MemoPut(self, idx, value) < 0)
                 goto error;
         }
     }

From f0c0728107d9fbad2f5b3b4aed2787e8a6d6a4c1 Mon Sep 17 00:00:00 2001
From: "Gregory P. Smith" <greg@krypto.org>
Date: Fri, 27 Sep 2024 14:38:42 -0700
Subject: [PATCH 08/12] Reworded NEWS a bit.

---
 .../Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
index cbe0e53e8fabb0..814326a0b53864 100644
--- a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
+++ b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst
@@ -1,2 +1,2 @@
-:mod:`pickle` now avoids allocation of arbitrary large amount of memory
-when unpickling specially crafted data.
+:mod:`pickle` now avoids allocating an arbitrary large amount of memory
+based on small untrusted input when unpickling specially crafted data.

From c72d0952268c760cf7d95bcf88abc6a34879c08a Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 28 Sep 2024 10:27:00 +0300
Subject: [PATCH 09/12] Fix C to Python integer conversion.

---
 Modules/_pickle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Modules/_pickle.c b/Modules/_pickle.c
index 50d8971a3719d1..1a92376a4e1f26 100644
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -1558,7 +1558,7 @@ _Unpickler_MemoGet(PickleState *st, UnpicklerObject *self, size_t idx)
         }
     }
     if (self->memo_dict != NULL) {
-        PyObject *key = PyLong_FromSsize_t(idx);
+        PyObject *key = PyLong_FromSize_t(idx);
         if (key == NULL) {
             return NULL;
         }

From e89bfeabd717e91c3e5c159cabc8f865d291fc19 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 28 Sep 2024 10:27:41 +0300
Subject: [PATCH 10/12] Add more comments.

---
 Lib/test/pickletester.py | 72 ++++++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 411e4bd19e8274..e4245f495356a9 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -74,6 +74,15 @@ def count_opcode(code, pickle):
 def identity(x):
     return x
 
+def itersize(start, stop):
+    # Produce geometrical increasing sequence from start to stop
+    # (inclusively) for tests.
+    size = start
+    while size < stop:
+        yield size
+        size <<= 1
+    yield stop
+
 
 class UnseekableIO(io.BytesIO):
     def peek(self, *args):
@@ -1118,28 +1127,38 @@ def test_negative_32b_binput(self):
         dumped = b'\x80\x03X\x01\x00\x00\x00ar\xff\xff\xff\xff.'
         self.check_unpickling_error(ValueError, dumped)
 
-    def itersize(self, start, stop):
-        # Produce geometrical increasing sequence from start to stop
-        # (inclusively) for tests.
-        size = start
-        while size < stop:
-            yield size
-            size <<= 1
-        yield stop
-
     def test_too_large_put(self):
+        # Test that PUT with large id does not cause allocation of
+        # too large memo table.
         data = lambda n: (b'((lp' + str(n).encode() + b'\n' +
                           b'g' + str(n).encode() + b'\nt.')
+        #    0: (    MARK
+        #    1: (        MARK
+        #    2: l            LIST       (MARK at 1)
+        #    3: p        PUT        1000000000000
+        #   18: g        GET        1000000000000
+        #   33: t        TUPLE      (MARK at 0)
+        #   34: .    STOP
         for idx in [10**6, 10**9, 10**12]:
             self.assertEqual(self.loads(data(idx)), ([],)*2)
 
     def test_too_large_long_binput(self):
+        # Test that LONG_BINPUT with large id does not cause allocation of
+        # too large memo table.
         data = lambda n: (b'(]r' + struct.pack('<I', n) +
                           b'j' + struct.pack('<I', n) + b't.')
-        for idx in self.itersize(1 << 20, min(sys.maxsize, (1 << 32) - 1)):
+        #    0: (    MARK
+        #    1: ]        EMPTY_LIST
+        #    2: r        LONG_BINPUT 4294967295
+        #    7: j        LONG_BINGET 4294967295
+        #   12: t        TUPLE      (MARK at 0)
+        #   13: .    STOP
+        for idx in itersize(1 << 20, min(sys.maxsize, (1 << 32) - 1)):
             self.assertEqual(self.loads(data(idx)), ([],)*2)
 
     def _test_truncated_data(self, dumped, expected_error=None):
+        # Test that instructions to read large data without providing
+        # such amount of data do not cause large memory usage.
         if expected_error is None:
             expected_error = self.truncated_data_error
         # BytesIO
@@ -1164,36 +1183,47 @@ def _test_truncated_data(self, dumped, expected_error=None):
 
     def test_truncated_large_binstring(self):
         data = lambda size: b'T' + struct.pack('<I', size) + b'.' * 5
+        #    0: T    BINSTRING  '....'
+        #    9: .    STOP
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
+        for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
             self._test_truncated_data(data(size))
         self._test_truncated_data(data(1 << 31),
             (pickle.UnpicklingError, 'truncated|exceeds|negative byte count'))
 
     def test_truncated_large_binunicode(self):
         data = lambda size: b'X' + struct.pack('<I', size) + b'.' * 5
+        #    0: X    BINUNICODE '....'
+        #    9: .    STOP
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, min(sys.maxsize - 5, (1 << 32) - 1)):
+        for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 32) - 1)):
             self._test_truncated_data(data(size))
 
     def test_truncated_large_binbytes(self):
         data = lambda size: b'B' + struct.pack('<I', size) + b'.' * 5
+        #    0: B    BINBYTES   b'....'
+        #    9: .    STOP
         self.assertEqual(self.loads(data(4)), b'....') # self-testing
-        for size in self.itersize(1 << 10, min(sys.maxsize, 1 << 31)):
+        for size in itersize(1 << 10, min(sys.maxsize, 1 << 31)):
             self._test_truncated_data(data(size))
 
     def test_truncated_large_long4(self):
         data = lambda size: b'\x8b' + struct.pack('<I', size) + b'.' * 5
+        #    0: \x8b LONG4      0x2e2e2e2e
+        #    9: .    STOP
         self.assertEqual(self.loads(data(4)), 0x2e2e2e2e) # self-testing
-        for size in self.itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
+        for size in itersize(1 << 10, min(sys.maxsize - 5, (1 << 31) - 1)):
             self._test_truncated_data(data(size))
         self._test_truncated_data(data(1 << 31),
             (pickle.UnpicklingError, 'LONG pickle has negative byte count'))
 
     def test_truncated_large_frame(self):
         data = lambda size: b'\x95' + struct.pack('<Q', size) + b'N.'
+        #    0: \x95 FRAME      2
+        #    9: N    NONE
+        #   10: .    STOP
         self.assertIsNone(self.loads(data(2))) # self-testing
-        for size in self.itersize(1 << 10, sys.maxsize - 9):
+        for size in itersize(1 << 10, sys.maxsize - 9):
             self._test_truncated_data(data(size))
         if sys.maxsize + 1 < 1 << 64:
             self._test_truncated_data(data(sys.maxsize + 1),
@@ -1202,24 +1232,30 @@ def test_truncated_large_frame(self):
 
     def test_truncated_large_binunicode8(self):
         data = lambda size: b'\x8d' + struct.pack('<Q', size) + b'.' * 5
+        #    0: \x8d BINUNICODE8 '....'
+        #   13: .    STOP
         self.assertEqual(self.loads(data(4)), '....') # self-testing
-        for size in self.itersize(1 << 10, sys.maxsize - 9):
+        for size in itersize(1 << 10, sys.maxsize - 9):
             self._test_truncated_data(data(size))
         if sys.maxsize + 1 < 1 << 64:
             self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
 
     def test_truncated_large_binbytes8(self):
         data = lambda size: b'\x8e' + struct.pack('<Q', size) + b'.' * 5
+        #    0: \x8e BINBYTES8  b'....'
+        #   13: .    STOP
         self.assertEqual(self.loads(data(4)), b'....') # self-testing
-        for size in self.itersize(1 << 10, sys.maxsize):
+        for size in itersize(1 << 10, sys.maxsize):
             self._test_truncated_data(data(size))
         if sys.maxsize + 1 < 1 << 64:
             self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)
 
     def test_truncated_large_bytearray8(self):
         data = lambda size: b'\x96' + struct.pack('<Q', size) + b'.' * 5
+        #    0: \x96 BYTEARRAY8 bytearray(b'....')
+        #   13: .    STOP
         self.assertEqual(self.loads(data(4)), bytearray(b'....')) # self-testing
-        for size in self.itersize(1 << 10, sys.maxsize):
+        for size in itersize(1 << 10, sys.maxsize):
             self._test_truncated_data(data(size))
         if sys.maxsize + 1 < 1 << 64:
             self._test_truncated_data(data(sys.maxsize + 1), self.size_overflow_error)

From a80106c033acdfad910003d067ac6e64eee4646c Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 28 Sep 2024 14:12:56 +0300
Subject: [PATCH 11/12] Fix test on 32-bit platforms.

---
 Lib/test/pickletester.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index e4245f495356a9..f15a3ab8fa8383 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1140,6 +1140,8 @@ def test_too_large_put(self):
         #   33: t        TUPLE      (MARK at 0)
         #   34: .    STOP
         for idx in [10**6, 10**9, 10**12]:
+            if idx > sys.maxsize:
+                continue
             self.assertEqual(self.loads(data(idx)), ([],)*2)
 
     def test_too_large_long_binput(self):

From 20aa1bf6b293e8a656d1a4cedc3b9c9c17039ab6 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 8 Apr 2025 17:21:48 +0300
Subject: [PATCH 12/12] Fix __sizeof__.

---
 Lib/test/test_pickle.py | 2 +-
 Modules/_pickle.c       | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index 4060949e449b85..c88154c27bf7a0 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -477,7 +477,7 @@ def test_pickler(self):
                 0)  # Write buffer is cleared after every dump().
 
         def test_unpickler(self):
-            basesize = support.calcobjsize('2P2n2P 2P2n2i5P 2P3n8P2n2i')
+            basesize = support.calcobjsize('2P2n3P 2P2n2i5P 2P3n8P2n2i')
             unpickler = _pickle.Unpickler
             P = struct.calcsize('P')  # Size of memo table entry.
             n = struct.calcsize('n')  # Size of mark table entry.
diff --git a/Modules/_pickle.c b/Modules/_pickle.c
index 1423092df8f412..7a14fdc6302bd1 100644
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -7311,13 +7311,6 @@ _pickle_Unpickler___sizeof___impl(UnpicklerObject *self)
     size_t res = _PyObject_SIZE(Py_TYPE(self));
     if (self->memo != NULL)
         res += self->memo_size * sizeof(PyObject *);
-    if (self->memo_dict != NULL) {
-        size_t s = _PySys_GetSizeOf(self->memo_dict);
-        if (s == (size_t)-1) {
-            return -1;
-        }
-        res += s;
-    }
     if (self->marks != NULL)
         res += (size_t)self->marks_size * sizeof(Py_ssize_t);
     if (self->input_line != NULL)