From 822230d411fed5d92706be9413440dea093e2905 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 20 May 2024 12:35:59 +0300 Subject: [PATCH 01/12] gh-115952: Fix vulnerability in the pickle module Loading a small data which does not even involve arbitrary code execution could consume arbitrary large amount of memory. There were three issues: * PUT and LONG_BINPUT with large argument (the C implementation only). Since the memo is implemented in C as a continuous dynamic array, a single opcode can cause its resizing to arbitrary size. Now the sparsity of memo indices is limited. * BINBYTES, BINBYTES8 and BYTEARRAY8 with large argument. They allocated the bytes or bytearray object of the specified size before reading into it. Now they read very large data by chunks. * BINSTRING, BINUNICODE, LONG4, BINUNICODE8 and FRAME with large argument. They read the whole data by calling the read() method of the underlying file object, which usually allocates the bytes object of the specified size before reading into it. Now they read very large data by chunks. --- Lib/pickle.py | 34 ++- Lib/test/pickletester.py | 102 ++++++++- Lib/test/test_pickle.py | 38 ++++ ...-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst | 3 + Modules/_pickle.c | 209 ++++++++++++------ 5 files changed, 304 insertions(+), 82 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst diff --git a/Lib/pickle.py b/Lib/pickle.py index 33c97c8c5efb28..2e190a0e02124e 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -191,6 +191,11 @@ def __init__(self, value): __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$", x)]) +# Data larger that this will be read in chunks, to prevent extreme +# overallocation. +SAFE_BUF_SIZE = (1 << 20) + + class _Framer: _FRAME_SIZE_MIN = 4 @@ -289,7 +294,7 @@ def read(self, n): "pickle exhausted before end of frame") return data else: - return self.file_read(n) + return self.safe_file_read(n) def readline(self): if self.current_frame: @@ -304,11 +309,23 @@ def readline(self): else: return self.file_readline() + def safe_file_read(self, size): + cursize = min(size, SAFE_BUF_SIZE) + b = self.file_read(cursize) + while cursize < size and len(b) == cursize: + delta = min(cursize, size - cursize) + b += self.file_read(delta) + cursize += delta + return b + def load_frame(self, frame_size): if self.current_frame and self.current_frame.read() != b'': raise UnpicklingError( "beginning of a new frame before end of current frame") - self.current_frame = io.BytesIO(self.file_read(frame_size)) + data = self.safe_file_read(frame_size) + if len(data) < frame_size: + raise EOFError + self.current_frame = io.BytesIO(data) # Tools used for pickling. @@ -1378,12 +1395,17 @@ def load_binbytes8(self): dispatch[BINBYTES8[0]] = load_binbytes8 def load_bytearray8(self): - len, = unpack(' maxsize: + size, = unpack(' maxsize: raise UnpicklingError("BYTEARRAY8 exceeds system's maximum size " "of %d bytes" % maxsize) - b = bytearray(len) - self.readinto(b) + cursize = min(size, SAFE_BUF_SIZE) + b = bytearray(cursize) + if self.readinto(b) == cursize: + while cursize < size and len(b) == cursize: + delta = min(cursize, size - cursize) + b += self.read(delta) + cursize += delta self.append(b) dispatch[BYTEARRAY8[0]] = load_bytearray8 diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 93e7dbbd103934..769fd99fb1381c 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -853,9 +853,8 @@ def assert_is_copy(self, obj, objcopy, msg=None): self.assertEqual(getattr(obj, slot, None), getattr(objcopy, slot, None), msg=msg) - def check_unpickling_error(self, errors, data): - with self.subTest(data=data), \ - self.assertRaises(errors): + def check_unpickling_error_strict(self, errors, data): + with self.assertRaises(errors): try: self.loads(data) except BaseException as exc: @@ -864,6 +863,10 @@ def check_unpickling_error(self, errors, data): (data, exc.__class__.__name__, exc)) raise + def check_unpickling_error(self, errors, data): + with self.subTest(data=data): + self.check_unpickling_error_strict(errors, data) + def test_load_from_data0(self): self.assert_is_copy(self._testdata, self.loads(DATA0)) @@ -1115,6 +1118,99 @@ def test_negative_32b_binput(self): dumped = b'\x80\x03X\x01\x00\x00\x00ar\xff\xff\xff\xff.' self.check_unpickling_error(ValueError, dumped) + def itersize(self, start, stop): + size = start + while size < stop: + yield size + size <<= 1 + + def _test_truncated_data(self, dumped, expected_error=None): + if expected_error is None: + expected_error = self.truncated_data_error + # BytesIO + with self.assertRaisesRegex(*expected_error): + self.loads(dumped) + if hasattr(self, 'unpickler'): + try: + with open(TESTFN, 'wb') as f: + f.write(dumped) + # buffered file + with open(TESTFN, 'rb') as f: + u = self.unpickler(f) + with self.assertRaisesRegex(*expected_error): + u.load() + # unbuffered file + with open(TESTFN, 'rb', buffering=0) as f: + u = self.unpickler(f) + with self.assertRaisesRegex(*expected_error): + u.load() + finally: + os_helper.unlink(TESTFN) + + def test_truncated_large_binstring(self): + data = lambda size: b'T' + struct.pack(' sys.maxsize')) + + def test_truncated_large_binunicode8(self): + data = lambda size: b'\x8d' + struct.pack('readinto) { + /* readinto() not supported on file-like object, fall back to read() + * and copy into destination buffer (bpo-39681) */ + PyObject* len = PyLong_FromSsize_t(n); + if (len == NULL) { + return -1; + } + PyObject* data = _Pickle_FastCall(self->read, len); + if (data == NULL) { + return -1; + } + if (!PyBytes_Check(data)) { + PyErr_Format(PyExc_ValueError, + "read() returned non-bytes object (%R)", + Py_TYPE(data)); + Py_DECREF(data); + return -1; + } + Py_ssize_t read_size = PyBytes_GET_SIZE(data); + if (read_size < n) { + Py_DECREF(data); + return bad_readline(state); + } + memcpy(buf, PyBytes_AS_STRING(data), n); + Py_DECREF(data); + return n; + } + + /* Call readinto() into user buffer */ + PyObject *buf_obj = PyMemoryView_FromMemory(buf, n, PyBUF_WRITE); + if (buf_obj == NULL) { + return -1; + } + PyObject *read_size_obj = _Pickle_FastCall(self->readinto, buf_obj); + if (read_size_obj == NULL) { + return -1; + } + Py_ssize_t read_size = PyLong_AsSsize_t(read_size_obj); + Py_DECREF(read_size_obj); + + if (read_size < 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "readinto() returned negative size"); + } + return -1; + } + if (read_size < n) { + return bad_readline(state); + } + return n; +} + /* If reading from a file, we need to only pull the bytes we need, since there may be multiple pickle objects arranged contiguously in the same input buffer. @@ -1256,7 +1319,7 @@ static const Py_ssize_t READ_WHOLE_LINE = -1; causing the Unpickler to go back to the file for more data. Use the returned size to tell you how much data you can process. */ static Py_ssize_t -_Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) +_Unpickler_ReadFromFile(PickleState *state, UnpicklerObject *self, Py_ssize_t n) { PyObject *data; Py_ssize_t read_size; @@ -1268,6 +1331,9 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) if (n == READ_WHOLE_LINE) { data = PyObject_CallNoArgs(self->readline); + if (data == NULL) { + return -1; + } } else { PyObject *len; @@ -1292,13 +1358,28 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) return n; } } - len = PyLong_FromSsize_t(n); + Py_ssize_t cursize = Py_MIN(n, SAFE_BUF_SIZE); + len = PyLong_FromSsize_t(cursize); if (len == NULL) return -1; data = _Pickle_FastCall(self->read, len); + if (data == NULL) { + return -1; + } + while (cursize < n) { + Py_ssize_t prevsize = cursize; + cursize += Py_MIN(cursize, n - cursize); + if (_PyBytes_Resize(&data, cursize) < 0) { + return -1; + } + if (_Unpickler_ReadIntoFromFile(state, self, + PyBytes_AS_STRING(data) + prevsize, cursize - prevsize) < 0) + { + Py_DECREF(data); + return -1; + } + } } - if (data == NULL) - return -1; read_size = _Unpickler_SetStringInput(self, data); Py_DECREF(data); @@ -1325,7 +1406,7 @@ _Unpickler_ReadImpl(UnpicklerObject *self, PickleState *st, char **s, Py_ssize_t return bad_readline(st); /* Extend the buffer to satisfy desired size */ - num_read = _Unpickler_ReadFromFile(self, n); + num_read = _Unpickler_ReadFromFile(st, self, n); if (num_read < 0) return -1; if (num_read < n) @@ -1372,57 +1453,7 @@ _Unpickler_ReadInto(PickleState *state, UnpicklerObject *self, char *buf, return -1; } - if (!self->readinto) { - /* readinto() not supported on file-like object, fall back to read() - * and copy into destination buffer (bpo-39681) */ - PyObject* len = PyLong_FromSsize_t(n); - if (len == NULL) { - return -1; - } - PyObject* data = _Pickle_FastCall(self->read, len); - if (data == NULL) { - return -1; - } - if (!PyBytes_Check(data)) { - PyErr_Format(PyExc_ValueError, - "read() returned non-bytes object (%R)", - Py_TYPE(data)); - Py_DECREF(data); - return -1; - } - Py_ssize_t read_size = PyBytes_GET_SIZE(data); - if (read_size < n) { - Py_DECREF(data); - return bad_readline(state); - } - memcpy(buf, PyBytes_AS_STRING(data), n); - Py_DECREF(data); - return n; - } - - /* Call readinto() into user buffer */ - PyObject *buf_obj = PyMemoryView_FromMemory(buf, n, PyBUF_WRITE); - if (buf_obj == NULL) { - return -1; - } - PyObject *read_size_obj = _Pickle_FastCall(self->readinto, buf_obj); - if (read_size_obj == NULL) { - return -1; - } - Py_ssize_t read_size = PyLong_AsSsize_t(read_size_obj); - Py_DECREF(read_size_obj); - - if (read_size < 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "readinto() returned negative size"); - } - return -1; - } - if (read_size < n) { - return bad_readline(state); - } - return n; + return _Unpickler_ReadIntoFromFile(state, self, buf, n); } /* Read `n` bytes from the unpickler's data source, storing the result in `*s`. @@ -1482,7 +1513,7 @@ _Unpickler_Readline(PickleState *state, UnpicklerObject *self, char **result) if (!self->read) return bad_readline(state); - num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE); + num_read = _Unpickler_ReadFromFile(state, self, READ_WHOLE_LINE); if (num_read < 0) return -1; if (num_read == 0 || self->input_buffer[num_read - 1] != '\n') @@ -1526,11 +1557,15 @@ _Unpickler_MemoGet(UnpicklerObject *self, size_t idx) /* Returns -1 (with an exception set) on failure, 0 on success. This takes its own reference to `value`. */ static int -_Unpickler_MemoPut(UnpicklerObject *self, size_t idx, PyObject *value) +_Unpickler_MemoPut(PickleState *st, UnpicklerObject *self, size_t idx, PyObject *value) { PyObject *old_item; if (idx >= self->memo_size) { + if (idx > self->memo_len * 2 + (1 << 17)) { + PyErr_SetString(st->UnpicklingError, "too sparse memo indices"); + return -1; + } if (_Unpickler_ResizeMemoList(self, idx * 2) < 0) return -1; assert(idx < self->memo_size); @@ -5451,13 +5486,28 @@ load_counted_binbytes(PickleState *state, UnpicklerObject *self, int nbytes) return -1; } - bytes = PyBytes_FromStringAndSize(NULL, size); - if (bytes == NULL) - return -1; - if (_Unpickler_ReadInto(state, self, PyBytes_AS_STRING(bytes), size) < 0) { - Py_DECREF(bytes); + Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE); + Py_ssize_t prevsize = 0; + bytes = PyBytes_FromStringAndSize(NULL, cursize); + if (bytes == NULL) { return -1; } + while (1) { + if (_Unpickler_ReadInto(state, self, + PyBytes_AS_STRING(bytes) + prevsize, cursize - prevsize) < 0) + { + Py_DECREF(bytes); + return -1; + } + if (cursize >= size) { + break; + } + prevsize = cursize; + cursize += Py_MIN(cursize, size - cursize); + if (_PyBytes_Resize(&bytes, cursize) < 0) { + return -1; + } + } PDATA_PUSH(self->stack, bytes, -1); return 0; @@ -5482,14 +5532,27 @@ load_counted_bytearray(PickleState *state, UnpicklerObject *self) return -1; } - bytearray = PyByteArray_FromStringAndSize(NULL, size); + Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE); + Py_ssize_t prevsize = 0; + bytearray = PyByteArray_FromStringAndSize(NULL, cursize); if (bytearray == NULL) { return -1; } - char *str = PyByteArray_AS_STRING(bytearray); - if (_Unpickler_ReadInto(state, self, str, size) < 0) { - Py_DECREF(bytearray); - return -1; + while (1) { + if (_Unpickler_ReadInto(state, self, + PyByteArray_AS_STRING(bytearray) + prevsize, + cursize - prevsize) < 0) { + Py_DECREF(bytearray); + return -1; + } + if (cursize >= size) { + break; + } + prevsize = cursize; + cursize += Py_MIN(cursize, size - cursize); + if (PyByteArray_Resize(bytearray, cursize) < 0) { + return -1; + } } PDATA_PUSH(self->stack, bytearray, -1); @@ -6279,7 +6342,7 @@ load_put(PickleState *state, UnpicklerObject *self) return -1; } - return _Unpickler_MemoPut(self, idx, value); + return _Unpickler_MemoPut(state, self, idx, value); } static int @@ -6298,7 +6361,7 @@ load_binput(PickleState *state, UnpicklerObject *self) idx = Py_CHARMASK(s[0]); - return _Unpickler_MemoPut(self, idx, value); + return _Unpickler_MemoPut(state, self, idx, value); } static int @@ -6322,7 +6385,7 @@ load_long_binput(PickleState *state, UnpicklerObject *self) return -1; } - return _Unpickler_MemoPut(self, idx, value); + return _Unpickler_MemoPut(state, self, idx, value); } static int @@ -6334,7 +6397,7 @@ load_memoize(PickleState *state, UnpicklerObject *self) return Pdata_stack_underflow(state, self->stack); value = self->stack->data[Py_SIZE(self->stack) - 1]; - return _Unpickler_MemoPut(self, self->memo_len, value); + return _Unpickler_MemoPut(state, self, self->memo_len, value); } static int @@ -7432,7 +7495,7 @@ Unpickler_set_memo(UnpicklerObject *self, PyObject *obj, void *Py_UNUSED(ignored "memo key must be positive integers."); goto error; } - if (_Unpickler_MemoPut(self, idx, value) < 0) + if (_Unpickler_MemoPut(state, self, idx, value) < 0) goto error; } } From 88f1461cfa9c3f921be1c04ddb6e09581d6375c0 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 20 May 2024 14:01:54 +0300 Subject: [PATCH 02/12] Try to fix tests of 32-bit platforms. --- Lib/test/pickletester.py | 17 ++++++++--------- Lib/test/test_pickle.py | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 769fd99fb1381c..66902599dd538e 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1123,6 +1123,7 @@ def itersize(self, start, stop): while size < stop: yield size size <<= 1 + yield stop def _test_truncated_data(self, dumped, expected_error=None): if expected_error is None: @@ -1150,29 +1151,27 @@ def _test_truncated_data(self, dumped, expected_error=None): def test_truncated_large_binstring(self): data = lambda size: b'T' + struct.pack(' Date: Mon, 20 May 2024 20:51:10 +0300 Subject: [PATCH 03/12] Try to fix more tests on 32-bit platforms. --- Lib/test/pickletester.py | 6 +++--- Lib/test/test_pickle.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 66902599dd538e..fff54a6f22f23e 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1151,7 +1151,7 @@ def _test_truncated_data(self, dumped, expected_error=None): def test_truncated_large_binstring(self): data = lambda size: b'T' + struct.pack(' Date: Wed, 22 May 2024 16:18:17 +0300 Subject: [PATCH 04/12] Apply suggestions from code review --- Lib/pickle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 2e190a0e02124e..9be5c2401c7a9e 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -191,7 +191,7 @@ def __init__(self, value): __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$", x)]) -# Data larger that this will be read in chunks, to prevent extreme +# Data larger than this will be read in chunks, to prevent extreme # overallocation. SAFE_BUF_SIZE = (1 << 20) From d0e667eb848cf28e027645b13546ec961b623f57 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 29 Jun 2024 11:10:31 +0300 Subject: [PATCH 05/12] Remove empty lines. --- Lib/test/test_pickle.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index 809bdab06260df..9deec6dc881761 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -62,7 +62,6 @@ def test_too_large_put(self): for idx in [10**6, 10**9, 10**12]: self.assertEqual(self.loads(data(idx)), ([],)*2) - def test_too_large_long_binput(self): data = lambda n: (b'(]r' + struct.pack(' Date: Sun, 30 Jun 2024 13:32:39 +0300 Subject: [PATCH 06/12] Change names, add more commentis and update the NEWS entry. --- Lib/pickle.py | 12 ++++++------ Lib/test/pickletester.py | 2 ++ ...24-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst | 5 ++--- Modules/_pickle.c | 17 ++++++++++++----- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index e287e162925989..1e10459e58405e 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -193,7 +193,7 @@ def __init__(self, value): # Data larger than this will be read in chunks, to prevent extreme # overallocation. -SAFE_BUF_SIZE = (1 << 20) +_MIN_READ_BUF_SIZE = (1 << 20) class _Framer: @@ -294,7 +294,7 @@ def read(self, n): "pickle exhausted before end of frame") return data else: - return self.safe_file_read(n) + return self._chunked_file_read(n) def readline(self): if self.current_frame: @@ -309,8 +309,8 @@ def readline(self): else: return self.file_readline() - def safe_file_read(self, size): - cursize = min(size, SAFE_BUF_SIZE) + def _chunked_file_read(self, size): + cursize = min(size, _MIN_READ_BUF_SIZE) b = self.file_read(cursize) while cursize < size and len(b) == cursize: delta = min(cursize, size - cursize) @@ -322,7 +322,7 @@ def load_frame(self, frame_size): if self.current_frame and self.current_frame.read() != b'': raise UnpicklingError( "beginning of a new frame before end of current frame") - data = self.safe_file_read(frame_size) + data = self._chunked_file_read(frame_size) if len(data) < frame_size: raise EOFError self.current_frame = io.BytesIO(data) @@ -1419,7 +1419,7 @@ def load_bytearray8(self): if size > maxsize: raise UnpicklingError("BYTEARRAY8 exceeds system's maximum size " "of %d bytes" % maxsize) - cursize = min(size, SAFE_BUF_SIZE) + cursize = min(size, _MIN_READ_BUF_SIZE) b = bytearray(cursize) if self.readinto(b) == cursize: while cursize < size and len(b) == cursize: diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 6184837b26afd8..2aeee4fef213b3 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1119,6 +1119,8 @@ def test_negative_32b_binput(self): self.check_unpickling_error(ValueError, dumped) def itersize(self, start, stop): + # Produce geometrical increasing sequence from start to stop + # (inclusively) for tests. size = start while size < stop: yield size diff --git a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst index 3d4de4be87ecf5..cbe0e53e8fabb0 100644 --- a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst +++ b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst @@ -1,3 +1,2 @@ -Fix a vulnerability in the :mod:`pickle` module which allowed to consume -arbitrary large amount of memory when loading a small data which does not -even involve arbitrary code execution. +:mod:`pickle` now avoids allocation of arbitrary large amount of memory +when unpickling specially crafted data. diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 4de33ab6cdd87c..51c91fb3e53e8b 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -155,7 +155,7 @@ enum { PREFETCH = 8192 * 16, /* Data larger that this will be read in chunks, to prevent extreme overallocation. */ - SAFE_BUF_SIZE = 1 << 20, + MIN_READ_BUF_SIZE = 1 << 20, FRAME_SIZE_MIN = 4, FRAME_SIZE_TARGET = 64 * 1024, @@ -1358,7 +1358,7 @@ _Unpickler_ReadFromFile(PickleState *state, UnpicklerObject *self, Py_ssize_t n) return n; } } - Py_ssize_t cursize = Py_MIN(n, SAFE_BUF_SIZE); + Py_ssize_t cursize = Py_MIN(n, MIN_READ_BUF_SIZE); len = PyLong_FromSsize_t(cursize); if (len == NULL) return -1; @@ -1562,7 +1562,14 @@ _Unpickler_MemoPut(PickleState *st, UnpicklerObject *self, size_t idx, PyObject PyObject *old_item; if (idx >= self->memo_size) { - if (idx > self->memo_len * 2 + (1 << 17)) { + /* MAX_MEMO_INDICES_GAP was introduced mainly for making testing of + * PUT, BINPUT and LONG_BINPUT opcodes simpler. It should be more + * than 1<<16 for LONG_BINPUT. + * The standard pickler never produces data that requires more than 0. + * The Python code does not have such limitation. + */ + const int MAX_MEMO_INDICES_GAP = 1 << 17; + if (idx > self->memo_len * 2 + MAX_MEMO_INDICES_GAP) { PyErr_SetString(st->UnpicklingError, "too sparse memo indices"); return -1; } @@ -5485,7 +5492,7 @@ load_counted_binbytes(PickleState *state, UnpicklerObject *self, int nbytes) return -1; } - Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE); + Py_ssize_t cursize = Py_MIN(size, MIN_READ_BUF_SIZE); Py_ssize_t prevsize = 0; bytes = PyBytes_FromStringAndSize(NULL, cursize); if (bytes == NULL) { @@ -5531,7 +5538,7 @@ load_counted_bytearray(PickleState *state, UnpicklerObject *self) return -1; } - Py_ssize_t cursize = Py_MIN(size, SAFE_BUF_SIZE); + Py_ssize_t cursize = Py_MIN(size, MIN_READ_BUF_SIZE); Py_ssize_t prevsize = 0; bytearray = PyByteArray_FromStringAndSize(NULL, cursize); if (bytearray == NULL) { From 184984db7e85c19fae6258a471e7cbe527069c2f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 6 Sep 2024 10:59:12 +0300 Subject: [PATCH 07/12] Support arbitrary non-continuous memo keys. --- Lib/test/pickletester.py | 12 +++++ Lib/test/test_pickle.py | 32 +----------- Modules/_pickle.c | 110 ++++++++++++++++++++++++--------------- 3 files changed, 82 insertions(+), 72 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 548ec60bd34a66..b5b4da00049886 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1127,6 +1127,18 @@ def itersize(self, start, stop): size <<= 1 yield stop + def test_too_large_put(self): + data = lambda n: (b'((lp' + str(n).encode() + b'\n' + + b'g' + str(n).encode() + b'\nt.') + for idx in [10**6, 10**9, 10**12]: + self.assertEqual(self.loads(data(idx)), ([],)*2) + + def test_too_large_long_binput(self): + data = lambda n: (b'(]r' + struct.pack('= self->memo_size) - return NULL; - - return self->memo[idx]; + PyObject *value; + if (idx < self->memo_size) { + value = self->memo[idx]; + if (value != NULL) { + return value; + } + } + if (self->memo_dict != NULL) { + PyObject *key = PyLong_FromSsize_t(idx); + if (key == NULL) { + return NULL; + } + if (idx < self->memo_size) { + (void)PyDict_Pop(self->memo_dict, key, &value); + self->memo[idx] = value; + } + else { + value = PyDict_GetItemWithError(self->memo_dict, key); + } + Py_DECREF(key); + if (value != NULL || PyErr_Occurred()) { + return value; + } + } + PyErr_Format(st->UnpicklingError, "Memo value not found at index %zd", idx); + return NULL; } /* Returns -1 (with an exception set) on failure, 0 on success. This takes its own reference to `value`. */ static int -_Unpickler_MemoPut(PickleState *st, UnpicklerObject *self, size_t idx, PyObject *value) +_Unpickler_MemoPut(UnpicklerObject *self, size_t idx, PyObject *value) { PyObject *old_item; if (idx >= self->memo_size) { - /* MAX_MEMO_INDICES_GAP was introduced mainly for making testing of - * PUT, BINPUT and LONG_BINPUT opcodes simpler. It should be more - * than 1<<16 for LONG_BINPUT. - * The standard pickler never produces data that requires more than 0. - * The Python code does not have such limitation. - */ - const int MAX_MEMO_INDICES_GAP = 1 << 17; - if (idx > self->memo_len * 2 + MAX_MEMO_INDICES_GAP) { - PyErr_SetString(st->UnpicklingError, "too sparse memo indices"); - return -1; + if (idx > self->memo_len * 2) { + /* The memo keys are too sparse. Use a dict instead of + * a continuous array for the memo. */ + if (self->memo_dict == NULL) { + self->memo_dict = PyDict_New(); + if (self->memo_dict == NULL) { + return -1; + } + } + PyObject *key = PyLong_FromSize_t(idx); + if (key == NULL) { + return -1; + } + + if (PyDict_SetItem(self->memo_dict, key, value) < 0) { + Py_DECREF(key); + return -1; + } + Py_DECREF(key); + return 0; } if (_Unpickler_ResizeMemoList(self, idx * 2) < 0) return -1; @@ -1642,6 +1675,7 @@ _Unpickler_New(PyObject *module) self->memo = memo; self->memo_size = MEMO_SIZE; self->memo_len = 0; + self->memo_dict = NULL; self->persistent_load = NULL; memset(&self->buffer, 0, sizeof(Py_buffer)); self->input_buffer = NULL; @@ -6149,20 +6183,15 @@ load_get(PickleState *st, UnpicklerObject *self) if (key == NULL) return -1; idx = PyLong_AsSsize_t(key); + Py_DECREF(key); if (idx == -1 && PyErr_Occurred()) { - Py_DECREF(key); return -1; } - value = _Unpickler_MemoGet(self, idx); + value = _Unpickler_MemoGet(st, self, idx); if (value == NULL) { - if (!PyErr_Occurred()) { - PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx); - } - Py_DECREF(key); return -1; } - Py_DECREF(key); PDATA_APPEND(self->stack, value, -1); return 0; @@ -6180,13 +6209,8 @@ load_binget(PickleState *st, UnpicklerObject *self) idx = Py_CHARMASK(s[0]); - value = _Unpickler_MemoGet(self, idx); + value = _Unpickler_MemoGet(st, self, idx); if (value == NULL) { - PyObject *key = PyLong_FromSsize_t(idx); - if (key != NULL) { - PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx); - Py_DECREF(key); - } return -1; } @@ -6206,13 +6230,8 @@ load_long_binget(PickleState *st, UnpicklerObject *self) idx = calc_binsize(s, 4); - value = _Unpickler_MemoGet(self, idx); + value = _Unpickler_MemoGet(st, self, idx); if (value == NULL) { - PyObject *key = PyLong_FromSsize_t(idx); - if (key != NULL) { - PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx); - Py_DECREF(key); - } return -1; } @@ -6337,7 +6356,7 @@ load_put(PickleState *state, UnpicklerObject *self) return -1; } - return _Unpickler_MemoPut(state, self, idx, value); + return _Unpickler_MemoPut(self, idx, value); } static int @@ -6356,7 +6375,7 @@ load_binput(PickleState *state, UnpicklerObject *self) idx = Py_CHARMASK(s[0]); - return _Unpickler_MemoPut(state, self, idx, value); + return _Unpickler_MemoPut(self, idx, value); } static int @@ -6380,7 +6399,7 @@ load_long_binput(PickleState *state, UnpicklerObject *self) return -1; } - return _Unpickler_MemoPut(state, self, idx, value); + return _Unpickler_MemoPut(self, idx, value); } static int @@ -6392,7 +6411,7 @@ load_memoize(PickleState *state, UnpicklerObject *self) return Pdata_stack_underflow(state, self->stack); value = self->stack->data[Py_SIZE(self->stack) - 1]; - return _Unpickler_MemoPut(state, self, self->memo_len, value); + return _Unpickler_MemoPut(self, self->memo_len, value); } static int @@ -7141,6 +7160,13 @@ _pickle_Unpickler___sizeof___impl(UnpicklerObject *self) size_t res = _PyObject_SIZE(Py_TYPE(self)); if (self->memo != NULL) res += self->memo_size * sizeof(PyObject *); + if (self->memo_dict != NULL) { + size_t s = _PySys_GetSizeOf(self->memo_dict); + if (s == (size_t)-1) { + return -1; + } + res += s; + } if (self->marks != NULL) res += (size_t)self->marks_size * sizeof(Py_ssize_t); if (self->input_line != NULL) @@ -7175,6 +7201,7 @@ Unpickler_clear(UnpicklerObject *self) self->buffer.buf = NULL; } + Py_CLEAR(self->memo_dict); _Unpickler_MemoCleanup(self); PyMem_Free(self->marks); self->marks = NULL; @@ -7209,6 +7236,7 @@ Unpickler_traverse(UnpicklerObject *self, visitproc visit, void *arg) Py_VISIT(self->stack); Py_VISIT(self->persistent_load); Py_VISIT(self->buffers); + Py_VISIT(self->memo_dict); PyObject **memo = self->memo; if (memo) { Py_ssize_t i = self->memo_size; @@ -7514,7 +7542,7 @@ Unpickler_set_memo(UnpicklerObject *self, PyObject *obj, void *Py_UNUSED(ignored "memo key must be positive integers."); goto error; } - if (_Unpickler_MemoPut(state, self, idx, value) < 0) + if (_Unpickler_MemoPut(self, idx, value) < 0) goto error; } } From f0c0728107d9fbad2f5b3b4aed2787e8a6d6a4c1 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Fri, 27 Sep 2024 14:38:42 -0700 Subject: [PATCH 08/12] Reworded NEWS a bit. --- .../Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst index cbe0e53e8fabb0..814326a0b53864 100644 --- a/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst +++ b/Misc/NEWS.d/next/Library/2024-05-20-12-35-52.gh-issue-115952.J6n_Kf.rst @@ -1,2 +1,2 @@ -:mod:`pickle` now avoids allocation of arbitrary large amount of memory -when unpickling specially crafted data. +:mod:`pickle` now avoids allocating an arbitrary large amount of memory +based on small untrusted input when unpickling specially crafted data. From c72d0952268c760cf7d95bcf88abc6a34879c08a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 28 Sep 2024 10:27:00 +0300 Subject: [PATCH 09/12] Fix C to Python integer conversion. --- Modules/_pickle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 50d8971a3719d1..1a92376a4e1f26 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1558,7 +1558,7 @@ _Unpickler_MemoGet(PickleState *st, UnpicklerObject *self, size_t idx) } } if (self->memo_dict != NULL) { - PyObject *key = PyLong_FromSsize_t(idx); + PyObject *key = PyLong_FromSize_t(idx); if (key == NULL) { return NULL; } From e89bfeabd717e91c3e5c159cabc8f865d291fc19 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 28 Sep 2024 10:27:41 +0300 Subject: [PATCH 10/12] Add more comments. --- Lib/test/pickletester.py | 72 ++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 411e4bd19e8274..e4245f495356a9 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -74,6 +74,15 @@ def count_opcode(code, pickle): def identity(x): return x +def itersize(start, stop): + # Produce geometrical increasing sequence from start to stop + # (inclusively) for tests. + size = start + while size < stop: + yield size + size <<= 1 + yield stop + class UnseekableIO(io.BytesIO): def peek(self, *args): @@ -1118,28 +1127,38 @@ def test_negative_32b_binput(self): dumped = b'\x80\x03X\x01\x00\x00\x00ar\xff\xff\xff\xff.' self.check_unpickling_error(ValueError, dumped) - def itersize(self, start, stop): - # Produce geometrical increasing sequence from start to stop - # (inclusively) for tests. - size = start - while size < stop: - yield size - size <<= 1 - yield stop - def test_too_large_put(self): + # Test that PUT with large id does not cause allocation of + # too large memo table. data = lambda n: (b'((lp' + str(n).encode() + b'\n' + b'g' + str(n).encode() + b'\nt.') + # 0: ( MARK + # 1: ( MARK + # 2: l LIST (MARK at 1) + # 3: p PUT 1000000000000 + # 18: g GET 1000000000000 + # 33: t TUPLE (MARK at 0) + # 34: . STOP for idx in [10**6, 10**9, 10**12]: self.assertEqual(self.loads(data(idx)), ([],)*2) def test_too_large_long_binput(self): + # Test that LONG_BINPUT with large id does not cause allocation of + # too large memo table. data = lambda n: (b'(]r' + struct.pack(' Date: Sat, 28 Sep 2024 14:12:56 +0300 Subject: [PATCH 11/12] Fix test on 32-bit platforms. --- Lib/test/pickletester.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index e4245f495356a9..f15a3ab8fa8383 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1140,6 +1140,8 @@ def test_too_large_put(self): # 33: t TUPLE (MARK at 0) # 34: . STOP for idx in [10**6, 10**9, 10**12]: + if idx > sys.maxsize: + continue self.assertEqual(self.loads(data(idx)), ([],)*2) def test_too_large_long_binput(self): From 20aa1bf6b293e8a656d1a4cedc3b9c9c17039ab6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 8 Apr 2025 17:21:48 +0300 Subject: [PATCH 12/12] Fix __sizeof__. --- Lib/test/test_pickle.py | 2 +- Modules/_pickle.c | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index 4060949e449b85..c88154c27bf7a0 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -477,7 +477,7 @@ def test_pickler(self): 0) # Write buffer is cleared after every dump(). def test_unpickler(self): - basesize = support.calcobjsize('2P2n2P 2P2n2i5P 2P3n8P2n2i') + basesize = support.calcobjsize('2P2n3P 2P2n2i5P 2P3n8P2n2i') unpickler = _pickle.Unpickler P = struct.calcsize('P') # Size of memo table entry. n = struct.calcsize('n') # Size of mark table entry. diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 1423092df8f412..7a14fdc6302bd1 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -7311,13 +7311,6 @@ _pickle_Unpickler___sizeof___impl(UnpicklerObject *self) size_t res = _PyObject_SIZE(Py_TYPE(self)); if (self->memo != NULL) res += self->memo_size * sizeof(PyObject *); - if (self->memo_dict != NULL) { - size_t s = _PySys_GetSizeOf(self->memo_dict); - if (s == (size_t)-1) { - return -1; - } - res += s; - } if (self->marks != NULL) res += (size_t)self->marks_size * sizeof(Py_ssize_t); if (self->input_line != NULL)