diff --git a/Lib/pickle.py b/Lib/pickle.py index 4fa3632d1a738f..acacede994f51a 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -190,6 +190,11 @@ def __init__(self, value): __all__.extend(x for x in dir() if x.isupper() and not x.startswith('_')) +# Data larger than this will be read in chunks, to prevent extreme +# overallocation. +_MIN_READ_BUF_SIZE = (1 << 20) + + class _Framer: _FRAME_SIZE_MIN = 4 @@ -288,7 +293,7 @@ def read(self, n): "pickle exhausted before end of frame") return data else: - return self.file_read(n) + return self._chunked_file_read(n) def readline(self): if self.current_frame: @@ -303,11 +308,23 @@ def readline(self): else: return self.file_readline() + def _chunked_file_read(self, size): + cursize = min(size, _MIN_READ_BUF_SIZE) + b = self.file_read(cursize) + while cursize < size and len(b) == cursize: + delta = min(cursize, size - cursize) + b += self.file_read(delta) + cursize += delta + return b + def load_frame(self, frame_size): if self.current_frame and self.current_frame.read() != b'': raise UnpicklingError( "beginning of a new frame before end of current frame") - self.current_frame = io.BytesIO(self.file_read(frame_size)) + data = self._chunked_file_read(frame_size) + if len(data) < frame_size: + raise EOFError + self.current_frame = io.BytesIO(data) # Tools used for pickling. @@ -1497,12 +1514,17 @@ def load_binbytes8(self): dispatch[BINBYTES8[0]] = load_binbytes8 def load_bytearray8(self): - len, = unpack(' maxsize: + size, = unpack(' maxsize: raise UnpicklingError("BYTEARRAY8 exceeds system's maximum size " "of %d bytes" % maxsize) - b = bytearray(len) - self.readinto(b) + cursize = min(size, _MIN_READ_BUF_SIZE) + b = bytearray(cursize) + if self.readinto(b) == cursize: + while cursize < size and len(b) == cursize: + delta = min(cursize, size - cursize) + b += self.read(delta) + cursize += delta self.append(b) dispatch[BYTEARRAY8[0]] = load_bytearray8 diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index bdc7ef62943a28..d13de25de648f3 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -74,6 +74,15 @@ def count_opcode(code, pickle): def identity(x): return x +def itersize(start, stop): + # Produce geometrical increasing sequence from start to stop + # (inclusively) for tests. + size = start + while size < stop: + yield size + size <<= 1 + yield stop + class UnseekableIO(io.BytesIO): def peek(self, *args): @@ -853,9 +862,8 @@ def assert_is_copy(self, obj, objcopy, msg=None): self.assertEqual(getattr(obj, slot, None), getattr(objcopy, slot, None), msg=msg) - def check_unpickling_error(self, errors, data): - with self.subTest(data=data), \ - self.assertRaises(errors): + def check_unpickling_error_strict(self, errors, data): + with self.assertRaises(errors): try: self.loads(data) except BaseException as exc: @@ -864,6 +872,10 @@ def check_unpickling_error(self, errors, data): (data, exc.__class__.__name__, exc)) raise + def check_unpickling_error(self, errors, data): + with self.subTest(data=data): + self.check_unpickling_error_strict(errors, data) + def test_load_from_data0(self): self.assert_is_copy(self._testdata, self.loads(DATA0)) @@ -1135,6 +1147,141 @@ def test_negative_32b_binput(self): dumped = b'\x80\x03X\x01\x00\x00\x00ar\xff\xff\xff\xff.' self.check_unpickling_error(ValueError, dumped) + def test_too_large_put(self): + # Test that PUT with large id does not cause allocation of + # too large memo table. + data = lambda n: (b'((lp' + str(n).encode() + b'\n' + + b'g' + str(n).encode() + b'\nt.') + # 0: ( MARK + # 1: ( MARK + # 2: l LIST (MARK at 1) + # 3: p PUT 1000000000000 + # 18: g GET 1000000000000 + # 33: t TUPLE (MARK at 0) + # 34: . STOP + for idx in [10**6, 10**9, 10**12]: + if idx > sys.maxsize: + continue + self.assertEqual(self.loads(data(idx)), ([],)*2) + + def test_too_large_long_binput(self): + # Test that LONG_BINPUT with large id does not cause allocation of + # too large memo table. + data = lambda n: (b'(]r' + struct.pack(' sys.maxsize')) + + def test_truncated_large_binunicode8(self): + data = lambda size: b'\x8d' + struct.pack('readinto) { + /* readinto() not supported on file-like object, fall back to read() + * and copy into destination buffer (bpo-39681) */ + PyObject* len = PyLong_FromSsize_t(n); + if (len == NULL) { + return -1; + } + PyObject* data = _Pickle_FastCall(self->read, len); + if (data == NULL) { + return -1; + } + if (!PyBytes_Check(data)) { + PyErr_Format(PyExc_ValueError, + "read() returned non-bytes object (%R)", + Py_TYPE(data)); + Py_DECREF(data); + return -1; + } + Py_ssize_t read_size = PyBytes_GET_SIZE(data); + if (read_size < n) { + Py_DECREF(data); + return bad_readline(state); + } + memcpy(buf, PyBytes_AS_STRING(data), n); + Py_DECREF(data); + return n; + } + + /* Call readinto() into user buffer */ + PyObject *buf_obj = PyMemoryView_FromMemory(buf, n, PyBUF_WRITE); + if (buf_obj == NULL) { + return -1; + } + PyObject *read_size_obj = _Pickle_FastCall(self->readinto, buf_obj); + if (read_size_obj == NULL) { + return -1; + } + Py_ssize_t read_size = PyLong_AsSsize_t(read_size_obj); + Py_DECREF(read_size_obj); + + if (read_size < 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "readinto() returned negative size"); + } + return -1; + } + if (read_size < n) { + return bad_readline(state); + } + return n; +} + /* If reading from a file, we need to only pull the bytes we need, since there may be multiple pickle objects arranged contiguously in the same input buffer. @@ -1269,7 +1333,7 @@ static const Py_ssize_t READ_WHOLE_LINE = -1; causing the Unpickler to go back to the file for more data. Use the returned size to tell you how much data you can process. */ static Py_ssize_t -_Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) +_Unpickler_ReadFromFile(PickleState *state, UnpicklerObject *self, Py_ssize_t n) { PyObject *data; Py_ssize_t read_size; @@ -1281,6 +1345,9 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) if (n == READ_WHOLE_LINE) { data = PyObject_CallNoArgs(self->readline); + if (data == NULL) { + return -1; + } } else { PyObject *len; @@ -1309,13 +1376,28 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) return n; } } - len = PyLong_FromSsize_t(n); + Py_ssize_t cursize = Py_MIN(n, MIN_READ_BUF_SIZE); + len = PyLong_FromSsize_t(cursize); if (len == NULL) return -1; data = _Pickle_FastCall(self->read, len); + if (data == NULL) { + return -1; + } + while (cursize < n) { + Py_ssize_t prevsize = cursize; + cursize += Py_MIN(cursize, n - cursize); + if (_PyBytes_Resize(&data, cursize) < 0) { + return -1; + } + if (_Unpickler_ReadIntoFromFile(state, self, + PyBytes_AS_STRING(data) + prevsize, cursize - prevsize) < 0) + { + Py_DECREF(data); + return -1; + } + } } - if (data == NULL) - return -1; read_size = _Unpickler_SetStringInput(self, data); Py_DECREF(data); @@ -1342,7 +1424,7 @@ _Unpickler_ReadImpl(UnpicklerObject *self, PickleState *st, char **s, Py_ssize_t return bad_readline(st); /* Extend the buffer to satisfy desired size */ - num_read = _Unpickler_ReadFromFile(self, n); + num_read = _Unpickler_ReadFromFile(st, self, n); if (num_read < 0) return -1; if (num_read < n) @@ -1389,57 +1471,7 @@ _Unpickler_ReadInto(PickleState *state, UnpicklerObject *self, char *buf, return -1; } - if (!self->readinto) { - /* readinto() not supported on file-like object, fall back to read() - * and copy into destination buffer (bpo-39681) */ - PyObject* len = PyLong_FromSsize_t(n); - if (len == NULL) { - return -1; - } - PyObject* data = _Pickle_FastCall(self->read, len); - if (data == NULL) { - return -1; - } - if (!PyBytes_Check(data)) { - PyErr_Format(PyExc_ValueError, - "read() returned non-bytes object (%R)", - Py_TYPE(data)); - Py_DECREF(data); - return -1; - } - Py_ssize_t read_size = PyBytes_GET_SIZE(data); - if (read_size < n) { - Py_DECREF(data); - return bad_readline(state); - } - memcpy(buf, PyBytes_AS_STRING(data), n); - Py_DECREF(data); - return n; - } - - /* Call readinto() into user buffer */ - PyObject *buf_obj = PyMemoryView_FromMemory(buf, n, PyBUF_WRITE); - if (buf_obj == NULL) { - return -1; - } - PyObject *read_size_obj = _Pickle_FastCall(self->readinto, buf_obj); - if (read_size_obj == NULL) { - return -1; - } - Py_ssize_t read_size = PyLong_AsSsize_t(read_size_obj); - Py_DECREF(read_size_obj); - - if (read_size < 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "readinto() returned negative size"); - } - return -1; - } - if (read_size < n) { - return bad_readline(state); - } - return n; + return _Unpickler_ReadIntoFromFile(state, self, buf, n); } /* Read `n` bytes from the unpickler's data source, storing the result in `*s`. @@ -1499,7 +1531,7 @@ _Unpickler_Readline(PickleState *state, UnpicklerObject *self, char **result) if (!self->read) return bad_readline(state); - num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE); + num_read = _Unpickler_ReadFromFile(state, self, READ_WHOLE_LINE); if (num_read < 0) return -1; if (num_read == 0 || self->input_buffer[num_read - 1] != '\n') @@ -1532,12 +1564,34 @@ _Unpickler_ResizeMemoList(UnpicklerObject *self, size_t new_size) /* Returns NULL if idx is out of bounds. */ static PyObject * -_Unpickler_MemoGet(UnpicklerObject *self, size_t idx) +_Unpickler_MemoGet(PickleState *st, UnpicklerObject *self, size_t idx) { - if (idx >= self->memo_size) - return NULL; - - return self->memo[idx]; + PyObject *value; + if (idx < self->memo_size) { + value = self->memo[idx]; + if (value != NULL) { + return value; + } + } + if (self->memo_dict != NULL) { + PyObject *key = PyLong_FromSize_t(idx); + if (key == NULL) { + return NULL; + } + if (idx < self->memo_size) { + (void)PyDict_Pop(self->memo_dict, key, &value); + self->memo[idx] = value; + } + else { + value = PyDict_GetItemWithError(self->memo_dict, key); + } + Py_DECREF(key); + if (value != NULL || PyErr_Occurred()) { + return value; + } + } + PyErr_Format(st->UnpicklingError, "Memo value not found at index %zd", idx); + return NULL; } /* Returns -1 (with an exception set) on failure, 0 on success. @@ -1548,6 +1602,27 @@ _Unpickler_MemoPut(UnpicklerObject *self, size_t idx, PyObject *value) PyObject *old_item; if (idx >= self->memo_size) { + if (idx > self->memo_len * 2) { + /* The memo keys are too sparse. Use a dict instead of + * a continuous array for the memo. */ + if (self->memo_dict == NULL) { + self->memo_dict = PyDict_New(); + if (self->memo_dict == NULL) { + return -1; + } + } + PyObject *key = PyLong_FromSize_t(idx); + if (key == NULL) { + return -1; + } + + if (PyDict_SetItem(self->memo_dict, key, value) < 0) { + Py_DECREF(key); + return -1; + } + Py_DECREF(key); + return 0; + } if (_Unpickler_ResizeMemoList(self, idx * 2) < 0) return -1; assert(idx < self->memo_size); @@ -1617,6 +1692,7 @@ _Unpickler_New(PyObject *module) self->memo = memo; self->memo_size = MEMO_SIZE; self->memo_len = 0; + self->memo_dict = NULL; self->persistent_load = NULL; self->persistent_load_attr = NULL; memset(&self->buffer, 0, sizeof(Py_buffer)); @@ -5590,13 +5666,28 @@ load_counted_binbytes(PickleState *state, UnpicklerObject *self, int nbytes) return -1; } - bytes = PyBytes_FromStringAndSize(NULL, size); - if (bytes == NULL) - return -1; - if (_Unpickler_ReadInto(state, self, PyBytes_AS_STRING(bytes), size) < 0) { - Py_DECREF(bytes); + Py_ssize_t cursize = Py_MIN(size, MIN_READ_BUF_SIZE); + Py_ssize_t prevsize = 0; + bytes = PyBytes_FromStringAndSize(NULL, cursize); + if (bytes == NULL) { return -1; } + while (1) { + if (_Unpickler_ReadInto(state, self, + PyBytes_AS_STRING(bytes) + prevsize, cursize - prevsize) < 0) + { + Py_DECREF(bytes); + return -1; + } + if (cursize >= size) { + break; + } + prevsize = cursize; + cursize += Py_MIN(cursize, size - cursize); + if (_PyBytes_Resize(&bytes, cursize) < 0) { + return -1; + } + } PDATA_PUSH(self->stack, bytes, -1); return 0; @@ -5621,14 +5712,27 @@ load_counted_bytearray(PickleState *state, UnpicklerObject *self) return -1; } - bytearray = PyByteArray_FromStringAndSize(NULL, size); + Py_ssize_t cursize = Py_MIN(size, MIN_READ_BUF_SIZE); + Py_ssize_t prevsize = 0; + bytearray = PyByteArray_FromStringAndSize(NULL, cursize); if (bytearray == NULL) { return -1; } - char *str = PyByteArray_AS_STRING(bytearray); - if (_Unpickler_ReadInto(state, self, str, size) < 0) { - Py_DECREF(bytearray); - return -1; + while (1) { + if (_Unpickler_ReadInto(state, self, + PyByteArray_AS_STRING(bytearray) + prevsize, + cursize - prevsize) < 0) { + Py_DECREF(bytearray); + return -1; + } + if (cursize >= size) { + break; + } + prevsize = cursize; + cursize += Py_MIN(cursize, size - cursize); + if (PyByteArray_Resize(bytearray, cursize) < 0) { + return -1; + } } PDATA_PUSH(self->stack, bytearray, -1); @@ -6230,20 +6334,15 @@ load_get(PickleState *st, UnpicklerObject *self) if (key == NULL) return -1; idx = PyLong_AsSsize_t(key); + Py_DECREF(key); if (idx == -1 && PyErr_Occurred()) { - Py_DECREF(key); return -1; } - value = _Unpickler_MemoGet(self, idx); + value = _Unpickler_MemoGet(st, self, idx); if (value == NULL) { - if (!PyErr_Occurred()) { - PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx); - } - Py_DECREF(key); return -1; } - Py_DECREF(key); PDATA_APPEND(self->stack, value, -1); return 0; @@ -6261,13 +6360,8 @@ load_binget(PickleState *st, UnpicklerObject *self) idx = Py_CHARMASK(s[0]); - value = _Unpickler_MemoGet(self, idx); + value = _Unpickler_MemoGet(st, self, idx); if (value == NULL) { - PyObject *key = PyLong_FromSsize_t(idx); - if (key != NULL) { - PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx); - Py_DECREF(key); - } return -1; } @@ -6287,13 +6381,8 @@ load_long_binget(PickleState *st, UnpicklerObject *self) idx = calc_binsize(s, 4); - value = _Unpickler_MemoGet(self, idx); + value = _Unpickler_MemoGet(st, self, idx); if (value == NULL) { - PyObject *key = PyLong_FromSsize_t(idx); - if (key != NULL) { - PyErr_Format(st->UnpicklingError, "Memo value not found at index %ld", idx); - Py_DECREF(key); - } return -1; } @@ -7258,6 +7347,7 @@ Unpickler_clear(PyObject *op) self->buffer.buf = NULL; } + Py_CLEAR(self->memo_dict); _Unpickler_MemoCleanup(self); PyMem_Free(self->marks); self->marks = NULL; @@ -7294,6 +7384,7 @@ Unpickler_traverse(PyObject *op, visitproc visit, void *arg) Py_VISIT(self->persistent_load); Py_VISIT(self->persistent_load_attr); Py_VISIT(self->buffers); + Py_VISIT(self->memo_dict); PyObject **memo = self->memo; if (memo) { Py_ssize_t i = self->memo_size;