diff --git a/Doc/library/json.rst b/Doc/library/json.rst index 4e7046d6d8f6ac..2358c3f3530003 100644 --- a/Doc/library/json.rst +++ b/Doc/library/json.rst @@ -260,7 +260,7 @@ Basic Usage .. function:: load(fp, *, cls=None, object_hook=None, parse_float=None, \ parse_int=None, parse_constant=None, \ - object_pairs_hook=None, **kw) + object_pairs_hook=None, cache_keys=True, **kw) Deserialize *fp* to a Python object using the :ref:`JSON-to-Python conversion table `. @@ -321,6 +321,11 @@ Basic Usage Default ``None``. :type parse_constant: :term:`callable` | None + :param bool cache_keys: + If set, then repeated keys will be re-used across dictionaries, leading + to lower memory usage, but worse performance. + Default ``True``. + :raises JSONDecodeError: When the data being deserialized is not a valid JSON document. @@ -345,7 +350,11 @@ Basic Usage conversion length limitation ` to help avoid denial of service attacks. -.. function:: loads(s, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw) + .. versionchanged:: next + + * Added the optional *cache_keys* parameter. + +.. function:: loads(s, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, cache_keys=True, **kw) Identical to :func:`load`, but instead of a file-like object, deserialize *s* (a :class:`str`, :class:`bytes` or :class:`bytearray` diff --git a/Lib/json/__init__.py b/Lib/json/__init__.py index 1d972d22ded072..6deafcadbce63a 100644 --- a/Lib/json/__init__.py +++ b/Lib/json/__init__.py @@ -272,7 +272,8 @@ def detect_encoding(b): def load(fp, *, cls=None, object_hook=None, parse_float=None, - parse_int=None, parse_constant=None, object_pairs_hook=None, **kw): + parse_int=None, parse_constant=None, object_pairs_hook=None, + cache_keys=True, **kw): """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing a JSON document) to a Python object. @@ -293,11 +294,13 @@ def load(fp, *, cls=None, object_hook=None, parse_float=None, return loads(fp.read(), cls=cls, object_hook=object_hook, parse_float=parse_float, parse_int=parse_int, - parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw) + parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, + cache_keys=cache_keys, **kw) def loads(s, *, cls=None, object_hook=None, parse_float=None, - parse_int=None, parse_constant=None, object_pairs_hook=None, **kw): + parse_int=None, parse_constant=None, object_pairs_hook=None, + cache_keys=True, **kw): """Deserialize ``s`` (a ``str``, ``bytes`` or ``bytearray`` instance containing a JSON document) to a Python object. @@ -327,6 +330,9 @@ def loads(s, *, cls=None, object_hook=None, parse_float=None, This can be used to raise an exception if invalid JSON numbers are encountered. + if ``cache_keys`` is true, then repeated keys will be re-used across + dictionaries, leading to lower memory usage, but worse performance. + To use a custom ``JSONDecoder`` subclass, specify it with the ``cls`` kwarg; otherwise ``JSONDecoder`` is used. """ @@ -342,7 +348,8 @@ def loads(s, *, cls=None, object_hook=None, parse_float=None, if (cls is None and object_hook is None and parse_int is None and parse_float is None and - parse_constant is None and object_pairs_hook is None and not kw): + parse_constant is None and object_pairs_hook is None and + cache_keys and not kw): return _default_decoder.decode(s) if cls is None: cls = JSONDecoder @@ -356,4 +363,6 @@ def loads(s, *, cls=None, object_hook=None, parse_float=None, kw['parse_int'] = parse_int if parse_constant is not None: kw['parse_constant'] = parse_constant + if not cache_keys: + kw['cache_keys'] = cache_keys return cls(**kw).decode(s) diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py index ff4bfcdcc407b9..5c7042ed19d023 100644 --- a/Lib/json/decoder.py +++ b/Lib/json/decoder.py @@ -135,7 +135,7 @@ def py_scanstring(s, end, strict=True, def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, - memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR): + memo=None, cache_keys=True, _w=WHITESPACE.match, _ws=WHITESPACE_STR): s, end = s_and_end pairs = [] pairs_append = pairs.append @@ -166,7 +166,8 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, end += 1 while True: key, end = scanstring(s, end, strict) - key = memo_get(key, key) + if cache_keys: + key = memo_get(key, key) # To skip some function call overhead we optimize the fast paths where # the JSON key separator is ": " or just ":". if s[end:end + 1] != ':': @@ -291,7 +292,7 @@ class JSONDecoder(object): def __init__(self, *, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, strict=True, - object_pairs_hook=None): + object_pairs_hook=None, cache_keys=True): """``object_hook``, if specified, will be called with the result of every JSON object decoded and its return value will be used in place of the given ``dict``. This can be used to provide custom @@ -323,6 +324,9 @@ def __init__(self, *, object_hook=None, parse_float=None, characters will be allowed inside strings. Control characters in this context are those with character codes in the 0-31 range, including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``. + + if ``cache_keys`` is true, then repeated keys will be re-used across + dictionaries, leading to lower memory usage, but worse performance. """ self.object_hook = object_hook self.parse_float = parse_float or float @@ -330,6 +334,7 @@ def __init__(self, *, object_hook=None, parse_float=None, self.parse_constant = parse_constant or _CONSTANTS.__getitem__ self.strict = strict self.object_pairs_hook = object_pairs_hook + self.cache_keys = cache_keys self.parse_object = JSONObject self.parse_array = JSONArray self.parse_string = scanstring diff --git a/Lib/json/scanner.py b/Lib/json/scanner.py index 090897515fe2f3..a3e904b3b1d788 100644 --- a/Lib/json/scanner.py +++ b/Lib/json/scanner.py @@ -18,6 +18,7 @@ def py_make_scanner(context): parse_string = context.parse_string match_number = NUMBER_RE.match strict = context.strict + cache_keys = context.cache_keys parse_float = context.parse_float parse_int = context.parse_int parse_constant = context.parse_constant @@ -35,7 +36,7 @@ def _scan_once(string, idx): return parse_string(string, idx + 1, strict) elif nextchar == '{': return parse_object((string, idx + 1), strict, - _scan_once, object_hook, object_pairs_hook, memo) + _scan_once, object_hook, object_pairs_hook, memo, cache_keys) elif nextchar == '[': return parse_array((string, idx + 1), _scan_once) elif nextchar == 'n' and string[idx:idx + 4] == 'null': diff --git a/Lib/test/test_json/test_decode.py b/Lib/test/test_json/test_decode.py index 2250af964c022b..d3cea0809ba517 100644 --- a/Lib/test/test_json/test_decode.py +++ b/Lib/test/test_json/test_decode.py @@ -1,6 +1,7 @@ import decimal from io import StringIO from collections import OrderedDict +from functools import partial from test.test_json import PyTest, CTest from test import support @@ -89,6 +90,19 @@ def test_keys_reuse(self): self.check_keys_reuse(s, decoder.decode) self.assertFalse(decoder.memo) + def check_no_keys_reuse(self, source, loads): + rval = loads(source) + (a, b), (c, d) = sorted(rval[0]), sorted(rval[1]) + self.assertIsNot(a, c) + self.assertIsNot(b, d) + + def test_no_keys_reuse(self): + s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]' + self.check_no_keys_reuse(s, partial(self.loads, cache_keys=False)) + decoder = self.json.decoder.JSONDecoder(cache_keys=False) + self.check_no_keys_reuse(s, decoder.decode) + self.assertFalse(decoder.memo) + def test_extra_data(self): s = '[1, 2, 3]5' msg = 'Extra data' diff --git a/Misc/NEWS.d/next/Library/2025-04-01-07-18-10.gh-issue-131955.o-v72F.rst b/Misc/NEWS.d/next/Library/2025-04-01-07-18-10.gh-issue-131955.o-v72F.rst new file mode 100644 index 00000000000000..030ce7516fed42 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-04-01-07-18-10.gh-issue-131955.o-v72F.rst @@ -0,0 +1 @@ +Allow to disable internal string key caching for :func:`json.loads`. diff --git a/Modules/_json.c b/Modules/_json.c index cd8e697916226b..04c67e1df14eee 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -21,6 +21,7 @@ typedef struct _PyScannerObject { PyObject_HEAD signed char strict; + signed char cache_keys; PyObject *object_hook; PyObject *object_pairs_hook; PyObject *parse_float; @@ -32,6 +33,7 @@ typedef struct _PyScannerObject { static PyMemberDef scanner_members[] = { {"strict", Py_T_BOOL, offsetof(PyScannerObject, strict), Py_READONLY, "strict"}, + {"cache_keys", Py_T_BOOL, offsetof(PyScannerObject, cache_keys), Py_READONLY, "cache_keys"}, {"object_hook", _Py_T_OBJECT, offsetof(PyScannerObject, object_hook), Py_READONLY, "object_hook"}, {"object_pairs_hook", _Py_T_OBJECT, offsetof(PyScannerObject, object_pairs_hook), Py_READONLY}, {"parse_float", _Py_T_OBJECT, offsetof(PyScannerObject, parse_float), Py_READONLY, "parse_float"}, @@ -710,10 +712,12 @@ _parse_object_unicode(PyScannerObject *s, PyObject *memo, PyObject *pystr, Py_ss key = scanstring_unicode(pystr, idx + 1, s->strict, &next_idx); if (key == NULL) goto bail; - if (PyDict_SetDefaultRef(memo, key, key, &memokey) < 0) { - goto bail; + if (memo != Py_None) { + if (PyDict_SetDefaultRef(memo, key, key, &memokey) < 0) { + goto bail; + } + Py_SETREF(key, memokey); } - Py_SETREF(key, memokey); idx = next_idx; /* skip whitespace between key and : delimiter, read :, skip whitespace */ @@ -1124,8 +1128,10 @@ scan_once_unicode(PyScannerObject *s, PyObject *memo, PyObject *pystr, Py_ssize_ } static PyObject * -scanner_call(PyObject *self, PyObject *args, PyObject *kwds) +scanner_call(PyObject *op, PyObject *args, PyObject *kwds) { + PyScannerObject *self = PyScannerObject_CAST(op); + /* Python callable interface to scan_once_{str,unicode} */ PyObject *pystr; PyObject *rval; @@ -1142,12 +1148,17 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds) return NULL; } - PyObject *memo = PyDict_New(); - if (memo == NULL) { - return NULL; + PyObject *memo; + if (self->cache_keys) { + memo = PyDict_New(); + if (memo == NULL) { + return NULL; + } + } + else { + memo = Py_None; } - rval = scan_once_unicode(PyScannerObject_CAST(self), - memo, pystr, idx, &next_idx); + rval = scan_once_unicode(self, memo, pystr, idx, &next_idx); Py_DECREF(memo); if (rval == NULL) return NULL; @@ -1160,6 +1171,7 @@ scanner_new(PyTypeObject *type, PyObject *args, PyObject *kwds) PyScannerObject *s; PyObject *ctx; PyObject *strict; + PyObject *cache_keys; static char *kwlist[] = {"context", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx)) @@ -1178,6 +1190,13 @@ scanner_new(PyTypeObject *type, PyObject *args, PyObject *kwds) Py_DECREF(strict); if (s->strict < 0) goto bail; + cache_keys = PyObject_GetAttrString(ctx, "cache_keys"); + if (cache_keys == NULL) + goto bail; + s->cache_keys = PyObject_IsTrue(cache_keys); + Py_DECREF(cache_keys); + if (s->cache_keys < 0) + goto bail; s->object_hook = PyObject_GetAttrString(ctx, "object_hook"); if (s->object_hook == NULL) goto bail;