Skip to content

Commit dec624e

Browse files
nineteendoblurb-it[bot]serhiy-storchakavstinner
authored
gh-135336: Add fast path to json string encoding (#133239)
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <[email protected]> Co-authored-by: Victor Stinner <[email protected]>
1 parent 7ab68cd commit dec624e

File tree

2 files changed

+145
-44
lines changed

2 files changed

+145
-44
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:mod:`json` now encodes strings up to 2.2x faster if they consist solely of characters that don’t require escaping.

Modules/_json.c

Lines changed: 144 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ typedef struct _PyEncoderObject {
5151
char sort_keys;
5252
char skipkeys;
5353
int allow_nan;
54-
PyCFunction fast_encode;
54+
int (*fast_encode)(PyUnicodeWriter *, PyObject *);
5555
} PyEncoderObject;
5656

5757
#define PyEncoderObject_CAST(op) ((PyEncoderObject *)(op))
@@ -102,8 +102,10 @@ static PyObject *
102102
_encoded_const(PyObject *obj);
103103
static void
104104
raise_errmsg(const char *msg, PyObject *s, Py_ssize_t end);
105-
static PyObject *
106-
encoder_encode_string(PyEncoderObject *s, PyObject *obj);
105+
static int
106+
_steal_accumulate(PyUnicodeWriter *writer, PyObject *stolen);
107+
static int
108+
encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj);
107109
static PyObject *
108110
encoder_encode_float(PyEncoderObject *s, PyObject *obj);
109111

@@ -146,22 +148,11 @@ ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars)
146148
return chars;
147149
}
148150

149-
static PyObject *
150-
ascii_escape_unicode(PyObject *pystr)
151+
static Py_ssize_t
152+
ascii_escape_size(const void *input, int kind, Py_ssize_t input_chars)
151153
{
152-
/* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */
153154
Py_ssize_t i;
154-
Py_ssize_t input_chars;
155155
Py_ssize_t output_size;
156-
Py_ssize_t chars;
157-
PyObject *rval;
158-
const void *input;
159-
Py_UCS1 *output;
160-
int kind;
161-
162-
input_chars = PyUnicode_GET_LENGTH(pystr);
163-
input = PyUnicode_DATA(pystr);
164-
kind = PyUnicode_KIND(pystr);
165156

166157
/* Compute the output size */
167158
for (i = 0, output_size = 2; i < input_chars; i++) {
@@ -181,11 +172,22 @@ ascii_escape_unicode(PyObject *pystr)
181172
}
182173
if (output_size > PY_SSIZE_T_MAX - d) {
183174
PyErr_SetString(PyExc_OverflowError, "string is too long to escape");
184-
return NULL;
175+
return -1;
185176
}
186177
output_size += d;
187178
}
188179

180+
return output_size;
181+
}
182+
183+
static PyObject *
184+
ascii_escape_unicode_and_size(const void *input, int kind, Py_ssize_t input_chars, Py_ssize_t output_size)
185+
{
186+
Py_ssize_t i;
187+
Py_ssize_t chars;
188+
PyObject *rval;
189+
Py_UCS1 *output;
190+
189191
rval = PyUnicode_New(output_size, 127);
190192
if (rval == NULL) {
191193
return NULL;
@@ -210,23 +212,62 @@ ascii_escape_unicode(PyObject *pystr)
210212
}
211213

212214
static PyObject *
213-
escape_unicode(PyObject *pystr)
215+
ascii_escape_unicode(PyObject *pystr)
216+
{
217+
/* Take a PyUnicode pystr and return a new ASCII-only escaped PyUnicode */
218+
Py_ssize_t input_chars = PyUnicode_GET_LENGTH(pystr);
219+
const void *input = PyUnicode_DATA(pystr);
220+
int kind = PyUnicode_KIND(pystr);
221+
222+
Py_ssize_t output_size = ascii_escape_size(input, kind, input_chars);
223+
if (output_size < 0) {
224+
return NULL;
225+
}
226+
227+
return ascii_escape_unicode_and_size(input, kind, input_chars, output_size);
228+
}
229+
230+
static int
231+
write_escaped_ascii(PyUnicodeWriter *writer, PyObject *pystr)
214232
{
215-
/* Take a PyUnicode pystr and return a new escaped PyUnicode */
216-
Py_ssize_t i;
217233
Py_ssize_t input_chars;
218-
Py_ssize_t output_size;
219-
Py_ssize_t chars;
220-
PyObject *rval;
221234
const void *input;
222235
int kind;
223-
Py_UCS4 maxchar;
224236

225-
maxchar = PyUnicode_MAX_CHAR_VALUE(pystr);
226237
input_chars = PyUnicode_GET_LENGTH(pystr);
227238
input = PyUnicode_DATA(pystr);
228239
kind = PyUnicode_KIND(pystr);
229240

241+
Py_ssize_t output_size = ascii_escape_size(input, kind, input_chars);
242+
if (output_size < 0) {
243+
return -1;
244+
}
245+
246+
if (output_size == input_chars + 2) {
247+
/* No need to escape anything */
248+
if (PyUnicodeWriter_WriteChar(writer, '"') < 0) {
249+
return -1;
250+
}
251+
if (PyUnicodeWriter_WriteStr(writer, pystr) < 0) {
252+
return -1;
253+
}
254+
return PyUnicodeWriter_WriteChar(writer, '"');
255+
}
256+
257+
PyObject *rval = ascii_escape_unicode_and_size(input, kind, input_chars, output_size);
258+
if (rval == NULL) {
259+
return -1;
260+
}
261+
262+
return _steal_accumulate(writer, rval);
263+
}
264+
265+
static Py_ssize_t
266+
escape_size(const void *input, int kind, Py_ssize_t input_chars)
267+
{
268+
Py_ssize_t i;
269+
Py_ssize_t output_size;
270+
230271
/* Compute the output size */
231272
for (i = 0, output_size = 2; i < input_chars; i++) {
232273
Py_UCS4 c = PyUnicode_READ(kind, input, i);
@@ -244,11 +285,21 @@ escape_unicode(PyObject *pystr)
244285
}
245286
if (output_size > PY_SSIZE_T_MAX - d) {
246287
PyErr_SetString(PyExc_OverflowError, "string is too long to escape");
247-
return NULL;
288+
return -1;
248289
}
249290
output_size += d;
250291
}
251292

293+
return output_size;
294+
}
295+
296+
static PyObject *
297+
escape_unicode_and_size(const void *input, int kind, Py_UCS4 maxchar, Py_ssize_t input_chars, Py_ssize_t output_size)
298+
{
299+
Py_ssize_t i;
300+
Py_ssize_t chars;
301+
PyObject *rval;
302+
252303
rval = PyUnicode_New(output_size, maxchar);
253304
if (rval == NULL)
254305
return NULL;
@@ -303,6 +354,55 @@ escape_unicode(PyObject *pystr)
303354
return rval;
304355
}
305356

357+
static PyObject *
358+
escape_unicode(PyObject *pystr)
359+
{
360+
/* Take a PyUnicode pystr and return a new escaped PyUnicode */
361+
Py_ssize_t input_chars = PyUnicode_GET_LENGTH(pystr);
362+
const void *input = PyUnicode_DATA(pystr);
363+
int kind = PyUnicode_KIND(pystr);
364+
Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(pystr);
365+
366+
Py_ssize_t output_size = escape_size(input, kind, input_chars);
367+
if (output_size < 0) {
368+
return NULL;
369+
}
370+
371+
return escape_unicode_and_size(input, kind, maxchar, input_chars, output_size);
372+
}
373+
374+
static int
375+
write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr)
376+
{
377+
Py_ssize_t input_chars = PyUnicode_GET_LENGTH(pystr);
378+
const void *input = PyUnicode_DATA(pystr);
379+
int kind = PyUnicode_KIND(pystr);
380+
Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(pystr);
381+
382+
Py_ssize_t output_size = escape_size(input, kind, input_chars);
383+
if (output_size < 0) {
384+
return -1;
385+
}
386+
387+
if (output_size == input_chars + 2) {
388+
/* No need to escape anything */
389+
if (PyUnicodeWriter_WriteChar(writer, '"') < 0) {
390+
return -1;
391+
}
392+
if (PyUnicodeWriter_WriteStr(writer, pystr) < 0) {
393+
return -1;
394+
}
395+
return PyUnicodeWriter_WriteChar(writer, '"');
396+
}
397+
398+
PyObject *rval = escape_unicode_and_size(input, kind, maxchar, input_chars, output_size);
399+
if (rval == NULL) {
400+
return -1;
401+
}
402+
403+
return _steal_accumulate(writer, rval);
404+
}
405+
306406
static void
307407
raise_errmsg(const char *msg, PyObject *s, Py_ssize_t end)
308408
{
@@ -1256,8 +1356,11 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12561356

12571357
if (PyCFunction_Check(s->encoder)) {
12581358
PyCFunction f = PyCFunction_GetFunction(s->encoder);
1259-
if (f == py_encode_basestring_ascii || f == py_encode_basestring) {
1260-
s->fast_encode = f;
1359+
if (f == py_encode_basestring_ascii) {
1360+
s->fast_encode = write_escaped_ascii;
1361+
}
1362+
else if (f == py_encode_basestring) {
1363+
s->fast_encode = write_escaped_unicode;
12611364
}
12621365
}
12631366

@@ -1438,24 +1541,27 @@ encoder_encode_float(PyEncoderObject *s, PyObject *obj)
14381541
return PyFloat_Type.tp_repr(obj);
14391542
}
14401543

1441-
static PyObject *
1442-
encoder_encode_string(PyEncoderObject *s, PyObject *obj)
1544+
static int
1545+
encoder_write_string(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj)
14431546
{
14441547
/* Return the JSON representation of a string */
14451548
PyObject *encoded;
14461549

14471550
if (s->fast_encode) {
1448-
return s->fast_encode(NULL, obj);
1551+
return s->fast_encode(writer, obj);
14491552
}
14501553
encoded = PyObject_CallOneArg(s->encoder, obj);
1451-
if (encoded != NULL && !PyUnicode_Check(encoded)) {
1554+
if (encoded == NULL) {
1555+
return -1;
1556+
}
1557+
if (!PyUnicode_Check(encoded)) {
14521558
PyErr_Format(PyExc_TypeError,
14531559
"encoder() must return a string, not %.80s",
14541560
Py_TYPE(encoded)->tp_name);
14551561
Py_DECREF(encoded);
1456-
return NULL;
1562+
return -1;
14571563
}
1458-
return encoded;
1564+
return _steal_accumulate(writer, encoded);
14591565
}
14601566

14611567
static int
@@ -1486,10 +1592,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer,
14861592
return PyUnicodeWriter_WriteASCII(writer, "false", 5);
14871593
}
14881594
else if (PyUnicode_Check(obj)) {
1489-
PyObject *encoded = encoder_encode_string(s, obj);
1490-
if (encoded == NULL)
1491-
return -1;
1492-
return _steal_accumulate(writer, encoded);
1595+
return encoder_write_string(s, writer, obj);
14931596
}
14941597
else if (PyLong_Check(obj)) {
14951598
if (PyLong_CheckExact(obj)) {
@@ -1578,7 +1681,7 @@ encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *firs
15781681
PyObject *item_separator)
15791682
{
15801683
PyObject *keystr = NULL;
1581-
PyObject *encoded;
1684+
int rv;
15821685

15831686
if (PyUnicode_Check(key)) {
15841687
keystr = Py_NewRef(key);
@@ -1624,13 +1727,10 @@ encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *firs
16241727
}
16251728
}
16261729

1627-
encoded = encoder_encode_string(s, keystr);
1730+
rv = encoder_write_string(s, writer, keystr);
16281731
Py_DECREF(keystr);
1629-
if (encoded == NULL) {
1630-
return -1;
1631-
}
16321732

1633-
if (_steal_accumulate(writer, encoded) < 0) {
1733+
if (rv < 0) {
16341734
return -1;
16351735
}
16361736
if (PyUnicodeWriter_WriteStr(writer, s->key_separator) < 0) {

0 commit comments

Comments
 (0)