Skip to content

Commit a0dcf05

Browse files
committed
gh-139156: Optimize _PyUnicode_EncodeCharmap()
Specialize _PyUnicode_EncodeCharmap() for EncodingMapType which is used by Python codecs such as iso8859_15.
1 parent c7b11b7 commit a0dcf05

File tree

1 file changed

+60
-14
lines changed

1 file changed

+60
-14
lines changed

Objects/unicodeobject.c

Lines changed: 60 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6440,6 +6440,8 @@ _PyUnicode_EncodeUTF16(PyObject *str,
64406440
#endif
64416441

64426442
if (kind == PyUnicode_1BYTE_KIND) {
6443+
// gh-139156: Don't use PyBytesWriter API here since it has an overhead
6444+
// on short strings
64436445
PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
64446446
if (v == NULL) {
64456447
return NULL;
@@ -8857,11 +8859,15 @@ charmapencode_output(Py_UCS4 c, PyObject *mapping,
88578859
if (Py_IS_TYPE(mapping, &EncodingMapType)) {
88588860
int res = encoding_map_lookup(c, mapping);
88598861
Py_ssize_t requiredsize = *outpos+1;
8860-
if (res == -1)
8862+
if (res == -1) {
88618863
return enc_FAILED;
8862-
if (outsize<requiredsize)
8863-
if (charmapencode_resize(writer, outpos, requiredsize))
8864+
}
8865+
8866+
if (outsize<requiredsize) {
8867+
if (charmapencode_resize(writer, outpos, requiredsize)) {
88648868
return enc_EXCEPTION;
8869+
}
8870+
}
88658871
outstart = _PyBytesWriter_GetData(writer);
88668872
outstart[(*outpos)++] = (char)res;
88678873
return enc_SUCCESS;
@@ -8902,7 +8908,7 @@ charmapencode_output(Py_UCS4 c, PyObject *mapping,
89028908
return enc_SUCCESS;
89038909
}
89048910

8905-
/* handle an error in PyUnicode_EncodeCharmap
8911+
/* handle an error in _PyUnicode_EncodeCharmap()
89068912
Return 0 on success, -1 on error */
89078913
static int
89088914
charmap_encoding_error(
@@ -9080,23 +9086,63 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
90809086
Py_ssize_t respos = 0;
90819087
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
90829088

9083-
while (inpos<size) {
9084-
Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9085-
/* try to encode it */
9086-
charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
9087-
if (x==enc_EXCEPTION) /* error */
9088-
goto onError;
9089-
if (x==enc_FAILED) { /* unencodable character */
9089+
if (Py_IS_TYPE(mapping, &EncodingMapType)) {
9090+
char *outstart = _PyBytesWriter_GetData(writer);
9091+
Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
9092+
9093+
while (inpos<size) {
9094+
Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9095+
9096+
/* try to encode it */
9097+
int res = encoding_map_lookup(ch, mapping);
9098+
Py_ssize_t requiredsize = respos+1;
9099+
if (res == -1) {
9100+
goto enc_FAILED;
9101+
}
9102+
9103+
if (outsize<requiredsize) {
9104+
if (charmapencode_resize(writer, &respos, requiredsize)) {
9105+
goto onError;
9106+
}
9107+
outstart = _PyBytesWriter_GetData(writer);
9108+
outsize = _PyBytesWriter_GetSize(writer);
9109+
}
9110+
outstart[respos++] = (char)res;
9111+
9112+
/* done with this character => adjust input position */
9113+
++inpos;
9114+
continue;
9115+
9116+
enc_FAILED:
90909117
if (charmap_encoding_error(unicode, &inpos, mapping,
90919118
&exc,
90929119
&error_handler, &error_handler_obj, errors,
90939120
writer, &respos)) {
90949121
goto onError;
90959122
}
9123+
outstart = _PyBytesWriter_GetData(writer);
9124+
outsize = _PyBytesWriter_GetSize(writer);
90969125
}
9097-
else {
9098-
/* done with this character => adjust input position */
9099-
++inpos;
9126+
}
9127+
else {
9128+
while (inpos<size) {
9129+
Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9130+
/* try to encode it */
9131+
charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
9132+
if (x==enc_EXCEPTION) /* error */
9133+
goto onError;
9134+
if (x==enc_FAILED) { /* unencodable character */
9135+
if (charmap_encoding_error(unicode, &inpos, mapping,
9136+
&exc,
9137+
&error_handler, &error_handler_obj, errors,
9138+
writer, &respos)) {
9139+
goto onError;
9140+
}
9141+
}
9142+
else {
9143+
/* done with this character => adjust input position */
9144+
++inpos;
9145+
}
91009146
}
91019147
}
91029148

0 commit comments

Comments
 (0)