Skip to content

Commit ea4f7ad

Browse files
committed
Fix xmlcharrefreplace codecs handler.
1 parent 2e66302 commit ea4f7ad

File tree

2 files changed

+109
-91
lines changed

2 files changed

+109
-91
lines changed

Lib/test/test_capi/test_codecs.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,8 @@ def test_codec_replace_errors_handler(self):
843843

844844
def test_codec_xmlcharrefreplace_errors_handler(self):
845845
handler = _testcapi.codec_xmlcharrefreplace_errors
846-
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
846+
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors,
847+
safe=True)
847848

848849
def test_codec_backslashreplace_errors_handler(self):
849850
handler = _testcapi.codec_backslashreplace_errors
@@ -853,12 +854,12 @@ def test_codec_namereplace_errors_handler(self):
853854
handler = _testlimitedcapi.codec_namereplace_errors
854855
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
855856

856-
def do_test_codec_errors_handler(self, handler, exceptions):
857+
def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False):
857858
at_least_one = False
858859
for exc in exceptions:
859860
# See https://github.com/python/cpython/issues/123378 and related
860861
# discussion and issues for details.
861-
if self._exception_may_crash(exc):
862+
if not safe and self._exception_may_crash(exc):
862863
continue
863864

864865
at_least_one = True

Python/codecs.c

Lines changed: 105 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -755,100 +755,117 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
755755

756756
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
757757
{
758-
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
759-
PyObject *restuple;
760-
PyObject *object;
761-
Py_ssize_t i;
762-
Py_ssize_t start;
763-
Py_ssize_t end;
764-
PyObject *res;
765-
Py_UCS1 *outp;
766-
Py_ssize_t ressize;
767-
Py_UCS4 ch;
768-
if (PyUnicodeEncodeError_GetStart(exc, &start))
769-
return NULL;
770-
if (PyUnicodeEncodeError_GetEnd(exc, &end))
771-
return NULL;
772-
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
773-
return NULL;
774-
if (end - start > PY_SSIZE_T_MAX / (2+7+1))
775-
end = start + PY_SSIZE_T_MAX / (2+7+1);
776-
for (i = start, ressize = 0; i < end; ++i) {
777-
/* object is guaranteed to be "ready" */
778-
ch = PyUnicode_READ_CHAR(object, i);
779-
if (ch<10)
780-
ressize += 2+1+1;
781-
else if (ch<100)
782-
ressize += 2+2+1;
783-
else if (ch<1000)
784-
ressize += 2+3+1;
785-
else if (ch<10000)
786-
ressize += 2+4+1;
787-
else if (ch<100000)
788-
ressize += 2+5+1;
789-
else if (ch<1000000)
790-
ressize += 2+6+1;
791-
else
792-
ressize += 2+7+1;
758+
if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
759+
wrong_exception_type(exc);
760+
return NULL;
761+
}
762+
763+
Py_ssize_t start, end;
764+
if (PyUnicodeEncodeError_GetStart(exc, &start)) {
765+
return NULL;
766+
}
767+
if (PyUnicodeEncodeError_GetEnd(exc, &end)) {
768+
return NULL;
769+
}
770+
if (end <= start) {
771+
// gh-12337 will handle negative end or start (for now we crash)
772+
return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
773+
}
774+
775+
PyObject *obj = PyUnicodeEncodeError_GetObject(exc);
776+
if (obj == NULL) {
777+
return NULL;
778+
}
779+
780+
if (end - start > PY_SSIZE_T_MAX / 10) {
781+
end = start + PY_SSIZE_T_MAX / 10;
782+
}
783+
784+
end = Py_MIN(end, PyUnicode_GET_LENGTH(obj));
785+
786+
Py_ssize_t ressize = 0;
787+
for (Py_ssize_t i = start; i < end; ++i) {
788+
/* object is guaranteed to be "ready" */
789+
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
790+
// The number of characters that each character 'ch' contributes
791+
// in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}.
792+
if (ch < 10) {
793+
ressize += 4;
793794
}
794-
/* allocate replacement */
795-
res = PyUnicode_New(ressize, 127);
796-
if (res == NULL) {
797-
Py_DECREF(object);
798-
return NULL;
795+
else if (ch < 100) {
796+
ressize += 5;
799797
}
800-
outp = PyUnicode_1BYTE_DATA(res);
801-
/* generate replacement */
802-
for (i = start; i < end; ++i) {
803-
int digits;
804-
int base;
805-
ch = PyUnicode_READ_CHAR(object, i);
806-
*outp++ = '&';
807-
*outp++ = '#';
808-
if (ch<10) {
809-
digits = 1;
810-
base = 1;
811-
}
812-
else if (ch<100) {
813-
digits = 2;
814-
base = 10;
815-
}
816-
else if (ch<1000) {
817-
digits = 3;
818-
base = 100;
819-
}
820-
else if (ch<10000) {
821-
digits = 4;
822-
base = 1000;
823-
}
824-
else if (ch<100000) {
825-
digits = 5;
826-
base = 10000;
827-
}
828-
else if (ch<1000000) {
829-
digits = 6;
830-
base = 100000;
831-
}
832-
else {
833-
digits = 7;
834-
base = 1000000;
835-
}
836-
while (digits-->0) {
837-
*outp++ = '0' + ch/base;
838-
ch %= base;
839-
base /= 10;
840-
}
841-
*outp++ = ';';
798+
else if (ch < 1000) {
799+
ressize += 6;
800+
}
801+
else if (ch < 10000) {
802+
ressize += 7;
803+
}
804+
else if (ch < 100000) {
805+
ressize += 8;
806+
}
807+
else if (ch < 1000000) {
808+
ressize += 9;
809+
}
810+
else {
811+
assert(ch < 10000000);
812+
ressize += 10;
842813
}
843-
assert(_PyUnicode_CheckConsistency(res, 1));
844-
restuple = Py_BuildValue("(Nn)", res, end);
845-
Py_DECREF(object);
846-
return restuple;
847814
}
848-
else {
849-
wrong_exception_type(exc);
815+
816+
/* allocate replacement */
817+
PyObject *res = PyUnicode_New(ressize, 127);
818+
if (res == NULL) {
819+
Py_DECREF(obj);
850820
return NULL;
851821
}
822+
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
823+
/* generate replacement */
824+
for (Py_ssize_t i = start; i < end; ++i) {
825+
int digits, base;
826+
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
827+
if (ch < 10) {
828+
digits = 1;
829+
base = 1;
830+
}
831+
else if (ch < 100) {
832+
digits = 2;
833+
base = 10;
834+
}
835+
else if (ch < 1000) {
836+
digits = 3;
837+
base = 100;
838+
}
839+
else if (ch < 10000) {
840+
digits = 4;
841+
base = 1000;
842+
}
843+
else if (ch < 100000) {
844+
digits = 5;
845+
base = 10000;
846+
}
847+
else if (ch < 1000000) {
848+
digits = 6;
849+
base = 100000;
850+
}
851+
else {
852+
assert(ch < 10000000);
853+
digits = 7;
854+
base = 1000000;
855+
}
856+
*outp++ = '&';
857+
*outp++ = '#';
858+
while (digits-- > 0) {
859+
*outp++ = '0' + ch / base;
860+
ch %= base;
861+
base /= 10;
862+
}
863+
*outp++ = ';';
864+
}
865+
assert(_PyUnicode_CheckConsistency(res, 1));
866+
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
867+
Py_DECREF(obj);
868+
return restuple;
852869
}
853870

854871
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)

0 commit comments

Comments
 (0)