Skip to content

Commit 7dfec2e

Browse files
committed
Merge remote-tracking branch 'upstream/main' into feat/codecs/xmlcharrefreplace-handler-129173
2 parents 329c039 + fa6a814 commit 7dfec2e

File tree

1 file changed

+82
-40
lines changed

1 file changed

+82
-40
lines changed

Python/codecs.c

Lines changed: 82 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,26 @@ codec_handler_write_unicode_dec(Py_UCS1 **p, Py_UCS4 ch)
780780
*(*p)++ = ';';
781781
}
782782

783+
/*
784+
* Create a Unicode string containing 'count' copies of the official
785+
* Unicode REPLACEMENT CHARACTER (0xFFFD).
786+
*/
787+
static PyObject *
788+
codec_handler_unicode_replacement_character(Py_ssize_t count)
789+
{
790+
PyObject *res = PyUnicode_New(count, Py_UNICODE_REPLACEMENT_CHARACTER);
791+
if (res == NULL) {
792+
return NULL;
793+
}
794+
assert(count == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
795+
Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
796+
for (Py_ssize_t i = 0; i < count; ++i) {
797+
outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
798+
}
799+
assert(_PyUnicode_CheckConsistency(res, 1));
800+
return res;
801+
}
802+
783803

784804
// --- handler: 'strict' ------------------------------------------------------
785805

@@ -825,50 +845,71 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc)
825845
}
826846

827847

828-
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
848+
// --- handler: 'replace' -----------------------------------------------------
849+
850+
static PyObject *
851+
_PyCodec_ReplaceUnicodeEncodeError(PyObject *exc)
829852
{
830853
Py_ssize_t start, end, slen;
854+
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
855+
&start, &end, &slen, false) < 0)
856+
{
857+
return NULL;
858+
}
859+
PyObject *res = PyUnicode_New(slen, '?');
860+
if (res == NULL) {
861+
return NULL;
862+
}
863+
assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
864+
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
865+
memset(outp, '?', sizeof(Py_UCS1) * slen);
866+
assert(_PyUnicode_CheckConsistency(res, 1));
867+
return Py_BuildValue("(Nn)", res, end);
868+
}
831869

832-
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
833-
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
834-
&start, &end, &slen, false) < 0) {
835-
return NULL;
836-
}
837-
PyObject *res = PyUnicode_New(slen, '?');
838-
if (res == NULL) {
839-
return NULL;
840-
}
841-
assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
842-
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
843-
memset(outp, '?', sizeof(Py_UCS1) * slen);
844-
assert(_PyUnicode_CheckConsistency(res, 1));
845-
return Py_BuildValue("(Nn)", res, end);
870+
871+
static PyObject *
872+
_PyCodec_ReplaceUnicodeDecodeError(PyObject *exc)
873+
{
874+
Py_ssize_t end;
875+
if (PyUnicodeDecodeError_GetEnd(exc, &end) < 0) {
876+
return NULL;
846877
}
847-
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
848-
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
849-
NULL, &end, NULL, true) < 0) {
850-
return NULL;
851-
}
852-
return Py_BuildValue("(Cn)",
853-
(int)Py_UNICODE_REPLACEMENT_CHARACTER,
854-
end);
878+
PyObject *res = codec_handler_unicode_replacement_character(1);
879+
if (res == NULL) {
880+
return NULL;
855881
}
856-
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
857-
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
858-
&start, &end, &slen, false) < 0) {
859-
return NULL;
860-
}
861-
PyObject *res = PyUnicode_New(slen, Py_UNICODE_REPLACEMENT_CHARACTER);
862-
if (res == NULL) {
863-
return NULL;
864-
}
865-
assert(slen == 0 || PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
866-
Py_UCS2 *outp = PyUnicode_2BYTE_DATA(res);
867-
for (Py_ssize_t i = 0; i < slen; ++i) {
868-
outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
869-
}
870-
assert(_PyUnicode_CheckConsistency(res, 1));
871-
return Py_BuildValue("(Nn)", res, end);
882+
return Py_BuildValue("(Nn)", res, end);
883+
}
884+
885+
886+
static PyObject *
887+
_PyCodec_ReplaceUnicodeTranslateError(PyObject *exc)
888+
{
889+
Py_ssize_t start, end, slen;
890+
if (_PyUnicodeError_GetParams(exc, NULL, NULL,
891+
&start, &end, &slen, false) < 0)
892+
{
893+
return NULL;
894+
}
895+
PyObject *res = codec_handler_unicode_replacement_character(slen);
896+
if (res == NULL) {
897+
return NULL;
898+
}
899+
return Py_BuildValue("(Nn)", res, end);
900+
}
901+
902+
903+
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
904+
{
905+
if (_PyIsUnicodeEncodeError(exc)) {
906+
return _PyCodec_ReplaceUnicodeEncodeError(exc);
907+
}
908+
else if (_PyIsUnicodeDecodeError(exc)) {
909+
return _PyCodec_ReplaceUnicodeDecodeError(exc);
910+
}
911+
else if (_PyIsUnicodeTranslateError(exc)) {
912+
return _PyCodec_ReplaceUnicodeTranslateError(exc);
872913
}
873914
else {
874915
wrong_exception_type(exc);
@@ -1467,7 +1508,8 @@ ignore_errors(PyObject *Py_UNUSED(self), PyObject *exc)
14671508
}
14681509

14691510

1470-
static PyObject *replace_errors(PyObject *self, PyObject *exc)
1511+
static inline PyObject *
1512+
replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
14711513
{
14721514
return PyCodec_ReplaceErrors(exc);
14731515
}

0 commit comments

Comments
 (0)