Skip to content

Commit 9d99097

Browse files
committed
extract some logic
1 parent a8880d1 commit 9d99097

File tree

1 file changed

+70
-71
lines changed

1 file changed

+70
-71
lines changed

Python/codecs.c

Lines changed: 70 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,60 @@ wrong_exception_type(PyObject *exc)
676676
PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
677677

678678

679+
// --- codecs handlers: utilities ---------------------------------------------
680+
681+
/*
682+
* Return the number of characters (including special prefixes)
683+
* needed to represent 'ch' by _codec_handler_write_unicode_hex().
684+
*/
685+
static inline Py_ssize_t
686+
_codec_handler_unicode_hex_width(Py_UCS4 ch)
687+
{
688+
if (ch >= 0x10000) {
689+
// format: '\\' + 'U' + 8 hex digits
690+
return 1 + 1 + 8;
691+
}
692+
else if (ch >= 0x100) {
693+
// format: '\\' + 'u' + 4 hex digits
694+
return 1 + 1 + 4;
695+
}
696+
else {
697+
// format: '\\' + 'x' + 2 hex digits
698+
return 1 + 1 + 2;
699+
}
700+
}
701+
702+
703+
/*
704+
* Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
705+
* using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
706+
*/
707+
static inline void
708+
_codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
709+
{
710+
*(*p)++ = '\\';
711+
if (ch >= 0x10000) {
712+
*(*p)++ = 'U';
713+
*(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
714+
*(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
715+
*(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
716+
*(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
717+
*(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
718+
*(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
719+
}
720+
else if (ch >= 0x100) {
721+
*(*p)++ = 'u';
722+
*(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
723+
*(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
724+
}
725+
else {
726+
*(*p)++ = 'x';
727+
}
728+
*(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
729+
*(*p)++ = Py_hexdigits[ch & 0xf];
730+
}
731+
732+
679733
// --- handler: 'strict' ------------------------------------------------------
680734

681735
PyObject *PyCodec_StrictErrors(PyObject *exc)
@@ -942,17 +996,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
942996

943997
Py_ssize_t ressize = 0;
944998
for (Py_ssize_t i = start; i < end; ++i) {
945-
/* object is guaranteed to be "ready" */
946999
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
947-
if (c >= 0x10000) {
948-
ressize += 1 + 1 + 8;
949-
}
950-
else if (c >= 0x100) {
951-
ressize += 1 + 1 + 4;
952-
}
953-
else {
954-
ressize += 1 + 1 + 2;
955-
}
1000+
ressize += _codec_handler_unicode_hex_width(c);
9561001
}
9571002
PyObject *res = PyUnicode_New(ressize, 127);
9581003
if (res == NULL) {
@@ -962,26 +1007,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
9621007
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
9631008
for (Py_ssize_t i = start; i < end; ++i) {
9641009
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
965-
*outp++ = '\\';
966-
if (c >= 0x00010000) {
967-
*outp++ = 'U';
968-
*outp++ = Py_hexdigits[(c >> 28) & 0xf];
969-
*outp++ = Py_hexdigits[(c >> 24) & 0xf];
970-
*outp++ = Py_hexdigits[(c >> 20) & 0xf];
971-
*outp++ = Py_hexdigits[(c >> 16) & 0xf];
972-
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
973-
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
974-
}
975-
else if (c >= 0x100) {
976-
*outp++ = 'u';
977-
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
978-
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
979-
}
980-
else {
981-
*outp++ = 'x';
982-
}
983-
*outp++ = Py_hexdigits[(c >> 4) & 0xf];
984-
*outp++ = Py_hexdigits[c & 0xf];
1010+
_codec_handler_write_unicode_hex(&outp, c);
9851011
}
9861012
assert(_PyUnicode_CheckConsistency(res, 1));
9871013
Py_DECREF(obj);
@@ -1012,82 +1038,55 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
10121038
return NULL;
10131039
}
10141040

1015-
char buffer[256]; /* NAME_MAXLEN */
1016-
Py_ssize_t i = start, ressize = 0, replsize;
1017-
for (; i < end; ++i) {
1018-
// If 'c' is recognized by getname(), the corresponding replacement
1019-
// is '\\' + 'U' + '{' + NAME + '}', namely 1 + 1 + 1 + len(NAME) + 1
1020-
// characters. Otherwise, the replacement is obtained similarly as
1021-
// in PyCodec_BackslashReplaceErrors().
1022-
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1041+
char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1042+
Py_ssize_t imax = start, ressize = 0, replsize;
1043+
for (; imax < end; ++imax) {
1044+
Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
10231045
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1024-
// failures of 'getname()' are ignored by the handler
1025-
replsize = 1 + 1 + 1 + (int)strlen(buffer) + 1;
1026-
}
1027-
else if (c >= 0x10000) {
1028-
replsize = 1 + 1 + 8;
1029-
}
1030-
else if (c >= 0x100) {
1031-
replsize = 1 + 1 + 4;
1046+
// If 'c' is recognized by getname(), the corresponding replacement
1047+
// is '\\' + 'U' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1048+
// characters. Failures of 'getname()' are ignored by the handler.
1049+
replsize = 1 + 1 + 1 + strlen(buffer) + 1;
10321050
}
10331051
else {
1034-
replsize = 1 + 1 + 2;
1052+
replsize = _codec_handler_unicode_hex_width(c);
10351053
}
10361054
if (ressize > PY_SSIZE_T_MAX - replsize) {
10371055
break;
10381056
}
10391057
ressize += replsize;
10401058
}
10411059

1042-
end = i;
10431060
PyObject *res = PyUnicode_New(ressize, 127);
10441061
if (res == NULL) {
10451062
Py_DECREF(obj);
10461063
return NULL;
10471064
}
10481065

10491066
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1050-
for (Py_ssize_t i = start; i < end; ++i) {
1067+
for (Py_ssize_t i = start; i < imax; ++i) {
10511068
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1052-
*outp++ = '\\';
10531069
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1054-
// failures of 'getname()' are ignored by the handler
1070+
*outp++ = '\\';
10551071
*outp++ = 'N';
10561072
*outp++ = '{';
10571073
(void)strcpy((char *)outp, buffer);
10581074
outp += strlen(buffer);
10591075
*outp++ = '}';
1060-
continue;
1061-
}
1062-
1063-
if (c >= 0x00010000) {
1064-
*outp++ = 'U';
1065-
*outp++ = Py_hexdigits[(c >> 28) & 0xf];
1066-
*outp++ = Py_hexdigits[(c >> 24) & 0xf];
1067-
*outp++ = Py_hexdigits[(c >> 20) & 0xf];
1068-
*outp++ = Py_hexdigits[(c >> 16) & 0xf];
1069-
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
1070-
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
1071-
}
1072-
else if (c >= 0x100) {
1073-
*outp++ = 'u';
1074-
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
1075-
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
10761076
}
10771077
else {
1078-
*outp++ = 'x';
1078+
_codec_handler_write_unicode_hex(&outp, c);
10791079
}
1080-
*outp++ = Py_hexdigits[(c >> 4) & 0xf];
1081-
*outp++ = Py_hexdigits[c & 0xf];
10821080
}
10831081

10841082
assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
10851083
assert(_PyUnicode_CheckConsistency(res, 1));
1086-
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
1084+
PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
10871085
Py_DECREF(obj);
10881086
return restuple;
10891087
}
10901088

1089+
10911090
#define ENC_UNKNOWN -1
10921091
#define ENC_UTF8 0
10931092
#define ENC_UTF16BE 1

0 commit comments

Comments
 (0)