@@ -730,6 +730,25 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
730730}
731731
732732
733+ /*
734+ * Determine the number of digits for a decimal representation of Unicode
735+ * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
736+ */
737+ static inline int
738+ n_decimal_digits_for_codepoint (Py_UCS4 ch )
739+ {
740+ if (ch < 10 ) return 1 ;
741+ if (ch < 100 ) return 2 ;
742+ if (ch < 1000 ) return 3 ;
743+ if (ch < 10000 ) return 4 ;
744+ if (ch < 100000 ) return 5 ;
745+ if (ch < 1000000 ) return 6 ;
746+ if (ch < 10000000 ) return 7 ;
747+ // Unicode codepoints are limited to 1114111 (7 decimal digits)
748+ Py_UNREACHABLE ();
749+ }
750+
751+
733752/*
734753 * Create a Unicode string containing 'count' copies of the official
735754 * Unicode REPLACEMENT CHARACTER (0xFFFD).
@@ -867,9 +886,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
867886 }
868887}
869888
889+
890+ // --- handler: 'xmlcharrefreplace' -------------------------------------------
891+
870892PyObject * PyCodec_XMLCharRefReplaceErrors (PyObject * exc )
871893{
872- if (!PyObject_TypeCheck (exc , ( PyTypeObject * ) PyExc_UnicodeEncodeError )) {
894+ if (!_PyIsUnicodeEncodeError (exc )) {
873895 wrong_exception_type (exc );
874896 return NULL ;
875897 }
@@ -896,30 +918,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
896918
897919 Py_ssize_t ressize = 0 ;
898920 for (Py_ssize_t i = start ; i < end ; ++ i ) {
899- /* object is guaranteed to be "ready" */
900921 Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
901- if (ch < 10 ) {
902- ressize += 2 + 1 + 1 ;
903- }
904- else if (ch < 100 ) {
905- ressize += 2 + 2 + 1 ;
906- }
907- else if (ch < 1000 ) {
908- ressize += 2 + 3 + 1 ;
909- }
910- else if (ch < 10000 ) {
911- ressize += 2 + 4 + 1 ;
912- }
913- else if (ch < 100000 ) {
914- ressize += 2 + 5 + 1 ;
915- }
916- else if (ch < 1000000 ) {
917- ressize += 2 + 6 + 1 ;
918- }
919- else {
920- assert (ch < 10000000 );
921- ressize += 2 + 7 + 1 ;
922- }
922+ int k = n_decimal_digits_for_codepoint (ch );
923+ assert (k != 0 );
924+ assert (k <= 7 );
925+ ressize += 2 + k + 1 ;
923926 }
924927
925928 /* allocate replacement */
@@ -931,45 +934,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
931934 Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
932935 /* generate replacement */
933936 for (Py_ssize_t i = start ; i < end ; ++ i ) {
934- int digits , base ;
935937 Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
936- if (ch < 10 ) {
937- digits = 1 ;
938- base = 1 ;
939- }
940- else if (ch < 100 ) {
941- digits = 2 ;
942- base = 10 ;
943- }
944- else if (ch < 1000 ) {
945- digits = 3 ;
946- base = 100 ;
947- }
948- else if (ch < 10000 ) {
949- digits = 4 ;
950- base = 1000 ;
951- }
952- else if (ch < 100000 ) {
953- digits = 5 ;
954- base = 10000 ;
955- }
956- else if (ch < 1000000 ) {
957- digits = 6 ;
958- base = 100000 ;
959- }
960- else {
961- assert (ch < 10000000 );
962- digits = 7 ;
963- base = 1000000 ;
964- }
938+ /*
939+ * Write the decimal representation of 'ch' to the buffer pointed by 'p'
940+ * using at most 7 characters prefixed by '&#' and suffixed by ';'.
941+ */
965942 * outp ++ = '&' ;
966943 * outp ++ = '#' ;
967- while (digits -- > 0 ) {
968- assert (base >= 1 );
969- * outp ++ = '0' + ch / base ;
970- ch %= base ;
971- base /= 10 ;
944+ Py_UCS1 * digit_end = outp + n_decimal_digits_for_codepoint (ch );
945+ for (Py_UCS1 * p_digit = digit_end - 1 ; p_digit >= outp ; -- p_digit ) {
946+ * p_digit = '0' + (ch % 10 );
947+ ch /= 10 ;
972948 }
949+ assert (ch == 0 );
950+ outp = digit_end ;
973951 * outp ++ = ';' ;
974952 }
975953 assert (_PyUnicode_CheckConsistency (res , 1 ));
@@ -1517,7 +1495,8 @@ replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
15171495}
15181496
15191497
1520- static PyObject * xmlcharrefreplace_errors (PyObject * self , PyObject * exc )
1498+ static inline PyObject *
1499+ xmlcharrefreplace_errors (PyObject * Py_UNUSED (self ), PyObject * exc )
15211500{
15221501 return PyCodec_XMLCharRefReplaceErrors (exc );
15231502}
0 commit comments