pythongh-129173: simplify PyCodec_XMLCharRefReplaceErrors logic (python#129894)

picnixz · encukou · web-flow · commit f693f8422700 · 2025-03-03T11:43:22.000Z
Writing the decimal representation of a Unicode codepoint only requires to know the number of digits.

---------

Co-authored-by: Petr Viktorin &lt;encukou@gmail.com&gt;
diff --git a/Python/codecs.c b/Python/codecs.c
@@ -730,6 +730,25 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
 }
 
 
+/*
+ * Determine the number of digits for a decimal representation of Unicode
+ * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
+ */
+static inline int
+n_decimal_digits_for_codepoint(Py_UCS4 ch)
+{
+    if (ch < 10) return 1;
+    if (ch < 100) return 2;
+    if (ch < 1000) return 3;
+    if (ch < 10000) return 4;
+    if (ch < 100000) return 5;
+    if (ch < 1000000) return 6;
+    if (ch < 10000000) return 7;
+    // Unicode codepoints are limited to 1114111 (7 decimal digits)
+    Py_UNREACHABLE();
+}
+
+
 /*
  * Create a Unicode string containing 'count' copies of the official
  * Unicode REPLACEMENT CHARACTER (0xFFFD).
@@ -867,9 +886,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
     }
 }
 
+
+// --- handler: 'xmlcharrefreplace' -------------------------------------------
+
 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
 {
-    if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
+    if (!_PyIsUnicodeEncodeError(exc)) {
         wrong_exception_type(exc);
         return NULL;
     }
@@ -896,30 +918,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
 
     Py_ssize_t ressize = 0;
     for (Py_ssize_t i = start; i < end; ++i) {
-        /* object is guaranteed to be "ready" */
         Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
-        if (ch < 10) {
-            ressize += 2 + 1 + 1;
-        }
-        else if (ch < 100) {
-            ressize += 2 + 2 + 1;
-        }
-        else if (ch < 1000) {
-            ressize += 2 + 3 + 1;
-        }
-        else if (ch < 10000) {
-            ressize += 2 + 4 + 1;
-        }
-        else if (ch < 100000) {
-            ressize += 2 + 5 + 1;
-        }
-        else if (ch < 1000000) {
-            ressize += 2 + 6 + 1;
-        }
-        else {
-            assert(ch < 10000000);
-            ressize += 2 + 7 + 1;
-        }
+        int k = n_decimal_digits_for_codepoint(ch);
+        assert(k != 0);
+        assert(k <= 7);
+        ressize += 2 + k + 1;
     }
 
     /* allocate replacement */
@@ -931,45 +934,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
     Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
     /* generate replacement */
     for (Py_ssize_t i = start; i < end; ++i) {
-        int digits, base;
         Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
-        if (ch < 10) {
-            digits = 1;
-            base = 1;
-        }
-        else if (ch < 100) {
-            digits = 2;
-            base = 10;
-        }
-        else if (ch < 1000) {
-            digits = 3;
-            base = 100;
-        }
-        else if (ch < 10000) {
-            digits = 4;
-            base = 1000;
-        }
-        else if (ch < 100000) {
-            digits = 5;
-            base = 10000;
-        }
-        else if (ch < 1000000) {
-            digits = 6;
-            base = 100000;
-        }
-        else {
-            assert(ch < 10000000);
-            digits = 7;
-            base = 1000000;
-        }
+        /*
+         * Write the decimal representation of 'ch' to the buffer pointed by 'p'
+         * using at most 7 characters prefixed by '&#' and suffixed by ';'.
+         */
         *outp++ = '&';
         *outp++ = '#';
-        while (digits-- > 0) {
-            assert(base >= 1);
-            *outp++ = '0' + ch / base;
-            ch %= base;
-            base /= 10;
+        Py_UCS1 *digit_end = outp + n_decimal_digits_for_codepoint(ch);
+        for (Py_UCS1 *p_digit = digit_end - 1; p_digit >= outp; --p_digit) {
+            *p_digit = '0' + (ch % 10);
+            ch /= 10;
         }
+        assert(ch == 0);
+        outp = digit_end;
         *outp++ = ';';
     }
     assert(_PyUnicode_CheckConsistency(res, 1));
@@ -1517,7 +1495,8 @@ replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
 }
 
 
-static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
+static inline PyObject *
+xmlcharrefreplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
 {
     return PyCodec_XMLCharRefReplaceErrors(exc);
 }