C

StanFromIreland · StanFromIreland · commit 92873d68937c · 2025-07-14T09:34:06.000+01:00
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
@@ -26,7 +26,7 @@
 
 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
 
-"""#"
+"""
 
 import codecs
 import sys
@@ -37,10 +37,23 @@
 _import_tail = ['*']
 _aliases = aliases.aliases
 
+
+_norm_encoding_map = (
+    #0123456789ABCDEF0123456789ABCDEF
+    '                                '
+    '              . 0123456789      '
+    ' ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
+    ' abcdefghijklmnopqrstuvwxyz     '
+    '                                '
+    '                                '
+    '                                '
+    '                                ')
+
+
 class CodecRegistryError(LookupError, SystemError):
     pass
 
-def normalize_encoding(encoding):
+def normalize_encoding(encoding, /):
 
     """ Normalize an encoding name.
 
@@ -55,18 +68,10 @@ def normalize_encoding(encoding):
     if isinstance(encoding, bytes):
         encoding = str(encoding, "ascii")
 
-    chars = []
-    punct = False
-    for c in encoding:
-        if c.isalnum() or c == '.':
-            if punct and chars:
-                chars.append('_')
-            if c.isascii():
-                chars.append(c)
-            punct = False
-        else:
-            punct = True
-    return ''.join(chars)
+    s = encoding.translate(_norm_encoding_map)
+    return '_'.join(s.split())
+
+from _codecs import _normalize_encoding as normalize_encoding
 
 def search_function(encoding):
 
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
@@ -3895,11 +3895,13 @@ def search_function(encoding):
         self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
 
     def test_encodings_normalize_encoding(self):
-        # encodings.normalize_encoding() ignores non-ASCII characters.
         normalize = encodings.normalize_encoding
         self.assertEqual(normalize('utf_8'), 'utf_8')
-        self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
         self.assertEqual(normalize('utf   8'), 'utf_8')
+
+        # encodings.normalize_encoding() does not accept non-ASCII characters.
+        self.assertRaises(UnicodeEncodeError, normalize, 'utf\xE9\u20AC\U0010ffff-8')
+
         # encodings.normalize_encoding() doesn't convert
         # characters to lower case.
         self.assertEqual(normalize('UTF 8'), 'UTF_8')
diff --git a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst
@@ -0,0 +1,4 @@
+:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
+by implementing the function in C using the private
+``_Py_normalize_encoding`` which has been modified to make lowercase
+conversion optional.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
@@ -1022,6 +1022,44 @@ _codecs_lookup_error_impl(PyObject *module, const char *name)
     return PyCodec_LookupError(name);
 }
 
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
+
+/*[clinic input]
+_codecs._normalize_encoding
+    encoding: str(encoding='ascii')
+    /
+
+Normalize an encoding name, while not converting to lower case (to_lower == 1).
+Used for encodings.normalize_encoding.
+[clinic start generated code]*/
+
+static PyObject *
+_codecs__normalize_encoding_impl(PyObject *module, char *encoding)
+/*[clinic end generated code: output=d5e3a4b5266fbe96 input=ca002bbc262228f1]*/
+{
+    size_t len = strlen(encoding);
+    if (len > PY_SSIZE_T_MAX) {
+        PyErr_SetString(PyExc_OverflowError, "encoding is too large");
+        return NULL;
+    }
+
+    char *normalized = PyMem_Malloc(len + 1);
+    if (normalized == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    if (!_Py_normalize_encoding(encoding, normalized, len + 1, 0)) {
+        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
+        PyMem_Free(normalized);
+        return NULL;
+    }
+
+    PyObject *v = PyUnicode_FromString(normalized);
+    PyMem_Free(normalized);
+    return v;
+}
+
+
 /* --- Module API --------------------------------------------------------- */
 
 static PyMethodDef _codecs_functions[] = {
@@ -1071,6 +1109,7 @@ static PyMethodDef _codecs_functions[] = {
     _CODECS_REGISTER_ERROR_METHODDEF
     _CODECS__UNREGISTER_ERROR_METHODDEF
     _CODECS_LOOKUP_ERROR_METHODDEF
+    _CODECS__NORMALIZE_ENCODING_METHODDEF
     {NULL, NULL}                /* sentinel */
 };
 
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -3587,13 +3587,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
     return v;
 }
 
-/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
-   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
-   longer than lower_len-1). */
+/* Normalize an encoding name like encodings.normalize_encoding()
+   Optionally covert convert to lowercase by setting *to_lower* to 1.
+   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
 int
 _Py_normalize_encoding(const char *encoding,
                        char *lower,
-                       size_t lower_len)
+                       size_t lower_len,
+                       int to_lower)
 {
     const char *e;
     char *l;
@@ -3624,7 +3625,7 @@ _Py_normalize_encoding(const char *encoding,
             if (l == l_end) {
                 return 0;
             }
-            *l++ = Py_TOLOWER(c);
+            *l++ = to_lower ? Py_TOLOWER(c) : c;
         }
         else {
             punct = 1;
@@ -3659,7 +3660,7 @@ PyUnicode_Decode(const char *s,
     }
 
     /* Shortcuts for common default encodings */
-    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
         char *lower = buflower;
 
         /* Fast paths */
@@ -3916,7 +3917,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
     }
 
     /* Shortcuts for common default encodings */
-    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
         char *lower = buflower;
 
         /* Fast paths */
diff --git a/Python/codecs.c b/Python/codecs.c
@@ -90,7 +90,7 @@ PyCodec_Unregister(PyObject *search_function)
     return 0;
 }
 
-extern int _Py_normalize_encoding(const char *, char *, size_t);
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
 
 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
    converted to lower case, spaces and hyphens are replaced with underscores. */
@@ -108,10 +108,11 @@ PyObject *normalizestring(const char *string)
     }
 
     encoding = PyMem_Malloc(len + 1);
-    if (encoding == NULL)
+    if (encoding == NULL) {
         return PyErr_NoMemory();
+    }
 
-    if (!_Py_normalize_encoding(string, encoding, len + 1))
+    if (!_Py_normalize_encoding(string, encoding, len + 1, 1))
     {
         PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
         PyMem_Free(encoding);
diff --git a/Python/fileutils.c b/Python/fileutils.c
@@ -180,7 +180,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
 
 #define USE_FORCE_ASCII
 
-extern int _Py_normalize_encoding(const char *, char *, size_t);
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
 
 /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
    and POSIX locale. nl_langinfo(CODESET) announces an alias of the
@@ -231,7 +231,7 @@ check_force_ascii(void)
     }
 
     char encoding[20];   /* longest name: "iso_646.irv_1991\0" */
-    if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
+    if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
         goto error;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ PyCodec_Unregister(PyObject *search_function)`
`90`	`90`	`return 0;`
`91`	`91`	`}`
`92`	`92`
`93`		`-extern int _Py_normalize_encoding(const char , char , size_t);`
	`93`	`+extern int _Py_normalize_encoding(const char , char , size_t, int);`
`94`	`94`
`95`	`95`	`/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are`
`96`	`96`	`converted to lower case, spaces and hyphens are replaced with underscores. */`
`@@ -108,10 +108,11 @@ PyObject normalizestring(const char string)`
`108`	`108`	`}`
`109`	`109`
`110`	`110`	`encoding = PyMem_Malloc(len + 1);`
`111`		`- if (encoding == NULL)`
	`111`	`+ if (encoding == NULL) {`
`112`	`112`	`return PyErr_NoMemory();`
	`113`	`+ }`
`113`	`114`
`114`		`- if (!_Py_normalize_encoding(string, encoding, len + 1))`
	`115`	`+ if (!_Py_normalize_encoding(string, encoding, len + 1, 1))`
`115`	`116`	`{`
`116`	`117`	`PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");`
`117`	`118`	`PyMem_Free(encoding);`
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ _Py_mbrtowc(wchar_t pwc, const char str, size_t len, mbstate_t *pmbs)`
`180`	`180`
`181`	`181`	`#define USE_FORCE_ASCII`
`182`	`182`
`183`		`-extern int _Py_normalize_encoding(const char , char , size_t);`
	`183`	`+extern int _Py_normalize_encoding(const char , char , size_t, int);`
`184`	`184`
`185`	`185`	`/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale`
`186`	`186`	`and POSIX locale. nl_langinfo(CODESET) announces an alias of the`
`@@ -231,7 +231,7 @@ check_force_ascii(void)`
`231`	`231`	`}`
`232`	`232`
`233`	`233`	`char encoding[20]; /* longest name: "iso_646.irv_1991\0" */`
`234`		`- if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {`
	`234`	`+ if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {`
`235`	`235`	`goto error;`
`236`	`236`	`}`
`237`	`237`