Skip to content

Commit af58a6f

Browse files
gh-88886: Remove excessive encoding name normalization (GH-137167)
The codecs lookup function now performs only minimal normalization of the encoding name before passing it to the search functions: all ASCII letters are converted to lower case, spaces are replaced with hyphens. Excessive normalization broke third-party codecs providers, like python-iconv. Revert "bpo-37751: Fix codecs.lookup() normalization (GH-15092)" This reverts commit 20f59fe.
1 parent 6b7b9d0 commit af58a6f

File tree

5 files changed

+54
-45
lines changed

5 files changed

+54
-45
lines changed

Doc/library/codecs.rst

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,21 @@ The full details for each codec can also be looked up directly:
6868
Looks up the codec info in the Python codec registry and returns a
6969
:class:`CodecInfo` object as defined below.
7070

71-
Encodings are first looked up in the registry's cache. If not found, the list of
71+
This function first normalizes the *encoding*: all ASCII letters are
72+
converted to lower case, spaces are replaced with hyphens.
73+
Then encoding is looked up in the registry's cache. If not found, the list of
7274
registered search functions is scanned. If no :class:`CodecInfo` object is
7375
found, a :exc:`LookupError` is raised. Otherwise, the :class:`CodecInfo` object
7476
is stored in the cache and returned to the caller.
7577

78+
.. versionchanged:: 3.9
79+
Any characters except ASCII letters and digits and a dot are converted to underscore.
80+
81+
.. versionchanged:: next
82+
No characters are converted to underscore anymore.
83+
Spaces are converted to hyphens.
84+
85+
7686
.. class:: CodecInfo(encode, decode, streamreader=None, streamwriter=None, incrementalencoder=None, incrementaldecoder=None, name=None)
7787

7888
Codec details when looking up the codec registry. The constructor
@@ -167,14 +177,11 @@ function:
167177
.. function:: register(search_function, /)
168178

169179
Register a codec search function. Search functions are expected to take one
170-
argument, being the encoding name in all lower case letters with hyphens
171-
and spaces converted to underscores, and return a :class:`CodecInfo` object.
180+
argument, being the encoding name in all lower case letters with spaces
181+
converted to hyphens, and return a :class:`CodecInfo` object.
172182
In case a search function cannot find a given encoding, it should return
173183
``None``.
174184

175-
.. versionchanged:: 3.9
176-
Hyphens and spaces are converted to underscore.
177-
178185

179186
.. function:: unregister(search_function, /)
180187

@@ -1065,7 +1072,7 @@ or with dictionaries as mapping tables. The following table lists the codecs by
10651072
name, together with a few common aliases, and the languages for which the
10661073
encoding is likely used. Neither the list of aliases nor the list of languages
10671074
is meant to be exhaustive. Notice that spelling alternatives that only differ in
1068-
case or use a hyphen instead of an underscore are also valid aliases
1075+
case or use a space or a hyphen instead of an underscore are also valid aliases
10691076
because they are equivalent when normalized by
10701077
:func:`~encodings.normalize_encoding`. For example, ``'utf-8'`` is a valid
10711078
alias for the ``'utf_8'`` codec.

Lib/test/test_capi/test_codecs.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,6 @@ def test_codec_known_encoding(self):
630630
for name in [
631631
encoding_name,
632632
encoding_name.upper(),
633-
encoding_name.replace('_', '-'),
634633
]:
635634
with self.subTest(name):
636635
self.assertTrue(_testcapi.codec_known_encoding(name))

Lib/test/test_codecs.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3873,26 +3873,22 @@ def test_rot13_func(self):
38733873
class CodecNameNormalizationTest(unittest.TestCase):
38743874
"""Test codec name normalization"""
38753875
def test_codecs_lookup(self):
3876-
FOUND = (1, 2, 3, 4)
3877-
NOT_FOUND = (None, None, None, None)
38783876
def search_function(encoding):
3879-
if encoding == "aaa_8":
3880-
return FOUND
3877+
if encoding.startswith("test."):
3878+
return (encoding, 2, 3, 4)
38813879
else:
3882-
return NOT_FOUND
3880+
return None
38833881

38843882
codecs.register(search_function)
38853883
self.addCleanup(codecs.unregister, search_function)
3886-
self.assertEqual(FOUND, codecs.lookup('aaa_8'))
3887-
self.assertEqual(FOUND, codecs.lookup('AAA-8'))
3888-
self.assertEqual(FOUND, codecs.lookup('AAA---8'))
3889-
self.assertEqual(FOUND, codecs.lookup('AAA 8'))
3890-
self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8'))
3891-
self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8'))
3892-
self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8'))
3893-
self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8'))
3894-
self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8'))
3895-
self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
3884+
self.assertEqual(codecs.lookup('test.aaa_8'), ('test.aaa_8', 2, 3, 4))
3885+
self.assertEqual(codecs.lookup('TEST.AAA-8'), ('test.aaa-8', 2, 3, 4))
3886+
self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4))
3887+
self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4))
3888+
self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa---8', 2, 3, 4))
3889+
self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4))
3890+
self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4))
3891+
self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4))
38963892

38973893
def test_encodings_normalize_encoding(self):
38983894
# encodings.normalize_encoding() ignores non-ASCII characters.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The codecs lookup function now again performs only minimal normalization of
2+
the encoding name before passing it to the search functions: all ASCII
3+
letters are converted to lower case, spaces are replaced with hyphens.
4+
This restores the pre-Python 3.9 behavior.

Python/codecs.c

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -85,43 +85,46 @@ PyCodec_Unregister(PyObject *search_function)
8585

8686
extern int _Py_normalize_encoding(const char *, char *, size_t);
8787

88-
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
89-
converted to lower case, spaces and hyphens are replaced with underscores. */
88+
/* Convert a string to a normalized Python string: all ASCII letters are
89+
converted to lower case, spaces are replaced with hyphens. */
9090

91-
static
92-
PyObject *normalizestring(const char *string)
91+
static PyObject*
92+
normalizestring(const char *string)
9393
{
94+
size_t i;
9495
size_t len = strlen(string);
95-
char *encoding;
96+
char *p;
9697
PyObject *v;
9798

9899
if (len > PY_SSIZE_T_MAX) {
99100
PyErr_SetString(PyExc_OverflowError, "string is too large");
100101
return NULL;
101102
}
102103

103-
encoding = PyMem_Malloc(len + 1);
104-
if (encoding == NULL)
104+
p = PyMem_Malloc(len + 1);
105+
if (p == NULL)
105106
return PyErr_NoMemory();
106-
107-
if (!_Py_normalize_encoding(string, encoding, len + 1))
108-
{
109-
PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
110-
PyMem_Free(encoding);
111-
return NULL;
112-
}
113-
114-
v = PyUnicode_FromString(encoding);
115-
PyMem_Free(encoding);
107+
for (i = 0; i < len; i++) {
108+
char ch = string[i];
109+
if (ch == ' ')
110+
ch = '-';
111+
else
112+
ch = Py_TOLOWER(Py_CHARMASK(ch));
113+
p[i] = ch;
114+
}
115+
p[i] = '\0';
116+
v = PyUnicode_FromString(p);
117+
PyMem_Free(p);
116118
return v;
117119
}
118120

119121
/* Lookup the given encoding and return a tuple providing the codec
120122
facilities.
121123
122-
The encoding string is looked up converted to all lower-case
123-
characters. This makes encodings looked up through this mechanism
124-
effectively case-insensitive.
124+
ASCII letters in the encoding string is looked up converted to all
125+
lower case. This makes encodings looked up through this mechanism
126+
effectively case-insensitive. Spaces are replaced with hyphens for
127+
names like "US ASCII" and "ISO 8859-1".
125128
126129
If no codec is found, a LookupError is set and NULL returned.
127130
@@ -142,8 +145,8 @@ PyObject *_PyCodec_Lookup(const char *encoding)
142145
assert(interp->codecs.initialized);
143146

144147
/* Convert the encoding to a normalized Python string: all
145-
characters are converted to lower case, spaces and hyphens are
146-
replaced with underscores. */
148+
ASCII letters are converted to lower case, spaces are
149+
replaced with hyphens. */
147150
PyObject *v = normalizestring(encoding);
148151
if (v == NULL) {
149152
return NULL;

0 commit comments

Comments
 (0)