Skip to content

Commit 92873d6

Browse files
C
1 parent 609d5ad commit 92873d6

File tree

8 files changed

+110
-29
lines changed

8 files changed

+110
-29
lines changed

Lib/encodings/__init__.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
2727
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
2828
29-
"""#"
29+
"""
3030

3131
import codecs
3232
import sys
@@ -37,10 +37,23 @@
3737
_import_tail = ['*']
3838
_aliases = aliases.aliases
3939

40+
41+
_norm_encoding_map = (
42+
#0123456789ABCDEF0123456789ABCDEF
43+
' '
44+
' . 0123456789 '
45+
' ABCDEFGHIJKLMNOPQRSTUVWXYZ '
46+
' abcdefghijklmnopqrstuvwxyz '
47+
' '
48+
' '
49+
' '
50+
' ')
51+
52+
4053
class CodecRegistryError(LookupError, SystemError):
4154
pass
4255

43-
def normalize_encoding(encoding):
56+
def normalize_encoding(encoding, /):
4457

4558
""" Normalize an encoding name.
4659
@@ -55,18 +68,10 @@ def normalize_encoding(encoding):
5568
if isinstance(encoding, bytes):
5669
encoding = str(encoding, "ascii")
5770

58-
chars = []
59-
punct = False
60-
for c in encoding:
61-
if c.isalnum() or c == '.':
62-
if punct and chars:
63-
chars.append('_')
64-
if c.isascii():
65-
chars.append(c)
66-
punct = False
67-
else:
68-
punct = True
69-
return ''.join(chars)
71+
s = encoding.translate(_norm_encoding_map)
72+
return '_'.join(s.split())
73+
74+
from _codecs import _normalize_encoding as normalize_encoding
7075

7176
def search_function(encoding):
7277

Lib/test/test_codecs.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3895,11 +3895,13 @@ def search_function(encoding):
38953895
self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
38963896

38973897
def test_encodings_normalize_encoding(self):
3898-
# encodings.normalize_encoding() ignores non-ASCII characters.
38993898
normalize = encodings.normalize_encoding
39003899
self.assertEqual(normalize('utf_8'), 'utf_8')
3901-
self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
39023900
self.assertEqual(normalize('utf 8'), 'utf_8')
3901+
3902+
# encodings.normalize_encoding() does not accept non-ASCII characters.
3903+
self.assertRaises(UnicodeEncodeError, normalize, 'utf\xE9\u20AC\U0010ffff-8')
3904+
39033905
# encodings.normalize_encoding() doesn't convert
39043906
# characters to lower case.
39053907
self.assertEqual(normalize('UTF 8'), 'UTF_8')
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
2+
by implementing the function in C using the private
3+
``_Py_normalize_encoding`` which has been modified to make lowercase
4+
conversion optional.

Modules/_codecsmodule.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,6 +1022,44 @@ _codecs_lookup_error_impl(PyObject *module, const char *name)
10221022
return PyCodec_LookupError(name);
10231023
}
10241024

1025+
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
1026+
1027+
/*[clinic input]
1028+
_codecs._normalize_encoding
1029+
encoding: str(encoding='ascii')
1030+
/
1031+
1032+
Normalize an encoding name, while not converting to lower case (to_lower == 1).
1033+
Used for encodings.normalize_encoding.
1034+
[clinic start generated code]*/
1035+
1036+
static PyObject *
1037+
_codecs__normalize_encoding_impl(PyObject *module, char *encoding)
1038+
/*[clinic end generated code: output=d5e3a4b5266fbe96 input=ca002bbc262228f1]*/
1039+
{
1040+
size_t len = strlen(encoding);
1041+
if (len > PY_SSIZE_T_MAX) {
1042+
PyErr_SetString(PyExc_OverflowError, "encoding is too large");
1043+
return NULL;
1044+
}
1045+
1046+
char *normalized = PyMem_Malloc(len + 1);
1047+
if (normalized == NULL) {
1048+
return PyErr_NoMemory();
1049+
}
1050+
1051+
if (!_Py_normalize_encoding(encoding, normalized, len + 1, 0)) {
1052+
PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
1053+
PyMem_Free(normalized);
1054+
return NULL;
1055+
}
1056+
1057+
PyObject *v = PyUnicode_FromString(normalized);
1058+
PyMem_Free(normalized);
1059+
return v;
1060+
}
1061+
1062+
10251063
/* --- Module API --------------------------------------------------------- */
10261064

10271065
static PyMethodDef _codecs_functions[] = {
@@ -1071,6 +1109,7 @@ static PyMethodDef _codecs_functions[] = {
10711109
_CODECS_REGISTER_ERROR_METHODDEF
10721110
_CODECS__UNREGISTER_ERROR_METHODDEF
10731111
_CODECS_LOOKUP_ERROR_METHODDEF
1112+
_CODECS__NORMALIZE_ENCODING_METHODDEF
10741113
{NULL, NULL} /* sentinel */
10751114
};
10761115

Modules/clinic/_codecsmodule.c.h

Lines changed: 30 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Objects/unicodeobject.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3587,13 +3587,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
35873587
return v;
35883588
}
35893589

3590-
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3591-
also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3592-
longer than lower_len-1). */
3590+
/* Normalize an encoding name like encodings.normalize_encoding()
3591+
Optionally covert convert to lowercase by setting *to_lower* to 1.
3592+
Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
35933593
int
35943594
_Py_normalize_encoding(const char *encoding,
35953595
char *lower,
3596-
size_t lower_len)
3596+
size_t lower_len,
3597+
int to_lower)
35973598
{
35983599
const char *e;
35993600
char *l;
@@ -3624,7 +3625,7 @@ _Py_normalize_encoding(const char *encoding,
36243625
if (l == l_end) {
36253626
return 0;
36263627
}
3627-
*l++ = Py_TOLOWER(c);
3628+
*l++ = to_lower ? Py_TOLOWER(c) : c;
36283629
}
36293630
else {
36303631
punct = 1;
@@ -3659,7 +3660,7 @@ PyUnicode_Decode(const char *s,
36593660
}
36603661

36613662
/* Shortcuts for common default encodings */
3662-
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3663+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
36633664
char *lower = buflower;
36643665

36653666
/* Fast paths */
@@ -3916,7 +3917,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
39163917
}
39173918

39183919
/* Shortcuts for common default encodings */
3919-
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3920+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
39203921
char *lower = buflower;
39213922

39223923
/* Fast paths */

Python/codecs.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ PyCodec_Unregister(PyObject *search_function)
9090
return 0;
9191
}
9292

93-
extern int _Py_normalize_encoding(const char *, char *, size_t);
93+
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
9494

9595
/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
9696
converted to lower case, spaces and hyphens are replaced with underscores. */
@@ -108,10 +108,11 @@ PyObject *normalizestring(const char *string)
108108
}
109109

110110
encoding = PyMem_Malloc(len + 1);
111-
if (encoding == NULL)
111+
if (encoding == NULL) {
112112
return PyErr_NoMemory();
113+
}
113114

114-
if (!_Py_normalize_encoding(string, encoding, len + 1))
115+
if (!_Py_normalize_encoding(string, encoding, len + 1, 1))
115116
{
116117
PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
117118
PyMem_Free(encoding);

Python/fileutils.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
180180

181181
#define USE_FORCE_ASCII
182182

183-
extern int _Py_normalize_encoding(const char *, char *, size_t);
183+
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
184184

185185
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
186186
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
@@ -231,7 +231,7 @@ check_force_ascii(void)
231231
}
232232

233233
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
234-
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
234+
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
235235
goto error;
236236
}
237237

0 commit comments

Comments
 (0)