Skip to content

Commit 03c5cdc

Browse files
committed
PB-2241: added character conversion mappings for more special characters
1 parent a0a7aaa commit 03c5cdc

File tree

1 file changed

+54
-4
lines changed

1 file changed

+54
-4
lines changed

scripts/utils.py

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,59 @@
11
import re
22

33

4-
def normalize_umlauts(text):
5-
"""Maps German umlauts to their ASCII equivalents."""
6-
mapping = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss'}
4+
def normalize_special_characters(text):
5+
"""Maps special characters and common European accents to their ASCII equivalents."""
6+
mapping = {
7+
'ä': 'ae',
8+
'ö': 'oe',
9+
'ü': 'ue',
10+
'Ä': 'Ae',
11+
'Ö': 'Oe',
12+
'Ü': 'Ue',
13+
'ß': 'ss',
14+
'é': 'e',
15+
'è': 'e',
16+
'ê': 'e',
17+
'ë': 'e',
18+
'à': 'a',
19+
'â': 'a',
20+
'á': 'a',
21+
'ã': 'a',
22+
'ò': 'o',
23+
'ô': 'o',
24+
'ó': 'o',
25+
'õ': 'o',
26+
'ù': 'u',
27+
'û': 'u',
28+
'ú': 'u',
29+
'ì': 'i',
30+
'î': 'i',
31+
'í': 'i',
32+
'ï': 'i',
33+
'ç': 'c',
34+
'ñ': 'n',
35+
'É': 'E',
36+
'È': 'E',
37+
'Ê': 'E',
38+
'Ë': 'E',
39+
'À': 'A',
40+
'Â': 'A',
41+
'Á': 'A',
42+
'Ã': 'A',
43+
'Ò': 'O',
44+
'Ô': 'O',
45+
'Ó': 'O',
46+
'Õ': 'O',
47+
'Ù': 'U',
48+
'Û': 'U',
49+
'Ú': 'U',
50+
'Ì': 'I',
51+
'Î': 'I',
52+
'Í': 'I',
53+
'Ï': 'I',
54+
'Ç': 'C',
55+
'Ñ': 'N'
56+
}
757
for char, replacement in mapping.items():
858
text = text.replace(char, replacement)
959
return text
@@ -15,7 +65,7 @@ def sanitize_name(text):
1565
non-alphanumeric characters with hyphens.
1666
"""
1767
# Normalize Umlauts
18-
clean_name = normalize_umlauts(text)
68+
clean_name = normalize_special_characters(text)
1969
# Replace non-allowed chars with hyphens
2070
clean_name = re.sub(r'[^a-zA-Z0-9-]+', '-', clean_name)
2171
# Strip leading/trailing hyphens and spaces

0 commit comments

Comments
 (0)