Skip to content

Commit 6a82aba

Browse files
committed
PB-2241: simplified util script for sanitizing file names
1 parent 03c5cdc commit 6a82aba

File tree

1 file changed

+17
-55
lines changed

1 file changed

+17
-55
lines changed

scripts/utils.py

Lines changed: 17 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,22 @@
11
import re
2+
import unicodedata
3+
4+
# Define the German-specific mapping once
5+
GERMAN_UMLAUTS_MAPPING = str.maketrans({
6+
'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss'
7+
})
28

39

410
def normalize_special_characters(text):
5-
"""Maps special characters and common European accents to their ASCII equivalents."""
6-
mapping = {
7-
'ä': 'ae',
8-
'ö': 'oe',
9-
'ü': 'ue',
10-
'Ä': 'Ae',
11-
'Ö': 'Oe',
12-
'Ü': 'Ue',
13-
'ß': 'ss',
14-
'é': 'e',
15-
'è': 'e',
16-
'ê': 'e',
17-
'ë': 'e',
18-
'à': 'a',
19-
'â': 'a',
20-
'á': 'a',
21-
'ã': 'a',
22-
'ò': 'o',
23-
'ô': 'o',
24-
'ó': 'o',
25-
'õ': 'o',
26-
'ù': 'u',
27-
'û': 'u',
28-
'ú': 'u',
29-
'ì': 'i',
30-
'î': 'i',
31-
'í': 'i',
32-
'ï': 'i',
33-
'ç': 'c',
34-
'ñ': 'n',
35-
'É': 'E',
36-
'È': 'E',
37-
'Ê': 'E',
38-
'Ë': 'E',
39-
'À': 'A',
40-
'Â': 'A',
41-
'Á': 'A',
42-
'Ã': 'A',
43-
'Ò': 'O',
44-
'Ô': 'O',
45-
'Ó': 'O',
46-
'Õ': 'O',
47-
'Ù': 'U',
48-
'Û': 'U',
49-
'Ú': 'U',
50-
'Ì': 'I',
51-
'Î': 'I',
52-
'Í': 'I',
53-
'Ï': 'I',
54-
'Ç': 'C',
55-
'Ñ': 'N'
56-
}
57-
for char, replacement in mapping.items():
58-
text = text.replace(char, replacement)
59-
return text
11+
# Step 1: handle the german umlauts specifically (make sure, ö will be oe and so on.)
12+
text = text.translate(GERMAN_UMLAUTS_MAPPING)
13+
14+
# Step 2: Decompose remaining accents (é -> e + ´)
15+
# NFKD separates the base character from the "combining" accent mark
16+
text = unicodedata.normalize('NFKD', text)
17+
18+
# Filter out the combining marks (the accents) and rejoin
19+
return "".join(c for c in text if not unicodedata.combining(c))
6020

6121

6222
def sanitize_name(text):
@@ -68,5 +28,7 @@ def sanitize_name(text):
6828
clean_name = normalize_special_characters(text)
6929
# Replace non-allowed chars with hyphens
7030
clean_name = re.sub(r'[^a-zA-Z0-9-]+', '-', clean_name)
31+
# prevent multiple subsequent hyphens, such as -- for example. Replace with a single -
32+
clean_name = re.sub(r'-+', '-', clean_name)
7133
# Strip leading/trailing hyphens and spaces
7234
return clean_name.strip('-')

0 commit comments

Comments
 (0)