|
1 | | -import re |
2 | | - |
3 | | -NON_ALPHA = re.compile( |
4 | | - "[^" |
5 | | - "\u0080-\u00FF" # Latin-1 Supplement (covers many Western European languages) |
6 | | - "\u0100-\u017F" # Latin Extended-A (Central European, Baltic, etc.) |
7 | | - "\u0180-\u024F" # Latin Extended-B (additional European languages) |
8 | | - "\u1E00-\u1EFF" # Latin Extended Additional (covers Vietnamese, some African languages) |
9 | | - "\u0400-\u04FF" # Cyrillic (covers Russian, Bulgarian, Serbian, etc.) |
10 | | - "\u0370-\u03FF" # Greek and Coptic |
11 | | - "\u1F00-\u1FFF" # Greek Extended |
12 | | - # Including your original ranges |
13 | | - "\u0600-\u06FF" # Arabic |
14 | | - "\u07C0-\u07FF" # N'Ko |
15 | | - "\u0900-\u097F" # Devanagari (Hindi, Marathi, Sanskrit) |
16 | | - "\u1200-\u137F" # Ethiopic (Amharic, Tigrinya) |
17 | | - "\u2D30-\u2D7F" # Tifinagh (Berber languages) |
18 | | - "\uA500-\uA63F" # Vai (West African) |
19 | | - # Additional language ranges |
20 | | - "\u4E00-\u9FFF" # CJK Unified Ideographs (Chinese) |
21 | | - "\uAC00-\uD7AF" # Hangul Syllables (Korean) |
22 | | - "\u3040-\u309F\u30A0-\u30FF" # Hiragana and Katakana (Japanese) |
23 | | - "\u0B80-\u0BFF" # Tamil |
24 | | - "\u0C00-\u0C7F" # Telugu |
25 | | - "\u0C80-\u0CFF" # Kannada |
26 | | - "\u0D00-\u0D7F" # Malayalam |
27 | | - "\u0980-\u09FF" # Bengali |
28 | | - "\u0A00-\u0A7F" # Gurmukhi (Punjabi) |
29 | | - "\u0A80-\u0AFF" # Gujarati |
30 | | - "\u0B00-\u0B7F" # Oriya |
31 | | - "\u0750-\u077F" # Arabic Supplement |
32 | | - "A-Za-z_0-9" # General Latin, numerals, and underscore |
33 | | - "]" |
34 | | -) |
| 1 | +import re |
| 2 | + |
| 3 | +NON_ALPHA = re.compile( |
| 4 | + "[^" |
| 5 | + "\u0080-\u00FF" # Latin-1 Supplement (covers many Western European languages) |
| 6 | + "\u0100-\u017F" # Latin Extended-A (Central European, Baltic, etc.) |
| 7 | + "\u0180-\u024F" # Latin Extended-B (additional European languages) |
| 8 | + "\u1E00-\u1EFF" # Latin Extended Additional (covers Vietnamese, some African languages) |
| 9 | + "\u0400-\u04FF" # Cyrillic (covers Russian, Bulgarian, Serbian, etc.) |
| 10 | + "\u0370-\u03FF" # Greek and Coptic |
| 11 | + "\u1F00-\u1FFF" # Greek Extended |
| 12 | + # Including your original ranges |
| 13 | + "\u0600-\u06FF" # Arabic |
| 14 | + "\u07C0-\u07FF" # N'Ko |
| 15 | + "\u0900-\u097F" # Devanagari (Hindi, Marathi, Sanskrit) |
| 16 | + "\u1200-\u137F" # Ethiopic (Amharic, Tigrinya) |
| 17 | + "\u2D30-\u2D7F" # Tifinagh (Berber languages) |
| 18 | + "\uA500-\uA63F" # Vai (West African) |
| 19 | + # Additional language ranges |
| 20 | + "\u4E00-\u9FFF" # CJK Unified Ideographs (Chinese) |
| 21 | + "\uAC00-\uD7AF" # Hangul Syllables (Korean) |
| 22 | + "\u3040-\u309F\u30A0-\u30FF" # Hiragana and Katakana (Japanese) |
| 23 | + "\u0B80-\u0BFF" # Tamil |
| 24 | + "\u0C00-\u0C7F" # Telugu |
| 25 | + "\u0C80-\u0CFF" # Kannada |
| 26 | + "\u0D00-\u0D7F" # Malayalam |
| 27 | + "\u0980-\u09FF" # Bengali |
| 28 | + "\u0A00-\u0A7F" # Gurmukhi (Punjabi) |
| 29 | + "\u0A80-\u0AFF" # Gujarati |
| 30 | + "\u0B00-\u0B7F" # Oriya |
| 31 | + "\u0750-\u077F" # Arabic Supplement |
| 32 | + "A-Za-z_0-9" # General Latin, numerals, and underscore |
| 33 | + "]" |
| 34 | +) |
0 commit comments