Skip to content

Commit 7df0ae8

Browse files
committed
update phonemis library
1 parent 90ec4b9 commit 7df0ae8

File tree

6 files changed

+182
-0
lines changed

6 files changed

+182
-0
lines changed
Binary file not shown.
Binary file not shown.

packages/react-native-executorch/third-party/include/phonemis/preprocessor/constants.h

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,184 @@ inline const std::unordered_map<std::int64_t, std::string> kLargeCardinals = {
3636
{1000000000000LL, "trillion"}};
3737
} // namespace num2words::constants
3838

39+
// ----------------------------
40+
// unicode processing constants
41+
// ----------------------------
42+
namespace unicode::constants {
43+
// Foreign character to latin-only conversion
44+
inline const std::unordered_map<char32_t, std::string> kForeignToLatin = {
45+
// Polish
46+
{U'Ą', "A"},
47+
{U'ą', "a"},
48+
{U'Ć', "C"},
49+
{U'ć', "c"},
50+
{U'Ę', "E"},
51+
{U'ę', "e"},
52+
{U'Ł', "L"},
53+
{U'ł', "l"},
54+
{U'Ń', "N"},
55+
{U'ń', "n"},
56+
{U'Ó', "O"},
57+
{U'ó', "o"},
58+
{U'Ś', "S"},
59+
{U'ś', "s"},
60+
{U'Ź', "Z"},
61+
{U'ź', "z"},
62+
{U'Ż', "Z"},
63+
{U'ż', "z"},
64+
65+
// German
66+
{U'Ä', "A"},
67+
{U'ä', "a"},
68+
{U'Ö', "O"},
69+
{U'ö', "o"},
70+
{U'Ü', "U"},
71+
{U'ü', "u"},
72+
{U'ß', "ss"},
73+
74+
// French
75+
{U'À', "A"},
76+
{U'à', "a"},
77+
{U'Â', "A"},
78+
{U'â', "a"},
79+
{U'Æ', "AE"},
80+
{U'æ', "ae"},
81+
{U'Ç', "C"},
82+
{U'ç', "c"},
83+
{U'É', "E"},
84+
{U'é', "e"},
85+
{U'È', "E"},
86+
{U'è', "e"},
87+
{U'Ê', "E"},
88+
{U'ê', "e"},
89+
{U'Ë', "E"},
90+
{U'ë', "e"},
91+
{U'Î', "I"},
92+
{U'î', "i"},
93+
{U'Ï', "I"},
94+
{U'ï', "i"},
95+
{U'Ô', "O"},
96+
{U'ô', "o"},
97+
{U'Œ', "OE"},
98+
{U'œ', "oe"},
99+
{U'Ù', "U"},
100+
{U'ù', "u"},
101+
{U'Û', "U"},
102+
{U'û', "u"},
103+
{U'Ü', "U"},
104+
{U'ü', "u"},
105+
106+
// Spanish
107+
{U'Á', "A"},
108+
{U'á', "a"},
109+
{U'É', "E"},
110+
{U'é', "e"},
111+
{U'Í', "I"},
112+
{U'í', "i"},
113+
{U'Ó', "O"},
114+
{U'ó', "o"},
115+
{U'Ú', "U"},
116+
{U'ú', "u"},
117+
{U'Ü', "U"},
118+
{U'ü', "u"},
119+
{U'Ñ', "N"},
120+
{U'ñ', "n"},
121+
122+
// Italian
123+
{U'À', "A"},
124+
{U'à', "a"},
125+
{U'È', "E"},
126+
{U'è', "e"},
127+
{U'É', "E"},
128+
{U'é', "e"},
129+
{U'Ì', "I"},
130+
{U'ì', "i"},
131+
{U'Í', "I"},
132+
{U'í', "i"},
133+
{U'Î', "I"},
134+
{U'î', "i"},
135+
{U'Ò', "O"},
136+
{U'ò', "o"},
137+
{U'Ó', "O"},
138+
{U'ó', "o"},
139+
{U'Ù', "U"},
140+
{U'ù', "u"},
141+
{U'Ú', "U"},
142+
{U'ú', "u"},
143+
144+
// Scandinavian
145+
{U'Å', "A"},
146+
{U'å', "a"},
147+
{U'Æ', "AE"},
148+
{U'æ', "ae"},
149+
{U'Ø', "O"},
150+
{U'ø', "o"},
151+
152+
// Hungarian
153+
{U'Á', "A"},
154+
{U'á', "a"},
155+
{U'É', "E"},
156+
{U'é', "e"},
157+
{U'Í', "I"},
158+
{U'í', "i"},
159+
{U'Ó', "O"},
160+
{U'ó', "o"},
161+
{U'Ö', "O"},
162+
{U'ö', "o"},
163+
{U'Ő', "O"},
164+
{U'ő', "o"},
165+
{U'Ú', "U"},
166+
{U'ú', "u"},
167+
{U'Ü', "U"},
168+
{U'ü', "u"},
169+
{U'Ű', "U"},
170+
{U'ű', "u"},
171+
172+
// Czech/Slovak
173+
{U'Á', "A"},
174+
{U'á', "a"},
175+
{U'Č', "C"},
176+
{U'č', "c"},
177+
{U'Ď', "D"},
178+
{U'ď', "d"},
179+
{U'É', "E"},
180+
{U'é', "e"},
181+
{U'Ě', "E"},
182+
{U'ě', "e"},
183+
{U'Í', "I"},
184+
{U'í', "i"},
185+
{U'Ň', "N"},
186+
{U'ň', "n"},
187+
{U'Ó', "O"},
188+
{U'ó', "o"},
189+
{U'Ř', "R"},
190+
{U'ř', "r"},
191+
{U'Š', "S"},
192+
{U'š', "s"},
193+
{U'Ť', "T"},
194+
{U'ť', "t"},
195+
{U'Ú', "U"},
196+
{U'ú', "u"},
197+
{U'Ů', "U"},
198+
{U'ů', "u"},
199+
{U'Ý', "Y"},
200+
{U'ý', "y"},
201+
{U'Ž', "Z"},
202+
{U'ž', "z"},
203+
204+
// Romanian
205+
{U'Ă', "A"},
206+
{U'ă', "a"},
207+
{U'Â', "A"},
208+
{U'â', "a"},
209+
{U'Î', "I"},
210+
{U'î', "i"},
211+
{U'Ș', "S"},
212+
{U'ș', "s"},
213+
{U'Ț', "T"},
214+
{U'ț', "t"}};
215+
} // namespace unicode::constants
216+
39217
// ---------------
40218
// other constants
41219
// ---------------

packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55

66
namespace phonemis::preprocessor {
77

8+
// Normalizes the text by replacing all foreign characters
9+
// to latin-only phrases.
10+
std::string normalize_unicode(const std::string &text);
11+
812
// Divides a monolit text into multiple sentences.
913
// A sentence always ends with a end of sentence character (defined in
1014
// constants.h).

0 commit comments

Comments
 (0)