Skip to content

Commit 7e137a9

Browse files
committed
Add Hungarian normalisation files and implement normaliser logic
- Created `changes.csv` and `force_changes.csv` for Hungarian text normalisation with various mappings. - Implemented `normaliser.py` to handle loading of changes, applying replacements, and normalising text. - Added functionality for handling dates, times, ordinals, and removing unwanted characters. - Introduced a simple normaliser for any language with basic structure and example files.
1 parent 4b4359b commit 7e137a9

File tree

8 files changed

+2233
-659
lines changed

8 files changed

+2233
-659
lines changed

src/f5_tts/infer/infer_gradio.py

Lines changed: 537 additions & 659 deletions
Large diffs are not rendered by default.

src/f5_tts/infer/infer_gradio_original.py

Lines changed: 944 additions & 0 deletions
Large diffs are not rendered by default.

src/f5_tts/infer/normalisers/hun/changes.csv

Lines changed: 429 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
%, százalék
2+
ninjutsu, nindzsucu
3+
tweet, tvít
4+
chips, csipsz
5+
ly, j
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
import csv
2+
import re
3+
from num2words import num2words
4+
import os
5+
6+
# Határozzuk meg a normaliser.py könyvtárát
7+
base_dir = os.path.dirname(os.path.abspath(__file__))
8+
9+
def load_force_changes(filename="force_changes.csv"):
10+
# A fájl elérési útja a base_dir könyvtárhoz képest
11+
file_path = os.path.join(base_dir, filename)
12+
force_changes = {}
13+
with open(file_path, encoding='utf-8') as csvfile:
14+
reader = csv.reader(csvfile)
15+
for row in reader:
16+
if row:
17+
key, value = row
18+
force_changes[key.strip()] = value.strip()
19+
return force_changes
20+
21+
def apply_force_changes(text, force_changes):
22+
# Fix cserék alkalmazása a szövegre
23+
for key, value in force_changes.items():
24+
text = text.replace(key, f' {value} ')
25+
return text
26+
27+
def load_changes(filename="changes.csv"):
28+
# A fájl elérési útja a base_dir könyvtárhoz képest
29+
file_path = os.path.join(base_dir, filename)
30+
changes = {}
31+
with open(file_path, encoding='utf-8') as csvfile:
32+
reader = csv.reader(csvfile)
33+
for row in reader:
34+
if row:
35+
key, value = row
36+
changes[key.strip()] = value.strip()
37+
return changes
38+
39+
def apply_changes(text, changes):
40+
# Cserék alkalmazása csak teljes szavakra, betűmérettől függetlenül
41+
for key, value in changes.items():
42+
pattern = r'\b{}\b'.format(re.escape(key))
43+
text = re.sub(pattern, value, text, flags=re.IGNORECASE)
44+
return text
45+
46+
ordinals = {
47+
1: 'első',
48+
2: 'második',
49+
3: 'harmadik',
50+
4: 'negyedik',
51+
5: 'ötödik',
52+
6: 'hatodik',
53+
7: 'hetedik',
54+
8: 'nyolcadik',
55+
9: 'kilencedik',
56+
10: 'tizedik',
57+
# További sorszámok hozzáadása szükség szerint
58+
}
59+
60+
def replace_ordinals(text, ordinals):
61+
# Sorszámok átírása, kivéve ha a mondat végén vannak
62+
def repl(match):
63+
num = int(match.group(1))
64+
start, end = match.span()
65+
following_text = text[end:]
66+
if re.match(r'^\s*$', following_text) or re.match(r'^\s*[\.!\?]', following_text):
67+
return match.group(0)
68+
ordinal_word = ordinals.get(num, num2words(num, to='ordinal', lang='hu'))
69+
return ordinal_word
70+
pattern = r'(\d+)\.(?![\s]*$|[\s]*[\.!\?])'
71+
text = re.sub(pattern, repl, text)
72+
return text
73+
74+
months = {
75+
'jan.': 'január',
76+
'feb.': 'február',
77+
'márc.': 'március',
78+
'már.': 'március',
79+
'ápr.': 'április',
80+
'máj.': 'május',
81+
'jún.': 'június',
82+
'júl.': 'július',
83+
'aug.': 'augusztus',
84+
'szept.': 'szeptember',
85+
'szep.': 'szeptember',
86+
'okt.': 'október',
87+
'nov.': 'november',
88+
'dec.': 'december',
89+
}
90+
91+
months_numbers = {
92+
1: 'január',
93+
2: 'február',
94+
3: 'március',
95+
4: 'április',
96+
5: 'május',
97+
6: 'június',
98+
7: 'július',
99+
8: 'augusztus',
100+
9: 'szeptember',
101+
10: 'október',
102+
11: 'november',
103+
12: 'december',
104+
}
105+
106+
day_words = {
107+
1: 'elseje',
108+
2: 'másodika',
109+
3: 'harmadika',
110+
4: 'negyedike',
111+
5: 'ötödike',
112+
6: 'hatodika',
113+
7: 'hetedike',
114+
8: 'nyolcadika',
115+
9: 'kilencedike',
116+
10: 'tizedike',
117+
11: 'tizenegyedike',
118+
12: 'tizenkettedike',
119+
13: 'tizenharmadika',
120+
14: 'tizennegyedike',
121+
15: 'tizenötödike',
122+
16: 'tizenhatodika',
123+
17: 'tizenhetedike',
124+
18: 'tizennyolcadika',
125+
19: 'tizenkilencedike',
126+
20: 'huszadika',
127+
21: 'huszonegyedike',
128+
22: 'huszonkettedike',
129+
23: 'huszonharmadika',
130+
24: 'huszonnegyedike',
131+
25: 'huszonötödike',
132+
26: 'huszonhatodika',
133+
27: 'huszonhetedike',
134+
28: 'huszonnyolcadika',
135+
29: 'huszonkilencedike',
136+
30: 'harmincadika',
137+
31: 'harmincegyedike',
138+
}
139+
140+
def day_to_text(day):
141+
# Napok átírása szöveges formára
142+
return day_words.get(day, num2words(day, lang='hu') + 'ika')
143+
144+
def replace_dates(text):
145+
# Dátumok felismerése és átírása
146+
month_abbrs = '|'.join(re.escape(k) for k in months.keys())
147+
148+
# Év.Hónap.Nap formátum (2015.10.23.)
149+
pattern1 = r'(\d{4})\.(\d{1,2})\.(\d{1,2})\.'
150+
def repl1(match):
151+
year = int(match.group(1))
152+
month = int(match.group(2))
153+
day = int(match.group(3))
154+
year_text = num2words(year, lang='hu')
155+
month_text = months_numbers.get(month, '')
156+
day_text = day_to_text(day)
157+
return f'{year_text} {month_text} {day_text}'
158+
text = re.sub(pattern1, repl1, text)
159+
160+
# Év.HónapRöv.Nap formátum (2015.okt.23.)
161+
pattern2 = r'(\d{4})\.(' + month_abbrs + r')(\d{1,2})\.'
162+
def repl2(match):
163+
year = int(match.group(1))
164+
month_abbr = match.group(2)
165+
day = int(match.group(3))
166+
year_text = num2words(year, lang='hu')
167+
month_text = months.get(month_abbr.lower(), month_abbr)
168+
day_text = day_to_text(day)
169+
return f'{year_text} {month_text} {day_text}'
170+
text = re.sub(pattern2, repl2, text)
171+
172+
# HónapRöv.Nap formátum (okt.23.)
173+
pattern3 = r'(' + month_abbrs + r')(\d{1,2})\.'
174+
def repl3(match):
175+
month_abbr = match.group(1)
176+
day = int(match.group(2))
177+
month_text = months.get(month_abbr.lower(), month_abbr)
178+
day_text = day_to_text(day)
179+
return f'{month_text} {day_text}'
180+
text = re.sub(pattern3, repl3, text)
181+
182+
# HónapRöv. Nap-án formátum (okt. 23-án)
183+
pattern4 = r'(' + month_abbrs + r')\s+(\d{1,2})-án'
184+
def repl4(match):
185+
month_abbr = match.group(1)
186+
day = int(match.group(2))
187+
month_text = months.get(month_abbr.lower(), month_abbr)
188+
day_text = day_to_text(day) + 'n'
189+
return f'{month_text} {day_text}'
190+
text = re.sub(pattern4, repl4, text)
191+
192+
return text
193+
194+
def replace_times(text):
195+
# Időpontok felismerése és átírása
196+
pattern = r'(\d{1,2}):(\d{2})(?::(\d{2}))?'
197+
def repl(match):
198+
hour = int(match.group(1))
199+
minute = int(match.group(2))
200+
second = match.group(3)
201+
hour_text = num2words(hour, lang='hu')
202+
minute_text = num2words(minute, lang='hu')
203+
time_text = f'{hour_text} óra {minute_text} perc'
204+
if second:
205+
second = int(second)
206+
second_text = num2words(second, lang='hu')
207+
time_text += f' {second_text} másodperc'
208+
return time_text
209+
text = re.sub(pattern, repl, text)
210+
return text
211+
212+
def replace_numbers(text):
213+
# Számok átírása szöveges megfelelőjükre
214+
pattern = r'\b\d+\b'
215+
def repl(match):
216+
num = int(match.group(0))
217+
return num2words(num, lang='hu')
218+
text = re.sub(pattern, repl, text)
219+
return text
220+
221+
def remove_duplicate_spaces(text):
222+
# Többszörös szóközök eltávolítása
223+
text = re.sub(r'\s+', ' ', text)
224+
return text.strip()
225+
226+
def remove_unwanted_characters(text):
227+
# Az eltávolítandó karakterek listája
228+
unwanted_characters = r'[*\-\"\'\:\(\)/#@]'
229+
# Eltávolítjuk az összes felsorolt karaktert
230+
return re.sub(unwanted_characters, ' ', text)
231+
232+
def add_prefix(text):
233+
# Hozzáadja a "... " szöveget a szöveg elejéhez
234+
return '... ' + text.lower()
235+
236+
def normalize(text):
237+
# A szöveg normalizálása a megadott lépésekkel
238+
force_changes = load_force_changes('force_changes.csv')
239+
changes = load_changes('changes.csv')
240+
241+
text = apply_force_changes(text, force_changes)
242+
text = apply_changes(text, changes)
243+
text = replace_dates(text)
244+
text = replace_times(text)
245+
text = replace_ordinals(text, ordinals)
246+
text = replace_numbers(text)
247+
text = remove_unwanted_characters(text)
248+
text = remove_duplicate_spaces(text)
249+
text = add_prefix(text)
250+
251+
return text
252+
253+
if __name__ == "__main__":
254+
# Példa szöveg
255+
sample_text = "Ez egy példa, KENY, szöveg 10% és 7:15 időponttal 2015.10.23. dátummal. Chartmen"
256+
normalized_text = normalize(sample_text)
257+
print(normalized_text)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
example_need_to_change, example_changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#, hashtag
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import csv
2+
import re
3+
from num2words import num2words
4+
import os
5+
6+
base_dir = os.path.dirname(os.path.abspath(__file__))
7+
8+
def load_force_changes(filename="force_changes.csv"):
9+
file_path = os.path.join(base_dir, filename)
10+
force_changes = {}
11+
with open(file_path, encoding='utf-8') as csvfile:
12+
reader = csv.reader(csvfile)
13+
for row in reader:
14+
if row:
15+
key, value = row
16+
force_changes[key.strip()] = value.strip()
17+
return force_changes
18+
19+
def apply_force_changes(text, force_changes):
20+
for key, value in force_changes.items():
21+
text = text.replace(key, f' {value} ')
22+
return text
23+
24+
def load_changes(filename="changes.csv"):
25+
file_path = os.path.join(base_dir, filename)
26+
changes = {}
27+
with open(file_path, encoding='utf-8') as csvfile:
28+
reader = csv.reader(csvfile)
29+
for row in reader:
30+
if row:
31+
key, value = row
32+
changes[key.strip()] = value.strip()
33+
return changes
34+
35+
def apply_changes(text, changes):
36+
for key, value in changes.items():
37+
pattern = r'\b{}\b'.format(re.escape(key))
38+
text = re.sub(pattern, value, text, flags=re.IGNORECASE)
39+
return text
40+
41+
def remove_duplicate_spaces(text):
42+
# Többszörös szóközök eltávolítása
43+
text = re.sub(r'\s+', ' ', text)
44+
return text.strip()
45+
46+
def add_prefix(text):
47+
# stabilize the short predicts with ... frefix
48+
return '... ' + text.lower()
49+
50+
def normalize(text):
51+
force_changes = load_force_changes('force_changes.csv')
52+
changes = load_changes('changes.csv')
53+
54+
text = apply_force_changes(text, force_changes)
55+
text = apply_changes(text, changes)
56+
text = remove_duplicate_spaces(text)
57+
text = add_prefix(text)
58+
59+
return text

0 commit comments

Comments
 (0)