File tree Expand file tree Collapse file tree 5 files changed +25
-18
lines changed
Expand file tree Collapse file tree 5 files changed +25
-18
lines changed Original file line number Diff line number Diff line change 1717
1818wer_metric = evaluate .load ("wer" )
1919
20- def normalize_text (text ):
21- """Simple text normalization for non english languages"""
22- if text is None :
23- return ""
24- # Remove capitalization
25- text = text .lower ()
26-
27- # Remove punctuation
28- text = re .sub (r'[^\w\s]' , '' , text )
29-
30- # Remove extra spaces
31- text = re .sub (r'\s+' , ' ' , text ).strip ()
32- return text
3320
3421def main (args ):
3522 DATA_CACHE_DIR = os .path .join (os .getcwd (), "audio_cache" )
@@ -181,8 +168,8 @@ def download_audio_files(batch):
181168 transcriptions = transcriptions [0 ]
182169
183170 references = all_data ["references" ]
184- references = [normalize_text (ref ) for ref in references ]
185- predictions = [normalize_text (pred .text ) for pred in transcriptions ]
171+ references = [data_utils . ml_normalizer (ref ) for ref in references ]
172+ predictions = [data_utils . ml_normalizer (pred .text ) for pred in transcriptions ]
186173
187174 avg_time = total_time / len (all_data ["audio_filepaths" ])
188175
Original file line number Diff line number Diff line change @@ -9,7 +9,8 @@ export PYTHONPATH="..":$PYTHONPATH
99# Configuration
1010MODEL_ID=" nvidia/canary-1b-flash"
1111
12- BATCH_SIZE=128
12+ BATCH_SIZE=64
13+
1314DEVICE_ID=0
1415
1516# Available datasets and languages
Original file line number Diff line number Diff line change 1- from .normalizer import EnglishTextNormalizer
1+ from .normalizer import EnglishTextNormalizer , BasicMultilingualTextNormalizer
Original file line number Diff line number Diff line change 11from datasets import load_dataset , Audio
2- from normalizer import EnglishTextNormalizer
2+ from normalizer import EnglishTextNormalizer , BasicMultilingualTextNormalizer
33
44from .eval_utils import read_manifest , write_manifest
55
@@ -30,6 +30,8 @@ def get_text(sample):
3030
3131normalizer = EnglishTextNormalizer ()
3232
33+ ml_normalizer = BasicMultilingualTextNormalizer ()
34+
3335
3436def normalize (batch ):
3537 batch ["original_text" ] = get_text (batch )
Original file line number Diff line number Diff line change @@ -92,6 +92,23 @@ def __call__(self, s: str):
9292 return s
9393
9494
95+ class BasicMultilingualTextNormalizer :
96+ def __init__ (self , remove_diacritics : bool = True ):
97+ self .clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
98+
99+ def __call__ (self , s : str ):
100+ s = s .lower ()
101+ s = re .sub (r"[<\[][^>\]]*[>\]]" , "" , s ) # remove words between brackets
102+ s = re .sub (r"\(([^)]+?)\)" , "" , s ) # remove words between parenthesis
103+ s = self .clean (s ).lower ()
104+
105+ # Remove punctuations and extra spaces
106+ s = re .sub (r"[^\w\s]" , "" , s )
107+ s = re .sub (r"\s+" , " " , s ).strip ()
108+
109+ return s
110+
111+
95112class EnglishNumberNormalizer :
96113 """
97114 Convert any spelled-out numbers into arabic numbers, while handling:
You can’t perform that action at this time.
0 commit comments