add ml normalizer

nithinraok · nithinraok · commit 0402d4b51bc3 · 2025-08-06T11:33:22.000-07:00
Signed-off-by: nithinraok &lt;nithinrao.koluguri@gmail.com&gt;
diff --git a/nemo_asr/run_eval_ml.py b/nemo_asr/run_eval_ml.py
@@ -17,19 +17,6 @@
 
 wer_metric = evaluate.load("wer")
 
-def normalize_text(text):
-    """Simple text normalization for non english languages"""
-    if text is None:
-        return ""
-    # Remove capitalization
-    text = text.lower()
-    
-    # Remove punctuation
-    text = re.sub(r'[^\w\s]', '', text)
-    
-    # Remove extra spaces
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
 
 def main(args):
     DATA_CACHE_DIR = os.path.join(os.getcwd(), "audio_cache")
@@ -181,8 +168,8 @@ def download_audio_files(batch):
         transcriptions = transcriptions[0]
     
     references = all_data["references"]
-    references = [normalize_text(ref) for ref in references]
-    predictions = [normalize_text(pred.text) for pred in transcriptions]
+    references = [data_utils.ml_normalizer(ref) for ref in references]
+    predictions = [data_utils.ml_normalizer(pred.text) for pred in transcriptions]
 
     avg_time = total_time / len(all_data["audio_filepaths"])
 
diff --git a/nemo_asr/run_nemo_ml.sh b/nemo_asr/run_nemo_ml.sh
@@ -9,7 +9,8 @@ export PYTHONPATH="..":$PYTHONPATH
 # Configuration
 MODEL_ID="nvidia/canary-1b-flash" 
 
-BATCH_SIZE=128 
+BATCH_SIZE=64
+
 DEVICE_ID=0
 
 # Available datasets and languages
diff --git a/normalizer/__init__.py b/normalizer/__init__.py
@@ -1 +1 @@
-from .normalizer import EnglishTextNormalizer
+from .normalizer import EnglishTextNormalizer, BasicMultilingualTextNormalizer
diff --git a/normalizer/data_utils.py b/normalizer/data_utils.py
@@ -1,5 +1,5 @@
 from datasets import load_dataset, Audio
-from normalizer import EnglishTextNormalizer
+from normalizer import EnglishTextNormalizer, BasicMultilingualTextNormalizer
 
 from .eval_utils import read_manifest, write_manifest
 
@@ -30,6 +30,8 @@ def get_text(sample):
 
 normalizer = EnglishTextNormalizer()
 
+ml_normalizer = BasicMultilingualTextNormalizer()
+
 
 def normalize(batch):
     batch["original_text"] = get_text(batch)
diff --git a/normalizer/normalizer.py b/normalizer/normalizer.py
@@ -92,6 +92,23 @@ def __call__(self, s: str):
         return s
 
 
+class BasicMultilingualTextNormalizer:
+    def __init__(self, remove_diacritics: bool = True):
+        self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = self.clean(s).lower()
+
+        # Remove punctuations and extra spaces
+        s = re.sub(r"[^\w\s]", "", s)
+        s = re.sub(r"\s+", " ", s).strip()
+
+        return s
+
+
 class EnglishNumberNormalizer:
     """
     Convert any spelled-out numbers into arabic numbers, while handling:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .normalizer import EnglishTextNormalizer`
	`1`	`+from .normalizer import EnglishTextNormalizer, BasicMultilingualTextNormalizer`