added pnc flag for canary nemo asr eval

KunalDhawan · KunalDhawan · commit b7ae64bbf952 · 2024-05-15T15:51:20.000-07:00
Signed-off-by: KunalDhawan &lt;kunaldhawan97@gmail.com&gt;
diff --git a/nemo_asr/run_canary.sh b/nemo_asr/run_canary.sh
@@ -3,6 +3,7 @@
 export PYTHONPATH="..":$PYTHONPATH
 
 MODEL_IDs=("nvidia/canary-1b")
+PNC=False
 BATCH_SIZE=64
 DEVICE_ID=0
 
@@ -19,6 +20,7 @@ do
         --dataset="ami" \
         --split="test" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
     
@@ -28,6 +30,7 @@ do
         --dataset="earnings22" \
         --split="test" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
@@ -37,6 +40,7 @@ do
         --dataset="gigaspeech" \
         --split="test" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
@@ -46,6 +50,7 @@ do
         --dataset="librispeech" \
         --split="test.clean" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
@@ -55,6 +60,7 @@ do
         --dataset="librispeech" \
         --split="test.other" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
@@ -64,6 +70,7 @@ do
         --dataset="spgispeech" \
         --split="test" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
@@ -73,6 +80,7 @@ do
         --dataset="tedlium" \
         --split="test" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
@@ -82,6 +90,7 @@ do
         --dataset="voxpopuli" \
         --split="test" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
@@ -91,6 +100,7 @@ do
         --dataset="common_voice" \
         --split="test" \
         --device=${DEVICE_ID} \
+        --pnc=${PNC} \
         --batch_size=${BATCH_SIZE} \
         --max_eval_samples=-1 
 
diff --git a/nemo_asr/run_eval.py b/nemo_asr/run_eval.py
@@ -54,15 +54,19 @@ def pack_results(results: list, buffer, transcriptions):
     return results
 
 
-def buffer_audio_and_transcribe(model: ASRModel, dataset, batch_size: int, cache_prefix: str, verbose: bool = True):
+def buffer_audio_and_transcribe(model: ASRModel, dataset, batch_size: int, pnc:bool, cache_prefix: str, verbose: bool = True):
     buffer = []
     results = []
     for sample in tqdm(dataset_iterator(dataset), desc='Evaluating: Sample id', unit='', disable=not verbose):
         buffer.append(sample)
 
         if len(buffer) == batch_size:
             filepaths = write_audio(buffer, cache_prefix)
-            transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False)
+
+            if pnc is not None:
+                transcriptions = model.transcribe(filepaths, batch_size=batch_size, pnc=False, verbose=False)
+            else:
+                transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False)
             # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
             if type(transcriptions) == tuple and len(transcriptions) == 2:
                 transcriptions = transcriptions[0]
@@ -71,7 +75,10 @@ def buffer_audio_and_transcribe(model: ASRModel, dataset, batch_size: int, cache
 
     if len(buffer) > 0:
         filepaths = write_audio(buffer, cache_prefix)
-        transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False)
+        if pnc is not None:
+            transcriptions = model.transcribe(filepaths, batch_size=batch_size, pnc=False, verbose=False)
+        else:
+            transcriptions = model.transcribe(filepaths, batch_size=batch_size, verbose=False)
         # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
         if type(transcriptions) == tuple and len(transcriptions) == 2:
             transcriptions = transcriptions[0]
@@ -112,7 +119,7 @@ def main(args):
     # run streamed inference
     cache_prefix = (f"{args.model_id.replace('/', '-')}-{args.dataset_path.replace('/', '')}-"
                     f"{args.dataset.replace('/', '-')}-{args.split}")
-    results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, cache_prefix, verbose=True)
+    results = buffer_audio_and_transcribe(asr_model, dataset, args.batch_size, args.pnc, cache_prefix, verbose=True)
     for sample in results:
         predictions.append(data_utils.normalizer(sample["pred_text"]))
         references.append(sample["reference"])
@@ -166,6 +173,12 @@ def main(args):
         default=None,
         help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.",
     )
+    parser.add_argument(
+        "--pnc",
+        type=bool,
+        default=None,
+        help="flag to indicate inferene in pnc mode for models that support punctuation and capitalization",
+    )
     parser.add_argument(
         "--no-streaming",
         dest='streaming',