NVIDIA-NeMo · nithinraok · Dec 23, 2025 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/scripts/dataset_processing/get_commonvoice_data.py b/scripts/dataset_processing/get_commonvoice_data.py
@@ -97,7 +97,7 @@
 
 
 def process_files(csv_file, data_root, num_workers):
-    """ Read *.csv file description, convert mp3 to wav, process text.
+    """Read *.csv file description, convert mp3 to wav, process text.
         Save results to data_root.
 
     Args:
@@ -165,14 +165,13 @@
         commands = [
             'wget',
             '--user-agent',
-            '"Mozilla/5.0 (Windows NT 10.0; WOW64) '
-            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"',
+            'Mozilla/5.0 (Windows NT 10.0; WOW64) '
+            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
@@ -165,7 +165,7 @@
        commands = [
            'wget',
            '--user-agent',
-            'Mozilla/5.0 (Windows NT 10.0; WOW64) '
+            'Mozilla/5.0 (Windows NT 10.0; WOW64) ' +
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
            '-O',
            output_archive_filename,
@@ -165,7 +165,7 @@
        commands = [
            'wget',
            '--user-agent',
-            'Mozilla/5.0 (Windows NT 10.0; WOW64) '
+            'Mozilla/5.0 (Windows NT 10.0; WOW64) ' +
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
            '-O',
            output_archive_filename,
             '-O',
             output_archive_filename,
-            f'{COMMON_VOICE_URL}',
+            COMMON_VOICE_URL,
         ]
-        commands = " ".join(commands)
-        subprocess.run(commands, shell=True, stderr=sys.stderr, stdout=sys.stdout, capture_output=False)
+        subprocess.run(commands, shell=False, stderr=sys.stderr, stdout=sys.stdout, capture_output=False)
         filename = f"{args.language}.tar.gz"
         target_file = os.path.join(data_root, os.path.basename(filename))
 

diff --git a/tools/asr_evaluator/utils.py b/tools/asr_evaluator/utils.py
@@ -183,23 +183,26 @@ def run_chunked_inference(cfg: DictConfig) -> DictConfig:
     # 1) change TranscriptionConfig on top of the executed scripts such as speech_to_text_buffered_infer_rnnt.py, or
     # 2) add command as "decoding.strategy=greedy_batch " to below script
 
-    base_cmd = f"python {script_path} \
-    calculate_wer=False \
-    model_path={cfg.model_path} \
-    pretrained_name={cfg.pretrained_name} \
-    dataset_manifest={cfg.test_ds.manifest_filepath} \
-    output_filename={cfg.output_filename} \
-    random_seed={cfg.random_seed} \
-    batch_size={cfg.test_ds.batch_size} \
-    ++num_workers={cfg.test_ds.num_workers} \
-    chunk_len_in_secs={cfg.inference.chunk_len_in_secs} \
-    ++total_buffer_in_secs={cfg.inference.total_buffer_in_secs} \
-    model_stride={cfg.inference.model_stride} \
-    ++timestamps={cfg.inference.timestamps}"
+    base_cmd = [
+        "python",
+        str(script_path),
+        "calculate_wer=False",
+        f"model_path={cfg.model_path}",
+        f"pretrained_name={cfg.pretrained_name}",
+        f"dataset_manifest={cfg.test_ds.manifest_filepath}",
+        f"output_filename={cfg.output_filename}",
+        f"random_seed={cfg.random_seed}",
+        f"batch_size={cfg.test_ds.batch_size}",
+        f"++num_workers={cfg.test_ds.num_workers}",
+        f"chunk_len_in_secs={cfg.inference.chunk_len_in_secs}",
+        f"++total_buffer_in_secs={cfg.inference.total_buffer_in_secs}",
+        f"model_stride={cfg.inference.model_stride}",
+        f"++timestamps={cfg.inference.timestamps}",
+    ]
 
     subprocess.run(
         base_cmd,
-        shell=True,
+        shell=False,
         check=True,
     )
     return cfg
@@ -239,19 +242,25 @@ def run_offline_inference(cfg: DictConfig) -> DictConfig:
         # If need to change other config such as decoding strategy, could either:
         # 1) change TranscriptionConfig on top of the executed scripts such as transcribe_speech.py in examples/asr, or
         # 2) add command as "rnnt_decoding.strategy=greedy_batch " to below script
+        base_cmd = [
+            "python",
+            str(script_path),
+            "calculate_wer=False",
+            f"model_path={cfg.model_path}",
+            f"pretrained_name={cfg.pretrained_name}",
+            f"dataset_manifest={cfg.test_ds.manifest_filepath}",
+            f"output_filename={cfg.output_filename}",
+            f"batch_size={cfg.test_ds.batch_size}",
+            f"num_workers={cfg.test_ds.num_workers}",
+            f"random_seed={cfg.random_seed}",
+            f"eval_config_yaml={f.name}",
+            f"decoder_type={cfg.inference.decoder_type}",
+        ]
+        if hydra_overrides:
+            base_cmd.extend(hydra_overrides.split())
         subprocess.run(
-            f"python {script_path} "
-            f"calculate_wer=False "
-            f"model_path={cfg.model_path} "
-            f"pretrained_name={cfg.pretrained_name} "
-            f"dataset_manifest={cfg.test_ds.manifest_filepath} "
-            f"output_filename={cfg.output_filename} "
-            f"batch_size={cfg.test_ds.batch_size} "
-            f"num_workers={cfg.test_ds.num_workers} "
-            f"random_seed={cfg.random_seed} "
-            f"eval_config_yaml={f.name} "
-            f"decoder_type={cfg.inference.decoder_type} {hydra_overrides}",
-            shell=True,
+            base_cmd,
+            shell=False,
             check=True,
         )