Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions scripts/dataset_processing/get_commonvoice_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@


def process_files(csv_file, data_root, num_workers):
""" Read *.csv file description, convert mp3 to wav, process text.
"""Read *.csv file description, convert mp3 to wav, process text.
Save results to data_root.
Args:
Expand Down Expand Up @@ -165,14 +165,13 @@
commands = [
'wget',
'--user-agent',
'"Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"',
'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
Comment on lines +168 to +169

Check warning

Code scanning / CodeQL

Implicit string concatenation in a list Warning

Implicit string concatenation. Maybe missing a comma?

Copilot Autofix

AI 26 days ago

To fix this, we should make the concatenation of the two string literals in the commands list explicit. The goal is to keep the User-Agent value exactly the same while avoiding implicit string concatenation inside the list, thereby satisfying CodeQL and improving readability.

The best minimal change is to join the two adjacent literals on lines 168–169 with + so that Python’s intent is clear while still passing a single User-Agent string as the value for the --user-agent option. We should only edit the string element in the commands list and not change any other behavior or imports.

Concretely, in scripts/dataset_processing/get_commonvoice_data.py, locate the commands = [ block in main() and replace the two-line implicit concatenation:

            'Mozilla/5.0 (Windows NT 10.0; WOW64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',

with an explicit concatenation (split across lines for readability):

            'Mozilla/5.0 (Windows NT 10.0; WOW64) ' +
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',

No new methods, imports, or definitions are needed.

Suggested changeset 1
scripts/dataset_processing/get_commonvoice_data.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/scripts/dataset_processing/get_commonvoice_data.py b/scripts/dataset_processing/get_commonvoice_data.py
--- a/scripts/dataset_processing/get_commonvoice_data.py
+++ b/scripts/dataset_processing/get_commonvoice_data.py
@@ -165,7 +165,7 @@
         commands = [
             'wget',
             '--user-agent',
-            'Mozilla/5.0 (Windows NT 10.0; WOW64) '
+            'Mozilla/5.0 (Windows NT 10.0; WOW64) ' +
             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
             '-O',
             output_archive_filename,
EOF
@@ -165,7 +165,7 @@
commands = [
'wget',
'--user-agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'Mozilla/5.0 (Windows NT 10.0; WOW64) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
'-O',
output_archive_filename,
Copilot is powered by AI and may make mistakes. Always verify output.
'-O',
output_archive_filename,
f'{COMMON_VOICE_URL}',
COMMON_VOICE_URL,
]
commands = " ".join(commands)
subprocess.run(commands, shell=True, stderr=sys.stderr, stdout=sys.stdout, capture_output=False)
subprocess.run(commands, shell=False, stderr=sys.stderr, stdout=sys.stdout, capture_output=False)
filename = f"{args.language}.tar.gz"
target_file = os.path.join(data_root, os.path.basename(filename))

Expand Down
61 changes: 35 additions & 26 deletions tools/asr_evaluator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,23 +183,26 @@ def run_chunked_inference(cfg: DictConfig) -> DictConfig:
# 1) change TranscriptionConfig on top of the executed scripts such as speech_to_text_buffered_infer_rnnt.py, or
# 2) add command as "decoding.strategy=greedy_batch " to below script

base_cmd = f"python {script_path} \
calculate_wer=False \
model_path={cfg.model_path} \
pretrained_name={cfg.pretrained_name} \
dataset_manifest={cfg.test_ds.manifest_filepath} \
output_filename={cfg.output_filename} \
random_seed={cfg.random_seed} \
batch_size={cfg.test_ds.batch_size} \
++num_workers={cfg.test_ds.num_workers} \
chunk_len_in_secs={cfg.inference.chunk_len_in_secs} \
++total_buffer_in_secs={cfg.inference.total_buffer_in_secs} \
model_stride={cfg.inference.model_stride} \
++timestamps={cfg.inference.timestamps}"
base_cmd = [
"python",
str(script_path),
"calculate_wer=False",
f"model_path={cfg.model_path}",
f"pretrained_name={cfg.pretrained_name}",
f"dataset_manifest={cfg.test_ds.manifest_filepath}",
f"output_filename={cfg.output_filename}",
f"random_seed={cfg.random_seed}",
f"batch_size={cfg.test_ds.batch_size}",
f"++num_workers={cfg.test_ds.num_workers}",
f"chunk_len_in_secs={cfg.inference.chunk_len_in_secs}",
f"++total_buffer_in_secs={cfg.inference.total_buffer_in_secs}",
f"model_stride={cfg.inference.model_stride}",
f"++timestamps={cfg.inference.timestamps}",
]

subprocess.run(
base_cmd,
shell=True,
shell=False,
check=True,
)
return cfg
Expand Down Expand Up @@ -239,19 +242,25 @@ def run_offline_inference(cfg: DictConfig) -> DictConfig:
# If need to change other config such as decoding strategy, could either:
# 1) change TranscriptionConfig on top of the executed scripts such as transcribe_speech.py in examples/asr, or
# 2) add command as "rnnt_decoding.strategy=greedy_batch " to below script
base_cmd = [
"python",
str(script_path),
"calculate_wer=False",
f"model_path={cfg.model_path}",
f"pretrained_name={cfg.pretrained_name}",
f"dataset_manifest={cfg.test_ds.manifest_filepath}",
f"output_filename={cfg.output_filename}",
f"batch_size={cfg.test_ds.batch_size}",
f"num_workers={cfg.test_ds.num_workers}",
f"random_seed={cfg.random_seed}",
f"eval_config_yaml={f.name}",
f"decoder_type={cfg.inference.decoder_type}",
]
if hydra_overrides:
base_cmd.extend(hydra_overrides.split())
subprocess.run(
f"python {script_path} "
f"calculate_wer=False "
f"model_path={cfg.model_path} "
f"pretrained_name={cfg.pretrained_name} "
f"dataset_manifest={cfg.test_ds.manifest_filepath} "
f"output_filename={cfg.output_filename} "
f"batch_size={cfg.test_ds.batch_size} "
f"num_workers={cfg.test_ds.num_workers} "
f"random_seed={cfg.random_seed} "
f"eval_config_yaml={f.name} "
f"decoder_type={cfg.inference.decoder_type} {hydra_overrides}",
shell=True,
base_cmd,
shell=False,
check=True,
)

Expand Down
Loading