Skip to content

Commit 006b042

Browse files
committed
Switch to hf-audio/esb-datasets-test-only-sorted dataset
1 parent 8e47917 commit 006b042

File tree

2 files changed

+15
-15
lines changed

2 files changed

+15
-15
lines changed

ctranslate2/run_eval.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,12 @@ def main(args) -> None:
4949
for batch in tqdm(dataset_iterator(dataset), desc=f"Evaluating {args.model_id}"):
5050
segments, _ = asr_model.transcribe(batch["array"], language="en")
5151
outputs = [segment._asdict() for segment in segments]
52-
predictions.extend(
53-
data_utils.normalizer(
54-
"".join([segment["text"] for segment in outputs])
55-
).strip()
56-
)
57-
references.extend(batch["reference"][0])
52+
transcription = data_utils.normalizer(
53+
"".join([segment["text"] for segment in outputs])
54+
).strip()
55+
56+
predictions.append(transcription)
57+
references.append(batch["reference"])
5858

5959
# Write manifest results
6060
manifest_path = data_utils.write_manifest(

ctranslate2/run_whisper.sh

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ do
1414

1515
python run_eval.py \
1616
--model_id=${MODEL_ID} \
17-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
17+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
1818
--dataset="ami" \
1919
--split="test" \
2020
--device=${DEVICE_INDEX} \
@@ -23,7 +23,7 @@ do
2323

2424
python run_eval.py \
2525
--model_id=${MODEL_ID} \
26-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
26+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
2727
--dataset="earnings22" \
2828
--split="test" \
2929
--device=${DEVICE_INDEX} \
@@ -32,7 +32,7 @@ do
3232

3333
python run_eval.py \
3434
--model_id=${MODEL_ID} \
35-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
35+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
3636
--dataset="gigaspeech" \
3737
--split="test" \
3838
--device=${DEVICE_INDEX} \
@@ -41,7 +41,7 @@ do
4141

4242
python run_eval.py \
4343
--model_id=${MODEL_ID} \
44-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
44+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
4545
--dataset="librispeech" \
4646
--split="test.clean" \
4747
--device=${DEVICE_INDEX} \
@@ -50,7 +50,7 @@ do
5050

5151
python run_eval.py \
5252
--model_id=${MODEL_ID} \
53-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
53+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
5454
--dataset="librispeech" \
5555
--split="test.other" \
5656
--device=${DEVICE_INDEX} \
@@ -59,7 +59,7 @@ do
5959

6060
python run_eval.py \
6161
--model_id=${MODEL_ID} \
62-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
62+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
6363
--dataset="spgispeech" \
6464
--split="test" \
6565
--device=${DEVICE_INDEX} \
@@ -68,7 +68,7 @@ do
6868

6969
python run_eval.py \
7070
--model_id=${MODEL_ID} \
71-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
71+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
7272
--dataset="tedlium" \
7373
--split="test" \
7474
--device=${DEVICE_INDEX} \
@@ -77,7 +77,7 @@ do
7777

7878
python run_eval.py \
7979
--model_id=${MODEL_ID} \
80-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
80+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
8181
--dataset="voxpopuli" \
8282
--split="test" \
8383
--device=${DEVICE_INDEX} \
@@ -86,7 +86,7 @@ do
8686

8787
python run_eval.py \
8888
--model_id=${MODEL_ID} \
89-
--dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
89+
--dataset_path="hf-audio/esb-datasets-test-only-sorted" \
9090
--dataset="common_voice" \
9191
--split="test" \
9292
--device=${DEVICE_INDEX} \

0 commit comments

Comments
 (0)