Switch to hf-audio/esb-datasets-test-only-sorted dataset

jordimas · jordimas · commit 006b042a81f3 · 2024-07-31T13:19:06.000+02:00
diff --git a/ctranslate2/run_eval.py b/ctranslate2/run_eval.py
@@ -49,12 +49,12 @@ def main(args) -> None:
     for batch in tqdm(dataset_iterator(dataset), desc=f"Evaluating {args.model_id}"):
         segments, _ = asr_model.transcribe(batch["array"], language="en")
         outputs = [segment._asdict() for segment in segments]
-        predictions.extend(
-            data_utils.normalizer(
-                "".join([segment["text"] for segment in outputs])
-            ).strip()
-        )
-        references.extend(batch["reference"][0])
+        transcription = data_utils.normalizer(
+            "".join([segment["text"] for segment in outputs])
+        ).strip()
+
+        predictions.append(transcription)
+        references.append(batch["reference"])
 
     # Write manifest results
     manifest_path = data_utils.write_manifest(
diff --git a/ctranslate2/run_whisper.sh b/ctranslate2/run_whisper.sh
@@ -14,7 +14,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="ami" \
         --split="test" \
         --device=${DEVICE_INDEX} \
@@ -23,7 +23,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="earnings22" \
         --split="test" \
         --device=${DEVICE_INDEX} \
@@ -32,7 +32,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="gigaspeech" \
         --split="test" \
         --device=${DEVICE_INDEX} \
@@ -41,7 +41,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="librispeech" \
         --split="test.clean" \
         --device=${DEVICE_INDEX} \
@@ -50,7 +50,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="librispeech" \
         --split="test.other" \
         --device=${DEVICE_INDEX} \
@@ -59,7 +59,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="spgispeech" \
         --split="test" \
         --device=${DEVICE_INDEX} \
@@ -68,7 +68,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="tedlium" \
         --split="test" \
         --device=${DEVICE_INDEX} \
@@ -77,7 +77,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="voxpopuli" \
         --split="test" \
         --device=${DEVICE_INDEX} \
@@ -86,7 +86,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="https://huggingface.co/datasets/hf-audio/esb-datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="common_voice" \
         --split="test" \
         --device=${DEVICE_INDEX} \