huggingface
diff --git a/‎normalizer/eval_utils.py‎
Lines changed: 73 additions & 13 deletions b/‎normalizer/eval_utils.py‎
Lines changed: 73 additions & 13 deletions
diff --git a/‎transformers/calc_rtf.py‎
Lines changed: 0 additions & 78 deletions b/‎transformers/calc_rtf.py‎
Lines changed: 0 additions & 78 deletions
diff --git a/‎transformers/run_data2vec.sh‎
Lines changed: 9 additions & 9 deletions b/‎transformers/run_data2vec.sh‎
Lines changed: 9 additions & 9 deletions
@@ -5,12 +5,13 @@
 import evaluate
 from collections import defaultdict
 
+
 def read_manifest(manifest_path: str):
     """
     Reads a manifest file (jsonl format) and returns a list of dictionaries containing samples.
     """
     data = []
-    with open(manifest_path, "r", encoding='utf-8') as f:
+    with open(manifest_path, "r", encoding="utf-8") as f:
         for line in f:
             if len(line) > 0:
                 datum = json.loads(line)
@@ -19,7 +20,14 @@ def read_manifest(manifest_path: str):
 
 
 def write_manifest(
-    references: list, transcriptions: list, model_id: str, dataset_path: str, dataset_name: str, split: str
+    references: list,
+    transcriptions: list,
+    model_id: str,
+    dataset_path: str,
+    dataset_name: str,
+    split: str,
+    audio_length: list = None,
+    transcription_time: list = None,
 ):
     """
     Writes a manifest file (jsonl format) and returns the path to the file.
@@ -31,6 +39,8 @@ def write_manifest(
         dataset_path: Path to the dataset.
         dataset_name: Name of the dataset.
         split: Dataset split name.
+        audio_length: Length of each audio sample in seconds.
+        transcription_time: Transcription time of each sample in seconds.
 
     Returns:
         Path to the manifest file.
@@ -41,21 +51,46 @@ def write_manifest(
 
     if len(references) != len(transcriptions):
         raise ValueError(
-            f"The number of samples in `ground_truths` ({len(references)}) "
+            f"The number of samples in `references` ({len(references)}) "
             f"must match `transcriptions` ({len(transcriptions)})."
         )
 
-    basedir = './results/'
+    if audio_length is not None and len(audio_length) != len(references):
+        raise ValueError(
+            f"The number of samples in `audio_length` ({len(audio_length)}) "
+            f"must match `references` ({len(references)})."
+        )
+    if transcription_time is not None and len(transcription_time) != len(references):
+        raise ValueError(
+            f"The number of samples in `transcription_time` ({len(transcription_time)}) "
+            f"must match `references` ({len(references)})."
+        )
+
+    audio_length = (
+        audio_length if audio_length is not None else len(references) * [None]
+    )
+    transcription_time = (
+        transcription_time
+        if transcription_time is not None
+        else len(references) * [None]
+    )
+
+    basedir = "./results/"
     if not os.path.exists(basedir):
         os.makedirs(basedir)
 
-    manifest_path = os.path.join(basedir, f"MODEL_{model_id}_DATASET_{dataset_path}_{dataset_name}_{split}.jsonl")
+    manifest_path = os.path.join(
+        basedir, f"MODEL_{model_id}_DATASET_{dataset_path}_{dataset_name}_{split}.jsonl"
+    )
 
-    with open(manifest_path, "w", encoding='utf-8') as f:
-        for idx, (text, transcript) in enumerate(zip(references, transcriptions)):
+    with open(manifest_path, "w", encoding="utf-8") as f:
+        for idx, (text, transcript, audio_length, transcription_time) in enumerate(
+            zip(references, transcriptions, audio_length, transcription_time)
+        ):
             datum = {
                 "audio_filepath": f"sample_{idx}",  # dummy value for Speech Data Processor
-                "duration": 0.0,  # dummy value for Speech Data Processor
+                "duration": audio_length,
+                "time": transcription_time,
                 "text": text,
                 "pred_text": transcript,
             }
@@ -106,7 +141,7 @@ def parse_filepath(fp: str):
         dataset_id = ds_fp.replace("DATASET_", "").rstrip(".jsonl")
         return model_id, dataset_id
 
-    # Compute results per dataset
+    # Compute WER results per dataset, and RTFx over all datasets
     results = {}
     wer_metric = evaluate.load("wer")
 
@@ -117,34 +152,59 @@ def parse_filepath(fp: str):
         references = [datum["text"] for datum in manifest]
         predictions = [datum["pred_text"] for datum in manifest]
 
+        time = [datum["time"] for datum in manifest]
+        duration = [datum["duration"] for datum in manifest]
+        compute_rtfx = all(time) and all(duration)
+
         wer = wer_metric.compute(references=references, predictions=predictions)
         wer = round(100 * wer, 2)
 
+        if compute_rtfx:
+            audio_length = sum(duration)
+            inference_time = sum(time)
+            rtfx = round(sum(duration) / sum(time), 4)
+        else:
+            audio_length = inference_time = rtfx = None
+
         result_key = f"{model_id_of_file} | {dataset_id}"
-        results[result_key] = wer
+        results[result_key] = {"wer": wer, "audio_length": audio_length, "inference_time": inference_time, "rtfx": rtfx}
 
     print("*" * 80)
     print("Results per dataset:")
     print("*" * 80)
 
     for k, v in results.items():
-        print(f"{k}: WER = {v:0.2f} %")
+        metrics = f"{k}: WER = {v['wer']:0.2f} %"
+        if v["rtfx"] is not None:
+            metrics += f", RTFx = {v['rtfx']:0.2f}"
+        print(metrics)
 
     # composite WER should be computed over all datasets and with the same key
     composite_wer = defaultdict(float)
+    composite_audio_length = defaultdict(float)
+    composite_inference_time = defaultdict(float)
     count_entries = defaultdict(int)
     for k, v in results.items():
         key = k.split("|")[0].strip()
-        composite_wer[key] += v
+        composite_wer[key] += v["wer"]
+        if v["rtfx"] is not None:
+            composite_audio_length[key] += v["audio_length"]
+            composite_inference_time[key] += v["inference_time"]
+        else:
+            composite_audio_length[key] = composite_inference_time[key] = None
         count_entries[key] += 1
 
     # normalize scores & print
     print()
     print("*" * 80)
-    print("Composite WER:")
+    print("Composite Results:")
     print("*" * 80)
     for k, v in composite_wer.items():
         wer = v / count_entries[k]
         print(f"{k}: WER = {wer:0.2f} %")
+    for k in composite_audio_length:
+        if composite_audio_length[k] is not None:
+            rtfx = composite_audio_length[k] / composite_inference_time[k]
+            print(f"{k}: RTFx = {rtfx:0.2f}")
     print("*" * 80)
     return composite_wer, results
@@ -13,7 +13,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="ami" \
         --split="test" \
         --device=0 \
@@ -23,7 +23,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="earnings22" \
         --split="test" \
         --device=0 \
@@ -32,7 +32,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="gigaspeech" \
         --split="test" \
         --device=0 \
@@ -41,7 +41,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="librispeech" \
         --split="test.clean" \
         --device=0 \
@@ -50,7 +50,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="librispeech" \
         --split="test.other" \
         --device=0 \
@@ -59,7 +59,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="spgispeech" \
         --split="test" \
         --device=0 \
@@ -68,7 +68,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="tedlium" \
         --split="test" \
         --device=0 \
@@ -77,7 +77,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="voxpopuli" \
         --split="test" \
         --device=0 \
@@ -86,7 +86,7 @@ do
 
     python run_eval.py \
         --model_id=${MODEL_ID} \
-        --dataset_path="open-asr-leaderboard/datasets-test-only" \
+        --dataset_path="hf-audio/esb-datasets-test-only-sorted" \
         --dataset="common_voice" \
         --split="test" \
         --device=0 \