huggingface · Deep-unlearning · Oct 13, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/api/run_longform.sh b/api/run_longform.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+export PYTHONPATH="..":$PYTHONPATH
+
+export OPENAI_API_KEY="your_api_key"
+export ASSEMBLYAI_API_KEY="your_api_key"
+export ELEVENLABS_API_KEY="your_api_key"
+export REVAI_API_KEY="your_api_key"
+export AQUAVOICE_API_KEY="your_api_key"
+
+MODEL_IDs=(
+    "openai/gpt-4o-transcribe"
+    "openai/gpt-4o-mini-transcribe"
+    "openai/whisper-1"
+    "assembly/best"
+    "elevenlabs/scribe_v1"
+    "revai/machine" # please use --use_url=True
+    "revai/fusion" # please use --use_url=True
+    "speechmatics/enhanced"
+    "aquavoice/avalon-v1-en"
+)
+
+MAX_WORKERS=10
+
+num_models=${#MODEL_IDs[@]}
+
+for (( i=0; i<${num_models}; i++ ));
+do
+    MODEL_ID=${MODEL_IDs[$i]}
+    python run_eval.py \
+        --dataset_path="hf-audio/asr-leaderboard-longform" \
+        --dataset="earnings21" \
+        --split="test" \
+        --model_name ${MODEL_ID} \
+        --max_workers ${MAX_WORKERS}
+
+    python run_eval.py \
+        --dataset_path="hf-audio/asr-leaderboard-longform" \
+        --dataset="earnings22" \
+        --split="test" \
+        --model_name ${MODEL_ID} \
+        --max_workers ${MAX_WORKERS}
+
+    python run_eval.py \
+        --dataset_path="hf-audio/asr-leaderboard-longform" \
+        --dataset="tedlium" \
+        --split="test" \
+        --model_name ${MODEL_ID} \
+        --max_workers ${MAX_WORKERS}
+
+    # Evaluate results
+    RUNDIR=`pwd` && \
+    cd ../normalizer && \
+    python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
+    cd $RUNDIR
+
+done
diff --git a/kyutai/run_kyutai_fongform.sh b/kyutai/run_kyutai_fongform.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+export PYTHONPATH="..":$PYTHONPATH
+
+MODEL_IDs=("kyutai/stt-2.6b-en")
+BATCH_SIZE=16
+
+num_models=${#MODEL_IDs[@]}
+for (( i=0; i<${num_models}; i++ ));
+do
+    MODEL_ID=${MODEL_IDs[$i]}
+
+    python run_eval.py \
+        --model_id=${MODEL_ID} \
+        --dataset_path="hf-audio/asr-leaderboard-longform" \
+        --dataset="earnings21" \
+        --split="test" \
+        --device=0 \
+        --batch_size=${BATCH_SIZE} \
+        --max_eval_samples=-1
+
+    python run_eval.py \
+        --model_id=${MODEL_ID} \
+        --dataset_path="hf-audio/asr-leaderboard-longform" \
+        --dataset="earnings22" \
+        --split="test" \
+        --device=0 \
+        --batch_size=${BATCH_SIZE} \
+        --max_eval_samples=-1
+
+    python run_eval.py \
+        --model_id=${MODEL_ID} \
+        --dataset_path="hf-audio/asr-leaderboard-longform" \
+        --dataset="tedlium" \
+        --split="test" \
+        --device=0 \
+        --batch_size=${BATCH_SIZE} \
+        --max_eval_samples=-1
+
+    # Evaluate results
+    RUNDIR=`pwd` && \
+    cd ../normalizer && \
+    python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
+    cd $RUNDIR
+
+done
diff --git a/nemo_asr/run_eval_long.py b/nemo_asr/run_eval_long.py
@@ -0,0 +1,225 @@
+import argparse
+
+import io
+import os
+import torch
+import evaluate
+import soundfile
+
+from tqdm import tqdm
+from normalizer import data_utils
+import numpy as np
+
+from nemo.collections.asr.models import ASRModel
+import time
+
+
+wer_metric = evaluate.load("wer")
+
+
+def main(args):
+
+    DATA_CACHE_DIR = os.path.join(os.getcwd(), "audio_cache")
+    DATASET_NAME = args.dataset
+    SPLIT_NAME = args.split
+
+    CACHE_DIR = os.path.join(DATA_CACHE_DIR, DATASET_NAME, SPLIT_NAME)
+    if not os.path.exists(CACHE_DIR):
+        os.makedirs(CACHE_DIR)
+
+    if args.device >= 0:
+        device = torch.device(f"cuda:{args.device}")
+        compute_dtype=torch.bfloat16
+    else:
+        device = torch.device("cpu")
+        compute_dtype=torch.float32
+
+
+    if args.model_id.endswith(".nemo"):
+        asr_model = ASRModel.restore_from(args.model_id, map_location=device)
+    else:
+        asr_model = ASRModel.from_pretrained(args.model_id, map_location=device)  # type: ASRModel
+
+    if args.longform:
+        asr_model.change_attention_model("rel_pos_local_attn", [128, 128])  # local attn
+    asr_model.to(compute_dtype)
+    asr_model.eval()
+
+    dataset = data_utils.load_data(args)
+
+    def download_audio_files(batch, indices):
+
+        # download audio files and write the paths, transcriptions and durations to a manifest file
+        audio_paths = []
+        durations = []
+
+        # Use global indices for unique filenames across all batches
+        for global_idx, sample in zip(indices, batch["audio"]):
+            # Use a unique filename based on global index
+            audio_path = os.path.join(CACHE_DIR, f"sample_{global_idx}.wav")
+
+            if "array" in sample:
+                audio_array = np.float32(sample["array"])
+                sample_rate = 16000
+
+            elif "bytes" in sample: # added to be compatible with latest datasets library (3.x.x) that produces byte stream
+                with io.BytesIO(sample["bytes"]) as audio_file:
+                    audio_array, sample_rate = soundfile.read(audio_file, dtype="float32")
+
+            else:
+                raise ValueError("Sample must have either 'array' or 'bytes' key")
+
+            if not os.path.exists(audio_path):
+                os.makedirs(os.path.dirname(audio_path), exist_ok=True)
+                soundfile.write(audio_path, audio_array, sample_rate)
+
+            audio_paths.append(audio_path)
+            durations.append(len(audio_array) / sample_rate)
+
+
+        batch["references"] = batch["norm_text"]
+        batch["audio_filepaths"] = audio_paths
+        batch["durations"] = durations
+
+        return batch
+
+
+    if args.max_eval_samples is not None and args.max_eval_samples > 0:
+        print(f"Subsampling dataset to first {args.max_eval_samples} samples !")
+        dataset = dataset.take(args.max_eval_samples)
+
+    dataset = data_utils.prepare_data(dataset)
+    if asr_model.cfg.decoding.strategy != "beam":
+        asr_model.cfg.decoding.strategy = "greedy_batch"
+        asr_model.change_decoding_strategy(asr_model.cfg.decoding)
+
+    # prepraing the offline dataset
+    dataset = dataset.map(download_audio_files, batch_size=args.batch_size, batched=True, with_indices=True, remove_columns=["audio"])
+
+    # Write manifest from daraset batch using json and keys audio_filepath, duration, text
+
+    all_data = {
+        "audio_filepaths": [],
+        "durations": [],
+        "references": [],
+    }
+
+    data_itr = iter(dataset)
+    for data in tqdm(data_itr, desc="Downloading Samples"):
+        for key in all_data:
+            all_data[key].append(data[key])
+
+    # Sort audio_filepaths and references based on durations values
+    sorted_indices = sorted(range(len(all_data["durations"])), key=lambda k: all_data["durations"][k], reverse=True)
+    all_data["audio_filepaths"] = [all_data["audio_filepaths"][i] for i in sorted_indices]
+    all_data["references"] = [all_data["references"][i] for i in sorted_indices]
+    all_data["durations"] = [all_data["durations"][i] for i in sorted_indices]
+
+    total_time = 0
+    for _ in range(2): # warmup once and calculate rtf
+        if _ == 0:
+            audio_files = all_data["audio_filepaths"][:args.batch_size * 4] # warmup with 4 batches
+        else:
+            audio_files = all_data["audio_filepaths"]
+        start_time = time.time()
+        with torch.inference_mode(), torch.no_grad(): 
+
+            if 'canary' in args.model_id and 'v2' not in args.model_id:
+                pnc = 'nopnc'
+            else:
+                pnc = 'pnc'
+
+            if 'canary' in args.model_id:
+                transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, pnc=pnc, num_workers=1)
+            else:
+                transcriptions = asr_model.transcribe(audio_files, batch_size=args.batch_size, verbose=False, num_workers=1)
+        end_time = time.time()
+        if _ == 1:
+            total_time += end_time - start_time
+    total_time = total_time
+
+    # normalize transcriptions with English normalizer
+    if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
+        transcriptions = transcriptions[0]
+    predictions = [data_utils.normalizer(pred.text) for pred in transcriptions]
+
+    avg_time = total_time / len(all_data["audio_filepaths"])
+
+    # Write manifest results (WER and RTFX)
+    manifest_path = data_utils.write_manifest(
+        all_data["references"],
+        predictions,
+        args.model_id,
+        args.dataset_path,
+        args.dataset,
+        args.split,
+        audio_length=all_data["durations"],
+        transcription_time=[avg_time] * len(all_data["audio_filepaths"]),
+    )
+
+    print("Results saved at path:", os.path.abspath(manifest_path))
+
+    wer = wer_metric.compute(references=all_data['references'], predictions=predictions)
+    wer = round(100 * wer, 2)
+
+    # transcription_time = sum(all_results["transcription_time"])
+    audio_length = sum(all_data["durations"])
+    rtfx = audio_length / total_time
+    rtfx = round(rtfx, 2)
+
+    print("RTFX:", rtfx)
+    print("WER:", wer, "%")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with NVIDIA NeMo.",
+    )
+    parser.add_argument(
+        '--dataset_path', type=str, default='esb/datasets', help='Dataset path. By default, it is `esb/datasets`'
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Dataset name. *E.g.* `'librispeech_asr` for the LibriSpeech ASR dataset, or `'common_voice'` for Common Voice. The full list of dataset names "
+        "can be found at `https://huggingface.co/datasets/esb/datasets`",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="test",
+        help="Split of the dataset. *E.g.* `'validation`' for the dev split, or `'test'` for the test split.",
+    )
+    parser.add_argument(
+        "--device",
+        type=int,
+        default=-1,
+        help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
+    )
+    parser.add_argument(
+        "--batch_size", type=int, default=32, help="Number of samples to go through each streamed batch.",
+    )
+    parser.add_argument(
+        "--max_eval_samples",
+        type=int,
+        default=None,
+        help="Number of samples to be evaluated. Put a lower number e.g. 64 for testing this script.",
+    )
+    parser.add_argument(
+        "--no-streaming",
+        dest='streaming',
+        action="store_false",
+        help="Choose whether you'd like to download the entire dataset or stream it during the evaluation.",
+    )
+    parser.add_argument(
+        "--longform",
+        action="store_true",
+        help="Whether to use longform mode.",
+    )
+    args = parser.parse_args()
+    parser.set_defaults(streaming=True)
+
+    main(args)
diff --git a/nemo_asr/run_longform.sh b/nemo_asr/run_longform.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+export PYTHONPATH="..":$PYTHONPATH
+
+#considering latest model
+MODEL_IDs=("nvidia/parakeet-tdt-1.1b" "nvidia/parakeet-rnnt-1.1b" "nvidia/parakeet-rnnt-0.6b" "nvidia/stt_en_fastconformer_transducer_large" "nvidia/stt_en_conformer_transducer_large" "nvidia/stt_en_conformer_transducer_small" "nvidia/parakeet-tdt-0.6b-v2")
+
+# For CTC models:
+# MODEL_IDs=("nvidia/parakeet-ctc-1.1b" "nvidia/parakeet-ctc-0.6b" "nvidia/stt_en_fastconformer_ctc_large" "nvidia/stt_en_conformer_ctc_large" "nvidia/stt_en_conformer_ctc_small")
+
+BATCH_SIZE=1
+DEVICE_ID=0
+
+num_models=${#MODEL_IDs[@]}
+
+for (( i=0; i<${num_models}; i++ ));
+do
+    MODEL_ID=${MODEL_IDs[$i]}
+
+    python run_eval_long.py \
+    --model_id=${MODEL_ID} \
+    --dataset_path="hf-audio/asr-leaderboard-longform" \
+    --dataset="earnings21" \
+    --split="test" \
+    --device=0 \
+    --batch_size=${BATCH_SIZE} \
+    --max_eval_samples=-1 \
+    --longform
+
+    python run_eval_long.py \
+    --model_id=${MODEL_ID} \
+    --dataset_path="hf-audio/asr-leaderboard-longform" \
+    --dataset="earnings22" \
+    --split="test" \
+    --device=0 \
+    --batch_size=${BATCH_SIZE} \
+    --max_eval_samples=-1 \
+    --longform
+
+    python run_eval_long.py \
+    --model_id=${MODEL_ID} \
+    --dataset_path="hf-audio/asr-leaderboard-longform" \
+    --dataset="tedlium" \
+    --split="test" \
+    --device=0 \
+    --batch_size=${BATCH_SIZE} \
+    --max_eval_samples=-1 \
+    --longform
+
+    # Evaluate results
+    RUNDIR=`pwd` && \
+    cd ../normalizer && \
+    python -c "import eval_utils; eval_utils.score_results('${RUNDIR}/results', '${MODEL_ID}')" && \
+    cd $RUNDIR
+
+done