From 770044ffad08b9b63b63e2a05a03aaa55788b1b8 Mon Sep 17 00:00:00 2001 From: Louis Jordan Date: Tue, 28 Oct 2025 15:19:32 +0000 Subject: [PATCH] Fix ElevenLabs language code extraction for multilingual ASR benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, language_code was hardcoded to "eng" for all ElevenLabs transcriptions, causing poor WER on multilingual benchmarks (e.g., French: 26.34% WER, Portuguese: 35.98% WER). This fix: - Extracts language code from dataset name (e.g., "fleurs_fr" → "fr") - Dynamically sets language_code parameter based on dataset - Defaults to "en" for English-only datasets (ami, librispeech, etc.) Test results: - French: 26.34% → 3.99% WER (85% improvement) - Portuguese: 35.98% → 4.55% WER (87% improvement) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- api/run_eval.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/api/run_eval.py b/api/run_eval.py index 92ded64..411d916 100644 --- a/api/run_eval.py +++ b/api/run_eval.py @@ -65,10 +65,21 @@ def fetch_audio_urls(dataset_path, dataset, split, batch_size=100, max_retries=2 raise Exception("Max retries exceeded while fetching data.") +def extract_language_code(dataset_name: str) -> str: + """ + Extract language code from dataset name for multilingual evaluation. + Examples: 'fleurs_fr' -> 'fr', 'mls_es' -> 'es', 'ami' -> 'en' + """ + if "_" in dataset_name: + return dataset_name.split("_")[-1] + return "en" + + def transcribe_with_retry( model_name: str, audio_file_path: Optional[str], sample: dict, + dataset: str, max_retries=10, use_url=False, ): @@ -204,7 +215,7 @@ def transcribe_with_retry( transcription = client.speech_to_text.convert( file=audio_data, model_id=model_name.split("/")[1], - language_code="eng", + language_code=extract_language_code(dataset), tag_audio_events=True, ) else: @@ -212,7 +223,7 @@ def transcribe_with_retry( transcription = client.speech_to_text.convert( file=audio_file, model_id=model_name.split("/")[1], - language_code="eng", + language_code=extract_language_code(dataset), tag_audio_events=True, ) return transcription.text @@ -330,7 +341,7 @@ def process_sample(sample): start = time.time() try: transcription = transcribe_with_retry( - model_name, None, sample, use_url=True + model_name, None, sample, dataset, use_url=True ) except Exception as e: print(f"Failed to transcribe after retries: {e}") @@ -353,7 +364,7 @@ def process_sample(sample): start = time.time() try: transcription = transcribe_with_retry( - model_name, tmp_path, sample, use_url=False + model_name, tmp_path, sample, dataset, use_url=False ) except Exception as e: print(f"Failed to transcribe after retries: {e}")