From 74f718925c4ac542d3de75e4b4b9aac137d50f9a Mon Sep 17 00:00:00 2001 From: Steven Zheng Date: Tue, 5 Nov 2024 17:37:22 +0100 Subject: [PATCH 1/2] Update ASR task page --- .../automatic-speech-recognition/data.ts | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/data.ts b/packages/tasks/src/tasks/automatic-speech-recognition/data.ts index 89078ce710..bf6122825e 100644 --- a/packages/tasks/src/tasks/automatic-speech-recognition/data.ts +++ b/packages/tasks/src/tasks/automatic-speech-recognition/data.ts @@ -6,12 +6,16 @@ const taskData: TaskDataCustom = { description: "31,175 hours of multilingual audio-text dataset in 108 languages.", id: "mozilla-foundation/common_voice_17_0", }, + { + description: "Multilingual and diverse audio dataset with 101k hours of audio.", + id: "amphion/Emilia-Dataset", + }, { description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.", id: "parler-tts/mls_eng", }, { - description: "A multi-lingual audio dataset with 370K hours of audio.", + description: "A multilingual audio dataset with 370K hours of audio.", id: "espnet/yodas", }, ], @@ -54,6 +58,10 @@ const taskData: TaskDataCustom = { description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.", id: "facebook/seamless-m4t-v2-large", }, + { + description: "A powerful multilingual ASR and Speech Translation model by Nvidia.", + id: "nvidia/canary-1b", + }, { description: "Powerful speaker diarization model.", id: "pyannote/speaker-diarization-3.1", @@ -65,13 +73,18 @@ const taskData: TaskDataCustom = { id: "hf-audio/whisper-large-v3", }, { - description: "Fastest speech recognition application.", - id: "sanchit-gandhi/whisper-jax", + description: "Latest ASR model from Useful Sensors.", + id: "mrfakename/Moonshinex", }, { description: "A high quality speech and text translation model by Meta.", id: "facebook/seamless_m4t", }, + { + description: "A powerful multilingual ASR and Speech Translation model by Nvidia", + id: "nvidia/canary-1b", + }, + ], summary: "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.", From d1705168d44b30a04414180d2ce57a56a117816f Mon Sep 17 00:00:00 2001 From: Steven Zheng Date: Tue, 5 Nov 2024 17:55:09 +0100 Subject: [PATCH 2/2] Update ASR task page --- packages/tasks/src/tasks/automatic-speech-recognition/data.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/data.ts b/packages/tasks/src/tasks/automatic-speech-recognition/data.ts index bf6122825e..622a522644 100644 --- a/packages/tasks/src/tasks/automatic-speech-recognition/data.ts +++ b/packages/tasks/src/tasks/automatic-speech-recognition/data.ts @@ -84,7 +84,6 @@ const taskData: TaskDataCustom = { description: "A powerful multilingual ASR and Speech Translation model by Nvidia", id: "nvidia/canary-1b", }, - ], summary: "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",