1+ import type { AutomaticSpeechRecognitionInput , AutomaticSpeechRecognitionOutput } from "@huggingface/tasks" ;
12import { InferenceOutputError } from "../../lib/InferenceOutputError" ;
23import type { BaseArgs , Options , RequestArgs } from "../../types" ;
34import { base64FromBytes } from "../../utils/base64FromBytes" ;
45import { request } from "../custom/request" ;
6+ import type { LegacyAudioInput } from "./utils" ;
7+ import { preparePayload } from "./utils" ;
8+ import { omit } from "../../utils/omit" ;
59
6- export type AutomaticSpeechRecognitionArgs = BaseArgs & {
7- /**
8- * Binary audio data
9- */
10- data : Blob | ArrayBuffer ;
11- } ;
12-
13- export interface AutomaticSpeechRecognitionOutput {
14- /**
15- * The text that was recognized from the audio
16- */
17- text : string ;
18- }
19-
10+ export type AutomaticSpeechRecognitionArgs = BaseArgs & ( AutomaticSpeechRecognitionInput | LegacyAudioInput ) ;
2011/**
2112 * This task reads some audio input and outputs the said words within the audio files.
2213 * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -25,15 +16,8 @@ export async function automaticSpeechRecognition(
2516 args : AutomaticSpeechRecognitionArgs ,
2617 options ?: Options
2718) : Promise < AutomaticSpeechRecognitionOutput > {
28- if ( args . provider === "fal-ai" ) {
29- const contentType = args . data instanceof Blob ? args . data . type : "audio/mpeg" ;
30- const base64audio = base64FromBytes (
31- new Uint8Array ( args . data instanceof ArrayBuffer ? args . data : await args . data . arrayBuffer ( ) )
32- ) ;
33- ( args as RequestArgs & { audio_url : string } ) . audio_url = `data:${ contentType } ;base64,${ base64audio } ` ;
34- delete ( args as RequestArgs & { data : unknown } ) . data ;
35- }
36- const res = await request < AutomaticSpeechRecognitionOutput > ( args , {
19+ const payload = await buildPayload ( args ) ;
20+ const res = await request < AutomaticSpeechRecognitionOutput > ( payload , {
3721 ...options ,
3822 taskHint : "automatic-speech-recognition" ,
3923 } ) ;
@@ -43,3 +27,31 @@ export async function automaticSpeechRecognition(
4327 }
4428 return res ;
4529}
30+
31+ const FAL_AI_SUPPORTED_BLOB_TYPES = [ "audio/mpeg" , "audio/mp4" , "audio/wav" , "audio/x-wav" ] ;
32+
33+ async function buildPayload ( args : AutomaticSpeechRecognitionArgs ) : Promise < RequestArgs > {
34+ if ( args . provider === "fal-ai" ) {
35+ const blob = "data" in args && args . data instanceof Blob ? args . data : "inputs" in args ? args . inputs : undefined ;
36+ const contentType = blob ?. type ;
37+ if ( ! contentType ) {
38+ throw new Error (
39+ `Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
40+ ) ;
41+ }
42+ if ( ! FAL_AI_SUPPORTED_BLOB_TYPES . includes ( contentType ) ) {
43+ throw new Error (
44+ `Provider fal-ai does not support blob type ${ contentType } - supported content types are: ${ FAL_AI_SUPPORTED_BLOB_TYPES . join (
45+ ", "
46+ ) } `
47+ ) ;
48+ }
49+ const base64audio = base64FromBytes ( new Uint8Array ( await blob . arrayBuffer ( ) ) ) ;
50+ return {
51+ ...( "data" in args ? omit ( args , "data" ) : omit ( args , "inputs" ) ) ,
52+ audio_url : `data:${ contentType } ;base64,${ base64audio } ` ,
53+ } ;
54+ } else {
55+ return preparePayload ( args ) ;
56+ }
57+ }
0 commit comments