huggingface
diff --git a/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/inference/package.json‎
Lines changed: 1 addition & 1 deletion b/‎packages/inference/package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/inference/src/providers/fal-ai.ts‎
Lines changed: 4 additions & 0 deletions b/‎packages/inference/src/providers/fal-ai.ts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎packages/inference/src/providers/replicate.ts‎
Lines changed: 3 additions & 0 deletions b/‎packages/inference/src/providers/replicate.ts‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎packages/inference/src/providers/together.ts‎
Lines changed: 2 additions & 0 deletions b/‎packages/inference/src/providers/together.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/inference/src/tasks/audio/audioClassification.ts‎
Lines changed: 7 additions & 22 deletions b/‎packages/inference/src/tasks/audio/audioClassification.ts‎
Lines changed: 7 additions & 22 deletions
diff --git a/‎packages/inference/src/tasks/audio/audioToAudio.ts‎
Lines changed: 43 additions & 23 deletions b/‎packages/inference/src/tasks/audio/audioToAudio.ts‎
Lines changed: 43 additions & 23 deletions
diff --git a/‎packages/inference/src/tasks/audio/automaticSpeechRecognition.ts‎
Lines changed: 35 additions & 23 deletions b/‎packages/inference/src/tasks/audio/automaticSpeechRecognition.ts‎
Lines changed: 35 additions & 23 deletions
diff --git a/‎packages/inference/src/tasks/audio/textToSpeech.ts‎
Lines changed: 8 additions & 14 deletions b/‎packages/inference/src/tasks/audio/textToSpeech.ts‎
Lines changed: 8 additions & 14 deletions
@@ -1,6 +1,6 @@
 # Ownership for the Inference Package
 
-/packages/inference/ @vvmnnnkv @radames
+/packages/inference/ @julien-c @hanouticelina @SBrandeis @coyotte508
 
 # Ownership for the Tasks Package
 
 
@@ -95,7 +95,7 @@ You can run our packages with vanilla JS, without any bundler, by using a CDN or
 
 ```html
 <script type="module">
-    import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@3.0.0/+esm';
+    import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@3.1.1/+esm';
     import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]/+esm";
 </script>
 ```
 
@@ -1,6 +1,6 @@
 {
 	"name": "@huggingface/inference",
-	"version": "3.0.0",
+	"version": "3.1.1",
 	"packageManager": "[email protected]",
 	"license": "MIT",
 	"author": "Tim Mikeladze <[email protected]>",
 
@@ -20,4 +20,8 @@ export const FAL_AI_SUPPORTED_MODEL_IDS: ProviderMapping<FalAiId> = {
 	"automatic-speech-recognition": {
 		"openai/whisper-large-v3": "fal-ai/whisper",
 	},
+	"text-to-video": {
+		"genmo/mochi-1-preview": "fal-ai/mochi-v1",
+		"tencent/HunyuanVideo": "fal-ai/hunyuan-video",
+	},
 };
@@ -13,4 +13,7 @@ export const REPLICATE_SUPPORTED_MODEL_IDS: ProviderMapping<ReplicateId> = {
 	"text-to-speech": {
 		"OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26",
 	},
+	"text-to-video": {
+		"genmo/mochi-1-preview": "genmoai/mochi-1:1944af04d098ef69bed7f9d335d102e652203f268ec4aaa2d836f6217217e460",
+	},
 };
@@ -21,6 +21,8 @@ export const TOGETHER_SUPPORTED_MODEL_IDS: ProviderMapping<TogetherId> = {
 	},
 	conversational: {
 		"databricks/dbrx-instruct": "databricks/dbrx-instruct",
+		"deepseek-ai/DeepSeek-R1": "deepseek-ai/DeepSeek-R1",
+		"deepseek-ai/DeepSeek-V3": "deepseek-ai/DeepSeek-V3",
 		"deepseek-ai/deepseek-llm-67b-chat": "deepseek-ai/deepseek-llm-67b-chat",
 		"google/gemma-2-9b-it": "google/gemma-2-9b-it",
 		"google/gemma-2b-it": "google/gemma-2-27b-it",
 
@@ -1,27 +1,11 @@
+import type { AudioClassificationInput, AudioClassificationOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import type { LegacyAudioInput } from "./utils";
+import { preparePayload } from "./utils";
 
-export type AudioClassificationArgs = BaseArgs & {
-	/**
-	 * Binary audio data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface AudioClassificationOutputValue {
-	/**
-	 * The label for the class (model specific)
-	 */
-	label: string;
-
-	/**
-	 * A float that represents how likely it is that the audio file belongs to this class.
-	 */
-	score: number;
-}
-
-export type AudioClassificationReturn = AudioClassificationOutputValue[];
+export type AudioClassificationArgs = BaseArgs & (AudioClassificationInput | LegacyAudioInput);
 
 /**
  * This task reads some audio input and outputs the likelihood of classes.
@@ -30,8 +14,9 @@ export type AudioClassificationReturn = AudioClassificationOutputValue[];
 export async function audioClassification(
 	args: AudioClassificationArgs,
 	options?: Options
-): Promise<AudioClassificationReturn> {
-	const res = await request<AudioClassificationReturn>(args, {
+): Promise<AudioClassificationOutput> {
+	const payload = preparePayload(args);
+	const res = await request<AudioClassificationOutput>(payload, {
 		...options,
 		taskHint: "audio-classification",
 	});
 
@@ -1,15 +1,19 @@
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
+import type { LegacyAudioInput } from "./utils";
+import { preparePayload } from "./utils";
 
-export type AudioToAudioArgs = BaseArgs & {
-	/**
-	 * Binary audio data
-	 */
-	data: Blob | ArrayBuffer;
-};
+export type AudioToAudioArgs =
+	| (BaseArgs & {
+			/**
+			 * Binary audio data
+			 */
+			inputs: Blob;
+	  })
+	| LegacyAudioInput;
 
-export interface AudioToAudioOutputValue {
+export interface AudioToAudioOutputElem {
 	/**
 	 * The label for the audio output (model specific)
 	 */
@@ -18,32 +22,48 @@ export interface AudioToAudioOutputValue {
 	/**
 	 * Base64 encoded audio output.
 	 */
-	blob: string;
+	audio: Blob;
+}
 
-	/**
-	 * Content-type for blob, e.g. audio/flac
-	 */
+export interface AudioToAudioOutput {
+	blob: string;
 	"content-type": string;
+	label: string;
 }
 
-export type AudioToAudioReturn = AudioToAudioOutputValue[];
-
 /**
  * This task reads some audio input and outputs one or multiple audio files.
  * Example model: speechbrain/sepformer-wham does audio source separation.
  */
-export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioReturn> {
-	const res = await request<AudioToAudioReturn>(args, {
+export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput[]> {
+	const payload = preparePayload(args);
+	const res = await request<AudioToAudioOutput>(payload, {
 		...options,
 		taskHint: "audio-to-audio",
 	});
-	const isValidOutput =
-		Array.isArray(res) &&
-		res.every(
-			(x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
-		);
-	if (!isValidOutput) {
-		throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
+
+	return validateOutput(res);
+}
+
+function validateOutput(output: unknown): AudioToAudioOutput[] {
+	if (!Array.isArray(output)) {
+		throw new InferenceOutputError("Expected Array");
+	}
+	if (
+		!output.every((elem): elem is AudioToAudioOutput => {
+			return (
+				typeof elem === "object" &&
+				elem &&
+				"label" in elem &&
+				typeof elem.label === "string" &&
+				"content-type" in elem &&
+				typeof elem["content-type"] === "string" &&
+				"blob" in elem &&
+				typeof elem.blob === "string"
+			);
+		})
+	) {
+		throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
 	}
-	return res;
+	return output;
 }
@@ -1,22 +1,13 @@
+import type { AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options, RequestArgs } from "../../types";
 import { base64FromBytes } from "../../utils/base64FromBytes";
 import { request } from "../custom/request";
+import type { LegacyAudioInput } from "./utils";
+import { preparePayload } from "./utils";
+import { omit } from "../../utils/omit";
 
-export type AutomaticSpeechRecognitionArgs = BaseArgs & {
-	/**
-	 * Binary audio data
-	 */
-	data: Blob | ArrayBuffer;
-};
-
-export interface AutomaticSpeechRecognitionOutput {
-	/**
-	 * The text that was recognized from the audio
-	 */
-	text: string;
-}
-
+export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput);
 /**
  * This task reads some audio input and outputs the said words within the audio files.
  * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -25,15 +16,8 @@ export async function automaticSpeechRecognition(
 	args: AutomaticSpeechRecognitionArgs,
 	options?: Options
 ): Promise<AutomaticSpeechRecognitionOutput> {
-	if (args.provider === "fal-ai") {
-		const contentType = args.data instanceof Blob ? args.data.type : "audio/mpeg";
-		const base64audio = base64FromBytes(
-			new Uint8Array(args.data instanceof ArrayBuffer ? args.data : await args.data.arrayBuffer())
-		);
-		(args as RequestArgs & { audio_url: string }).audio_url = `data:${contentType};base64,${base64audio}`;
-		delete (args as RequestArgs & { data: unknown }).data;
-	}
-	const res = await request<AutomaticSpeechRecognitionOutput>(args, {
+	const payload = await buildPayload(args);
+	const res = await request<AutomaticSpeechRecognitionOutput>(payload, {
 		...options,
 		taskHint: "automatic-speech-recognition",
 	});
@@ -43,3 +27,31 @@ export async function automaticSpeechRecognition(
 	}
 	return res;
 }
+
+const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
+
+async function buildPayload(args: AutomaticSpeechRecognitionArgs): Promise<RequestArgs> {
+	if (args.provider === "fal-ai") {
+		const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : undefined;
+		const contentType = blob?.type;
+		if (!contentType) {
+			throw new Error(
+				`Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
+			);
+		}
+		if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
+			throw new Error(
+				`Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
+					", "
+				)}`
+			);
+		}
+		const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
+		return {
+			...("data" in args ? omit(args, "data") : omit(args, "inputs")),
+			audio_url: `data:${contentType};base64,${base64audio}`,
+		};
+	} else {
+		return preparePayload(args);
+	}
+}
@@ -1,27 +1,25 @@
+import type { TextToSpeechInput } from "@huggingface/tasks";
 import { InferenceOutputError } from "../../lib/InferenceOutputError";
 import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 
-export type TextToSpeechArgs = BaseArgs & {
-	/**
-	 * The text to generate an audio from
-	 */
-	inputs: string;
-};
+type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
 
-export type TextToSpeechOutput = Blob;
 interface OutputUrlTextToSpeechGeneration {
 	output: string | string[];
 }
 /**
  * This task synthesize an audio of a voice pronouncing a given text.
  * Recommended model: espnet/kan-bayashi_ljspeech_vits
  */
-export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput> {
-	const res = await request<TextToSpeechOutput | OutputUrlTextToSpeechGeneration>(args, {
+export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<Blob> {
+	const res = await request<Blob | OutputUrlTextToSpeechGeneration>(args, {
 		...options,
 		taskHint: "text-to-speech",
 	});
+	if (res instanceof Blob) {
+		return res;
+	}
 	if (res && typeof res === "object") {
 		if ("output" in res) {
 			if (typeof res.output === "string") {
@@ -35,9 +33,5 @@ export async function textToSpeech(args: TextToSpeechArgs, options?: Options): P
 			}
 		}
 	}
-	const isValidOutput = res && res instanceof Blob;
-	if (!isValidOutput) {
-		throw new InferenceOutputError("Expected Blob");
-	}
-	return res;
+	throw new InferenceOutputError("Expected Blob or object with output");
 }
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@huggingface/inference",`
`3`		`- "version": "3.0.0",`
	`3`	`+ "version": "3.1.1",`
`4`	`4`	`"packageManager": "[email protected]",`
`5`	`5`	`"license": "MIT",`
`6`	`6`	`"author": "Tim Mikeladze <[email protected]>",`