feat. Refactor the NovitaTextToVideoTask using the async API.

viktor2077 · viktor2077 · commit fa7ca4411eed · 2025-05-16T18:53:44.000+08:00
diff --git a/packages/inference/README.md b/packages/inference/README.md
@@ -52,7 +52,7 @@ Currently, we support the following providers:
 - [Fireworks AI](https://fireworks.ai)
 - [Hyperbolic](https://hyperbolic.xyz)
 - [Nebius](https://studio.nebius.ai)
-- [Novita](https://novita.ai/?utm_source=github_huggingface&utm_medium=github_readme&utm_campaign=link)
+- [Novita](https://novita.ai)
 - [Nscale](https://nscale.com)
 - [OVHcloud](https://endpoints.ai.cloud.ovh.net/)
 - [Replicate](https://replicate.com)
@@ -93,6 +93,7 @@ Only a subset of models are supported when requesting third-party providers. You
 - [Cerebras supported models](https://huggingface.co/api/partners/cerebras/models)
 - [Groq supported models](https://console.groq.com/docs/models)
 - [HF Inference API (serverless)](https://huggingface.co/models?inference=warm&sort=trending)
+- [Novita AI supported models](https://huggingface.co/api/partners/novita/models)
 
 ❗**Important note:** To be compatible, the third-party API must adhere to the "standard" shape API we expect on HF model pages for each pipeline task type.
 This is not an issue for LLMs as everyone converged on the OpenAI API anyways, but can be more tricky for other tasks like "text-to-image" or "automatic-speech-recognition" where there exists no standard API. Let us know if any help is needed or if we can make things easier for you!
diff --git a/packages/inference/src/lib/getProviderHelper.ts b/packages/inference/src/lib/getProviderHelper.ts
@@ -120,6 +120,7 @@ export const PROVIDERS: Record<InferenceProvider, Partial<Record<InferenceTask,
 	novita: {
 		conversational: new Novita.NovitaConversationalTask(),
 		"text-generation": new Novita.NovitaTextGenerationTask(),
+		"text-to-video": new Novita.NovitaTextToVideoTask(),
 	},
 	nscale: {
 		"text-to-image": new Nscale.NscaleTextToImageTask(),
diff --git a/packages/inference/src/providers/novita.ts b/packages/inference/src/providers/novita.ts
@@ -17,6 +17,7 @@
 import { InferenceOutputError } from "../lib/InferenceOutputError";
 import { isUrl } from "../lib/isUrl";
 import type { BodyParams, UrlParams } from "../types";
+import { delay } from "../utils/delay";
 import { omit } from "../utils/omit";
 import {
 	BaseConversationalTask,
@@ -26,11 +27,11 @@ import {
 } from "./providerHelper";
 
 const NOVITA_API_BASE_URL = "https://api.novita.ai";
-export interface NovitaOutput {
-	video: {
-		video_url: string;
-	};
+
+export interface NovitaAsyncAPIOutput {
+	task_id: string;
 }
+
 export class NovitaTextGenerationTask extends BaseTextGenerationTask {
 	constructor() {
 		super("novita", NOVITA_API_BASE_URL);
@@ -50,38 +51,88 @@ export class NovitaConversationalTask extends BaseConversationalTask {
 		return "/v3/openai/chat/completions";
 	}
 }
+
 export class NovitaTextToVideoTask extends TaskProviderHelper implements TextToVideoTaskHelper {
 	constructor() {
 		super("novita", NOVITA_API_BASE_URL);
 	}
 
-	makeRoute(params: UrlParams): string {
-		return `/v3/hf/${params.model}`;
+	override makeRoute(params: UrlParams): string {
+		if (params.authMethod !== "provider-key") {
+			return `/v3/async/${params.model}?_subdomain=queue`;
+		}
+		return `/v3/async/${params.model}`;
 	}
 
-	preparePayload(params: BodyParams): Record<string, unknown> {
+	override preparePayload(params: BodyParams): Record<string, unknown> {
+		const { num_inference_steps, ...restParameters } = params.args.parameters as Record<string, unknown>;
 		return {
 			...omit(params.args, ["inputs", "parameters"]),
-			...(params.args.parameters as Record<string, unknown>),
+			...restParameters,
+			steps: num_inference_steps,
 			prompt: params.args.inputs,
 		};
 	}
-	override async getResponse(response: NovitaOutput): Promise<Blob> {
+
+	override async getResponse(
+		response: NovitaAsyncAPIOutput,
+		url?: string,
+		headers?: Record<string, string>
+	): Promise<Blob> {
+		if (!url || !headers) {
+			throw new InferenceOutputError("URL and headers are required for text-to-video task");
+		}
+		const taskId = response.task_id;
+		if (!taskId) {
+			throw new InferenceOutputError("No task ID found in the response");
+		}
+
+		const parsedUrl = new URL(url);
+		const baseUrl = `${parsedUrl.protocol}//${parsedUrl.host}${
+			parsedUrl.host === "router.huggingface.co" ? "/novita" : ""
+		}`;
+		const queryParams = parsedUrl.search;
+		const resultUrl = `${baseUrl}/v3/async/task-result${queryParams ? queryParams + '&' : '?'}task_id=${taskId}`;
+
+		let status = '';
+		let taskResult = undefined;
+
+		while (status !== 'TASK_STATUS_SUCCEED' && status !== 'TASK_STATUS_FAILED') {
+			await delay(500);
+			const resultResponse = await fetch(resultUrl, { headers });
+			if (!resultResponse.ok) {
+				throw new InferenceOutputError("Failed to fetch task result");
+			}
+			try {
+				taskResult = await resultResponse.json();
+				status = taskResult.task.status;
+			} catch (error) {
+				throw new InferenceOutputError("Failed to parse task result");
+			}
+		}
+
+		if (status === 'TASK_STATUS_FAILED') {
+			throw new InferenceOutputError("Task failed");
+		}
+
+		// There will be at most one video in the response.
 		const isValidOutput =
-			typeof response === "object" &&
-			!!response &&
-			"video" in response &&
-			typeof response.video === "object" &&
-			!!response.video &&
-			"video_url" in response.video &&
-			typeof response.video.video_url === "string" &&
-			isUrl(response.video.video_url);
+			typeof taskResult === "object" &&
+			!!taskResult &&
+			"videos" in taskResult &&
+			typeof taskResult.videos === "object" &&
+			!!taskResult.videos &&
+			Array.isArray(taskResult.videos) &&
+			taskResult.videos.length > 0 &&
+			"video_url" in taskResult.videos[0] &&
+			typeof taskResult.videos[0].video_url === "string" &&
+			isUrl(taskResult.videos[0].video_url);
 
 		if (!isValidOutput) {
-			throw new InferenceOutputError("Expected { video: { video_url: string } }");
+			throw new InferenceOutputError("Expected { videos: [{ video_url: string }] }");
 		}
 
-		const urlResponse = await fetch(response.video.video_url);
+		const urlResponse = await fetch(taskResult.videos[0].video_url);
 		return await urlResponse.blob();
 	}
 }
diff --git a/packages/inference/src/tasks/cv/textToVideo.ts b/packages/inference/src/tasks/cv/textToVideo.ts
@@ -3,7 +3,7 @@ import { resolveProvider } from "../../lib/getInferenceProviderMapping";
 import { getProviderHelper } from "../../lib/getProviderHelper";
 import { makeRequestOptions } from "../../lib/makeRequestOptions";
 import type { FalAiQueueOutput } from "../../providers/fal-ai";
-import type { NovitaOutput } from "../../providers/novita";
+import type { NovitaAsyncAPIOutput } from "../../providers/novita";
 import type { ReplicateOutput } from "../../providers/replicate";
 import type { BaseArgs, Options } from "../../types";
 import { innerRequest } from "../../utils/request";
@@ -15,7 +15,7 @@ export type TextToVideoOutput = Blob;
 export async function textToVideo(args: TextToVideoArgs, options?: Options): Promise<TextToVideoOutput> {
 	const provider = await resolveProvider(args.provider, args.model, args.endpointUrl);
 	const providerHelper = getProviderHelper(provider, "text-to-video");
-	const { data: response } = await innerRequest<FalAiQueueOutput | ReplicateOutput | NovitaOutput>(
+	const { data: response } = await innerRequest<FalAiQueueOutput | ReplicateOutput | NovitaAsyncAPIOutput>(
 		args,
 		providerHelper,
 		{