huggingface · SBrandeis · Jan 24, 2025 · Jan 22, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/packages/agents/pnpm-lock.yaml b/packages/agents/pnpm-lock.yaml
@@ -1,4 +1,4 @@
-# 🤗 Hugging Face Inference Endpoints
+# 🤗 Hugging Face Inference
 
 A Typescript powered wrapper for the Hugging Face Inference API (serverless), Inference Endpoints (dedicated), and third-party Inference Providers.
 It works with [Inference API (serverless)](https://huggingface.co/docs/api-inference/index) and [Inference Endpoints (dedicated)](https://huggingface.co/docs/inference-endpoints/index), and even with supported third-party Inference Providers.

diff --git a/packages/tasks-gen/package.json b/packages/tasks-gen/package.json
@@ -26,6 +26,7 @@
 		"type-fest": "^3.13.1"
 	},
 	"dependencies": {
-		"@huggingface/tasks": "workspace:^"
+		"@huggingface/tasks": "workspace:^",
+		"@huggingface/inference": "workspace:^"
 	}
 }
diff --git a/packages/tasks-gen/pnpm-lock.yaml b/packages/tasks-gen/pnpm-lock.yaml
diff --git a/packages/tasks-gen/scripts/generate-snippets-fixtures.ts b/packages/tasks-gen/scripts/generate-snippets-fixtures.ts
@@ -19,7 +19,7 @@ import { existsSync as pathExists } from "node:fs";
 import * as fs from "node:fs/promises";
 import * as path from "node:path/posix";
 
-import type { InferenceSnippet } from "@huggingface/tasks";
+import type { InferenceProvider, InferenceSnippet } from "@huggingface/tasks";
 import { snippets } from "@huggingface/tasks";
 
 type LANGUAGE = "sh" | "js" | "py";
@@ -28,6 +28,7 @@ const TEST_CASES: {
 	testName: string;
 	model: snippets.ModelDataMinimal;
 	languages: LANGUAGE[];
+	providers: InferenceProvider[];
 	opts?: Record<string, unknown>;
 }[] = [
 	{
@@ -39,6 +40,7 @@ const TEST_CASES: {
 			inference: "",
 		},
 		languages: ["sh", "js", "py"],
+		providers: ["hf-inference", "together"],
 		opts: { streaming: false },
 	},
 	{
@@ -50,6 +52,7 @@ const TEST_CASES: {
 			inference: "",
 		},
 		languages: ["sh", "js", "py"],
+		providers: ["hf-inference"],
 		opts: { streaming: true },
 	},
 	{
@@ -61,6 +64,7 @@ const TEST_CASES: {
 			inference: "",
 		},
 		languages: ["sh", "js", "py"],
+		providers: ["hf-inference"],
 		opts: { streaming: false },
 	},
 	{
@@ -72,6 +76,7 @@ const TEST_CASES: {
 			inference: "",
 		},
 		languages: ["sh", "js", "py"],
+		providers: ["hf-inference"],
 		opts: { streaming: true },
 	},
 	{
@@ -82,6 +87,7 @@ const TEST_CASES: {
 			tags: [],
 			inference: "",
 		},
+		providers: ["hf-inference"],
 		languages: ["sh", "js", "py"],
 	},
 ] as const;
@@ -113,31 +119,41 @@ function getFixtureFolder(testName: string): string {
 function generateInferenceSnippet(
 	model: snippets.ModelDataMinimal,
 	language: LANGUAGE,
+	provider: InferenceProvider,
 	opts?: Record<string, unknown>
 ): InferenceSnippet[] {
-	const generatedSnippets = GET_SNIPPET_FN[language](model, "api_token", opts);
+	const generatedSnippets = GET_SNIPPET_FN[language](model, "api_token", provider, opts);
 	return Array.isArray(generatedSnippets) ? generatedSnippets : [generatedSnippets];
 }
 
-async function getExpectedInferenceSnippet(testName: string, language: LANGUAGE): Promise<InferenceSnippet[]> {
+async function getExpectedInferenceSnippet(
+	testName: string,
+	language: LANGUAGE,
+	provider: InferenceProvider
+): Promise<InferenceSnippet[]> {
 	const fixtureFolder = getFixtureFolder(testName);
 	const files = await fs.readdir(fixtureFolder);
 
 	const expectedSnippets: InferenceSnippet[] = [];
-	for (const file of files.filter((file) => file.endsWith("." + language)).sort()) {
-		const client = path.basename(file).split(".").slice(1, -1).join("."); // e.g. '0.huggingface.js.js' => "huggingface.js"
+	for (const file of files.filter((file) => file.endsWith("." + language) && file.includes(`.${provider}.`)).sort()) {
+		const client = path.basename(file).split(".").slice(1, -2).join("."); // e.g. '0.huggingface.js.replicate.js' => "huggingface.js"
 		const content = await fs.readFile(path.join(fixtureFolder, file), { encoding: "utf-8" });
-		expectedSnippets.push(client === "default" ? { content } : { client, content });
+		expectedSnippets.push({ client, content });
 	}
 	return expectedSnippets;
 }
 
-async function saveExpectedInferenceSnippet(testName: string, language: LANGUAGE, snippets: InferenceSnippet[]) {
+async function saveExpectedInferenceSnippet(
+	testName: string,
+	language: LANGUAGE,
+	provider: InferenceProvider,
+	snippets: InferenceSnippet[]
+) {
 	const fixtureFolder = getFixtureFolder(testName);
 	await fs.mkdir(fixtureFolder, { recursive: true });
 
 	for (const [index, snippet] of snippets.entries()) {
-		const file = path.join(fixtureFolder, `${index}.${snippet.client ?? "default"}.${language}`);
+		const file = path.join(fixtureFolder, `${index}.${snippet.client ?? "default"}.${provider}.${language}`);
 		await fs.writeFile(file, snippet.content);
 	}
 }
@@ -147,13 +163,15 @@ if (import.meta.vitest) {
 	const { describe, expect, it } = import.meta.vitest;
 
 	describe("inference API snippets", () => {
-		TEST_CASES.forEach(({ testName, model, languages, opts }) => {
+		TEST_CASES.forEach(({ testName, model, languages, providers, opts }) => {
 			describe(testName, () => {
 				languages.forEach((language) => {
-					it(language, async () => {
-						const generatedSnippets = generateInferenceSnippet(model, language, opts);
-						const expectedSnippets = await getExpectedInferenceSnippet(testName, language);
-						expect(generatedSnippets).toEqual(expectedSnippets);
+					providers.forEach((provider) => {
+						it(language, async () => {
+							const generatedSnippets = generateInferenceSnippet(model, language, provider, opts);
+							const expectedSnippets = await getExpectedInferenceSnippet(testName, language, provider);
+							expect(generatedSnippets).toEqual(expectedSnippets);
+						});
 					});
 				});
 			});
@@ -166,11 +184,13 @@ if (import.meta.vitest) {
 	await fs.rm(path.join(rootDirFinder(), "snippets-fixtures"), { recursive: true, force: true });
 
 	console.debug("  🏭 Generating new fixtures...");
-	TEST_CASES.forEach(({ testName, model, languages, opts }) => {
-		console.debug(`      ${testName} (${languages.join(", ")})`);
+	TEST_CASES.forEach(({ testName, model, languages, providers, opts }) => {
+		console.debug(`      ${testName} (${languages.join(", ")}) (${providers.join(", ")})`);
 		languages.forEach(async (language) => {
-			const generatedSnippets = generateInferenceSnippet(model, language, opts);
-			await saveExpectedInferenceSnippet(testName, language, generatedSnippets);
+			providers.forEach(async (provider) => {
+				const generatedSnippets = generateInferenceSnippet(model, language, provider, opts);
+				await saveExpectedInferenceSnippet(testName, language, provider, generatedSnippets);
+			});
 		});
 	});
 	console.log("✅ All done!");

diff --git a/...onversational-llm-non-stream/0.default.sh → ...nal-llm-non-stream/0.curl.hf-inference.sh b/...onversational-llm-non-stream/0.default.sh → ...nal-llm-non-stream/0.curl.hf-inference.sh
diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/0.curl.together.sh b/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/0.curl.together.sh
@@ -0,0 +1,14 @@
+curl 'https://huggingface.co/api/inference-proxy/together/v1/chat/completions' \
+-H 'Authorization: Bearer api_token' \
+-H 'Content-Type: application/json' \
+--data '{
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "messages": [
+		{
+			"role": "user",
+			"content": "What is the capital of France?"
+		}
+	],
+    "max_tokens": 500,
+    "stream": false
+}'
diff --git a/...asks-gen/snippets-fixtures/conversational-llm-non-stream/0.huggingface.js.hf-inference.js b/...asks-gen/snippets-fixtures/conversational-llm-non-stream/0.huggingface.js.hf-inference.js
@@ -0,0 +1,17 @@
+import { HfInference } from "@huggingface/inference";
+
+const client = new HfInference("api_token");
+
+const chatCompletion = await client.chatCompletion({
+	model: "meta-llama/Llama-3.1-8B-Instruct",
+	messages: [
+		{
+			role: "user",
+			content: "What is the capital of France?"
+		}
+	],
+	provider: "hf-inference",
+	max_tokens: 500
+});
+
+console.log(chatCompletion.choices[0].message);
diff --git a/...tional-llm-non-stream/0.huggingface.js.js → ...m-non-stream/0.huggingface.js.together.js b/...tional-llm-non-stream/0.huggingface.js.js → ...m-non-stream/0.huggingface.js.together.js
@@ -10,6 +10,7 @@ const chatCompletion = await client.chatCompletion({
 			content: "What is the capital of France?"
 		}
 	],
+	provider: "together",
 	max_tokens: 500
 });
 

diff --git a/...sks-gen/snippets-fixtures/conversational-llm-non-stream/0.huggingface_hub.hf-inference.py b/...sks-gen/snippets-fixtures/conversational-llm-non-stream/0.huggingface_hub.hf-inference.py
@@ -0,0 +1,21 @@
+from huggingface_hub import InferenceClient
+
+client = InferenceClient(
+	provider="hf-inference",
+	api_key="api_token"
+)
+
+messages = [
+	{
+		"role": "user",
+		"content": "What is the capital of France?"
+	}
+]
+
+completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct", 
+	messages=messages, 
+	max_tokens=500
+)
+
+print(completion.choices[0].message)
diff --git a/...ional-llm-non-stream/0.huggingface_hub.py → ...-non-stream/0.huggingface_hub.together.py b/...ional-llm-non-stream/0.huggingface_hub.py → ...-non-stream/0.huggingface_hub.together.py
@@ -1,6 +1,9 @@
 from huggingface_hub import InferenceClient
 
-client = InferenceClient(api_key="api_token")
+client = InferenceClient(
+	provider="together",
+	api_key="api_token"
+)
 
 messages = [
 	{

diff --git a/...conversational-llm-non-stream/1.openai.js → ...l-llm-non-stream/1.openai.hf-inference.js b/...conversational-llm-non-stream/1.openai.js → ...l-llm-non-stream/1.openai.hf-inference.js
@@ -1,8 +1,8 @@
 import { OpenAI } from "openai";
 
 const client = new OpenAI({
-    baseURL: "https://api-inference.huggingface.co/v1/",
-    apiKey: "api_token"
+	baseURL: "https://api-inference.huggingface.co/v1/",
+	apiKey: "api_token"
 });
 
 const chatCompletion = await client.chat.completions.create({

diff --git a/...conversational-llm-non-stream/1.openai.py → ...l-llm-non-stream/1.openai.hf-inference.py b/...conversational-llm-non-stream/1.openai.py → ...l-llm-non-stream/1.openai.hf-inference.py
diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/1.openai.together.js b/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/1.openai.together.js
@@ -0,0 +1,19 @@
+import { OpenAI } from "openai";
+
+const client = new OpenAI({
+	baseURL: "https://huggingface.co/api/inference-proxy/together",
+	apiKey: "api_token"
+});
+
+const chatCompletion = await client.chat.completions.create({
+	model: "meta-llama/Llama-3.1-8B-Instruct",
+	messages: [
+		{
+			role: "user",
+			content: "What is the capital of France?"
+		}
+	],
+	max_tokens: 500
+});
+
+console.log(chatCompletion.choices[0].message);
diff --git a/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/1.openai.together.py b/packages/tasks-gen/snippets-fixtures/conversational-llm-non-stream/1.openai.together.py
@@ -0,0 +1,21 @@
+from openai import OpenAI
+
+client = OpenAI(
+	base_url="https://huggingface.co/api/inference-proxy/together",
+	api_key="api_token"
+)
+
+messages = [
+	{
+		"role": "user",
+		"content": "What is the capital of France?"
+	}
+]
+
+completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct", 
+	messages=messages, 
+	max_tokens=500
+)
+
+print(completion.choices[0].message)
diff --git a/...es/conversational-llm-stream/0.default.sh → ...ational-llm-stream/0.curl.hf-inference.sh b/...es/conversational-llm-stream/0.default.sh → ...ational-llm-stream/0.curl.hf-inference.sh
diff --git a/...ersational-llm-stream/0.huggingface.js.js → ...m-stream/0.huggingface.js.hf-inference.js b/...ersational-llm-stream/0.huggingface.js.js → ...m-stream/0.huggingface.js.hf-inference.js
@@ -12,6 +12,7 @@ const stream = client.chatCompletionStream({
 			content: "What is the capital of France?"
 		}
 	],
+	provider: "hf-inference",
 	max_tokens: 500
 });
 

diff --git a/...rsational-llm-stream/0.huggingface_hub.py → ...-stream/0.huggingface_hub.hf-inference.py b/...rsational-llm-stream/0.huggingface_hub.py → ...-stream/0.huggingface_hub.hf-inference.py
@@ -1,6 +1,9 @@
 from huggingface_hub import InferenceClient
 
-client = InferenceClient(api_key="api_token")
+client = InferenceClient(
+	provider="hf-inference",
+	api_key="api_token"
+)
 
 messages = [
 	{

diff --git a/...res/conversational-llm-stream/1.openai.js → ...ional-llm-stream/1.openai.hf-inference.js b/...res/conversational-llm-stream/1.openai.js → ...ional-llm-stream/1.openai.hf-inference.js
@@ -2,7 +2,7 @@ import { OpenAI } from "openai";
 
 const client = new OpenAI({
 	baseURL: "https://api-inference.huggingface.co/v1/",
-    apiKey: "api_token"
+	apiKey: "api_token"
 });
 
 let out = "";

diff --git a/...res/conversational-llm-stream/1.openai.py → ...ional-llm-stream/1.openai.hf-inference.py b/...res/conversational-llm-stream/1.openai.py → ...ional-llm-stream/1.openai.hf-inference.py
diff --git a/...onversational-vlm-non-stream/0.default.sh → ...nal-vlm-non-stream/0.curl.hf-inference.sh b/...onversational-vlm-non-stream/0.default.sh → ...nal-vlm-non-stream/0.curl.hf-inference.sh
diff --git a/...tional-vlm-non-stream/0.huggingface.js.js → ...n-stream/0.huggingface.js.hf-inference.js b/...tional-vlm-non-stream/0.huggingface.js.js → ...n-stream/0.huggingface.js.hf-inference.js
@@ -21,6 +21,7 @@ const chatCompletion = await client.chatCompletion({
 			]
 		}
 	],
+	provider: "hf-inference",
 	max_tokens: 500
 });
 

diff --git a/...ional-vlm-non-stream/0.huggingface_hub.py → ...-stream/0.huggingface_hub.hf-inference.py b/...ional-vlm-non-stream/0.huggingface_hub.py → ...-stream/0.huggingface_hub.hf-inference.py
@@ -1,6 +1,9 @@
 from huggingface_hub import InferenceClient
 
-client = InferenceClient(api_key="api_token")
+client = InferenceClient(
+	provider="hf-inference",
+	api_key="api_token"
+)
 
 messages = [
 	{

diff --git a/...conversational-vlm-non-stream/1.openai.js → ...l-vlm-non-stream/1.openai.hf-inference.js b/...conversational-vlm-non-stream/1.openai.js → ...l-vlm-non-stream/1.openai.hf-inference.js
@@ -1,8 +1,8 @@
 import { OpenAI } from "openai";
 
 const client = new OpenAI({
-    baseURL: "https://api-inference.huggingface.co/v1/",
-    apiKey: "api_token"
+	baseURL: "https://api-inference.huggingface.co/v1/",
+	apiKey: "api_token"
 });
 
 const chatCompletion = await client.chat.completions.create({

diff --git a/...conversational-vlm-non-stream/1.openai.py → ...l-vlm-non-stream/1.openai.hf-inference.py b/...conversational-vlm-non-stream/1.openai.py → ...l-vlm-non-stream/1.openai.hf-inference.py
diff --git a/...es/conversational-vlm-stream/0.default.sh → ...ational-vlm-stream/0.curl.hf-inference.sh b/...es/conversational-vlm-stream/0.default.sh → ...ational-vlm-stream/0.curl.hf-inference.sh
diff --git a/...ersational-vlm-stream/0.huggingface.js.js → ...m-stream/0.huggingface.js.hf-inference.js b/...ersational-vlm-stream/0.huggingface.js.js → ...m-stream/0.huggingface.js.hf-inference.js
@@ -23,6 +23,7 @@ const stream = client.chatCompletionStream({
 			]
 		}
 	],
+	provider: "hf-inference",
 	max_tokens: 500
 });
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,7 @@ const chatCompletion = await client.chatCompletion({ @@
     			content: "What is the capital of France?"
     		}
     	],
+    	provider: "together",
     	max_tokens: 500
     });
@@ Expand Down @@