huggingface
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/inference/package.json‎
Lines changed: 1 addition & 1 deletion b/‎packages/inference/package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/inference/src/package.ts‎
Lines changed: 1 addition & 1 deletion b/‎packages/inference/src/package.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/inference/src/snippets/getInferenceSnippets.ts‎
Lines changed: 91 additions & 7 deletions b/‎packages/inference/src/snippets/getInferenceSnippets.ts‎
Lines changed: 91 additions & 7 deletions
diff --git a/‎packages/mcp-client/package.json‎
Lines changed: 1 addition & 1 deletion b/‎packages/mcp-client/package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/mcp-client/src/Agent.ts‎
Lines changed: 18 additions & 8 deletions b/‎packages/mcp-client/src/Agent.ts‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎packages/mcp-client/src/McpClient.ts‎
Lines changed: 11 additions & 6 deletions b/‎packages/mcp-client/src/McpClient.ts‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎packages/tasks-gen/scripts/generate-snippets-fixtures.ts‎
Lines changed: 24 additions & 1 deletion b/‎packages/tasks-gen/scripts/generate-snippets-fixtures.ts‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎packages/tasks-gen/snippets-fixtures/automatic-speech-recognition/js/fetch/0.hf-inference.js‎
Lines changed: 1 addition & 1 deletion b/‎packages/tasks-gen/snippets-fixtures/automatic-speech-recognition/js/fetch/0.hf-inference.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/tasks-gen/snippets-fixtures/automatic-speech-recognition/js/huggingface.js/0.hf-inference.js‎
Lines changed: 1 addition & 1 deletion b/‎packages/tasks-gen/snippets-fixtures/automatic-speech-recognition/js/huggingface.js/0.hf-inference.js‎
Lines changed: 1 addition & 1 deletion
@@ -97,7 +97,7 @@ You can run our packages with vanilla JS, without any bundler, by using a CDN or
 
 ```html
 <script type="module">
-    import { InferenceClient } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected].0/+esm';
+    import { InferenceClient } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected].2/+esm';
     import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]/+esm";
 </script>
 ```
 
@@ -1,6 +1,6 @@
 {
 	"name": "@huggingface/inference",
-	"version": "4.0.0",
+	"version": "4.0.2",
 	"packageManager": "[email protected]",
 	"license": "MIT",
 	"author": "Hugging Face and Tim Mikeladze <[email protected]>",
 
@@ -1,3 +1,3 @@
 // Generated file from package.json. Issues importing JSON directly when publishing on commonjs/ESM - see https://github.com/microsoft/TypeScript/issues/51783
-export const PACKAGE_VERSION = "4.0.0";
+export const PACKAGE_VERSION = "4.0.2";
 export const PACKAGE_NAME = "@huggingface/inference";
@@ -14,7 +14,12 @@ import { makeRequestOptionsFromResolvedModel } from "../lib/makeRequestOptions.j
 import type { InferenceProviderOrPolicy, InferenceTask, RequestArgs } from "../types.js";
 import { templates } from "./templates.exported.js";
 
-export type InferenceSnippetOptions = { streaming?: boolean; billTo?: string } & Record<string, unknown>;
+export type InferenceSnippetOptions = {
+	streaming?: boolean;
+	billTo?: string;
+	accessToken?: string;
+	directRequest?: boolean;
+} & Record<string, unknown>;
 
 const PYTHON_CLIENTS = ["huggingface_hub", "fal_client", "requests", "openai"] as const;
 const JS_CLIENTS = ["fetch", "huggingface.js", "openai"] as const;
@@ -121,11 +126,15 @@ const HF_JS_METHODS: Partial<Record<WidgetType, string>> = {
 	translation: "translation",
 };
 
+// Placeholders to replace with env variable in snippets
+// little hack to support both direct requests and routing => routed requests should start with "hf_"
+const ACCESS_TOKEN_ROUTING_PLACEHOLDER = "hf_token_placeholder";
+const ACCESS_TOKEN_DIRECT_REQUEST_PLACEHOLDER = "not_hf_token_placeholder";
+
 // Snippet generators
 const snippetGenerator = (templateName: string, inputPreparationFn?: InputPreparationFn) => {
 	return (
 		model: ModelDataMinimal,
-		accessToken: string,
 		provider: InferenceProviderOrPolicy,
 		inferenceProviderMapping?: InferenceProviderModelMapping,
 		opts?: InferenceSnippetOptions
@@ -149,13 +158,19 @@ const snippetGenerator = (templateName: string, inputPreparationFn?: InputPrepar
 			console.error(`Failed to get provider helper for ${provider} (${task})`, e);
 			return [];
 		}
+
+		const placeholder = opts?.directRequest
+			? ACCESS_TOKEN_DIRECT_REQUEST_PLACEHOLDER
+			: ACCESS_TOKEN_ROUTING_PLACEHOLDER;
+		const accessTokenOrPlaceholder = opts?.accessToken ?? placeholder;
+
 		/// Prepare inputs + make request
 		const inputs = inputPreparationFn ? inputPreparationFn(model, opts) : { inputs: getModelInputSnippet(model) };
 		const request = makeRequestOptionsFromResolvedModel(
 			providerModelId,
 			providerHelper,
 			{
-				accessToken,
+				accessToken: accessTokenOrPlaceholder,
 				provider,
 				...inputs,
 			} as RequestArgs,
@@ -180,7 +195,7 @@ const snippetGenerator = (templateName: string, inputPreparationFn?: InputPrepar
 
 		/// Prepare template injection data
 		const params: TemplateParams = {
-			accessToken,
+			accessToken: accessTokenOrPlaceholder,
 			authorizationHeader: (request.info.headers as Record<string, string>)?.Authorization,
 			baseUrl: removeSuffix(request.url, "/chat/completions"),
 			fullUrl: request.url,
@@ -248,6 +263,11 @@ const snippetGenerator = (templateName: string, inputPreparationFn?: InputPrepar
 							snippet = `${importSection}\n\n${snippet}`;
 						}
 
+						/// Replace access token placeholder
+						if (snippet.includes(placeholder)) {
+							snippet = replaceAccessTokenPlaceholder(opts?.directRequest, placeholder, snippet, language, provider);
+						}
+
 						/// Snippet is ready!
 						return { language, client: client as string, content: snippet };
 					})
@@ -299,7 +319,6 @@ const snippets: Partial<
 		PipelineType,
 		(
 			model: ModelDataMinimal,
-			accessToken: string,
 			provider: InferenceProviderOrPolicy,
 			inferenceProviderMapping?: InferenceProviderModelMapping,
 			opts?: InferenceSnippetOptions
@@ -339,13 +358,12 @@ const snippets: Partial<
 
 export function getInferenceSnippets(
 	model: ModelDataMinimal,
-	accessToken: string,
 	provider: InferenceProviderOrPolicy,
 	inferenceProviderMapping?: InferenceProviderModelMapping,
 	opts?: Record<string, unknown>
 ): InferenceSnippet[] {
 	return model.pipeline_tag && model.pipeline_tag in snippets
-		? snippets[model.pipeline_tag]?.(model, accessToken, provider, inferenceProviderMapping, opts) ?? []
+		? snippets[model.pipeline_tag]?.(model, provider, inferenceProviderMapping, opts) ?? []
 		: [];
 }
 
@@ -420,3 +438,69 @@ function indentString(str: string): string {
 function removeSuffix(str: string, suffix: string) {
 	return str.endsWith(suffix) ? str.slice(0, -suffix.length) : str;
 }
+
+function replaceAccessTokenPlaceholder(
+	directRequest: boolean | undefined,
+	placeholder: string,
+	snippet: string,
+	language: InferenceSnippetLanguage,
+	provider: InferenceProviderOrPolicy
+): string {
+	// If "opts.accessToken" is not set, the snippets are generated with a placeholder.
+	// Once snippets are rendered, we replace the placeholder with code to fetch the access token from an environment variable.
+
+	// Determine if HF_TOKEN or specific provider token should be used
+	const useHfToken =
+		provider == "hf-inference" || // hf-inference provider => use $HF_TOKEN
+		(!directRequest && // if explicit directRequest => use provider-specific token
+			(!snippet.includes("https://") || // no URL provided => using a client => use $HF_TOKEN
+				snippet.includes("https://router.huggingface.co"))); // explicit routed request => use $HF_TOKEN
+
+	const accessTokenEnvVar = useHfToken
+		? "HF_TOKEN" // e.g. routed request or hf-inference
+		: provider.toUpperCase().replace("-", "_") + "_API_KEY"; // e.g. "REPLICATE_API_KEY"
+
+	// Replace the placeholder with the env variable
+	if (language === "sh") {
+		snippet = snippet.replace(
+			`'Authorization: Bearer ${placeholder}'`,
+			`"Authorization: Bearer $${accessTokenEnvVar}"` // e.g. "Authorization: Bearer $HF_TOKEN"
+		);
+	} else if (language === "python") {
+		snippet = "import os\n" + snippet;
+		snippet = snippet.replace(
+			`"${placeholder}"`,
+			`os.environ["${accessTokenEnvVar}"]` // e.g. os.environ["HF_TOKEN")
+		);
+		snippet = snippet.replace(
+			`"Bearer ${placeholder}"`,
+			`f"Bearer {os.environ['${accessTokenEnvVar}']}"` // e.g. f"Bearer {os.environ['HF_TOKEN']}"
+		);
+		snippet = snippet.replace(
+			`"Key ${placeholder}"`,
+			`f"Key {os.environ['${accessTokenEnvVar}']}"` // e.g. f"Key {os.environ['FAL_AI_API_KEY']}"
+		);
+		snippet = snippet.replace(
+			`"X-Key ${placeholder}"`,
+			`f"X-Key {os.environ['${accessTokenEnvVar}']}"` // e.g. f"X-Key {os.environ['BLACK_FOREST_LABS_API_KEY']}"
+		);
+	} else if (language === "js") {
+		snippet = snippet.replace(
+			`"${placeholder}"`,
+			`process.env.${accessTokenEnvVar}` // e.g. process.env.HF_TOKEN
+		);
+		snippet = snippet.replace(
+			`Authorization: "Bearer ${placeholder}",`,
+			`Authorization: \`Bearer $\{process.env.${accessTokenEnvVar}}\`,` // e.g. Authorization: `Bearer ${process.env.HF_TOKEN}`,
+		);
+		snippet = snippet.replace(
+			`Authorization: "Key ${placeholder}",`,
+			`Authorization: \`Key $\{process.env.${accessTokenEnvVar}}\`,` // e.g. Authorization: `Key ${process.env.FAL_AI_API_KEY}`,
+		);
+		snippet = snippet.replace(
+			`Authorization: "X-Key ${placeholder}",`,
+			`Authorization: \`X-Key $\{process.env.${accessTokenEnvVar}}\`,` // e.g. Authorization: `X-Key ${process.env.BLACK_FOREST_LABS_AI_API_KEY}`,
+		);
+	}
+	return snippet;
+}
@@ -34,7 +34,7 @@
 		"prepare": "pnpm run build",
 		"test": "vitest run",
 		"check": "tsc",
-		"agent": "tsx cli.ts"
+		"cli": "tsx cli.ts"
 	},
 	"files": [
 		"src",
 
@@ -46,6 +46,7 @@ const exitLoopTools = [taskCompletionTool, askQuestionTool];
 
 export class Agent extends McpClient {
 	private readonly servers: (ServerConfig | StdioServerParameters)[];
+	public readonly prompt: string;
 	protected messages: ChatCompletionInputMessage[];
 
 	constructor({
@@ -73,10 +74,11 @@ export class Agent extends McpClient {
 		super(provider ? { provider, endpointUrl, model, apiKey } : { provider, endpointUrl, model, apiKey });
 		/// ^This shenanigan is just here to please an overzealous TS type-checker.
 		this.servers = servers;
+		this.prompt = prompt ?? DEFAULT_SYSTEM_PROMPT;
 		this.messages = [
 			{
 				role: "system",
-				content: prompt ?? DEFAULT_SYSTEM_PROMPT,
+				content: this.prompt,
 			},
 		];
 	}
@@ -86,19 +88,27 @@ export class Agent extends McpClient {
 	}
 
 	async *run(
-		input: string,
+		input: string | ChatCompletionInputMessage[],
 		opts: { abortSignal?: AbortSignal } = {}
 	): AsyncGenerator<ChatCompletionStreamOutput | ChatCompletionInputMessageTool> {
-		this.messages.push({
-			role: "user",
-			content: input,
-		});
+		let messages: ChatCompletionInputMessage[];
+		if (typeof input === "string") {
+			/// Use internal array of messages
+			this.messages.push({
+				role: "user",
+				content: input,
+			});
+			messages = this.messages;
+		} else {
+			/// Use the passed messages directly
+			messages = input;
+		}
 
 		let numOfTurns = 0;
 		let nextTurnShouldCallTools = true;
 		while (true) {
 			try {
-				yield* this.processSingleTurnWithTools(this.messages, {
+				yield* this.processSingleTurnWithTools(messages, {
 					exitLoopTools,
 					exitIfFirstChunkNoTool: numOfTurns > 0 && nextTurnShouldCallTools,
 					abortSignal: opts.abortSignal,
@@ -111,7 +121,7 @@ export class Agent extends McpClient {
 			}
 			numOfTurns++;
 			// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
-			const currentLast = this.messages.at(-1)!;
+			const currentLast = messages.at(-1)!;
 			debug("current role", currentLast.role);
 			if (
 				currentLast.role === "tool" &&
 
@@ -160,13 +160,18 @@ export class McpClient {
 			for (const toolCall of delta.tool_calls ?? []) {
 				// aggregating chunks into an encoded arguments JSON object
 				if (!finalToolCalls[toolCall.index]) {
+					/// first chunk of the tool call
 					finalToolCalls[toolCall.index] = toolCall;
-				}
-				if (finalToolCalls[toolCall.index].function.arguments === undefined) {
-					finalToolCalls[toolCall.index].function.arguments = "";
-				}
-				if (toolCall.function.arguments) {
-					finalToolCalls[toolCall.index].function.arguments += toolCall.function.arguments;
+
+					/// ensure .function.arguments is always a string
+					if (finalToolCalls[toolCall.index].function.arguments === undefined) {
+						finalToolCalls[toolCall.index].function.arguments = "";
+					}
+				} else {
+					/// any subsequent chunk to the same tool call
+					if (toolCall.function.arguments) {
+						finalToolCalls[toolCall.index].function.arguments += toolCall.function.arguments;
+					}
 				}
 			}
 			if (opts.exitIfFirstChunkNoTool && numOfChunks <= 2 && Object.keys(finalToolCalls).length === 0) {
 
@@ -240,6 +240,30 @@ const TEST_CASES: {
 		providers: ["hf-inference"],
 		opts: { billTo: "huggingface" },
 	},
+	{
+		testName: "with-access-token",
+		task: "conversational",
+		model: {
+			id: "meta-llama/Llama-3.1-8B-Instruct",
+			pipeline_tag: "text-generation",
+			tags: ["conversational"],
+			inference: "",
+		},
+		providers: ["hf-inference"],
+		opts: { accessToken: "hf_xxx" },
+	},
+	{
+		testName: "explicit-direct-request",
+		task: "conversational",
+		model: {
+			id: "meta-llama/Llama-3.1-8B-Instruct",
+			pipeline_tag: "text-generation",
+			tags: ["conversational"],
+			inference: "",
+		},
+		providers: ["together"],
+		opts: { directRequest: true },
+	},
 	{
 		testName: "text-to-speech",
 		task: "text-to-speech",
@@ -314,7 +338,6 @@ function generateInferenceSnippet(
 ): InferenceSnippet[] {
 	const allSnippets = snippets.getInferenceSnippets(
 		model,
-		"api_token",
 		provider,
 		{
 			hfModelId: model.id,
 
@@ -3,7 +3,7 @@ async function query(data) {
 		"https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo",
 		{
 			headers: {
-				Authorization: "Bearer api_token",
+				Authorization: `Bearer ${process.env.HF_TOKEN}`,
 				"Content-Type": "audio/flac",
 			},
 			method: "POST",
 
@@ -1,6 +1,6 @@
 import { InferenceClient } from "@huggingface/inference";
 
-const client = new InferenceClient("api_token");
+const client = new InferenceClient(process.env.HF_TOKEN);
 
 const data = fs.readFileSync("sample1.flac");
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@huggingface/inference",`
`3`		`- "version": "4.0.0",`
	`3`	`+ "version": "4.0.2",`
`4`	`4`	`"packageManager": "[email protected]",`
`5`	`5`	`"license": "MIT",`
`6`	`6`	`"author": "Hugging Face and Tim Mikeladze <[email protected]>",`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ async function query(data) {`
`3`	`3`	`"https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo",`
`4`	`4`	`{`
`5`	`5`	`headers: {`
`6`		`- Authorization: "Bearer api_token",`
	`6`	+ Authorization: `Bearer ${process.env.HF_TOKEN}`,
`7`	`7`	`"Content-Type": "audio/flac",`
`8`	`8`	`},`
`9`	`9`	`method: "POST",`