Add Cohere provider

alexrs-cohere · alexrs-cohere · commit 429e81f1f841 · 2025-02-27T13:38:34.000Z
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -50,6 +50,7 @@ jobs:
           HF_REPLICATE_KEY: dummy
           HF_SAMBANOVA_KEY: dummy
           HF_TOGETHER_KEY: dummy
+          HF_COHERE_KEY: dummy
 
   browser:
     runs-on: ubuntu-latest
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ await uploadFile({
   // Can work with native File in browsers
   file: {
     path: "pytorch_model.bin",
-    content: new Blob(...) 
+    content: new Blob(...)
   }
 });
 
@@ -39,7 +39,7 @@ await inference.chatCompletion({
   ],
   max_tokens: 512,
   temperature: 0.5,
-  provider: "sambanova", // or together, fal-ai, replicate, …
+  provider: "sambanova", // or together, fal-ai, replicate, cohere …
 });
 
 await inference.textToImage({
@@ -146,12 +146,12 @@ for await (const chunk of inference.chatCompletionStream({
   console.log(chunk.choices[0].delta.content);
 }
 
-/// Using a third-party provider: 
+/// Using a third-party provider:
 await inference.chatCompletion({
   model: "meta-llama/Llama-3.1-8B-Instruct",
   messages: [{ role: "user", content: "Hello, nice to meet you!" }],
   max_tokens: 512,
-  provider: "sambanova", // or together, fal-ai, replicate, …
+  provider: "sambanova", // or together, fal-ai, replicate, cohere …
 })
 
 await inference.textToImage({
@@ -211,7 +211,7 @@ await uploadFile({
   // Can work with native File in browsers
   file: {
     path: "pytorch_model.bin",
-    content: new Blob(...) 
+    content: new Blob(...)
   }
 });
 
@@ -244,7 +244,7 @@ console.log(messages); // contains the data
 
 // or you can run the code directly, however you can't check that the code is safe to execute this way, use at your own risk.
 const messages = await agent.run("Draw a picture of a cat wearing a top hat. Then caption the picture and read it out loud.")
-console.log(messages); 
+console.log(messages);
 ```
 
 There are more features of course, check each library's README!
diff --git a/packages/inference/README.md b/packages/inference/README.md
@@ -56,6 +56,7 @@ Currently, we support the following providers:
 - [Sambanova](https://sambanova.ai)
 - [Together](https://together.xyz)
 - [Blackforestlabs](https://blackforestlabs.ai)
+- [Cohere](https://cohere.com)
 
 To send requests to a third-party provider, you have to pass the `provider` parameter to the inference function. Make sure your request is authenticated with an access token.
 ```ts
@@ -80,6 +81,7 @@ Only a subset of models are supported when requesting third-party providers. You
 - [Replicate supported models](https://huggingface.co/api/partners/replicate/models)
 - [Sambanova supported models](https://huggingface.co/api/partners/sambanova/models)
 - [Together supported models](https://huggingface.co/api/partners/together/models)
+- [Cohere supported models](https://huggingface.co/api/partners/cohere/models)
 - [HF Inference API (serverless)](https://huggingface.co/models?inference=warm&sort=trending)
 
 ❗**Important note:** To be compatible, the third-party API must adhere to the "standard" shape API we expect on HF model pages for each pipeline task type.
diff --git a/packages/inference/src/lib/makeRequestOptions.ts b/packages/inference/src/lib/makeRequestOptions.ts
@@ -1,5 +1,6 @@
 import { HF_HUB_URL, HF_ROUTER_URL } from "../config";
 import { BLACK_FOREST_LABS_CONFIG } from "../providers/black-forest-labs";
+import { COHERE_CONFIG } from "../providers/cohere";
 import { FAL_AI_CONFIG } from "../providers/fal-ai";
 import { FIREWORKS_AI_CONFIG } from "../providers/fireworks-ai";
 import { HF_INFERENCE_CONFIG } from "../providers/hf-inference";
@@ -27,6 +28,7 @@ let tasks: Record<string, { models: { id: string }[] }> | null = null;
  */
 const providerConfigs: Record<InferenceProvider, ProviderConfig> = {
 	"black-forest-labs": BLACK_FOREST_LABS_CONFIG,
+	"cohere": COHERE_CONFIG,
 	"fal-ai": FAL_AI_CONFIG,
 	"fireworks-ai": FIREWORKS_AI_CONFIG,
 	"hf-inference": HF_INFERENCE_CONFIG,
diff --git a/packages/inference/src/providers/cohere.ts b/packages/inference/src/providers/cohere.ts
@@ -0,0 +1,42 @@
+/**
+ * See the registered mapping of HF model ID => Cohere model ID here:
+ *
+ * https://huggingface.co/api/partners/cohere/models
+ *
+ * This is a publicly available mapping.
+ *
+ * If you want to try to run inference for a new model locally before it's registered on huggingface.co,
+ * you can add it to the dictionary "HARDCODED_MODEL_ID_MAPPING" in consts.ts, for dev purposes.
+ *
+ * - If you work at Cohere and want to update this mapping, please use the model mapping API we provide on huggingface.co
+ * - If you're a community member and want to add a new supported HF model to Cohere, please open an issue on the present repo
+ * and we will tag Cohere team members.
+ *
+ * Thanks!
+ */
+import type { ProviderConfig, UrlParams, HeaderParams, BodyParams } from "../types";
+
+const COHERE_API_BASE_URL = "https://api.cohere.com";
+
+
+const makeBody = (params: BodyParams): Record<string, unknown> => {
+	return {
+		...params.args,
+		model: params.model,
+	};
+};
+
+const makeHeaders = (params: HeaderParams): Record<string, string> => {
+	return { Authorization: `Bearer ${params.accessToken}` };
+};
+
+const makeUrl = (params: UrlParams): string => {
+	return `${params.baseUrl}/compatibility/v1/chat/completions`;
+};
+
+export const COHERE_CONFIG: ProviderConfig = {
+	baseUrl: COHERE_API_BASE_URL,
+	makeBody,
+	makeHeaders,
+	makeUrl,
+};
diff --git a/packages/inference/src/providers/consts.ts b/packages/inference/src/providers/consts.ts
@@ -17,6 +17,7 @@ export const HARDCODED_MODEL_ID_MAPPING: Record<InferenceProvider, Record<ModelI
 	 * "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen2.5-Coder-32B-Instruct",
 	 */
 	"black-forest-labs": {},
+	cohere: {},
 	"fal-ai": {},
 	"fireworks-ai": {},
 	"hf-inference": {},
diff --git a/packages/inference/src/tasks/nlp/chatCompletion.ts b/packages/inference/src/tasks/nlp/chatCompletion.ts
@@ -3,6 +3,109 @@ import type { BaseArgs, Options } from "../../types";
 import { request } from "../custom/request";
 import type { ChatCompletionInput, ChatCompletionOutput } from "@huggingface/tasks";
 
+export type CohereTextGenerationOutputFinishReason =
+	| "COMPLETE"
+	| "STOP_SEQUENCE"
+	| "MAX_TOKENS"
+	| "TOOL_CALL"
+	| "ERROR";
+
+interface CohereChatCompletionOutput {
+	id: string;
+	finish_reason: CohereTextGenerationOutputFinishReason;
+	message: CohereMessage;
+	usage: CohereChatCompletionOutputUsage;
+	logprobs?: CohereLogprob[]; // Optional field for log probabilities
+}
+
+interface CohereMessage {
+	role: string;
+	content: Array<{
+		type: string;
+		text: string;
+	}>;
+	tool_calls?: CohereToolCall[]; // Optional field for tool calls
+}
+
+interface CohereChatCompletionOutputUsage {
+	billed_units: CohereInputOutputTokens;
+	tokens: CohereInputOutputTokens;
+}
+
+interface CohereInputOutputTokens {
+	input_tokens: number;
+	output_tokens: number;
+}
+
+interface CohereLogprob {
+	logprob: number;
+	token: string;
+	top_logprobs: CohereTopLogprob[];
+}
+
+interface CohereTopLogprob {
+	logprob: number;
+	token: string;
+}
+
+interface CohereToolCall {
+	function: CohereFunctionDefinition;
+	id: string;
+	type: string;
+}
+
+interface CohereFunctionDefinition {
+	arguments: unknown;
+	description?: string;
+	name: string;
+}
+
+function convertCohereToChatCompletionOutput(res: CohereChatCompletionOutput): ChatCompletionOutput {
+	// Create a ChatCompletionOutput object from the CohereChatCompletionOutput
+	return {
+		id: res.id,
+		created: Date.now(),
+		model: "cohere-model",
+		system_fingerprint: "cohere-fingerprint",
+		usage: {
+			completion_tokens: res.usage.tokens.output_tokens,
+			prompt_tokens: res.usage.tokens.input_tokens,
+			total_tokens: res.usage.tokens.input_tokens + res.usage.tokens.output_tokens,
+		},
+		choices: [
+			{
+				finish_reason: res.finish_reason,
+				index: 0,
+				message: {
+					role: res.message.role,
+					content: res.message.content.map((c) => c.text).join(" "),
+					tool_calls: res.message.tool_calls?.map((toolCall) => ({
+						function: {
+							arguments: toolCall.function.arguments,
+							description: toolCall.function.description,
+							name: toolCall.function.name,
+						},
+						id: toolCall.id,
+						type: toolCall.type,
+					})),
+				},
+				logprobs: res.logprobs
+					? {
+							content: res.logprobs.map((logprob) => ({
+								logprob: logprob.logprob,
+								token: logprob.token,
+								top_logprobs: logprob.top_logprobs.map((topLogprob) => ({
+									logprob: topLogprob.logprob,
+									token: topLogprob.token,
+								})),
+							})),
+					  }
+					: undefined,
+			},
+		],
+	};
+}
+
 /**
  * Use the chat completion endpoint to generate a response to a prompt, using OpenAI message completion API no stream
  */
@@ -31,5 +134,4 @@ export async function chatCompletion(
 	if (!isValidOutput) {
 		throw new InferenceOutputError("Expected ChatCompletionOutput");
 	}
-	return res;
 }
diff --git a/packages/inference/src/tasks/nlp/chatCompletionStream.ts b/packages/inference/src/tasks/nlp/chatCompletionStream.ts
@@ -2,6 +2,117 @@ import type { BaseArgs, Options } from "../../types";
 import { streamingRequest } from "../custom/streamingRequest";
 import type { ChatCompletionInput, ChatCompletionStreamOutput } from "@huggingface/tasks";
 
+export type CohereTextGenerationOutputFinishReason =
+	| "COMPLETE"
+	| "STOP_SEQUENCE"
+	| "MAX_TOKENS"
+	| "TOOL_CALL"
+	| "ERROR";
+
+interface CohereChatCompletionStreamOutput {
+	id: string;
+	finish_reason?: CohereTextGenerationOutputFinishReason;
+	delta: CohereMessageDelta;
+	usage?: CohereChatCompletionOutputUsage;
+	logprobs?: CohereLogprob[];
+}
+
+interface CohereMessage {
+	role: string;
+	content: {
+		type: string;
+		text: string;
+	};
+	tool_calls?: CohereToolCall[];
+}
+
+interface CohereMessageDelta {
+	message: CohereMessage;
+}
+
+interface CohereChatCompletionOutputUsage {
+	billed_units: CohereInputOutputTokens;
+	tokens: CohereInputOutputTokens;
+}
+
+interface CohereInputOutputTokens {
+	input_tokens: number;
+	output_tokens: number;
+}
+
+interface CohereLogprob {
+	logprob: number;
+	token: string;
+	top_logprobs: CohereTopLogprob[];
+}
+
+interface CohereTopLogprob {
+	logprob: number;
+	token: string;
+}
+
+interface CohereToolCall {
+	function: CohereFunctionDefinition;
+	id: string;
+	type: string;
+}
+
+interface CohereFunctionDefinition {
+	arguments: unknown;
+	description?: string;
+	name: string;
+}
+
+function convertCohereToChatCompletionStreamOutput(res: CohereChatCompletionStreamOutput): ChatCompletionStreamOutput {
+	return {
+		id: res.id,
+		created: Date.now(), // Assuming the current timestamp as created time
+		model: "cohere-model", // Assuming a placeholder model name
+		system_fingerprint: "cohere-fingerprint", // Assuming a placeholder fingerprint
+		usage: res.usage
+			? {
+					completion_tokens: res.usage.tokens.output_tokens,
+					prompt_tokens: res.usage.tokens.input_tokens,
+					total_tokens: res.usage.tokens.input_tokens + res.usage.tokens.output_tokens,
+			  }
+			: undefined,
+		choices: [
+			{
+				delta: {
+					role: res.delta?.message?.role,
+					content: res.delta?.message?.content?.text,
+					tool_calls: res.delta?.message?.tool_calls
+						? {
+								function: {
+									arguments: JSON.stringify(res.delta?.message?.tool_calls[0]?.function.arguments), // Convert arguments to string
+									description: res.delta?.message?.tool_calls[0]?.function.description,
+									name: res.delta?.message?.tool_calls[0]?.function.name,
+								},
+								id: res.delta?.message?.tool_calls[0]?.id,
+								index: 0, // Assuming a single tool call with index 0
+								type: res.delta?.message?.tool_calls[0]?.type,
+						  }
+						: undefined,
+				},
+				finish_reason: res.finish_reason,
+				index: 0, // Assuming a single choice with index 0
+				logprobs: res.logprobs
+					? {
+							content: res.logprobs.map((logprob) => ({
+								logprob: logprob.logprob,
+								token: logprob.token,
+								top_logprobs: logprob.top_logprobs.map((topLogprob) => ({
+									logprob: topLogprob.logprob,
+									token: topLogprob.token,
+								})),
+							})),
+					  }
+					: undefined,
+			},
+		],
+	};
+}
+
 /**
  * Use to continue text from a prompt. Same as `textGeneration` but returns generator that can be read one token at a time
  */
diff --git a/packages/inference/src/types.ts b/packages/inference/src/types.ts
@@ -30,6 +30,7 @@ export type InferenceTask = Exclude<PipelineType, "other">;
 
 export const INFERENCE_PROVIDERS = [
 	"black-forest-labs",
+	"cohere",
 	"fal-ai",
 	"fireworks-ai",
 	"hf-inference",
diff --git a/packages/inference/test/HfInference.spec.ts b/packages/inference/test/HfInference.spec.ts
diff --git a/packages/tasks/src/inference-providers.ts b/packages/tasks/src/inference-providers.ts