Add Cohere as an Inference Provider (#2888)

alexrs-cohere · Wauplin · hanouticelina · Wauplin · commit ad7950b12bad · 2025-03-11T11:38:40.000+01:00
* Add Cohere as an Inference Provider

* Use new Cohere OpenAI compatible API

* Update docs/source/en/guides/inference.md

* Update src/huggingface_hub/inference/_providers/cohere.py

Co-authored-by: Célina &lt;hanouticelina@gmail.com&gt;

---------

Co-authored-by: Lucain &lt;lucainp@gmail.com&gt;
Co-authored-by: Lucain &lt;lucain@huggingface.co&gt;
Co-authored-by: Célina &lt;hanouticelina@gmail.com&gt;
diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -132,7 +132,7 @@ class InferenceClient:
             path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
             documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
         provider (`str`, *optional*):
-            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"replicate"`, "sambanova"` or `"together"`.
+            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cohere"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"replicate"`, "sambanova"` or `"together"`.
             defaults to hf-inference (Hugging Face Serverless Inference API).
             If model is a URL or `base_url` is passed, then `provider` is not used.
         token (`str` or `bool`, *optional*):
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -120,7 +120,7 @@ class AsyncInferenceClient:
             path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
             documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
         provider (`str`, *optional*):
-            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"replicate"`, "sambanova"` or `"together"`.
+            Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cohere"`, `"fal-ai"`, `"fireworks-ai"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"replicate"`, "sambanova"` or `"together"`.
             defaults to hf-inference (Hugging Face Serverless Inference API).
             If model is a URL or `base_url` is passed, then `provider` is not used.
         token (`str` or `bool`, *optional*):
diff --git a/src/huggingface_hub/inference/_providers/__init__.py b/src/huggingface_hub/inference/_providers/__init__.py
@@ -2,6 +2,7 @@
 
 from ._common import TaskProviderHelper
 from .black_forest_labs import BlackForestLabsTextToImageTask
+from .cohere import CohereConversationalTask
 from .fal_ai import (
     FalAIAutomaticSpeechRecognitionTask,
     FalAITextToImageTask,
@@ -20,6 +21,7 @@
 
 PROVIDER_T = Literal[
     "black-forest-labs",
+    "cohere",
     "fal-ai",
     "fireworks-ai",
     "hf-inference",
@@ -35,6 +37,9 @@
     "black-forest-labs": {
         "text-to-image": BlackForestLabsTextToImageTask(),
     },
+    "cohere": {
+        "conversational": CohereConversationalTask(),
+    },
     "fal-ai": {
         "automatic-speech-recognition": FalAIAutomaticSpeechRecognitionTask(),
         "text-to-image": FalAITextToImageTask(),
diff --git a/src/huggingface_hub/inference/_providers/_common.py b/src/huggingface_hub/inference/_providers/_common.py
@@ -17,6 +17,7 @@
     #
     # Example:
     # "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen2.5-Coder-32B-Instruct",
+    "cohere": {},
     "fal-ai": {},
     "fireworks-ai": {},
     "hf-inference": {},
diff --git a/src/huggingface_hub/inference/_providers/cohere.py b/src/huggingface_hub/inference/_providers/cohere.py
@@ -0,0 +1,15 @@
+from huggingface_hub.inference._providers._common import (
+    BaseConversationalTask,
+)
+
+
+_PROVIDER = "cohere"
+_BASE_URL = "https://api.cohere.com"
+
+
+class CohereConversationalTask(BaseConversationalTask):
+    def __init__(self):
+        super().__init__(provider=_PROVIDER, base_url=_BASE_URL)
+
+    def _prepare_route(self, mapped_model: str) -> str:
+            return "/compatibility/v1/chat/completions"
diff --git a/tests/cassettes/TestInferenceClient.test_chat_completion_no_stream[cohere,conversational].yaml b/tests/cassettes/TestInferenceClient.test_chat_completion_no_stream[cohere,conversational].yaml
@@ -0,0 +1,102 @@
+interactions:
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What is deep learning?"}], "model": "command-r7b-12-2024",
+      "stream": false}'
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '181'
+      Content-Type:
+      - application/json
+      X-Amzn-Trace-Id:
+      - 204391c6-92c8-4214-a394-04b025f3e86a
+    method: POST
+    uri: https://api.cohere.com/compatibility/v1/chat/completions
+  response:
+    body:
+      string: '{"id":"3b5751bb-10a2-4fc8-95a0-d1e6cfa788b3","choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"Deep
+        learning is a subfield of machine learning and artificial intelligence that
+        focuses on training artificial neural networks to learn and make predictions
+        from data. It is inspired by the structure and function of the human brain,
+        particularly the interconnected network of neurons.\n\nIn deep learning, artificial
+        neural networks are composed of multiple layers of interconnected nodes, or
+        \"neurons,\" which process and transform input data. These networks are designed
+        to automatically learn and extract hierarchical representations of data through
+        a process called \"training.\" The training process involves adjusting the
+        network''s internal parameters (weights and biases) to minimize the difference
+        between predicted and actual outputs.\n\nHere are some key characteristics
+        and concepts in deep learning:\n\n1. Neural Networks: Deep learning models
+        are primarily based on artificial neural networks, which are composed of layers
+        of nodes. These networks can have various architectures, such as convolutional
+        neural networks (CNNs) for image processing, recurrent neural networks (RNNs)
+        for sequential data, and transformer networks for natural language processing.\n\n2.
+        Deep Architecture: The term \"deep\" in deep learning refers to the depth
+        of the neural network, meaning it has multiple hidden layers between the input
+        and output layers. These hidden layers enable the network to learn complex
+        patterns and representations from the data.\n\n3. Learning and Training: Deep
+        learning models are trained using large amounts of labeled data and a process
+        called backpropagation. During training, the network adjusts its internal
+        parameters to minimize a loss function, which measures the difference between
+        predicted and actual outputs. This optimization process is typically done
+        using gradient descent or its variants.\n\n4. Feature Learning: One of the
+        key advantages of deep learning is its ability to automatically learn relevant
+        features from raw data. Unlike traditional machine learning, where feature
+        engineering is required, deep learning models can discover and extract features
+        at multiple levels of abstraction.\n\n5. Applications: Deep learning has been
+        applied to a wide range of tasks and domains, including image and speech recognition,
+        natural language processing, object detection, medical diagnosis, game playing
+        (e.g., AlphaGo), and autonomous driving.\n\nDeep learning has revolutionized
+        many areas of artificial intelligence due to its ability to handle complex
+        and large-scale data, learn hierarchical representations, and achieve state-of-the-art
+        performance in various tasks. It has driven significant advancements in areas
+        like computer vision, natural language understanding, and speech recognition."}}],"created":1740653732,"model":"command-r7b-12-2024","object":"chat.completion","usage":{"prompt_tokens":11,"completion_tokens":476,"total_tokens":487}}'
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Transfer-Encoding:
+      - chunked
+      Via:
+      - 1.1 google
+      access-control-expose-headers:
+      - X-Debug-Trace-ID
+      cache-control:
+      - no-cache, no-store, no-transform, must-revalidate, private, max-age=0
+      content-type:
+      - application/json
+      date:
+      - Thu, 27 Feb 2025 10:55:32 GMT
+      expires:
+      - Thu, 01 Jan 1970 00:00:00 UTC
+      num_chars:
+      - '2831'
+      num_tokens:
+      - '487'
+      pragma:
+      - no-cache
+      server:
+      - envoy
+      vary:
+      - Origin
+      x-accel-expires:
+      - '0'
+      x-api-warning:
+      - Please set an API version, for more information please refer to https://docs.cohere.com/versioning-reference
+      - Version is deprecated, for more information please refer to https://docs.cohere.com/versioning-reference
+      x-debug-trace-id:
+      - 430c1e5519b95b094771bcc36304445e
+      x-envoy-upstream-service-time:
+      - '2740'
+      x-trial-endpoint-call-limit:
+      - '100'
+      x-trial-endpoint-call-remaining:
+      - '99'
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/cassettes/TestInferenceClient.test_chat_completion_with_stream[cohere,conversational].yaml b/tests/cassettes/TestInferenceClient.test_chat_completion_with_stream[cohere,conversational].yaml
@@ -0,0 +1,146 @@
+interactions:
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What is deep learning?"}], "model": "command-r7b-12-2024",
+      "max_tokens": 20, "stream": true}'
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '198'
+      Content-Type:
+      - application/json
+      X-Amzn-Trace-Id:
+      - 68c492d9-abbd-4d0a-8462-e598765021e4
+    method: POST
+    uri: https://api.cohere.com/compatibility/v1/chat/completions
+  response:
+    body:
+      string: 'data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"","role":"assistant"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"Deep"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        learning"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        is"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        a"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        sub"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"field"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        of"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        machine"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        learning"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        and"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        artificial"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        intelligence"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        that"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        focuses"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        on"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        training"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        artificial"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        neural"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":null,"delta":{"content":"
+        networks"}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk"}
+
+
+        data: {"id":"2bb1b33e-53d9-4fae-8958-2e54c1e60f09","choices":[{"index":0,"finish_reason":"length","delta":{}}],"created":1740653733,"model":"command-r7b-12-2024","object":"chat.completion.chunk","usage":{"prompt_tokens":11,"completion_tokens":19,"total_tokens":30}}
+
+
+        data: [DONE]
+
+
+        '
+    headers:
+      Alt-Svc:
+      - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000
+      Transfer-Encoding:
+      - chunked
+      Via:
+      - 1.1 google
+      access-control-expose-headers:
+      - X-Debug-Trace-ID
+      cache-control:
+      - no-cache, no-store, no-transform, must-revalidate, private, max-age=0
+      content-type:
+      - text/event-stream
+      date:
+      - Thu, 27 Feb 2025 10:55:33 GMT
+      expires:
+      - Thu, 01 Jan 1970 00:00:00 UTC
+      pragma:
+      - no-cache
+      server:
+      - envoy
+      vary:
+      - Origin
+      x-accel-expires:
+      - '0'
+      x-api-warning:
+      - Please set an API version, for more information please refer to https://docs.cohere.com/versioning-reference
+      - Version is deprecated, for more information please refer to https://docs.cohere.com/versioning-reference
+      x-debug-trace-id:
+      - 4bc0ce4bda5305b5b60ef6268db5e3a7
+      x-envoy-upstream-service-time:
+      - '88'
+      x-trial-endpoint-call-limit:
+      - '100'
+      x-trial-endpoint-call-remaining:
+      - '98'
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py
@@ -63,6 +63,9 @@
     "black-forest-labs": {
         "text-to-image": "black-forest-labs/FLUX.1-dev",
     },
+    "cohere": {
+        "conversational": "CohereForAI/c4ai-command-r7b-12-2024",
+    },
     "together": {
         "conversational": "meta-llama/Meta-Llama-3-8B-Instruct",
         "text-generation": "meta-llama/Llama-2-70b-hf",
diff --git a/tests/test_inference_providers.py b/tests/test_inference_providers.py
@@ -9,6 +9,7 @@
     recursive_merge,
 )
 from huggingface_hub.inference._providers.black_forest_labs import BlackForestLabsTextToImageTask
+from huggingface_hub.inference._providers.cohere import CohereConversationalTask
 from huggingface_hub.inference._providers.fal_ai import (
     FalAIAutomaticSpeechRecognitionTask,
     FalAITextToImageTask,
@@ -110,6 +111,24 @@ def test_get_response_success(self, mocker):
         )
 
 
+class TestCohereConversationalTask:
+    def test_prepare_url(self):
+        helper = CohereConversationalTask()
+        assert helper.task == "conversational"
+        url = helper._prepare_url("cohere_token", "username/repo_name")
+        assert url == "https://api.cohere.com/compatibility/v1/chat/completions"
+
+    def test_prepare_payload_as_dict(self):
+        helper = CohereConversationalTask()
+        payload = helper._prepare_payload_as_dict(
+            [{"role": "user", "content": "Hello!"}], {}, "CohereForAI/command-r7b-12-2024"
+        )
+        assert payload == {
+            "messages": [{"role": "user", "content": "Hello!"}],
+            "model": "CohereForAI/command-r7b-12-2024",
+        }
+
+
 class TestFalAIProvider:
     def test_prepare_headers_fal_ai_key(self):
         """When using direct call, must use Key authorization."""

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`#`
`18`	`18`	`# Example:`
`19`	`19`	`# "Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen2.5-Coder-32B-Instruct",`
	`20`	`+ "cohere": {},`
`20`	`21`	`"fal-ai": {},`
`21`	`22`	`"fireworks-ai": {},`
`22`	`23`	`"hf-inference": {},`