[Inference] Correctly build chat completion URL with query parameters (huggingface#3200)

hanouticelina · Wauplin · github-actions[bot] · mintyleaf · commit 0b151127b659 · 2025-07-11T23:25:46.000+04:00
* fix base url parsing

* add comments

* add test case

* Apply suggestions from code review

Co-authored-by: Lucain &lt;lucain@huggingface.co&gt;

* Apply style fixes

---------

Co-authored-by: Lucain &lt;lucain@huggingface.co&gt;
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -130,9 +130,7 @@ class InferenceClient:
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
             Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
-            arguments are mutually exclusive. If using `base_url` for chat completion, the `/chat/completions` suffix
-            path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
-            documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
+            arguments are mutually exclusive. If a URL is passed as `model` or `base_url` for chat completion, the `(/v1)/chat/completions` suffix path will be appended to the URL.
         provider (`str`, *optional*):
             Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cerebras"`, `"cohere"`, `"fal-ai"`, `"featherless-ai"`, `"fireworks-ai"`, `"groq"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"nscale"`, `"openai"`, `"replicate"`, "sambanova"`, `"swarmind"` or `"together"`.
             Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order in https://hf.co/settings/inference-providers.
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -118,9 +118,7 @@ class AsyncInferenceClient:
             or a URL to a deployed Inference Endpoint. Defaults to None, in which case a recommended model is
             automatically selected for the task.
             Note: for better compatibility with OpenAI's client, `model` has been aliased as `base_url`. Those 2
-            arguments are mutually exclusive. If using `base_url` for chat completion, the `/chat/completions` suffix
-            path will be appended to the base URL (see the [TGI Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api)
-            documentation for details). When passing a URL as `model`, the client will not append any suffix path to it.
+            arguments are mutually exclusive. If a URL is passed as `model` or `base_url` for chat completion, the `(/v1)/chat/completions` suffix path will be appended to the URL.
         provider (`str`, *optional*):
             Name of the provider to use for inference. Can be `"black-forest-labs"`, `"cerebras"`, `"cohere"`, `"fal-ai"`, `"featherless-ai"`, `"fireworks-ai"`, `"groq"`, `"hf-inference"`, `"hyperbolic"`, `"nebius"`, `"novita"`, `"nscale"`, `"openai"`, `"replicate"`, "sambanova"` or `"together"`.
             Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order in https://hf.co/settings/inference-providers.
diff --git a/src/huggingface_hub/inference/_providers/hf_inference.py b/src/huggingface_hub/inference/_providers/hf_inference.py
@@ -2,6 +2,7 @@
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, Optional, Union
+from urllib.parse import urlparse, urlunparse
 
 from huggingface_hub import constants
 from huggingface_hub.hf_api import InferenceProviderMapping
@@ -125,18 +126,25 @@ def _prepare_url(self, api_key: str, mapped_model: str) -> str:
 
 
 def _build_chat_completion_url(model_url: str) -> str:
-    # Strip trailing /
-    model_url = model_url.rstrip("/")
+    parsed = urlparse(model_url)
+    path = parsed.path.rstrip("/")
 
-    # Append /chat/completions if not already present
-    if model_url.endswith("/v1"):
-        model_url += "/chat/completions"
+    # If the path already ends with /chat/completions, we're done!
+    if path.endswith("/chat/completions"):
+        return model_url
 
+    # Append /chat/completions if not already present
+    if path.endswith("/v1"):
+        new_path = path + "/chat/completions"
+    # If path was empty or just "/", set the full path
+    elif not path:
+        new_path = "/v1/chat/completions"
     # Append /v1/chat/completions if not already present
-    if not model_url.endswith("/chat/completions"):
-        model_url += "/v1/chat/completions"
+    else:
+        new_path = path + "/v1/chat/completions"
 
-    return model_url
+    # Reconstruct the URL with the new path and original query parameters.
+    return urlunparse(parsed._replace(path=new_path))
 
 
 @lru_cache(maxsize=1)
diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py
@@ -1034,6 +1034,27 @@ def test_chat_completion_error_in_stream():
             f"{LOCAL_TGI_URL}/v1",
             f"{LOCAL_TGI_URL}/v1/chat/completions",
         ),
+        # With query parameters
+        (
+            f"{INFERENCE_ENDPOINT_URL}/v1/chat/completions?api-version=1",
+            f"{INFERENCE_ENDPOINT_URL}/v1/chat/completions?api-version=1",
+        ),
+        (
+            f"{INFERENCE_ENDPOINT_URL}/chat/completions?api-version=1",
+            f"{INFERENCE_ENDPOINT_URL}/chat/completions?api-version=1",
+        ),
+        (
+            f"{INFERENCE_ENDPOINT_URL}?api-version=1",
+            f"{INFERENCE_ENDPOINT_URL}/v1/chat/completions?api-version=1",
+        ),
+        (
+            f"{INFERENCE_ENDPOINT_URL}/v1?api-version=1",
+            f"{INFERENCE_ENDPOINT_URL}/v1/chat/completions?api-version=1",
+        ),
+        (
+            f"{INFERENCE_ENDPOINT_URL}/?api-version=1",
+            f"{INFERENCE_ENDPOINT_URL}/v1/chat/completions?api-version=1",
+        ),
     ],
 )
 def test_resolve_chat_completion_url(model_url: str, expected_url: str):