huggingface
diff --git a/‎docs/api-inference/tasks/chat-completion.md‎
Lines changed: 96 additions & 7 deletions b/‎docs/api-inference/tasks/chat-completion.md‎
Lines changed: 96 additions & 7 deletions
diff --git a/‎docs/api-inference/tasks/image-text-to-text.md‎
Lines changed: 1 addition & 16 deletions b/‎docs/api-inference/tasks/image-text-to-text.md‎
Lines changed: 1 addition & 16 deletions
diff --git a/‎docs/api-inference/tasks/text-generation.md‎
Lines changed: 35 additions & 31 deletions b/‎docs/api-inference/tasks/text-generation.md‎
Lines changed: 35 additions & 31 deletions
@@ -14,20 +14,20 @@ For more details, check out:
 
 ## Chat Completion
 
-Generate a response given a list of messages.
-This is a subtask of [`text-generation`](./text_generation) designed to generate responses in a conversational context.
-
-
+Generate a response given a list of messages in a conversational context, supporting both conversational Language Models (LLMs) and conversational Vision-Language Models (VLMs).
+This is a subtask of [`text-generation`](./text_generation) and [`image-text-to-text`](./image_text_to_text).
 
 ### Recommended models
 
+#### Conversational Large Language Models (LLMs)
 - [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it): A text-generation model trained to follow instructions.
 - [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct): Very powerful text generation model trained to follow instructions.
 - [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct): Small yet powerful text generation model.
 - [HuggingFaceH4/starchat2-15b-v0.1](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1): Strong coding assistant model.
 - [mistralai/Mistral-Nemo-Instruct-2407](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407): Very strong open-source large language model.
 
-
+#### Conversational Vision-Language Models (VLMs)
+- [microsoft/Phi-3.5-vision-instruct](https://huggingface.co/microsoft/Phi-3.5-vision-instruct): Strong image-text-to-text model.
 
 ### Using the API
 
@@ -37,6 +37,8 @@ The API supports:
 * Using grammars, constraints, and tools.
 * Streaming the output
 
+#### Code snippet example for conversational LLMs
+
 
 <inferencesnippet>
 
@@ -70,7 +72,7 @@ for message in client.chat_completion(
     print(message.choices[0].delta.content, end="")
 ```
 
-To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
+To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.conversational_text-generation).
 </python>
 
 <js>
@@ -88,7 +90,94 @@ for await (const chunk of inference.chatCompletionStream({
 }
 ```
 
-To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#chatcompletion).
+To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#conversationaltext-generation).
+</js>
+
+</inferencesnippet>
+
+
+
+#### Code snippet example for conversational VLMs
+
+
+<inferencesnippet>
+
+<curl>
+```bash
+curl 'https://api-inference.huggingface.co/models/microsoft/Phi-3.5-vision-instruct/v1/chat/completions' \
+-H "Authorization: Bearer hf_***" \
+-H 'Content-Type: application/json' \
+-d '{
+	"model": "microsoft/Phi-3.5-vision-instruct",
+	"messages": [
+		{
+			"role": "user",
+			"content": [
+				{"type": "image_url", "image_url": {"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"}},
+				{"type": "text", "text": "Describe this image in one sentence."}
+			]
+		}
+	],
+	"max_tokens": 500,
+	"stream": false
+}'
+
+```
+</curl>
+
+<python>
+```py
+from huggingface_hub import InferenceClient
+
+client = InferenceClient(api_key="hf_***")
+
+image_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+
+for message in client.chat_completion(
+	model="microsoft/Phi-3.5-vision-instruct",
+	messages=[
+		{
+			"role": "user",
+			"content": [
+				{"type": "image_url", "image_url": {"url": image_url}},
+				{"type": "text", "text": "Describe this image in one sentence."},
+			],
+		}
+	],
+	max_tokens=500,
+	stream=True,
+):
+	print(message.choices[0].delta.content, end="")
+```
+
+To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.conversational_image-text-to-text).
+</python>
+
+<js>
+```js
+import { HfInference } from "@huggingface/inference";
+
+const inference = new HfInference("hf_***");
+const imageUrl = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg";
+
+for await (const chunk of inference.chatCompletionStream({
+	model: "microsoft/Phi-3.5-vision-instruct",
+	messages: [
+		{
+			"role": "user",
+			"content": [
+				{"type": "image_url", "image_url": {"url": imageUrl}},
+				{"type": "text", "text": "Describe this image in one sentence."},
+			],
+		}
+	],
+	max_tokens: 500,
+})) {
+	process.stdout.write(chunk.choices[0]?.delta?.content || "");
+}
+```
+
+To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#conversationalimage-text-to-text).
 </js>
 
 </inferencesnippet>
 
@@ -109,21 +109,6 @@ To use the JavaScript client, see `huggingface.js`'s [package reference](https:/
 
 ### API specification
 
-#### Request
-
-
-
-Some options can be configured by passing headers to the Inference API. Here are the available headers:
-
-| Headers |   |    |
-| :--- | :--- | :--- |
-| **authorization** | _string_ | Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with Inference API permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens). |
-| **x-use-cache** | _boolean, default to `true`_ | There is a cache layer on the inference API to speed up requests we have already seen. Most models can use those results as they are deterministic (meaning the outputs will be the same anyway). However, if you use a nondeterministic model, you can set this parameter to prevent the caching mechanism from being used, resulting in a real new query. Read more about caching [here](../parameters#caching]). |
-| **x-wait-for-model** | _boolean, default to `false`_ | If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error, as it will limit hanging in your application to known places. Read more about model availability [here](../overview#eligibility]). |
-
-For more information about Inference API headers, check out the parameters [guide](../parameters).
-
-#### Response
-
+For the API specification of conversational image-text-to-text models, please refer to the [Chat Completion API documentation](https://huggingface.co/docs/api-inference/tasks/chat-completion#api-specification).
 
 
@@ -42,50 +42,54 @@ This is only a subset of the supported models. Find the model that suits you bes
 
 <curl>
 ```bash
-curl 'https://api-inference.huggingface.co/models/google/gemma-2-2b-it/v1/chat/completions' \
--H "Authorization: Bearer hf_***" \
--H 'Content-Type: application/json' \
--d '{
-	"model": "google/gemma-2-2b-it",
-	"messages": [{"role": "user", "content": "What is the capital of France?"}],
-	"max_tokens": 500,
-	"stream": false
-}'
-
+curl https://api-inference.huggingface.co/models/google/gemma-2-2b-it \
+	-X POST \
+	-d '{"inputs": "Can you please let us know more details about your "}' \
+	-H 'Content-Type: application/json' \
+	-H "Authorization: Bearer hf_***"
 ```
 </curl>
 
 <python>
 ```py
-from huggingface_hub import InferenceClient
-
-client = InferenceClient(api_key="hf_***")
-
-for message in client.chat_completion(
-	model="google/gemma-2-2b-it",
-	messages=[{"role": "user", "content": "What is the capital of France?"}],
-	max_tokens=500,
-	stream=True,
-):
-    print(message.choices[0].delta.content, end="")
+import requests
+
+API_URL = "https://api-inference.huggingface.co/models/google/gemma-2-2b-it"
+headers = {"Authorization": "Bearer hf_***"}
+
+def query(payload):
+	response = requests.post(API_URL, headers=headers, json=payload)
+	return response.json()
+	
+output = query({
+	"inputs": "Can you please let us know more details about your ",
+})
 ```
 
 To use the Python client, see `huggingface_hub`'s [package reference](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation).
 </python>
 
 <js>
 ```js
-import { HfInference } from "@huggingface/inference";
-
-const inference = new HfInference("hf_***");
-
-for await (const chunk of inference.chatCompletionStream({
-	model: "google/gemma-2-2b-it",
-	messages: [{ role: "user", content: "What is the capital of France?" }],
-	max_tokens: 500,
-})) {
-	process.stdout.write(chunk.choices[0]?.delta?.content || "");
+async function query(data) {
+	const response = await fetch(
+		"https://api-inference.huggingface.co/models/google/gemma-2-2b-it",
+		{
+			headers: {
+				Authorization: "Bearer hf_***"
+				"Content-Type": "application/json",
+			},
+			method: "POST",
+			body: JSON.stringify(data),
+		}
+	);
+	const result = await response.json();
+	return result;
 }
+
+query({"inputs": "Can you please let us know more details about your "}).then((response) => {
+	console.log(JSON.stringify(response));
+});
 ```
 
 To use the JavaScript client, see `huggingface.js`'s [package reference](https://huggingface.co/docs/huggingface.js/inference/classes/HfInference#textgeneration).