Add back Image Text to Text page

Wauplin · Wauplin · commit 8bdbf0e0d644 · 2025-06-25T16:43:16.000+02:00
diff --git a/docs/inference-providers/_toctree.yml b/docs/inference-providers/_toctree.yml
@@ -70,6 +70,8 @@
       title: Image Classification
     - local: tasks/image-segmentation
       title: Image Segmentation
+    - local: tasks/image-text-to-text
+      title: Image-Text to Text
     - local: tasks/image-to-image
       title: Image to Image
     - local: tasks/object-detection
diff --git a/docs/inference-providers/tasks/chat-completion.md b/docs/inference-providers/tasks/chat-completion.md
@@ -33,6 +33,8 @@ This is a subtask of [`text-generation`](https://huggingface.co/docs/inference-p
 
 - [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): Strong image-text-to-text model.
 
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-text-to-text&sort=trending).
+
 ### API Playground
 
 For Chat Completion models, we provide an interactive UI Playground for easier testing:
diff --git a/docs/inference-providers/tasks/image-text-to-text.md b/docs/inference-providers/tasks/image-text-to-text.md
@@ -0,0 +1,43 @@
+<!---
+This markdown file has been generated from a script. Please do not edit it directly.
+For more details, check out:
+- the `generate.ts` script: https://github.com/huggingface/hub-docs/blob/main/scripts/inference-providers/scripts/generate.ts
+- the task template defining the sections in the page: https://github.com/huggingface/hub-docs/tree/main/scripts/inference-providers/templates/task/image-text-to-text.handlebars
+- the input jsonschema specifications used to generate the input markdown table: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/tasks/image-text-to-text/spec/input.json
+- the output jsonschema specifications used to generate the output markdown table: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/tasks/image-text-to-text/spec/output.json
+- the snippets used to generate the example:
+  - curl: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/snippets/curl.ts
+  - python: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/snippets/python.ts
+  - javascript: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/snippets/js.ts
+- the "tasks" content for recommended models: https://huggingface.co/api/tasks
+--->
+
+## Image-Text to Text
+
+Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.
+
+<Tip>
+
+For more details about the `image-text-to-text` task, check out its [dedicated page](https://huggingface.co/tasks/image-text-to-text)! You will find examples and related materials.
+
+</Tip>
+
+### Recommended models
+
+- [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): Strong image-text-to-text model.
+
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-text-to-text&sort=trending).
+
+### Using the API
+
+
+<InferenceSnippet
+    pipeline=image-text-to-text
+    providersMapping={ {"hf-inference":{"modelId":"google/gemma-3-27b-it","providerModelId":"google/gemma-3-27b-it"},"hyperbolic":{"modelId":"Qwen/Qwen2.5-VL-7B-Instruct","providerModelId":"Qwen/Qwen2.5-VL-7B-Instruct"}} }
+/>
+
+
+
+### API specification
+
+For the API specification of conversational image-text-to-text models, please refer to the [Chat Completion API documentation](https://huggingface.co/docs/inference-providers/tasks/chat-completion#api-specification).
diff --git a/scripts/inference-providers/scripts/generate.ts b/scripts/inference-providers/scripts/generate.ts
@@ -757,24 +757,21 @@ Object.entries(PER_PROVIDER_TASKS).forEach(([provider, tasks]) => {
 async function renderTemplate(
   templateName: string,
   namespace: string,
-  data: JsonObject,
+  data: JsonObject
 ): Promise<string> {
   console.log(`🎨  Rendering ${templateName} (${namespace})`);
   const template = Handlebars.compile(
-    await readTemplate(templateName, namespace),
+    await readTemplate(templateName, namespace)
   );
   return template(data);
 }
 
 await Promise.all(
   TASKS_EXTENDED.map(async (task) => {
-    if (task === "image-text-to-text") {
-      return; // not generated -> merged with chat-completion
-    }
     // @ts-ignore
     const rendered = await renderTemplate(task, "task", DATA);
     await writeTaskDoc(task, rendered);
-  }),
+  })
 );
 
 await Promise.all(
diff --git a/scripts/inference-providers/templates/task/chat-completion.handlebars b/scripts/inference-providers/templates/task/chat-completion.handlebars
@@ -17,6 +17,8 @@ This is a subtask of [`text-generation`](https://huggingface.co/docs/inference-p
 - [{{this.id}}](https://huggingface.co/{{this.id}}): {{this.description}}
 {{/each}}
 
+{{{tips.listModelsLink.image-text-to-text}}}
+
 ### API Playground
 
 For Chat Completion models, we provide an interactive UI Playground for easier testing:
diff --git a/scripts/inference-providers/templates/task/image-text-to-text.handlebars b/scripts/inference-providers/templates/task/image-text-to-text.handlebars
@@ -0,0 +1,21 @@
+## Image-Text to Text
+
+Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.
+
+{{{tips.linksToTaskPage.image-text-to-text}}}
+
+### Recommended models
+
+{{#each recommendedModels.conversational-image-text-to-text}}
+- [{{this.id}}](https://huggingface.co/{{this.id}}): {{this.description}}
+{{/each}}
+
+{{{tips.listModelsLink.image-text-to-text}}}
+
+### Using the API
+
+{{{snippets.image-text-to-text}}}
+
+### API specification
+
+For the API specification of conversational image-text-to-text models, please refer to the [Chat Completion API documentation](https://huggingface.co/docs/inference-providers/tasks/chat-completion#api-specification).