feat: add image-text-to-text models in transformers (#1772)

geoHeil · dolfim-ibm · web-flow · commit a07ba863c4c3 · 2025-07-08T05:54:57.000+02:00
* feat(dolphin): add dolphin support

Signed-off-by: Georg Heiler &lt;georg.kf.heiler@gmail.com&gt;

* rename

Signed-off-by: Georg Heiler &lt;georg.kf.heiler@gmail.com&gt;

* reformat

Signed-off-by: Georg Heiler &lt;georg.kf.heiler@gmail.com&gt;

* fix mypy

Signed-off-by: Georg Heiler &lt;georg.kf.heiler@gmail.com&gt;

* add prompt style and examples

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

---------

Signed-off-by: Georg Heiler &lt;georg.kf.heiler@gmail.com&gt;
Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;
Co-authored-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
@@ -31,6 +31,12 @@ class TransformersModelType(str, Enum):
     AUTOMODEL = "automodel"
     AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
     AUTOMODEL_CAUSALLM = "automodel-causallm"
+    AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
+
+
+class TransformersPromptStyle(str, Enum):
+    CHAT = "chat"
+    RAW = "raw"
 
 
 class InlineVlmOptions(BaseVlmOptions):
@@ -44,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
 
     inference_framework: InferenceFramework
     transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
+    transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
     response_format: ResponseFormat
 
     torch_dtype: Optional[str] = None
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -13,6 +13,7 @@
 from docling.datamodel.pipeline_options_vlm_model import (
     InlineVlmOptions,
     TransformersModelType,
+    TransformersPromptStyle,
 )
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import (
@@ -41,6 +42,7 @@ def __init__(
             from transformers import (
                 AutoModel,
                 AutoModelForCausalLM,
+                AutoModelForImageTextToText,
                 AutoModelForVision2Seq,
                 AutoProcessor,
                 BitsAndBytesConfig,
@@ -91,6 +93,11 @@ def __init__(
                 == TransformersModelType.AUTOMODEL_VISION2SEQ
             ):
                 model_cls = AutoModelForVision2Seq
+            elif (
+                self.vlm_options.transformers_model_type
+                == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
+            ):
+                model_cls = AutoModelForImageTextToText
 
             self.processor = AutoProcessor.from_pretrained(
                 artifacts_path,
@@ -169,7 +176,10 @@ def __call__(
     def formulate_prompt(self, user_prompt: str) -> str:
         """Formulate a prompt for the VLM."""
 
-        if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
+            return user_prompt
+
+        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
             _log.debug("Using specialized prompt for Phi-4")
             # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
 
@@ -182,20 +192,25 @@ def formulate_prompt(self, user_prompt: str) -> str:
 
             return prompt
 
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "This is a page from a document.",
-                    },
-                    {"type": "image"},
-                    {"type": "text", "text": user_prompt},
-                ],
-            }
-        ]
-        prompt = self.processor.apply_chat_template(
-            messages, add_generation_prompt=False
+        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "This is a page from a document.",
+                        },
+                        {"type": "image"},
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+            ]
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=False
+            )
+            return prompt
+
+        raise RuntimeError(
+            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
         )
-        return prompt
diff --git a/docs/examples/compare_vlm_models.py b/docs/examples/compare_vlm_models.py
@@ -14,11 +14,18 @@
 from tabulate import tabulate
 
 from docling.datamodel import vlm_model_specs
+from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
     VlmPipelineOptions,
 )
-from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
+from docling.datamodel.pipeline_options_vlm_model import (
+    InferenceFramework,
+    InlineVlmOptions,
+    ResponseFormat,
+    TransformersModelType,
+    TransformersPromptStyle,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
 
@@ -101,6 +108,33 @@ def convert(sources: list[Path], converter: DocumentConverter):
     out_path = Path("scratch")
     out_path.mkdir(parents=True, exist_ok=True)
 
+    ## Definiton of more inline models
+    llava_qwen = InlineVlmOptions(
+        repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
+        # prompt="Read text in the image.",
+        prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
+        # prompt="Parse the reading order of this document.",
+        response_format=ResponseFormat.MARKDOWN,
+        inference_framework=InferenceFramework.TRANSFORMERS,
+        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
+        scale=2.0,
+        temperature=0.0,
+    )
+
+    # Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
+    dolphin_oneshot = InlineVlmOptions(
+        repo_id="ByteDance/Dolphin",
+        prompt="<s>Read text in the image. <Answer/>",
+        response_format=ResponseFormat.MARKDOWN,
+        inference_framework=InferenceFramework.TRANSFORMERS,
+        transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
+        transformers_prompt_style=TransformersPromptStyle.RAW,
+        supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
+        scale=2.0,
+        temperature=0.0,
+    )
+
     ## Use VlmPipeline
     pipeline_options = VlmPipelineOptions()
     pipeline_options.generate_page_images = True
@@ -121,6 +155,9 @@ def convert(sources: list[Path], converter: DocumentConverter):
         vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
         vlm_model_specs.PHI4_TRANSFORMERS,
         vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
+        ## More inline models
+        dolphin_oneshot,
+        llava_qwen,
     ]
 
     # Remove MLX models if not on Mac