feat(vlm): Dynamic prompts (#1808)

shkarupa-alex · cau-git · web-flow · commit b8813eea806a · 2025-07-07T16:58:42.000+02:00
* Unify temperature options for Vlm models * Dynamic prompt support with example * DCO Remediation Commit for Shkarupa Alex <shkarupa.alex@gmail.com> I, Shkarupa Alex <shkarupa.alex@gmail.com>, hereby add my Signed-off-by to this commit: 34d446c I, Shkarupa Alex <shkarupa.alex@gmail.com>, hereby add my Signed-off-by to this commit: 9c595d5 Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com> * Replace Page with SegmentedPage * Fix example HF repo link Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Sign-off Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com> * DCO Remediation Commit for Shkarupa Alex <shkarupa.alex@gmail.com> I, Shkarupa Alex <shkarupa.alex@gmail.com>, hereby add my Signed-off-by to this commit: 1a16206 Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com> Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com> * Use lmstudio-community model Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Swap inference engine to LM Studio Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com> --------- Signed-off-by: Shkarupa Alex <shkarupa.alex@gmail.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
@@ -1,6 +1,7 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 
+from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
@@ -9,9 +10,10 @@
 
 class BaseVlmOptions(BaseModel):
     kind: str
-    prompt: str
+    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
     scale: float = 2.0
     max_size: Optional[int] = None
+    temperature: float = 0.0
 
 
 class ResponseFormat(str, Enum):
@@ -51,7 +53,6 @@ class InlineVlmOptions(BaseVlmOptions):
         AcceleratorDevice.MPS,
     ]
 
-    temperature: float = 0.0
     stop_strings: List[str] = []
     extra_generation_config: Dict[str, Any] = {}
 
diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py
@@ -29,12 +29,9 @@ def __init__(
 
             self.timeout = self.vlm_options.timeout
             self.concurrency = self.vlm_options.concurrency
-            self.prompt_content = (
-                f"This is a page from a document.\n{self.vlm_options.prompt}"
-            )
             self.params = {
                 **self.vlm_options.params,
-                "temperature": 0,
+                "temperature": self.vlm_options.temperature,
             }
 
     def __call__(
@@ -56,9 +53,14 @@ def _vlm_request(page):
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
 
+                    if callable(self.vlm_options.prompt):
+                        prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        prompt = self.vlm_options.prompt
+
                     page_tags = api_image_request(
                         image=hi_res_image,
-                        prompt=self.prompt_content,
+                        prompt=prompt,
                         url=self.vlm_options.url,
                         timeout=self.timeout,
                         headers=self.vlm_options.headers,
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -128,7 +128,11 @@ def __call__(
                     )
 
                     # Define prompt structure
-                    prompt = self.formulate_prompt()
+                    if callable(self.vlm_options.prompt):
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        user_prompt = self.vlm_options.prompt
+                    prompt = self.formulate_prompt(user_prompt)
 
                     inputs = self.processor(
                         text=prompt, images=[hi_res_image], return_tensors="pt"
@@ -162,7 +166,7 @@ def __call__(
 
                 yield page
 
-    def formulate_prompt(self) -> str:
+    def formulate_prompt(self, user_prompt: str) -> str:
         """Formulate a prompt for the VLM."""
 
         if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
@@ -173,7 +177,7 @@ def formulate_prompt(self) -> str:
             assistant_prompt = "<|assistant|>"
             prompt_suffix = "<|end|>"
 
-            prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
+            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
             _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
 
             return prompt
@@ -187,7 +191,7 @@ def formulate_prompt(self) -> str:
                         "text": "This is a page from a document.",
                     },
                     {"type": "image"},
-                    {"type": "text", "text": self.vlm_options.prompt},
+                    {"type": "text", "text": user_prompt},
                 ],
             }
         ]
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
@@ -56,8 +56,6 @@ def __init__(
             elif (artifacts_path / repo_cache_folder).exists():
                 artifacts_path = artifacts_path / repo_cache_folder
 
-            self.param_question = vlm_options.prompt
-
             ## Load the model
             self.vlm_model, self.processor = load(artifacts_path)
             self.config = load_config(artifacts_path)
@@ -86,8 +84,12 @@ def __call__(
                         if hi_res_image.mode != "RGB":
                             hi_res_image = hi_res_image.convert("RGB")
 
+                    if callable(self.vlm_options.prompt):
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        user_prompt = self.vlm_options.prompt
                     prompt = self.apply_chat_template(
-                        self.processor, self.config, self.param_question, num_images=1
+                        self.processor, self.config, user_prompt, num_images=1
                     )
 
                     start_time = time.time()
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
@@ -117,6 +117,7 @@ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
             page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
             if page._backend is not None and page._backend.is_valid():
                 page.size = page._backend.get_size()
+                page.parsed_page = page._backend.get_segmented_page()
 
         return page
 
diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py
@@ -1,8 +1,10 @@
 import logging
 import os
 from pathlib import Path
+from typing import Optional
 
 import requests
+from docling_core.types.doc.page import SegmentedPage
 from dotenv import load_dotenv
 
 from docling.datamodel.base_models import InputFormat
@@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
     return options
 
 
+#### Using LM Studio with OlmOcr model
+
+
+def lms_olmocr_vlm_options(model: str):
+    def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
+        if page is None:
+            return (
+                "Below is the image of one page of a document. Just return the plain text"
+                " representation of this document as if you were reading it naturally.\n"
+                "Do not hallucinate.\n"
+            )
+
+        anchor = [
+            f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
+        ]
+
+        for text_cell in page.textline_cells:
+            if not text_cell.text.strip():
+                continue
+            bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
+            anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
+
+        for image_cell in page.bitmap_resources:
+            bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
+            anchor.append(
+                f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
+            )
+
+        if len(anchor) == 1:
+            anchor.append(
+                f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
+            )
+
+        # Original prompt uses cells sorting. We are skipping it in this demo.
+
+        base_text = "\n".join(anchor)
+
+        return (
+            f"Below is the image of one page of a document, as well as some raw textual"
+            f" content that was previously extracted for it. Just return the plain text"
+            f" representation of this document as if you were reading it naturally.\n"
+            f"Do not hallucinate.\n"
+            f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
+        )
+
+    options = ApiVlmOptions(
+        url="http://localhost:1234/v1/chat/completions",
+        params=dict(
+            model=model,
+        ),
+        prompt=_dynamic_olmocr_prompt,
+        timeout=90,
+        scale=1.0,
+        max_size=1024,  # from OlmOcr pipeline
+        response_format=ResponseFormat.MARKDOWN,
+    )
+    return options
+
+
 #### Using Ollama
 
 
@@ -123,6 +188,12 @@ def main():
     #     format=ResponseFormat.MARKDOWN,
     # )
 
+    # Example using the OlmOcr (dynamic prompt) model with LM Studio:
+    # (uncomment the following lines)
+    # pipeline_options.vlm_options = lms_olmocr_vlm_options(
+    #     model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
+    # )
+
     # Example using the Granite Vision model with Ollama:
     # (uncomment the following lines)
     # pipeline_options.vlm_options = ollama_vlm_options(