Skip to content

Commit b8813ee

Browse files
feat(vlm): Dynamic prompts (#1808)
* Unify temperature options for Vlm models * Dynamic prompt support with example * DCO Remediation Commit for Shkarupa Alex <[email protected]> I, Shkarupa Alex <[email protected]>, hereby add my Signed-off-by to this commit: 34d446c I, Shkarupa Alex <[email protected]>, hereby add my Signed-off-by to this commit: 9c595d5 Signed-off-by: Shkarupa Alex <[email protected]> * Replace Page with SegmentedPage * Fix example HF repo link Signed-off-by: Christoph Auer <[email protected]> * Sign-off Signed-off-by: Shkarupa Alex <[email protected]> * DCO Remediation Commit for Shkarupa Alex <[email protected]> I, Shkarupa Alex <[email protected]>, hereby add my Signed-off-by to this commit: 1a16206 Signed-off-by: Shkarupa Alex <[email protected]> Signed-off-by: Shkarupa Alex <[email protected]> * Use lmstudio-community model Signed-off-by: Christoph Auer <[email protected]> * Swap inference engine to LM Studio Signed-off-by: Shkarupa Alex <[email protected]> --------- Signed-off-by: Shkarupa Alex <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent edd4356 commit b8813ee

File tree

6 files changed

+96
-15
lines changed

6 files changed

+96
-15
lines changed

docling/datamodel/pipeline_options_vlm_model.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from enum import Enum
2-
from typing import Any, Dict, List, Literal, Optional, Union
2+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
33

4+
from docling_core.types.doc.page import SegmentedPage
45
from pydantic import AnyUrl, BaseModel
56
from typing_extensions import deprecated
67

@@ -9,9 +10,10 @@
910

1011
class BaseVlmOptions(BaseModel):
1112
kind: str
12-
prompt: str
13+
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
1314
scale: float = 2.0
1415
max_size: Optional[int] = None
16+
temperature: float = 0.0
1517

1618

1719
class ResponseFormat(str, Enum):
@@ -51,7 +53,6 @@ class InlineVlmOptions(BaseVlmOptions):
5153
AcceleratorDevice.MPS,
5254
]
5355

54-
temperature: float = 0.0
5556
stop_strings: List[str] = []
5657
extra_generation_config: Dict[str, Any] = {}
5758

docling/models/api_vlm_model.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,9 @@ def __init__(
2929

3030
self.timeout = self.vlm_options.timeout
3131
self.concurrency = self.vlm_options.concurrency
32-
self.prompt_content = (
33-
f"This is a page from a document.\n{self.vlm_options.prompt}"
34-
)
3532
self.params = {
3633
**self.vlm_options.params,
37-
"temperature": 0,
34+
"temperature": self.vlm_options.temperature,
3835
}
3936

4037
def __call__(
@@ -56,9 +53,14 @@ def _vlm_request(page):
5653
if hi_res_image.mode != "RGB":
5754
hi_res_image = hi_res_image.convert("RGB")
5855

56+
if callable(self.vlm_options.prompt):
57+
prompt = self.vlm_options.prompt(page.parsed_page)
58+
else:
59+
prompt = self.vlm_options.prompt
60+
5961
page_tags = api_image_request(
6062
image=hi_res_image,
61-
prompt=self.prompt_content,
63+
prompt=prompt,
6264
url=self.vlm_options.url,
6365
timeout=self.timeout,
6466
headers=self.vlm_options.headers,

docling/models/vlm_models_inline/hf_transformers_model.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,11 @@ def __call__(
128128
)
129129

130130
# Define prompt structure
131-
prompt = self.formulate_prompt()
131+
if callable(self.vlm_options.prompt):
132+
user_prompt = self.vlm_options.prompt(page.parsed_page)
133+
else:
134+
user_prompt = self.vlm_options.prompt
135+
prompt = self.formulate_prompt(user_prompt)
132136

133137
inputs = self.processor(
134138
text=prompt, images=[hi_res_image], return_tensors="pt"
@@ -162,7 +166,7 @@ def __call__(
162166

163167
yield page
164168

165-
def formulate_prompt(self) -> str:
169+
def formulate_prompt(self, user_prompt: str) -> str:
166170
"""Formulate a prompt for the VLM."""
167171

168172
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
@@ -173,7 +177,7 @@ def formulate_prompt(self) -> str:
173177
assistant_prompt = "<|assistant|>"
174178
prompt_suffix = "<|end|>"
175179

176-
prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
180+
prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
177181
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
178182

179183
return prompt
@@ -187,7 +191,7 @@ def formulate_prompt(self) -> str:
187191
"text": "This is a page from a document.",
188192
},
189193
{"type": "image"},
190-
{"type": "text", "text": self.vlm_options.prompt},
194+
{"type": "text", "text": user_prompt},
191195
],
192196
}
193197
]

docling/models/vlm_models_inline/mlx_model.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,6 @@ def __init__(
5656
elif (artifacts_path / repo_cache_folder).exists():
5757
artifacts_path = artifacts_path / repo_cache_folder
5858

59-
self.param_question = vlm_options.prompt
60-
6159
## Load the model
6260
self.vlm_model, self.processor = load(artifacts_path)
6361
self.config = load_config(artifacts_path)
@@ -86,8 +84,12 @@ def __call__(
8684
if hi_res_image.mode != "RGB":
8785
hi_res_image = hi_res_image.convert("RGB")
8886

87+
if callable(self.vlm_options.prompt):
88+
user_prompt = self.vlm_options.prompt(page.parsed_page)
89+
else:
90+
user_prompt = self.vlm_options.prompt
8991
prompt = self.apply_chat_template(
90-
self.processor, self.config, self.param_question, num_images=1
92+
self.processor, self.config, user_prompt, num_images=1
9193
)
9294

9395
start_time = time.time()

docling/pipeline/vlm_pipeline.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
117117
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
118118
if page._backend is not None and page._backend.is_valid():
119119
page.size = page._backend.get_size()
120+
page.parsed_page = page._backend.get_segmented_page()
120121

121122
return page
122123

docs/examples/vlm_pipeline_api_model.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import logging
22
import os
33
from pathlib import Path
4+
from typing import Optional
45

56
import requests
7+
from docling_core.types.doc.page import SegmentedPage
68
from dotenv import load_dotenv
79

810
from docling.datamodel.base_models import InputFormat
@@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
3234
return options
3335

3436

37+
#### Using LM Studio with OlmOcr model
38+
39+
40+
def lms_olmocr_vlm_options(model: str):
41+
def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
42+
if page is None:
43+
return (
44+
"Below is the image of one page of a document. Just return the plain text"
45+
" representation of this document as if you were reading it naturally.\n"
46+
"Do not hallucinate.\n"
47+
)
48+
49+
anchor = [
50+
f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
51+
]
52+
53+
for text_cell in page.textline_cells:
54+
if not text_cell.text.strip():
55+
continue
56+
bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
57+
page.dimension.height
58+
)
59+
anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
60+
61+
for image_cell in page.bitmap_resources:
62+
bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
63+
page.dimension.height
64+
)
65+
anchor.append(
66+
f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
67+
)
68+
69+
if len(anchor) == 1:
70+
anchor.append(
71+
f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
72+
)
73+
74+
# Original prompt uses cells sorting. We are skipping it in this demo.
75+
76+
base_text = "\n".join(anchor)
77+
78+
return (
79+
f"Below is the image of one page of a document, as well as some raw textual"
80+
f" content that was previously extracted for it. Just return the plain text"
81+
f" representation of this document as if you were reading it naturally.\n"
82+
f"Do not hallucinate.\n"
83+
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
84+
)
85+
86+
options = ApiVlmOptions(
87+
url="http://localhost:1234/v1/chat/completions",
88+
params=dict(
89+
model=model,
90+
),
91+
prompt=_dynamic_olmocr_prompt,
92+
timeout=90,
93+
scale=1.0,
94+
max_size=1024, # from OlmOcr pipeline
95+
response_format=ResponseFormat.MARKDOWN,
96+
)
97+
return options
98+
99+
35100
#### Using Ollama
36101

37102

@@ -123,6 +188,12 @@ def main():
123188
# format=ResponseFormat.MARKDOWN,
124189
# )
125190

191+
# Example using the OlmOcr (dynamic prompt) model with LM Studio:
192+
# (uncomment the following lines)
193+
# pipeline_options.vlm_options = lms_olmocr_vlm_options(
194+
# model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
195+
# )
196+
126197
# Example using the Granite Vision model with Ollama:
127198
# (uncomment the following lines)
128199
# pipeline_options.vlm_options = ollama_vlm_options(

0 commit comments

Comments
 (0)