Skip to content

Commit a07ba86

Browse files
geoHeildolfim-ibm
andauthored
feat: add image-text-to-text models in transformers (#1772)
* feat(dolphin): add dolphin support Signed-off-by: Georg Heiler <[email protected]> * rename Signed-off-by: Georg Heiler <[email protected]> * reformat Signed-off-by: Georg Heiler <[email protected]> * fix mypy Signed-off-by: Georg Heiler <[email protected]> * add prompt style and examples Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Georg Heiler <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent e25873d commit a07ba86

File tree

3 files changed

+77
-18
lines changed

3 files changed

+77
-18
lines changed

docling/datamodel/pipeline_options_vlm_model.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ class TransformersModelType(str, Enum):
3131
AUTOMODEL = "automodel"
3232
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
3333
AUTOMODEL_CAUSALLM = "automodel-causallm"
34+
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
35+
36+
37+
class TransformersPromptStyle(str, Enum):
38+
CHAT = "chat"
39+
RAW = "raw"
3440

3541

3642
class InlineVlmOptions(BaseVlmOptions):
@@ -44,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
4450

4551
inference_framework: InferenceFramework
4652
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
53+
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
4754
response_format: ResponseFormat
4855

4956
torch_dtype: Optional[str] = None

docling/models/vlm_models_inline/hf_transformers_model.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from docling.datamodel.pipeline_options_vlm_model import (
1414
InlineVlmOptions,
1515
TransformersModelType,
16+
TransformersPromptStyle,
1617
)
1718
from docling.models.base_model import BasePageModel
1819
from docling.models.utils.hf_model_download import (
@@ -41,6 +42,7 @@ def __init__(
4142
from transformers import (
4243
AutoModel,
4344
AutoModelForCausalLM,
45+
AutoModelForImageTextToText,
4446
AutoModelForVision2Seq,
4547
AutoProcessor,
4648
BitsAndBytesConfig,
@@ -91,6 +93,11 @@ def __init__(
9193
== TransformersModelType.AUTOMODEL_VISION2SEQ
9294
):
9395
model_cls = AutoModelForVision2Seq
96+
elif (
97+
self.vlm_options.transformers_model_type
98+
== TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
99+
):
100+
model_cls = AutoModelForImageTextToText
94101

95102
self.processor = AutoProcessor.from_pretrained(
96103
artifacts_path,
@@ -169,7 +176,10 @@ def __call__(
169176
def formulate_prompt(self, user_prompt: str) -> str:
170177
"""Formulate a prompt for the VLM."""
171178

172-
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
179+
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
180+
return user_prompt
181+
182+
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
173183
_log.debug("Using specialized prompt for Phi-4")
174184
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
175185

@@ -182,20 +192,25 @@ def formulate_prompt(self, user_prompt: str) -> str:
182192

183193
return prompt
184194

185-
messages = [
186-
{
187-
"role": "user",
188-
"content": [
189-
{
190-
"type": "text",
191-
"text": "This is a page from a document.",
192-
},
193-
{"type": "image"},
194-
{"type": "text", "text": user_prompt},
195-
],
196-
}
197-
]
198-
prompt = self.processor.apply_chat_template(
199-
messages, add_generation_prompt=False
195+
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
196+
messages = [
197+
{
198+
"role": "user",
199+
"content": [
200+
{
201+
"type": "text",
202+
"text": "This is a page from a document.",
203+
},
204+
{"type": "image"},
205+
{"type": "text", "text": user_prompt},
206+
],
207+
}
208+
]
209+
prompt = self.processor.apply_chat_template(
210+
messages, add_generation_prompt=False
211+
)
212+
return prompt
213+
214+
raise RuntimeError(
215+
f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
200216
)
201-
return prompt

docs/examples/compare_vlm_models.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,18 @@
1414
from tabulate import tabulate
1515

1616
from docling.datamodel import vlm_model_specs
17+
from docling.datamodel.accelerator_options import AcceleratorDevice
1718
from docling.datamodel.base_models import InputFormat
1819
from docling.datamodel.pipeline_options import (
1920
VlmPipelineOptions,
2021
)
21-
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
22+
from docling.datamodel.pipeline_options_vlm_model import (
23+
InferenceFramework,
24+
InlineVlmOptions,
25+
ResponseFormat,
26+
TransformersModelType,
27+
TransformersPromptStyle,
28+
)
2229
from docling.document_converter import DocumentConverter, PdfFormatOption
2330
from docling.pipeline.vlm_pipeline import VlmPipeline
2431

@@ -101,6 +108,33 @@ def convert(sources: list[Path], converter: DocumentConverter):
101108
out_path = Path("scratch")
102109
out_path.mkdir(parents=True, exist_ok=True)
103110

111+
## Definiton of more inline models
112+
llava_qwen = InlineVlmOptions(
113+
repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
114+
# prompt="Read text in the image.",
115+
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
116+
# prompt="Parse the reading order of this document.",
117+
response_format=ResponseFormat.MARKDOWN,
118+
inference_framework=InferenceFramework.TRANSFORMERS,
119+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
120+
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
121+
scale=2.0,
122+
temperature=0.0,
123+
)
124+
125+
# Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
126+
dolphin_oneshot = InlineVlmOptions(
127+
repo_id="ByteDance/Dolphin",
128+
prompt="<s>Read text in the image. <Answer/>",
129+
response_format=ResponseFormat.MARKDOWN,
130+
inference_framework=InferenceFramework.TRANSFORMERS,
131+
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
132+
transformers_prompt_style=TransformersPromptStyle.RAW,
133+
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
134+
scale=2.0,
135+
temperature=0.0,
136+
)
137+
104138
## Use VlmPipeline
105139
pipeline_options = VlmPipelineOptions()
106140
pipeline_options.generate_page_images = True
@@ -121,6 +155,9 @@ def convert(sources: list[Path], converter: DocumentConverter):
121155
vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
122156
vlm_model_specs.PHI4_TRANSFORMERS,
123157
vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
158+
## More inline models
159+
dolphin_oneshot,
160+
llava_qwen,
124161
]
125162

126163
# Remove MLX models if not on Mac

0 commit comments

Comments
 (0)