Skip to content

Commit ed68b47

Browse files
committed
Update layout injection, move to experimental
Signed-off-by: Christoph Auer <[email protected]>
1 parent 72007b9 commit ed68b47

File tree

5 files changed

+45
-14
lines changed

5 files changed

+45
-14
lines changed

docling/experimental/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""Experimental modules for Docling.
2+
3+
This package contains experimental features that are under development
4+
and may change or be removed in future versions.
5+
"""
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Experimental datamodel modules."""

docling/datamodel/threaded_layout_vlm_pipeline_options.py renamed to docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,23 @@
88
ApiVlmOptions,
99
InlineVlmOptions,
1010
)
11-
from docling.datamodel.vlm_model_specs import SMOLDOCLING_TRANSFORMERS
11+
from docling.datamodel.vlm_model_specs import SMOLDOCLING_MLX, SMOLDOCLING_TRANSFORMERS
1212

1313

1414
class ThreadedLayoutVlmPipelineOptions(PaginatedPipelineOptions):
1515
"""Pipeline options for the threaded layout+VLM pipeline."""
1616

1717
# Inherit page image generation from PaginatedPipelineOptions but enable by default
1818
generate_page_images: bool = True
19+
images_scale: float = 2.0
1920

2021
# VLM configuration (will be enhanced with layout awareness by the pipeline)
21-
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = SMOLDOCLING_TRANSFORMERS
22+
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = SMOLDOCLING_MLX
2223

2324
# Layout model configuration
24-
layout_options: LayoutOptions = LayoutOptions(model_spec=DOCLING_LAYOUT_HERON)
25+
layout_options: LayoutOptions = LayoutOptions(
26+
model_spec=DOCLING_LAYOUT_HERON, skip_cell_assignment=True
27+
)
2528

2629
# Threading and batching controls
2730
layout_batch_size: int = 4
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Experimental pipeline modules."""

docling/pipeline/threaded_layout_vlm_pipeline.py renamed to docling/experimental/pipeline/threaded_layout_vlm_pipeline.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
InlineVlmOptions,
2727
)
2828
from docling.datamodel.settings import settings
29-
from docling.datamodel.threaded_layout_vlm_pipeline_options import (
29+
from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import (
3030
ThreadedLayoutVlmPipelineOptions,
3131
)
3232
from docling.models.api_vlm_model import ApiVlmModel
@@ -86,17 +86,38 @@ def build_prompt(self, page):
8686

8787
# If we have a full Page object with layout predictions, enhance the prompt
8888
if isinstance(page, Page) and page.predictions.layout:
89-
layout_info = []
89+
from docling_core.types.doc.tokens import DocumentToken
90+
91+
layout_elements = []
9092
for cluster in page.predictions.layout.clusters:
91-
# TODO: Format the layout boxes as doctags tokens.
92-
bbox = cluster.bbox
93-
label = str(cluster.label)
94-
coord_str = f"{label}: ({bbox.l:.1f}, {bbox.t:.1f}, {bbox.r:.1f}, {bbox.b:.1f})"
95-
layout_info.append(coord_str)
96-
97-
if layout_info:
98-
layout_injection = (
99-
f"\n\\Layout elements: {'; '.join(layout_info)}"
93+
# Get proper tag name from DocItemLabel
94+
tag_name = DocumentToken.create_token_name_from_doc_item_label(
95+
label=cluster.label
96+
)
97+
98+
# Convert bbox to tuple and get location tokens
99+
bbox_tuple = cluster.bbox.as_tuple()
100+
location_tokens = DocumentToken.get_location(
101+
bbox=bbox_tuple,
102+
page_w=page.size.width,
103+
page_h=page.size.height,
104+
xsize=500,
105+
ysize=500,
106+
)
107+
108+
# Create XML element with DocTags format
109+
xml_element = f"<{tag_name}>{location_tokens}</{tag_name}>"
110+
layout_elements.append(xml_element)
111+
112+
if layout_elements:
113+
# Join elements with newlines and wrap in layout tags
114+
layout_xml = (
115+
"<layout>" + "\n".join(layout_elements) + "</layout>"
116+
)
117+
layout_injection = f"\n{layout_xml}"
118+
119+
print(
120+
f"Layout injection prompt: {base_prompt + layout_injection}"
100121
)
101122
return base_prompt + layout_injection
102123

0 commit comments

Comments
 (0)