Update layout injection, move to experimental

cau-git · cau-git · commit ed68b47c6396 · 2025-09-11T16:42:56.000+02:00
Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling/experimental/__init__.py b/docling/experimental/__init__.py
@@ -0,0 +1,5 @@
+"""Experimental modules for Docling.
+
+This package contains experimental features that are under development
+and may change or be removed in future versions.
+"""
diff --git a/docling/experimental/datamodel/__init__.py b/docling/experimental/datamodel/__init__.py
@@ -0,0 +1 @@
+"""Experimental datamodel modules."""
diff --git a/docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py b/docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py
@@ -8,20 +8,23 @@
     ApiVlmOptions,
     InlineVlmOptions,
 )
-from docling.datamodel.vlm_model_specs import SMOLDOCLING_TRANSFORMERS
+from docling.datamodel.vlm_model_specs import SMOLDOCLING_MLX, SMOLDOCLING_TRANSFORMERS
 
 
 class ThreadedLayoutVlmPipelineOptions(PaginatedPipelineOptions):
     """Pipeline options for the threaded layout+VLM pipeline."""
 
     # Inherit page image generation from PaginatedPipelineOptions but enable by default
     generate_page_images: bool = True
+    images_scale: float = 2.0
 
     # VLM configuration (will be enhanced with layout awareness by the pipeline)
-    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = SMOLDOCLING_TRANSFORMERS
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = SMOLDOCLING_MLX
 
     # Layout model configuration
-    layout_options: LayoutOptions = LayoutOptions(model_spec=DOCLING_LAYOUT_HERON)
+    layout_options: LayoutOptions = LayoutOptions(
+        model_spec=DOCLING_LAYOUT_HERON, skip_cell_assignment=True
+    )
 
     # Threading and batching controls
     layout_batch_size: int = 4
diff --git a/docling/experimental/pipeline/__init__.py b/docling/experimental/pipeline/__init__.py
@@ -0,0 +1 @@
+"""Experimental pipeline modules."""
diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
@@ -26,7 +26,7 @@
     InlineVlmOptions,
 )
 from docling.datamodel.settings import settings
-from docling.datamodel.threaded_layout_vlm_pipeline_options import (
+from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import (
     ThreadedLayoutVlmPipelineOptions,
 )
 from docling.models.api_vlm_model import ApiVlmModel
@@ -86,17 +86,38 @@ def build_prompt(self, page):
 
                 # If we have a full Page object with layout predictions, enhance the prompt
                 if isinstance(page, Page) and page.predictions.layout:
-                    layout_info = []
+                    from docling_core.types.doc.tokens import DocumentToken
+
+                    layout_elements = []
                     for cluster in page.predictions.layout.clusters:
-                        # TODO: Format the layout boxes as doctags tokens.
-                        bbox = cluster.bbox
-                        label = str(cluster.label)
-                        coord_str = f"{label}: ({bbox.l:.1f}, {bbox.t:.1f}, {bbox.r:.1f}, {bbox.b:.1f})"
-                        layout_info.append(coord_str)
-
-                    if layout_info:
-                        layout_injection = (
-                            f"\n\\Layout elements: {'; '.join(layout_info)}"
+                        # Get proper tag name from DocItemLabel
+                        tag_name = DocumentToken.create_token_name_from_doc_item_label(
+                            label=cluster.label
+                        )
+
+                        # Convert bbox to tuple and get location tokens
+                        bbox_tuple = cluster.bbox.as_tuple()
+                        location_tokens = DocumentToken.get_location(
+                            bbox=bbox_tuple,
+                            page_w=page.size.width,
+                            page_h=page.size.height,
+                            xsize=500,
+                            ysize=500,
+                        )
+
+                        # Create XML element with DocTags format
+                        xml_element = f"<{tag_name}>{location_tokens}</{tag_name}>"
+                        layout_elements.append(xml_element)
+
+                    if layout_elements:
+                        # Join elements with newlines and wrap in layout tags
+                        layout_xml = (
+                            "<layout>" + "\n".join(layout_elements) + "</layout>"
+                        )
+                        layout_injection = f"\n{layout_xml}"
+
+                        print(
+                            f"Layout injection prompt: {base_prompt + layout_injection}"
                         )
                         return base_prompt + layout_injection