Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions docling/datamodel/pipeline_options_vlm_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union

from docling_core.types.doc.page import SegmentedPage
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate import: SegmentedPage is imported both at line 4 and inside the TYPE_CHECKING block at line 13. The import at line 4 should be removed since it's only used for type annotations and is already imported in the TYPE_CHECKING block.

Suggested change
from docling_core.types.doc.page import SegmentedPage

Copilot uses AI. Check for mistakes.
from pydantic import AnyUrl, BaseModel, ConfigDict
Expand All @@ -9,6 +9,11 @@
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.models.utils.generation_utils import GenerationStopper

if TYPE_CHECKING:
from docling_core.types.doc.page import SegmentedPage

from docling.datamodel.base_models import Page


class BaseVlmOptions(BaseModel):
kind: str
Expand All @@ -17,7 +22,7 @@ class BaseVlmOptions(BaseModel):
max_size: Optional[int] = None
temperature: float = 0.0

def build_prompt(self, page: Optional[SegmentedPage]) -> str:
def build_prompt(self, page: Optional[Union["Page", "SegmentedPage"]]) -> str:
return self.prompt

def decode_response(self, text: str) -> str:
Expand Down
5 changes: 5 additions & 0 deletions docling/experimental/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Experimental modules for Docling.

This package contains experimental features that are under development
and may change or be removed in future versions.
"""
1 change: 1 addition & 0 deletions docling/experimental/datamodel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Experimental datamodel modules."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Options for the threaded layout+VLM pipeline."""

from typing import Union

from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_HERON
from docling.datamodel.pipeline_options import LayoutOptions, PaginatedPipelineOptions
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InlineVlmOptions,
)
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS


class ThreadedLayoutVlmPipelineOptions(PaginatedPipelineOptions):
"""Pipeline options for the threaded layout+VLM pipeline."""

images_scale: float = 2.0

# VLM configuration (will be enhanced with layout awareness by the pipeline)
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = GRANITEDOCLING_TRANSFORMERS

# Layout model configuration
layout_options: LayoutOptions = LayoutOptions(
model_spec=DOCLING_LAYOUT_HERON, skip_cell_assignment=True
)

# Threading and batching controls
layout_batch_size: int = 4
vlm_batch_size: int = 4
batch_timeout_seconds: float = 2.0
queue_max_size: int = 50
169 changes: 169 additions & 0 deletions docling/experimental/demo_layout_vlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""Demo script for the new ThreadedLayoutVlmPipeline.
This script demonstrates the usage of the new pipeline that combines
layout model preprocessing with VLM processing in a threaded manner.
"""

import argparse
import logging
from io import BytesIO
from pathlib import Path

from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.datamodel.vlm_model_specs import (
GRANITEDOCLING_TRANSFORMERS,
GRANITEDOCLING_VLLM,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import (
ThreadedLayoutVlmPipelineOptions,
)
from docling.experimental.pipeline.threaded_layout_vlm_pipeline import (
ThreadedLayoutVlmPipeline,
)
from docling.pipeline.vlm_pipeline import VlmPipeline

_log = logging.getLogger(__name__)


def _parse_args():
parser = argparse.ArgumentParser(
description="Demo script for the new ThreadedLayoutVlmPipeline"
)
parser.add_argument(
"--input", type=str, required=True, help="Input directory containing PDF files"
)
parser.add_argument(
"--output",
type=str,
default="../results/",
help="Output directory for converted files",
)
return parser.parse_args()


def _get_docs(input_doc_paths):
"""Yield DocumentStream objects from list of input document paths"""
for path in input_doc_paths:
buf = BytesIO(path.read_bytes())
stream = DocumentStream(name=path.name, stream=buf)
yield stream


def demo_threaded_layout_vlm_pipeline(
input_doc_paths: list[Path], out_dir_layout_aware: Path, out_dir_classic_vlm: Path
):
"""Demonstrate the threaded layout+VLM pipeline."""

# Configure pipeline options
print("Configuring pipeline options...")
pipeline_options_layout_aware = ThreadedLayoutVlmPipelineOptions(
# VLM configuration - defaults to GRANITEDOCLING_TRANSFORMERS
vlm_options=GRANITEDOCLING_TRANSFORMERS,
# Layout configuration - defaults to DOCLING_LAYOUT_HERON
# Batch sizes for parallel processing
layout_batch_size=2,
vlm_batch_size=1,
# Queue configuration
queue_max_size=10,
batch_timeout_seconds=1.0,
# Layout coordinate injection
include_layout_coordinates=True,
coordinate_precision=1,
# Image processing
images_scale=2.0,
generate_page_images=True,
)

pipeline_options_classic_vlm = VlmPipelineOptions(vlm_otpions=GRANITEDOCLING_VLLM)
Copy link

Copilot AI Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'vlm_otpions' to 'vlm_options'.

Suggested change
pipeline_options_classic_vlm = VlmPipelineOptions(vlm_otpions=GRANITEDOCLING_VLLM)
pipeline_options_classic_vlm = VlmPipelineOptions(vlm_options=GRANITEDOCLING_VLLM)

Copilot uses AI. Check for mistakes.

# Create converter with the new pipeline
print("Initializing DocumentConverter (this may take a while - loading models)...")
doc_converter_layout_enhanced = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ThreadedLayoutVlmPipeline,
pipeline_options=pipeline_options_layout_aware,
)
}
)
doc_converter_classic_vlm = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options_classic_vlm,
),
}
)

print(f"Starting conversion of {len(input_doc_paths)} document(s)...")
result_layout_aware = doc_converter_layout_enhanced.convert_all(
list(_get_docs(input_doc_paths)), raises_on_error=False
)
result_without_layout = doc_converter_classic_vlm.convert_all(
list(_get_docs(input_doc_paths)), raises_on_error=False
)

for conv_result in result_layout_aware:
if conv_result.status == ConversionStatus.FAILURE:
_log.error(f"Conversion failed: {conv_result.status}")
continue

doc_filename = conv_result.input.file.stem
conv_result.document.save_as_doctags(
out_dir_layout_aware / f"{doc_filename}.dt"
)

for conv_result in result_without_layout:
if conv_result.status == ConversionStatus.FAILURE:
_log.error(f"Conversion failed: {conv_result.status}")
continue

doc_filename = conv_result.input.file.stem
conv_result.document.save_as_doctags(out_dir_classic_vlm / f"{doc_filename}.dt")


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
try:
print("Starting script...")
args = _parse_args()
print(f"Parsed arguments: input={args.input}, output={args.output}")

base_path = Path(args.input)

print(f"Searching for PDFs in: {base_path}")
input_doc_paths = sorted(base_path.rglob("*.*"))
input_doc_paths = [
e
for e in input_doc_paths
if e.name.endswith(".pdf") or e.name.endswith(".PDF")
]

if not input_doc_paths:
_log.error(f"ERROR: No PDF files found in {base_path}")

print(f"Found {len(input_doc_paths)} PDF file(s):")

out_dir_layout_aware = (
Path(args.output) / "layout_aware" / "model_output" / "layout" / "doc_tags"
)
out_dir_classic_vlm = (
Path(args.output) / "classic_vlm" / "model_output" / "layout" / "doc_tags"
)
out_dir_layout_aware.mkdir(parents=True, exist_ok=True)
out_dir_classic_vlm.mkdir(parents=True, exist_ok=True)

_log.info("Calling demo_threaded_layout_vlm_pipeline...")
demo_threaded_layout_vlm_pipeline(
input_doc_paths, out_dir_layout_aware, out_dir_classic_vlm
)
_log.info("Script completed successfully!")
except Exception as e:
print(f"ERROR: {type(e).__name__}: {e}")
import traceback

traceback.print_exc()
raise
1 change: 1 addition & 0 deletions docling/experimental/pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Experimental pipeline modules."""
Loading