Skip to content

Commit d572cf4

Browse files
author
Peter El Hachem
committed
add demo_layout file that produces with vs without layout injection
Signed-off-by: Peter El Hachem <[email protected]>
1 parent f248bbd commit d572cf4

File tree

1 file changed

+151
-0
lines changed

1 file changed

+151
-0
lines changed
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env python3
2+
"""Demo script for the new ThreadedLayoutVlmPipeline.
3+
4+
This script demonstrates the usage of the new pipeline that combines
5+
layout model preprocessing with VLM processing in a threaded manner.
6+
"""
7+
8+
from pathlib import Path
9+
import argparse
10+
import logging
11+
from io import BytesIO
12+
13+
14+
from docling.datamodel.base_models import InputFormat
15+
from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS, GRANITEDOCLING_VLLM
16+
from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import ThreadedLayoutVlmPipelineOptions
17+
from docling.document_converter import DocumentConverter, PdfFormatOption
18+
from docling.experimental.pipeline.threaded_layout_vlm_pipeline import ThreadedLayoutVlmPipeline
19+
from docling.datamodel.base_models import ConversionStatus, DocumentStream
20+
from docling.pipeline.vlm_pipeline import VlmPipeline
21+
from docling.datamodel.pipeline_options import VlmPipelineOptions
22+
23+
24+
_log = logging.getLogger(__name__)
25+
26+
27+
def _parse_args():
28+
parser = argparse.ArgumentParser(description='Demo script for the new ThreadedLayoutVlmPipeline')
29+
parser.add_argument(
30+
'--input',
31+
type=str,
32+
required=True,
33+
help='Input directory containing PDF files'
34+
)
35+
parser.add_argument(
36+
'--output',
37+
type=str,
38+
default='../results/',
39+
help='Output directory for converted files'
40+
)
41+
return parser.parse_args()
42+
43+
44+
def _get_docs(input_doc_paths):
45+
'''Yield DocumentStream objects from list of input document paths'''
46+
for path in input_doc_paths:
47+
buf = BytesIO(path.read_bytes())
48+
stream = DocumentStream(name=path.name, stream=buf)
49+
yield stream
50+
51+
52+
def demo_threaded_layout_vlm_pipeline(input_doc_paths: list[Path], out_dir_layout_aware: Path, out_dir_classic_vlm: Path):
53+
"""Demonstrate the threaded layout+VLM pipeline."""
54+
55+
# Configure pipeline options
56+
print("Configuring pipeline options...")
57+
pipeline_options_layout_aware = ThreadedLayoutVlmPipelineOptions(
58+
# VLM configuration - defaults to GRANITEDOCLING_TRANSFORMERS
59+
vlm_options=GRANITEDOCLING_TRANSFORMERS,
60+
# Layout configuration - defaults to DOCLING_LAYOUT_HERON
61+
62+
# Batch sizes for parallel processing
63+
layout_batch_size=2,
64+
vlm_batch_size=1,
65+
66+
# Queue configuration
67+
queue_max_size=10,
68+
batch_timeout_seconds=1.0,
69+
70+
# Layout coordinate injection
71+
include_layout_coordinates=True,
72+
coordinate_precision=1,
73+
74+
# Image processing
75+
images_scale=2.0,
76+
generate_page_images=True,
77+
)
78+
79+
pipeline_options_classic_vlm = VlmPipelineOptions(vlm_otpions=GRANITEDOCLING_VLLM)
80+
81+
# Create converter with the new pipeline
82+
print("Initializing DocumentConverter (this may take a while - loading models)...")
83+
doc_converter_layout_enhanced = DocumentConverter(
84+
format_options={
85+
InputFormat.PDF: PdfFormatOption(
86+
pipeline_cls=ThreadedLayoutVlmPipeline,
87+
pipeline_options=pipeline_options_layout_aware
88+
)
89+
}
90+
)
91+
doc_converter_classic_vlm = DocumentConverter(
92+
format_options={
93+
InputFormat.PDF: PdfFormatOption(
94+
pipeline_cls=VlmPipeline,
95+
pipeline_options=pipeline_options_classic_vlm,
96+
),
97+
}
98+
)
99+
100+
print(f"Starting conversion of {len(input_doc_paths)} document(s)...")
101+
result_layout_aware = doc_converter_layout_enhanced.convert_all(list(_get_docs(input_doc_paths)), raises_on_error=False)
102+
result_without_layout = doc_converter_classic_vlm.convert_all(list(_get_docs(input_doc_paths)), raises_on_error=False)
103+
104+
for conv_result in result_layout_aware:
105+
if conv_result.status == ConversionStatus.FAILURE:
106+
_log.error(f"Conversion failed: {conv_result.status}")
107+
continue
108+
109+
doc_filename = conv_result.input.file.stem
110+
conv_result.document.save_as_doctags(out_dir_layout_aware / f"{doc_filename}.dt")
111+
112+
for conv_result in result_without_layout:
113+
if conv_result.status == ConversionStatus.FAILURE:
114+
_log.error(f"Conversion failed: {conv_result.status}")
115+
continue
116+
117+
doc_filename = conv_result.input.file.stem
118+
conv_result.document.save_as_doctags(out_dir_classic_vlm / f"{doc_filename}.dt")
119+
120+
121+
if __name__ == "__main__":
122+
logging.basicConfig(level=logging.INFO)
123+
try:
124+
print("Starting script...")
125+
args = _parse_args()
126+
print(f"Parsed arguments: input={args.input}, output={args.output}")
127+
128+
base_path = Path(args.input)
129+
130+
print(f"Searching for PDFs in: {base_path}")
131+
input_doc_paths = sorted(list(base_path.rglob("*.*")))
132+
input_doc_paths = [e for e in input_doc_paths if e.name.endswith(".pdf") or e.name.endswith(".PDF")]
133+
134+
if not input_doc_paths:
135+
_log.error(f"ERROR: No PDF files found in {base_path}")
136+
137+
print(f"Found {len(input_doc_paths)} PDF file(s):")
138+
139+
out_dir_layout_aware = Path(args.output) / "layout_aware" / "model_output" / "layout" / "doc_tags"
140+
out_dir_classic_vlm = Path(args.output) / "classic_vlm" / "model_output" / "layout" / "doc_tags"
141+
out_dir_layout_aware.mkdir(parents=True, exist_ok=True)
142+
out_dir_classic_vlm.mkdir(parents=True, exist_ok=True)
143+
144+
_log.info("Calling demo_threaded_layout_vlm_pipeline...")
145+
demo_threaded_layout_vlm_pipeline(input_doc_paths, out_dir_layout_aware, out_dir_classic_vlm)
146+
_log.info("Script completed successfully!")
147+
except Exception as e:
148+
print(f"ERROR: {type(e).__name__}: {e}")
149+
import traceback
150+
traceback.print_exc()
151+
raise

0 commit comments

Comments
 (0)