|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Demo script for the new ThreadedLayoutVlmPipeline. |
| 3 | +
|
| 4 | +This script demonstrates the usage of the new pipeline that combines |
| 5 | +layout model preprocessing with VLM processing in a threaded manner. |
| 6 | +""" |
| 7 | + |
| 8 | +from pathlib import Path |
| 9 | +import argparse |
| 10 | +import logging |
| 11 | +from io import BytesIO |
| 12 | + |
| 13 | + |
| 14 | +from docling.datamodel.base_models import InputFormat |
| 15 | +from docling.datamodel.vlm_model_specs import GRANITEDOCLING_TRANSFORMERS, GRANITEDOCLING_VLLM |
| 16 | +from docling.experimental.datamodel.threaded_layout_vlm_pipeline_options import ThreadedLayoutVlmPipelineOptions |
| 17 | +from docling.document_converter import DocumentConverter, PdfFormatOption |
| 18 | +from docling.experimental.pipeline.threaded_layout_vlm_pipeline import ThreadedLayoutVlmPipeline |
| 19 | +from docling.datamodel.base_models import ConversionStatus, DocumentStream |
| 20 | +from docling.pipeline.vlm_pipeline import VlmPipeline |
| 21 | +from docling.datamodel.pipeline_options import VlmPipelineOptions |
| 22 | + |
| 23 | + |
| 24 | +_log = logging.getLogger(__name__) |
| 25 | + |
| 26 | + |
| 27 | +def _parse_args(): |
| 28 | + parser = argparse.ArgumentParser(description='Demo script for the new ThreadedLayoutVlmPipeline') |
| 29 | + parser.add_argument( |
| 30 | + '--input', |
| 31 | + type=str, |
| 32 | + required=True, |
| 33 | + help='Input directory containing PDF files' |
| 34 | + ) |
| 35 | + parser.add_argument( |
| 36 | + '--output', |
| 37 | + type=str, |
| 38 | + default='../results/', |
| 39 | + help='Output directory for converted files' |
| 40 | + ) |
| 41 | + return parser.parse_args() |
| 42 | + |
| 43 | + |
| 44 | +def _get_docs(input_doc_paths): |
| 45 | + '''Yield DocumentStream objects from list of input document paths''' |
| 46 | + for path in input_doc_paths: |
| 47 | + buf = BytesIO(path.read_bytes()) |
| 48 | + stream = DocumentStream(name=path.name, stream=buf) |
| 49 | + yield stream |
| 50 | + |
| 51 | + |
| 52 | +def demo_threaded_layout_vlm_pipeline(input_doc_paths: list[Path], out_dir_layout_aware: Path, out_dir_classic_vlm: Path): |
| 53 | + """Demonstrate the threaded layout+VLM pipeline.""" |
| 54 | + |
| 55 | + # Configure pipeline options |
| 56 | + print("Configuring pipeline options...") |
| 57 | + pipeline_options_layout_aware = ThreadedLayoutVlmPipelineOptions( |
| 58 | + # VLM configuration - defaults to GRANITEDOCLING_TRANSFORMERS |
| 59 | + vlm_options=GRANITEDOCLING_TRANSFORMERS, |
| 60 | + # Layout configuration - defaults to DOCLING_LAYOUT_HERON |
| 61 | + |
| 62 | + # Batch sizes for parallel processing |
| 63 | + layout_batch_size=2, |
| 64 | + vlm_batch_size=1, |
| 65 | + |
| 66 | + # Queue configuration |
| 67 | + queue_max_size=10, |
| 68 | + batch_timeout_seconds=1.0, |
| 69 | + |
| 70 | + # Layout coordinate injection |
| 71 | + include_layout_coordinates=True, |
| 72 | + coordinate_precision=1, |
| 73 | + |
| 74 | + # Image processing |
| 75 | + images_scale=2.0, |
| 76 | + generate_page_images=True, |
| 77 | + ) |
| 78 | + |
| 79 | + pipeline_options_classic_vlm = VlmPipelineOptions(vlm_otpions=GRANITEDOCLING_VLLM) |
| 80 | + |
| 81 | + # Create converter with the new pipeline |
| 82 | + print("Initializing DocumentConverter (this may take a while - loading models)...") |
| 83 | + doc_converter_layout_enhanced = DocumentConverter( |
| 84 | + format_options={ |
| 85 | + InputFormat.PDF: PdfFormatOption( |
| 86 | + pipeline_cls=ThreadedLayoutVlmPipeline, |
| 87 | + pipeline_options=pipeline_options_layout_aware |
| 88 | + ) |
| 89 | + } |
| 90 | + ) |
| 91 | + doc_converter_classic_vlm = DocumentConverter( |
| 92 | + format_options={ |
| 93 | + InputFormat.PDF: PdfFormatOption( |
| 94 | + pipeline_cls=VlmPipeline, |
| 95 | + pipeline_options=pipeline_options_classic_vlm, |
| 96 | + ), |
| 97 | + } |
| 98 | + ) |
| 99 | + |
| 100 | + print(f"Starting conversion of {len(input_doc_paths)} document(s)...") |
| 101 | + result_layout_aware = doc_converter_layout_enhanced.convert_all(list(_get_docs(input_doc_paths)), raises_on_error=False) |
| 102 | + result_without_layout = doc_converter_classic_vlm.convert_all(list(_get_docs(input_doc_paths)), raises_on_error=False) |
| 103 | + |
| 104 | + for conv_result in result_layout_aware: |
| 105 | + if conv_result.status == ConversionStatus.FAILURE: |
| 106 | + _log.error(f"Conversion failed: {conv_result.status}") |
| 107 | + continue |
| 108 | + |
| 109 | + doc_filename = conv_result.input.file.stem |
| 110 | + conv_result.document.save_as_doctags(out_dir_layout_aware / f"{doc_filename}.dt") |
| 111 | + |
| 112 | + for conv_result in result_without_layout: |
| 113 | + if conv_result.status == ConversionStatus.FAILURE: |
| 114 | + _log.error(f"Conversion failed: {conv_result.status}") |
| 115 | + continue |
| 116 | + |
| 117 | + doc_filename = conv_result.input.file.stem |
| 118 | + conv_result.document.save_as_doctags(out_dir_classic_vlm / f"{doc_filename}.dt") |
| 119 | + |
| 120 | + |
| 121 | +if __name__ == "__main__": |
| 122 | + logging.basicConfig(level=logging.INFO) |
| 123 | + try: |
| 124 | + print("Starting script...") |
| 125 | + args = _parse_args() |
| 126 | + print(f"Parsed arguments: input={args.input}, output={args.output}") |
| 127 | + |
| 128 | + base_path = Path(args.input) |
| 129 | + |
| 130 | + print(f"Searching for PDFs in: {base_path}") |
| 131 | + input_doc_paths = sorted(list(base_path.rglob("*.*"))) |
| 132 | + input_doc_paths = [e for e in input_doc_paths if e.name.endswith(".pdf") or e.name.endswith(".PDF")] |
| 133 | + |
| 134 | + if not input_doc_paths: |
| 135 | + _log.error(f"ERROR: No PDF files found in {base_path}") |
| 136 | + |
| 137 | + print(f"Found {len(input_doc_paths)} PDF file(s):") |
| 138 | + |
| 139 | + out_dir_layout_aware = Path(args.output) / "layout_aware" / "model_output" / "layout" / "doc_tags" |
| 140 | + out_dir_classic_vlm = Path(args.output) / "classic_vlm" / "model_output" / "layout" / "doc_tags" |
| 141 | + out_dir_layout_aware.mkdir(parents=True, exist_ok=True) |
| 142 | + out_dir_classic_vlm.mkdir(parents=True, exist_ok=True) |
| 143 | + |
| 144 | + _log.info("Calling demo_threaded_layout_vlm_pipeline...") |
| 145 | + demo_threaded_layout_vlm_pipeline(input_doc_paths, out_dir_layout_aware, out_dir_classic_vlm) |
| 146 | + _log.info("Script completed successfully!") |
| 147 | + except Exception as e: |
| 148 | + print(f"ERROR: {type(e).__name__}: {e}") |
| 149 | + import traceback |
| 150 | + traceback.print_exc() |
| 151 | + raise |
0 commit comments