@@ -103,12 +103,17 @@ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
103103parser = StructuredPDFParser()
104104
105105# Parser with VLM for structured data extraction
106- parser = StructuredPDFParser(
107- use_vlm = True ,
106+ from doctra.engines.vlm.service import VLMStructuredExtractor
107+
108+ # Initialize VLM engine
109+ vlm_engine = VLMStructuredExtractor(
108110 vlm_provider = " openai" , # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
109- vlm_api_key = " your_api_key_here"
111+ api_key = " your_api_key_here"
110112)
111113
114+ # Pass VLM engine to parser
115+ parser = StructuredPDFParser(vlm = vlm_engine)
116+
112117# Parse document
113118parser.parse(" document.pdf" )
114119```
@@ -141,6 +146,40 @@ paddle_ocr = PaddleOCREngine(
141146 use_textline_orientation = False # Text line orientation
142147)
143148parser = StructuredPDFParser(ocr_engine = paddle_ocr)
149+
150+ # Option 4: Reuse OCR engine across multiple parsers
151+ shared_ocr = PytesseractOCREngine(lang = " eng" , psm = 6 , oem = 3 )
152+ parser1 = StructuredPDFParser(ocr_engine = shared_ocr)
153+ parser2 = EnhancedPDFParser(ocr_engine = shared_ocr) # Reuse same instance
154+ ```
155+
156+ #### VLM Engine Configuration:
157+
158+ Doctra uses the same dependency injection pattern for VLM engines. You initialize the VLM engine externally and pass it to the parser:
159+
160+ ``` python
161+ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
162+ from doctra.engines.vlm.service import VLMStructuredExtractor
163+
164+ # Option 1: No VLM (default)
165+ parser = StructuredPDFParser() # VLM processing disabled
166+
167+ # Option 2: Initialize VLM engine and pass to parser
168+ vlm_engine = VLMStructuredExtractor(
169+ vlm_provider = " openai" , # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
170+ vlm_model = " gpt-5" , # Optional, uses default if None
171+ api_key = " your_api_key"
172+ )
173+ parser = StructuredPDFParser(vlm = vlm_engine)
174+
175+ # Option 3: Reuse VLM engine across multiple parsers
176+ shared_vlm = VLMStructuredExtractor(
177+ vlm_provider = " gemini" ,
178+ api_key = " your_api_key"
179+ )
180+ parser1 = StructuredPDFParser(vlm = shared_vlm)
181+ parser2 = EnhancedPDFParser(vlm = shared_vlm) # Reuse same instance
182+ parser3 = ChartTablePDFParser(vlm = shared_vlm) # Reuse same instance
144183```
145184
146185#### Advanced Configuration:
@@ -156,12 +195,18 @@ ocr_engine = PytesseractOCREngine(
156195 extra_config = " "
157196)
158197
159- parser = StructuredPDFParser(
160- # VLM Settings
161- use_vlm = True ,
198+ # Initialize VLM engine
199+ from doctra.engines.vlm.service import VLMStructuredExtractor
200+
201+ vlm_engine = VLMStructuredExtractor(
162202 vlm_provider = " openai" ,
163- vlm_model = " gpt-5" ,
164- vlm_api_key = " your_api_key" ,
203+ vlm_model = " gpt-5" , # Optional, uses default if None
204+ api_key = " your_api_key"
205+ )
206+
207+ parser = StructuredPDFParser(
208+ # VLM Engine (pass the initialized engine)
209+ vlm = vlm_engine, # or None to disable VLM
165210
166211 # Layout Detection Settings
167212 layout_model_name = " PP-DocLayout_plus-L" ,
@@ -227,18 +272,24 @@ ocr_engine = PytesseractOCREngine(
227272 oem = 3
228273)
229274
275+ # Initialize VLM engine
276+ from doctra.engines.vlm.service import VLMStructuredExtractor
277+
278+ vlm_engine = VLMStructuredExtractor(
279+ vlm_provider = " openai" ,
280+ vlm_model = " gpt-4-vision" , # Optional, uses default if None
281+ api_key = " your_api_key"
282+ )
283+
230284parser = EnhancedPDFParser(
231285 # Image Restoration Settings
232286 use_image_restoration = True ,
233287 restoration_task = " dewarping" , # Correct perspective distortion
234288 restoration_device = " cuda" , # Use GPU for faster processing
235289 restoration_dpi = 300 , # Higher DPI for better quality
236290
237- # VLM Settings
238- use_vlm = True ,
239- vlm_provider = " openai" ,
240- vlm_model = " gpt-4-vision" ,
241- vlm_api_key = " your_api_key" ,
291+ # VLM Engine (pass the initialized engine)
292+ vlm = vlm_engine, # or None to disable VLM
242293
243294 # Layout Detection Settings
244295 layout_model_name = " PP-DocLayout_plus-L" ,
@@ -296,16 +347,22 @@ parser.parse("document.pdf", output_base_dir="my_outputs")
296347#### Advanced Configuration:
297348
298349``` python
350+ # Initialize VLM engine
351+ from doctra.engines.vlm.service import VLMStructuredExtractor
352+
353+ vlm_engine = VLMStructuredExtractor(
354+ vlm_provider = " openai" ,
355+ vlm_model = " gpt-5" , # Optional, uses default if None
356+ api_key = " your_api_key"
357+ )
358+
299359parser = ChartTablePDFParser(
300360 # Extraction Settings
301361 extract_charts = True ,
302362 extract_tables = True ,
303363
304- # VLM Settings
305- use_vlm = True ,
306- vlm_provider = " openai" ,
307- vlm_model = " gpt-5" ,
308- vlm_api_key = " your_api_key" ,
364+ # VLM Engine (pass the initialized engine)
365+ vlm = vlm_engine, # or None to disable VLM
309366
310367 # Layout Detection Settings
311368 layout_model_name = " PP-DocLayout_plus-L" ,
@@ -347,12 +404,18 @@ parser.parse("document.docx")
347404#### Advanced Configuration with VLM:
348405
349406``` python
407+ # Initialize VLM engine
408+ from doctra.engines.vlm.service import VLMStructuredExtractor
409+
410+ vlm_engine = VLMStructuredExtractor(
411+ vlm_provider = " openai" , # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
412+ vlm_model = " gpt-4-vision" , # Optional, uses default if None
413+ api_key = " your_api_key"
414+ )
415+
350416parser = StructuredDOCXParser(
351- # VLM Settings
352- use_vlm = True ,
353- vlm_provider = " openai" , # or "gemini", "anthropic", "openrouter"
354- vlm_model = " gpt-4-vision" ,
355- vlm_api_key = " your_api_key" ,
417+ # VLM Engine (pass the initialized engine)
418+ vlm = vlm_engine, # or None to disable VLM
356419
357420 # Processing Options
358421 extract_images = True ,
@@ -682,15 +745,21 @@ from doctra.engines.ocr import PytesseractOCREngine
682745# Initialize OCR engine (optional - defaults to PyTesseract if not provided)
683746ocr_engine = PytesseractOCREngine(lang = " eng" , psm = 4 , oem = 3 )
684747
748+ # Initialize VLM engine
749+ from doctra.engines.vlm.service import VLMStructuredExtractor
750+
751+ vlm_engine = VLMStructuredExtractor(
752+ vlm_provider = " openai" ,
753+ api_key = " your_api_key"
754+ )
755+
685756# Initialize enhanced parser with image restoration
686757parser = EnhancedPDFParser(
687758 use_image_restoration = True ,
688759 restoration_task = " dewarping" , # Correct perspective distortion
689760 restoration_device = " cuda" , # Use GPU for faster processing
690761 ocr_engine = ocr_engine, # Pass OCR engine instance
691- use_vlm = True ,
692- vlm_provider = " openai" ,
693- vlm_api_key = " your_api_key"
762+ vlm = vlm_engine # Pass VLM engine instance
694763)
695764
696765# Process scanned document with enhancement
@@ -780,12 +849,18 @@ parser.parse("report.docx")
780849``` python
781850from doctra.parsers.structured_docx_parser import StructuredDOCXParser
782851
852+ # Initialize VLM engine
853+ from doctra.engines.vlm.service import VLMStructuredExtractor
854+
855+ vlm_engine = VLMStructuredExtractor(
856+ vlm_provider = " openai" ,
857+ vlm_model = " gpt-4-vision" , # Optional, uses default if None
858+ api_key = " your_api_key"
859+ )
860+
783861# DOCX parsing with VLM for enhanced analysis
784862parser = StructuredDOCXParser(
785- use_vlm = True ,
786- vlm_provider = " openai" ,
787- vlm_model = " gpt-4-vision" ,
788- vlm_api_key = " your_api_key" ,
863+ vlm = vlm_engine, # Pass VLM engine instance
789864 extract_images = True ,
790865 preserve_formatting = True ,
791866 table_detection = True ,
@@ -807,13 +882,19 @@ parser.parse("financial_report.docx")
807882``` python
808883from doctra.parsers.table_chart_extractor import ChartTablePDFParser
809884
885+ # Initialize VLM engine
886+ from doctra.engines.vlm.service import VLMStructuredExtractor
887+
888+ vlm_engine = VLMStructuredExtractor(
889+ vlm_provider = " openai" ,
890+ api_key = " your_api_key"
891+ )
892+
810893# Initialize parser with VLM
811894parser = ChartTablePDFParser(
812895 extract_charts = True ,
813896 extract_tables = True ,
814- use_vlm = True ,
815- vlm_provider = " openai" ,
816- vlm_api_key = " your_api_key"
897+ vlm = vlm_engine # Pass VLM engine instance
817898)
818899
819900# Process document
@@ -919,9 +1000,11 @@ parser.display_pages_with_boxes("document.pdf")
9191000- ** Flexible Processing** : Standalone image restoration or integrated with parsing
9201001
9211002### 🤖 VLM Integration
922- - Vision Language Model support for structured data extraction
923- - Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter, Qianfan, Ollama)
924- - Automatic conversion of charts and tables to structured formats
1003+ - ** Dependency Injection Pattern** : Initialize VLM engines externally and pass them to parsers for clearer API
1004+ - ** Vision Language Model Support** : Structured data extraction from visual elements
1005+ - ** Multiple Provider Options** : OpenAI, Gemini, Anthropic, OpenRouter, Qianfan, Ollama
1006+ - ** Reusable Engines** : Create VLM engine instances once and reuse across multiple parsers
1007+ - ** Automatic Conversion** : Charts and tables converted to structured formats (Excel, HTML, JSON)
9251008
9261009### 📊 Multiple Output Formats
9271010- ** Markdown** : Human-readable document with embedded images and tables
0 commit comments