Skip to content

Commit 2b301c9

Browse files
Merge pull request #78 from AdemBoukhris457/refactor/vlm_engine_dependency
refactor VLM configuration to use dependency pattern
2 parents af743aa + 32b1234 commit 2b301c9

17 files changed

+555
-254
lines changed

README.md

Lines changed: 119 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,17 @@ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
103103
parser = StructuredPDFParser()
104104

105105
# Parser with VLM for structured data extraction
106-
parser = StructuredPDFParser(
107-
use_vlm=True,
106+
from doctra.engines.vlm.service import VLMStructuredExtractor
107+
108+
# Initialize VLM engine
109+
vlm_engine = VLMStructuredExtractor(
108110
vlm_provider="openai", # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
109-
vlm_api_key="your_api_key_here"
111+
api_key="your_api_key_here"
110112
)
111113

114+
# Pass VLM engine to parser
115+
parser = StructuredPDFParser(vlm=vlm_engine)
116+
112117
# Parse document
113118
parser.parse("document.pdf")
114119
```
@@ -141,6 +146,40 @@ paddle_ocr = PaddleOCREngine(
141146
use_textline_orientation=False # Text line orientation
142147
)
143148
parser = StructuredPDFParser(ocr_engine=paddle_ocr)
149+
150+
# Option 4: Reuse OCR engine across multiple parsers
151+
shared_ocr = PytesseractOCREngine(lang="eng", psm=6, oem=3)
152+
parser1 = StructuredPDFParser(ocr_engine=shared_ocr)
153+
parser2 = EnhancedPDFParser(ocr_engine=shared_ocr) # Reuse same instance
154+
```
155+
156+
#### VLM Engine Configuration:
157+
158+
Doctra uses the same dependency injection pattern for VLM engines. You initialize the VLM engine externally and pass it to the parser:
159+
160+
```python
161+
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
162+
from doctra.engines.vlm.service import VLMStructuredExtractor
163+
164+
# Option 1: No VLM (default)
165+
parser = StructuredPDFParser() # VLM processing disabled
166+
167+
# Option 2: Initialize VLM engine and pass to parser
168+
vlm_engine = VLMStructuredExtractor(
169+
vlm_provider="openai", # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
170+
vlm_model="gpt-5", # Optional, uses default if None
171+
api_key="your_api_key"
172+
)
173+
parser = StructuredPDFParser(vlm=vlm_engine)
174+
175+
# Option 3: Reuse VLM engine across multiple parsers
176+
shared_vlm = VLMStructuredExtractor(
177+
vlm_provider="gemini",
178+
api_key="your_api_key"
179+
)
180+
parser1 = StructuredPDFParser(vlm=shared_vlm)
181+
parser2 = EnhancedPDFParser(vlm=shared_vlm) # Reuse same instance
182+
parser3 = ChartTablePDFParser(vlm=shared_vlm) # Reuse same instance
144183
```
145184

146185
#### Advanced Configuration:
@@ -156,12 +195,18 @@ ocr_engine = PytesseractOCREngine(
156195
extra_config=""
157196
)
158197

159-
parser = StructuredPDFParser(
160-
# VLM Settings
161-
use_vlm=True,
198+
# Initialize VLM engine
199+
from doctra.engines.vlm.service import VLMStructuredExtractor
200+
201+
vlm_engine = VLMStructuredExtractor(
162202
vlm_provider="openai",
163-
vlm_model="gpt-5",
164-
vlm_api_key="your_api_key",
203+
vlm_model="gpt-5", # Optional, uses default if None
204+
api_key="your_api_key"
205+
)
206+
207+
parser = StructuredPDFParser(
208+
# VLM Engine (pass the initialized engine)
209+
vlm=vlm_engine, # or None to disable VLM
165210

166211
# Layout Detection Settings
167212
layout_model_name="PP-DocLayout_plus-L",
@@ -227,18 +272,24 @@ ocr_engine = PytesseractOCREngine(
227272
oem=3
228273
)
229274

275+
# Initialize VLM engine
276+
from doctra.engines.vlm.service import VLMStructuredExtractor
277+
278+
vlm_engine = VLMStructuredExtractor(
279+
vlm_provider="openai",
280+
vlm_model="gpt-4-vision", # Optional, uses default if None
281+
api_key="your_api_key"
282+
)
283+
230284
parser = EnhancedPDFParser(
231285
# Image Restoration Settings
232286
use_image_restoration=True,
233287
restoration_task="dewarping", # Correct perspective distortion
234288
restoration_device="cuda", # Use GPU for faster processing
235289
restoration_dpi=300, # Higher DPI for better quality
236290

237-
# VLM Settings
238-
use_vlm=True,
239-
vlm_provider="openai",
240-
vlm_model="gpt-4-vision",
241-
vlm_api_key="your_api_key",
291+
# VLM Engine (pass the initialized engine)
292+
vlm=vlm_engine, # or None to disable VLM
242293

243294
# Layout Detection Settings
244295
layout_model_name="PP-DocLayout_plus-L",
@@ -296,16 +347,22 @@ parser.parse("document.pdf", output_base_dir="my_outputs")
296347
#### Advanced Configuration:
297348

298349
```python
350+
# Initialize VLM engine
351+
from doctra.engines.vlm.service import VLMStructuredExtractor
352+
353+
vlm_engine = VLMStructuredExtractor(
354+
vlm_provider="openai",
355+
vlm_model="gpt-5", # Optional, uses default if None
356+
api_key="your_api_key"
357+
)
358+
299359
parser = ChartTablePDFParser(
300360
# Extraction Settings
301361
extract_charts=True,
302362
extract_tables=True,
303363

304-
# VLM Settings
305-
use_vlm=True,
306-
vlm_provider="openai",
307-
vlm_model="gpt-5",
308-
vlm_api_key="your_api_key",
364+
# VLM Engine (pass the initialized engine)
365+
vlm=vlm_engine, # or None to disable VLM
309366

310367
# Layout Detection Settings
311368
layout_model_name="PP-DocLayout_plus-L",
@@ -347,12 +404,18 @@ parser.parse("document.docx")
347404
#### Advanced Configuration with VLM:
348405

349406
```python
407+
# Initialize VLM engine
408+
from doctra.engines.vlm.service import VLMStructuredExtractor
409+
410+
vlm_engine = VLMStructuredExtractor(
411+
vlm_provider="openai", # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
412+
vlm_model="gpt-4-vision", # Optional, uses default if None
413+
api_key="your_api_key"
414+
)
415+
350416
parser = StructuredDOCXParser(
351-
# VLM Settings
352-
use_vlm=True,
353-
vlm_provider="openai", # or "gemini", "anthropic", "openrouter"
354-
vlm_model="gpt-4-vision",
355-
vlm_api_key="your_api_key",
417+
# VLM Engine (pass the initialized engine)
418+
vlm=vlm_engine, # or None to disable VLM
356419

357420
# Processing Options
358421
extract_images=True,
@@ -682,15 +745,21 @@ from doctra.engines.ocr import PytesseractOCREngine
682745
# Initialize OCR engine (optional - defaults to PyTesseract if not provided)
683746
ocr_engine = PytesseractOCREngine(lang="eng", psm=4, oem=3)
684747

748+
# Initialize VLM engine
749+
from doctra.engines.vlm.service import VLMStructuredExtractor
750+
751+
vlm_engine = VLMStructuredExtractor(
752+
vlm_provider="openai",
753+
api_key="your_api_key"
754+
)
755+
685756
# Initialize enhanced parser with image restoration
686757
parser = EnhancedPDFParser(
687758
use_image_restoration=True,
688759
restoration_task="dewarping", # Correct perspective distortion
689760
restoration_device="cuda", # Use GPU for faster processing
690761
ocr_engine=ocr_engine, # Pass OCR engine instance
691-
use_vlm=True,
692-
vlm_provider="openai",
693-
vlm_api_key="your_api_key"
762+
vlm=vlm_engine # Pass VLM engine instance
694763
)
695764

696765
# Process scanned document with enhancement
@@ -780,12 +849,18 @@ parser.parse("report.docx")
780849
```python
781850
from doctra.parsers.structured_docx_parser import StructuredDOCXParser
782851

852+
# Initialize VLM engine
853+
from doctra.engines.vlm.service import VLMStructuredExtractor
854+
855+
vlm_engine = VLMStructuredExtractor(
856+
vlm_provider="openai",
857+
vlm_model="gpt-4-vision", # Optional, uses default if None
858+
api_key="your_api_key"
859+
)
860+
783861
# DOCX parsing with VLM for enhanced analysis
784862
parser = StructuredDOCXParser(
785-
use_vlm=True,
786-
vlm_provider="openai",
787-
vlm_model="gpt-4-vision",
788-
vlm_api_key="your_api_key",
863+
vlm=vlm_engine, # Pass VLM engine instance
789864
extract_images=True,
790865
preserve_formatting=True,
791866
table_detection=True,
@@ -807,13 +882,19 @@ parser.parse("financial_report.docx")
807882
```python
808883
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
809884

885+
# Initialize VLM engine
886+
from doctra.engines.vlm.service import VLMStructuredExtractor
887+
888+
vlm_engine = VLMStructuredExtractor(
889+
vlm_provider="openai",
890+
api_key="your_api_key"
891+
)
892+
810893
# Initialize parser with VLM
811894
parser = ChartTablePDFParser(
812895
extract_charts=True,
813896
extract_tables=True,
814-
use_vlm=True,
815-
vlm_provider="openai",
816-
vlm_api_key="your_api_key"
897+
vlm=vlm_engine # Pass VLM engine instance
817898
)
818899

819900
# Process document
@@ -919,9 +1000,11 @@ parser.display_pages_with_boxes("document.pdf")
9191000
- **Flexible Processing**: Standalone image restoration or integrated with parsing
9201001

9211002
### 🤖 VLM Integration
922-
- Vision Language Model support for structured data extraction
923-
- Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter, Qianfan, Ollama)
924-
- Automatic conversion of charts and tables to structured formats
1003+
- **Dependency Injection Pattern**: Initialize VLM engines externally and pass them to parsers for clearer API
1004+
- **Vision Language Model Support**: Structured data extraction from visual elements
1005+
- **Multiple Provider Options**: OpenAI, Gemini, Anthropic, OpenRouter, Qianfan, Ollama
1006+
- **Reusable Engines**: Create VLM engine instances once and reuse across multiple parsers
1007+
- **Automatic Conversion**: Charts and tables converted to structured formats (Excel, HTML, JSON)
9251008

9261009
### 📊 Multiple Output Formats
9271010
- **Markdown**: Human-readable document with embedded images and tables

docs/api/parsers.md

Lines changed: 63 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,18 @@ Comprehensive parser for Microsoft Word documents (.docx files).
5757
```python
5858
from doctra import StructuredPDFParser
5959
from doctra.engines.ocr import PytesseractOCREngine, PaddleOCREngine
60+
from doctra.engines.vlm.service import VLMStructuredExtractor
6061

6162
# Initialize OCR engine (optional - defaults to PyTesseract if None)
6263
ocr_engine = PytesseractOCREngine(lang="eng", psm=4, oem=3)
6364

65+
# Initialize VLM engine (optional - None to disable VLM)
66+
vlm_engine = VLMStructuredExtractor(
67+
vlm_provider="openai",
68+
vlm_model="gpt-4o", # Optional
69+
api_key="your-api-key"
70+
)
71+
6472
parser = StructuredPDFParser(
6573
# Layout Detection
6674
layout_model_name: str = "PP-DocLayout_plus-L",
@@ -70,11 +78,8 @@ parser = StructuredPDFParser(
7078
# OCR Engine (pass initialized engine instance)
7179
ocr_engine: Optional[Union[PytesseractOCREngine, PaddleOCREngine]] = None,
7280

73-
# VLM Settings
74-
use_vlm: bool = False,
75-
vlm_provider: str = None,
76-
vlm_api_key: str = None,
77-
vlm_model: str = None,
81+
# VLM Engine (pass initialized engine instance)
82+
vlm: Optional[VLMStructuredExtractor] = None,
7883

7984
# Split Table Merging
8085
merge_split_tables: bool = False,
@@ -109,6 +114,13 @@ parser.display_pages_with_boxes(
109114

110115
```python
111116
from doctra import EnhancedPDFParser
117+
from doctra.engines.vlm.service import VLMStructuredExtractor
118+
119+
# Initialize VLM engine (optional)
120+
vlm_engine = VLMStructuredExtractor(
121+
vlm_provider="openai",
122+
api_key="your-api-key"
123+
)
112124

113125
parser = EnhancedPDFParser(
114126
# Image Restoration
@@ -117,6 +129,9 @@ parser = EnhancedPDFParser(
117129
restoration_device: str = None,
118130
restoration_dpi: int = 200,
119131

132+
# VLM Engine (pass initialized engine instance)
133+
vlm: Optional[VLMStructuredExtractor] = None,
134+
120135
# All StructuredPDFParser parameters...
121136
)
122137

@@ -131,17 +146,21 @@ parser.parse(
131146

132147
```python
133148
from doctra import ChartTablePDFParser
149+
from doctra.engines.vlm.service import VLMStructuredExtractor
150+
151+
# Initialize VLM engine (optional)
152+
vlm_engine = VLMStructuredExtractor(
153+
vlm_provider="openai",
154+
api_key="your-api-key"
155+
)
134156

135157
parser = ChartTablePDFParser(
136158
# Extraction Settings
137159
extract_charts: bool = True,
138160
extract_tables: bool = True,
139161

140-
# VLM Settings
141-
use_vlm: bool = False,
142-
vlm_provider: str = None,
143-
vlm_api_key: str = None,
144-
vlm_model: str = None,
162+
# VLM Engine (pass initialized engine instance)
163+
vlm: Optional[VLMStructuredExtractor] = None,
145164

146165
# Layout Detection
147166
layout_model_name: str = "PP-DocLayout_plus-L",
@@ -160,13 +179,17 @@ parser.parse(
160179

161180
```python
162181
from doctra import StructuredDOCXParser
182+
from doctra.engines.vlm.service import VLMStructuredExtractor
183+
184+
# Initialize VLM engine (optional)
185+
vlm_engine = VLMStructuredExtractor(
186+
vlm_provider="openai",
187+
api_key="your-api-key"
188+
)
163189

164190
parser = StructuredDOCXParser(
165-
# VLM Settings
166-
use_vlm: bool = False,
167-
vlm_provider: str = None,
168-
vlm_api_key: str = None,
169-
vlm_model: str = None,
191+
# VLM Engine (pass initialized engine instance)
192+
vlm: Optional[VLMStructuredExtractor] = None,
170193

171194
# Processing Options
172195
extract_images: bool = True,
@@ -232,10 +255,31 @@ parser = StructuredPDFParser(ocr_engine=paddle_ocr)
232255

233256
| Parameter | Type | Default | Description |
234257
|-----------|------|---------|-------------|
235-
| `use_vlm` | bool | False | Enable VLM processing |
236-
| `vlm_provider` | str | None | Provider: "openai", "gemini", "anthropic", "openrouter" |
237-
| `vlm_api_key` | str | None | API key for the VLM provider |
238-
| `vlm_model` | str | None | Specific model to use (provider-dependent) |
258+
| `vlm` | `Optional[VLMStructuredExtractor]` | `None` | VLM engine instance. If `None`, VLM processing is disabled. |
259+
260+
**VLM Engine Configuration:**
261+
262+
VLM engines must be initialized externally and passed to the parser. This uses a dependency injection pattern for clearer API design.
263+
264+
**VLMStructuredExtractor Parameters:**
265+
- `vlm_provider` (str, required): VLM provider to use ("openai", "gemini", "anthropic", "openrouter", "qianfan", "ollama")
266+
- `vlm_model` (str, optional): Model name to use (defaults to provider-specific defaults)
267+
- `api_key` (str, optional): API key for the VLM provider (required for all providers except Ollama)
268+
269+
**Example:**
270+
```python
271+
from doctra.engines.vlm.service import VLMStructuredExtractor
272+
273+
# Initialize VLM engine
274+
vlm_engine = VLMStructuredExtractor(
275+
vlm_provider="openai",
276+
vlm_model="gpt-4o", # Optional
277+
api_key="your-api-key"
278+
)
279+
280+
# Pass to parser
281+
parser = StructuredPDFParser(vlm=vlm_engine)
282+
```
239283

240284
### Image Restoration Parameters
241285

0 commit comments

Comments
 (0)