Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 119 additions & 36 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,17 @@ from doctra.parsers.structured_pdf_parser import StructuredPDFParser
parser = StructuredPDFParser()

# Parser with VLM for structured data extraction
parser = StructuredPDFParser(
use_vlm=True,
from doctra.engines.vlm.service import VLMStructuredExtractor

# Initialize VLM engine
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai", # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
vlm_api_key="your_api_key_here"
api_key="your_api_key_here"
)

# Pass VLM engine to parser
parser = StructuredPDFParser(vlm=vlm_engine)

# Parse document
parser.parse("document.pdf")
```
Expand Down Expand Up @@ -141,6 +146,40 @@ paddle_ocr = PaddleOCREngine(
use_textline_orientation=False # Text line orientation
)
parser = StructuredPDFParser(ocr_engine=paddle_ocr)

# Option 4: Reuse OCR engine across multiple parsers
shared_ocr = PytesseractOCREngine(lang="eng", psm=6, oem=3)
parser1 = StructuredPDFParser(ocr_engine=shared_ocr)
parser2 = EnhancedPDFParser(ocr_engine=shared_ocr) # Reuse same instance
```

#### VLM Engine Configuration:

Doctra uses the same dependency injection pattern for VLM engines. You initialize the VLM engine externally and pass it to the parser:

```python
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
from doctra.engines.vlm.service import VLMStructuredExtractor

# Option 1: No VLM (default)
parser = StructuredPDFParser() # VLM processing disabled

# Option 2: Initialize VLM engine and pass to parser
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai", # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
vlm_model="gpt-5", # Optional, uses default if None
api_key="your_api_key"
)
parser = StructuredPDFParser(vlm=vlm_engine)

# Option 3: Reuse VLM engine across multiple parsers
shared_vlm = VLMStructuredExtractor(
vlm_provider="gemini",
api_key="your_api_key"
)
parser1 = StructuredPDFParser(vlm=shared_vlm)
parser2 = EnhancedPDFParser(vlm=shared_vlm) # Reuse same instance
parser3 = ChartTablePDFParser(vlm=shared_vlm) # Reuse same instance
```

#### Advanced Configuration:
Expand All @@ -156,12 +195,18 @@ ocr_engine = PytesseractOCREngine(
extra_config=""
)

parser = StructuredPDFParser(
# VLM Settings
use_vlm=True,
# Initialize VLM engine
from doctra.engines.vlm.service import VLMStructuredExtractor

vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
vlm_model="gpt-5",
vlm_api_key="your_api_key",
vlm_model="gpt-5", # Optional, uses default if None
api_key="your_api_key"
)

parser = StructuredPDFParser(
# VLM Engine (pass the initialized engine)
vlm=vlm_engine, # or None to disable VLM

# Layout Detection Settings
layout_model_name="PP-DocLayout_plus-L",
Expand Down Expand Up @@ -227,18 +272,24 @@ ocr_engine = PytesseractOCREngine(
oem=3
)

# Initialize VLM engine
from doctra.engines.vlm.service import VLMStructuredExtractor

vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
vlm_model="gpt-4-vision", # Optional, uses default if None
api_key="your_api_key"
)

parser = EnhancedPDFParser(
# Image Restoration Settings
use_image_restoration=True,
restoration_task="dewarping", # Correct perspective distortion
restoration_device="cuda", # Use GPU for faster processing
restoration_dpi=300, # Higher DPI for better quality

# VLM Settings
use_vlm=True,
vlm_provider="openai",
vlm_model="gpt-4-vision",
vlm_api_key="your_api_key",
# VLM Engine (pass the initialized engine)
vlm=vlm_engine, # or None to disable VLM

# Layout Detection Settings
layout_model_name="PP-DocLayout_plus-L",
Expand Down Expand Up @@ -296,16 +347,22 @@ parser.parse("document.pdf", output_base_dir="my_outputs")
#### Advanced Configuration:

```python
# Initialize VLM engine
from doctra.engines.vlm.service import VLMStructuredExtractor

vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
vlm_model="gpt-5", # Optional, uses default if None
api_key="your_api_key"
)

parser = ChartTablePDFParser(
# Extraction Settings
extract_charts=True,
extract_tables=True,

# VLM Settings
use_vlm=True,
vlm_provider="openai",
vlm_model="gpt-5",
vlm_api_key="your_api_key",
# VLM Engine (pass the initialized engine)
vlm=vlm_engine, # or None to disable VLM

# Layout Detection Settings
layout_model_name="PP-DocLayout_plus-L",
Expand Down Expand Up @@ -347,12 +404,18 @@ parser.parse("document.docx")
#### Advanced Configuration with VLM:

```python
# Initialize VLM engine
from doctra.engines.vlm.service import VLMStructuredExtractor

vlm_engine = VLMStructuredExtractor(
vlm_provider="openai", # or "gemini", "anthropic", "openrouter", "qianfan", "ollama"
vlm_model="gpt-4-vision", # Optional, uses default if None
api_key="your_api_key"
)

parser = StructuredDOCXParser(
# VLM Settings
use_vlm=True,
vlm_provider="openai", # or "gemini", "anthropic", "openrouter"
vlm_model="gpt-4-vision",
vlm_api_key="your_api_key",
# VLM Engine (pass the initialized engine)
vlm=vlm_engine, # or None to disable VLM

# Processing Options
extract_images=True,
Expand Down Expand Up @@ -682,15 +745,21 @@ from doctra.engines.ocr import PytesseractOCREngine
# Initialize OCR engine (optional - defaults to PyTesseract if not provided)
ocr_engine = PytesseractOCREngine(lang="eng", psm=4, oem=3)

# Initialize VLM engine
from doctra.engines.vlm.service import VLMStructuredExtractor

vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
api_key="your_api_key"
)

# Initialize enhanced parser with image restoration
parser = EnhancedPDFParser(
use_image_restoration=True,
restoration_task="dewarping", # Correct perspective distortion
restoration_device="cuda", # Use GPU for faster processing
ocr_engine=ocr_engine, # Pass OCR engine instance
use_vlm=True,
vlm_provider="openai",
vlm_api_key="your_api_key"
vlm=vlm_engine # Pass VLM engine instance
)

# Process scanned document with enhancement
Expand Down Expand Up @@ -780,12 +849,18 @@ parser.parse("report.docx")
```python
from doctra.parsers.structured_docx_parser import StructuredDOCXParser

# Initialize VLM engine
from doctra.engines.vlm.service import VLMStructuredExtractor

vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
vlm_model="gpt-4-vision", # Optional, uses default if None
api_key="your_api_key"
)

# DOCX parsing with VLM for enhanced analysis
parser = StructuredDOCXParser(
use_vlm=True,
vlm_provider="openai",
vlm_model="gpt-4-vision",
vlm_api_key="your_api_key",
vlm=vlm_engine, # Pass VLM engine instance
extract_images=True,
preserve_formatting=True,
table_detection=True,
Expand All @@ -807,13 +882,19 @@ parser.parse("financial_report.docx")
```python
from doctra.parsers.table_chart_extractor import ChartTablePDFParser

# Initialize VLM engine
from doctra.engines.vlm.service import VLMStructuredExtractor

vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
api_key="your_api_key"
)

# Initialize parser with VLM
parser = ChartTablePDFParser(
extract_charts=True,
extract_tables=True,
use_vlm=True,
vlm_provider="openai",
vlm_api_key="your_api_key"
vlm=vlm_engine # Pass VLM engine instance
)

# Process document
Expand Down Expand Up @@ -919,9 +1000,11 @@ parser.display_pages_with_boxes("document.pdf")
- **Flexible Processing**: Standalone image restoration or integrated with parsing

### 🤖 VLM Integration
- Vision Language Model support for structured data extraction
- Multiple provider options (OpenAI, Gemini, Anthropic, OpenRouter, Qianfan, Ollama)
- Automatic conversion of charts and tables to structured formats
- **Dependency Injection Pattern**: Initialize VLM engines externally and pass them to parsers for clearer API
- **Vision Language Model Support**: Structured data extraction from visual elements
- **Multiple Provider Options**: OpenAI, Gemini, Anthropic, OpenRouter, Qianfan, Ollama
- **Reusable Engines**: Create VLM engine instances once and reuse across multiple parsers
- **Automatic Conversion**: Charts and tables converted to structured formats (Excel, HTML, JSON)

### 📊 Multiple Output Formats
- **Markdown**: Human-readable document with embedded images and tables
Expand Down
82 changes: 63 additions & 19 deletions docs/api/parsers.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,18 @@ Comprehensive parser for Microsoft Word documents (.docx files).
```python
from doctra import StructuredPDFParser
from doctra.engines.ocr import PytesseractOCREngine, PaddleOCREngine
from doctra.engines.vlm.service import VLMStructuredExtractor

# Initialize OCR engine (optional - defaults to PyTesseract if None)
ocr_engine = PytesseractOCREngine(lang="eng", psm=4, oem=3)

# Initialize VLM engine (optional - None to disable VLM)
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
vlm_model="gpt-4o", # Optional
api_key="your-api-key"
)

parser = StructuredPDFParser(
# Layout Detection
layout_model_name: str = "PP-DocLayout_plus-L",
Expand All @@ -70,11 +78,8 @@ parser = StructuredPDFParser(
# OCR Engine (pass initialized engine instance)
ocr_engine: Optional[Union[PytesseractOCREngine, PaddleOCREngine]] = None,

# VLM Settings
use_vlm: bool = False,
vlm_provider: str = None,
vlm_api_key: str = None,
vlm_model: str = None,
# VLM Engine (pass initialized engine instance)
vlm: Optional[VLMStructuredExtractor] = None,

# Split Table Merging
merge_split_tables: bool = False,
Expand Down Expand Up @@ -109,6 +114,13 @@ parser.display_pages_with_boxes(

```python
from doctra import EnhancedPDFParser
from doctra.engines.vlm.service import VLMStructuredExtractor

# Initialize VLM engine (optional)
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
api_key="your-api-key"
)

parser = EnhancedPDFParser(
# Image Restoration
Expand All @@ -117,6 +129,9 @@ parser = EnhancedPDFParser(
restoration_device: str = None,
restoration_dpi: int = 200,

# VLM Engine (pass initialized engine instance)
vlm: Optional[VLMStructuredExtractor] = None,

# All StructuredPDFParser parameters...
)

Expand All @@ -131,17 +146,21 @@ parser.parse(

```python
from doctra import ChartTablePDFParser
from doctra.engines.vlm.service import VLMStructuredExtractor

# Initialize VLM engine (optional)
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
api_key="your-api-key"
)

parser = ChartTablePDFParser(
# Extraction Settings
extract_charts: bool = True,
extract_tables: bool = True,

# VLM Settings
use_vlm: bool = False,
vlm_provider: str = None,
vlm_api_key: str = None,
vlm_model: str = None,
# VLM Engine (pass initialized engine instance)
vlm: Optional[VLMStructuredExtractor] = None,

# Layout Detection
layout_model_name: str = "PP-DocLayout_plus-L",
Expand All @@ -160,13 +179,17 @@ parser.parse(

```python
from doctra import StructuredDOCXParser
from doctra.engines.vlm.service import VLMStructuredExtractor

# Initialize VLM engine (optional)
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
api_key="your-api-key"
)

parser = StructuredDOCXParser(
# VLM Settings
use_vlm: bool = False,
vlm_provider: str = None,
vlm_api_key: str = None,
vlm_model: str = None,
# VLM Engine (pass initialized engine instance)
vlm: Optional[VLMStructuredExtractor] = None,

# Processing Options
extract_images: bool = True,
Expand Down Expand Up @@ -232,10 +255,31 @@ parser = StructuredPDFParser(ocr_engine=paddle_ocr)

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `use_vlm` | bool | False | Enable VLM processing |
| `vlm_provider` | str | None | Provider: "openai", "gemini", "anthropic", "openrouter" |
| `vlm_api_key` | str | None | API key for the VLM provider |
| `vlm_model` | str | None | Specific model to use (provider-dependent) |
| `vlm` | `Optional[VLMStructuredExtractor]` | `None` | VLM engine instance. If `None`, VLM processing is disabled. |

**VLM Engine Configuration:**

VLM engines must be initialized externally and passed to the parser. This uses a dependency injection pattern for clearer API design.

**VLMStructuredExtractor Parameters:**
- `vlm_provider` (str, required): VLM provider to use ("openai", "gemini", "anthropic", "openrouter", "qianfan", "ollama")
- `vlm_model` (str, optional): Model name to use (defaults to provider-specific defaults)
- `api_key` (str, optional): API key for the VLM provider (required for all providers except Ollama)

**Example:**
```python
from doctra.engines.vlm.service import VLMStructuredExtractor

# Initialize VLM engine
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
vlm_model="gpt-4o", # Optional
api_key="your-api-key"
)

# Pass to parser
parser = StructuredPDFParser(vlm=vlm_engine)
```

### Image Restoration Parameters

Expand Down
Loading