Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
d717d46
Revise MCPTool usage example for Streamable HTTP
Hansehart Sep 26, 2025
c12dd25
Clarify connection types in MCPToolset documentation
Hansehart Sep 26, 2025
2b58a33
Merge pull request #1 from Hansehart/patch-1
Hansehart Sep 26, 2025
1611199
fix: Align with hatch run fmt requirements
Hansehart Sep 26, 2025
d8c3ffd
add: MistralOCRDocumentConverter
Hansehart Oct 13, 2025
877a3bc
add: project files
Hansehart Oct 13, 2025
6fd0394
fix: example lib usage
Hansehart Oct 13, 2025
1abfcbb
move: ocr document converter into child /mistral
Hansehart Oct 13, 2025
6e16719
add: example usage with annotations
Hansehart Oct 13, 2025
6416a0c
add: hatch run fmt
Hansehart Oct 13, 2025
e2ec0b6
add: mistralai
Hansehart Oct 13, 2025
a0c2abe
Merge branch 'main' into add-mistral-ocr
Hansehart Oct 13, 2025
df89124
fix: python3.9 compatibility with using Union, List, Optional
Hansehart Oct 14, 2025
fc7e31d
Merge branch 'add-mistral-ocr' of github.com:Hansehart/haystack-core-…
Hansehart Oct 14, 2025
39b6cae
add: new comments and their position
Hansehart Oct 14, 2025
f2170f9
add: moved schemas from init into run to bypass problems with seriali…
Hansehart Oct 14, 2025
7221af4
add: docstring convention
Hansehart Oct 14, 2025
aa8f3bc
add: process mutliple documents
Hansehart Oct 14, 2025
1160256
add: robust api handling with catching mistral errors
Hansehart Oct 14, 2025
d351909
add: Union[str, Path, ByteStream] as input
Hansehart Oct 14, 2025
4efc546
add: comment for new inputs
Hansehart Oct 14, 2025
c246153
add: pipeline example
Hansehart Oct 14, 2025
c665cde
fix: example ocr component
Hansehart Oct 14, 2025
0a7cf6a
fix: mistral file upload and pydantic v2 models
Hansehart Oct 14, 2025
0fbf500
add: pipeline example
Hansehart Oct 14, 2025
6620442
add: hint on document annotation page limit
Hansehart Oct 14, 2025
0656d2c
add: mistralai as project dependency
Hansehart Oct 14, 2025
b5ff05f
fix: hatch run fmt
Hansehart Oct 14, 2025
70b81ad
fix: hatch run docs
Hansehart Oct 14, 2025
815c92c
Merge branch 'deepset-ai:main' into add-mistral-ocr
Hansehart Oct 15, 2025
88302b0
add: to dict, from dict
Hansehart Oct 15, 2025
1b48359
Merge branch 'add-mistral-ocr' of github.com:Hansehart/haystack-core-…
Hansehart Oct 15, 2025
5030044
add: exlcuse mistral from compliance workflow (its apache 2.0)
Hansehart Oct 15, 2025
1e4c0f1
add: 3 initialization tests
Hansehart Oct 15, 2025
8f847f0
add: 4 se test
Hansehart Oct 15, 2025
b1d5729
add: test w/ proper mocking
Hansehart Oct 15, 2025
6a02ecf
add: real api test when env is set
Hansehart Oct 15, 2025
0cb1e8c
add: delete files by default from mistral if uploaded
Hansehart Oct 15, 2025
9b1b29e
fix: mock file deletion
Hansehart Oct 15, 2025
a4fbdb1
fix: hatch run fmt
Hansehart Oct 15, 2025
dbbb30a
Apply suggestion from @anakin87
Hansehart Oct 15, 2025
e57629c
Merge branch 'deepset-ai:main' into add-mistral-ocr
Hansehart Oct 19, 2025
0961855
Update integrations/mistral/src/haystack_integrations/components/conv…
Hansehart Oct 19, 2025
82f38eb
fix: nested try excepts
Hansehart Oct 19, 2025
e358660
Merge branch 'add-mistral-ocr' of github.com:Hansehart/haystack-core-…
Hansehart Oct 19, 2025
fd84e02
add: mention file upload
Hansehart Oct 19, 2025
cc8dd05
Update integrations/mistral/tests/test_ocr_document_converter.py
Hansehart Oct 19, 2025
46b89ff
Update integrations/mistral/tests/test_ocr_document_converter.py
Hansehart Oct 19, 2025
f92fe6d
add: less test code due to pytest.mark..parametrize
Hansehart Oct 19, 2025
99e7989
Merge branch 'add-mistral-ocr' of github.com:Hansehart/haystack-core-…
Hansehart Oct 19, 2025
4a5d316
add: less tests and const class type
Hansehart Oct 19, 2025
6017c15
fix: format
Hansehart Oct 19, 2025
e162a73
Merge branch 'main' into add-mistral-ocr
Hansehart Oct 21, 2025
30fbc23
add: ocr document converter to docusaurus
Hansehart Oct 22, 2025
fa193ec
add: converter to mistral
Hansehart Oct 22, 2025
5f4216e
Merge branch 'main' into add-mistral-ocr
Hansehart Oct 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions integrations/mistral/examples/ocr_document_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# To run this example, you will need to set a `MISTRAL_API_KEY` environment variable.
# This example demonstrates OCR document processing with structured annotations.

from mistralai.models import DocumentURLChunk
from pydantic import BaseModel, Field

from haystack_integrations.components.converters.mistral.ocr_document_converter import (
MistralOCRDocumentConverter,
)


# Define schema for structured image annotations (bbox)
class ImageAnnotation(BaseModel):
image_type: str = Field(..., description="The type of image content")
description: str = Field(..., description="Brief description of the image")


# Define schema for structured document annotations
class DocumentAnnotation(BaseModel):
language: str = Field(..., description="Primary language of the document")
urls: list[str] = Field(..., description="URLs found in the document")
topics: list[str] = Field(..., description="Main topics covered in the document")


# Initialize the converter with annotation schemas
converter = MistralOCRDocumentConverter(
pages=[2, 3],
bbox_annotation_schema=ImageAnnotation,
document_annotation_schema=DocumentAnnotation,
)

# Process a document URL (you can use any public or signed URL to a PDF or image)
doc_source = DocumentURLChunk(document_url="https://arxiv.org/pdf/1706.03762")

# Run OCR
result = converter.run(source=doc_source)

# Extract results
documents = result["documents"]
raw_mistral_response = result["raw_mistral_response"]
1 change: 1 addition & 0 deletions integrations/mistral/pydoc/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ loaders:
"haystack_integrations.components.embedders.mistral.document_embedder",
"haystack_integrations.components.embedders.mistral.text_embedder",
"haystack_integrations.components.generators.mistral.chat.chat_generator",
"haystack_integrations.components.converters.MistralOCRDocumentConverter",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
6 changes: 4 additions & 2 deletions integrations/mistral/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ dependencies = [
"pytest-rerunfailures",
"mypy",
"pip",
"pytz"
"pytz",
"mistralai"
]

[tool.hatch.envs.test.scripts]
Expand All @@ -68,7 +69,8 @@ all = 'pytest {args:tests}'
cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'

types = """mypy -p haystack_integrations.components.embedders.mistral \
-p haystack_integrations.components.generators.mistral {args}"""
-p haystack_integrations.components.generators.mistral \
-p haystack_integrations.components.converters {args}"""

[tool.mypy]
install_types = true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from haystack_integrations.components.converters.mistral.ocr_document_converter import MistralOCRDocumentConverter

__all__ = ["MistralOCRDocumentConverter"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
"""Mistral OCR Document Converter for Haystack.
A custom Haystack component that uses Mistral's OCR API to extract text from documents.
Takes a signed/public URL to a document and returns extracted text as Haystack Documents.
API Reference:
https://docs.mistral.ai/capabilities/document_ai/basic_ocr/
https://docs.mistral.ai/capabilities/document_ai/annotations/
Usage Example:
```python
from haystack.utils import Secret
from haystack_integrations.mistral import MistralOCRDocumentConverter
from mistralai.models import DocumentURLChunk, ImageURLChunk, FileChunk
converter = MistralOCRDocumentConverter(
api_key=Secret.from_env_var("MISTRAL_API_KEY"),
model="mistral-ocr-2505"
)
# Option 1: Process a document URL
doc_source = DocumentURLChunk(document_url="https://example.com/document.pdf")
result = converter.run(source=doc_source)
# Option 2: Process an image URL
img_source = ImageURLChunk(image_url="https://example.com/receipt.jpg")
result = converter.run(source=img_source)
# Option 3: Process a Mistral file ID (See: https://docs.mistral.ai/api/#tag/files)
file_source = FileChunk(file_id="file-abc123")
result = converter.run(source=file_source)
documents = result["documents"]
raw_response = result["raw_mistral_response"]
```
Structured Output Example:
```python
from pydantic import BaseModel, Field
from haystack_integrations.mistral import MistralOCRDocumentConverter
# Define schema for structured image annotations
class ImageAnnotation(BaseModel):
image_type: str = Field(..., description="The type of image content")
short_description: str = Field(..., description="Short natural-language description")
summary: str = Field(..., description="Detailed summary of the image content")
# Define schema for structured document annotations
class DocumentAnnotation(BaseModel):
language: str = Field(..., description="Primary language of the document")
chapter_titles: list[str] = Field(..., description="Detected chapter or section titles")
urls: list[str] = Field(..., description="URLs found in the text")
converter = MistralOCRDocumentConverter(
api_key=Secret.from_env_var("MISTRAL_API_KEY"),
model="mistral-ocr-2505",
bbox_annotation_schema=ImageAnnotation,
document_annotation_schema=DocumentAnnotation,
)
doc_source = DocumentURLChunk(document_url="https://example.com/report.pdf")
result = converter.run(source=doc_source)
```
"""

import json
import re
from typing import Type

from haystack import Document, component
from haystack.utils import Secret
from mistralai import Mistral
from mistralai.extra import response_format_from_pydantic_model
from mistralai.models import (
DocumentURLChunk,
FileChunk,
ImageURLChunk,
OCRResponse,
)
from pydantic import BaseModel


@component
class MistralOCRDocumentConverter:
"""
Extracts text from documents using Mistral's OCR API, with optional structured
annotations for both individual image regions (bounding boxes) and full documents.
Accepts a document URL (public or signed) and retrieves the recognized text
via Mistral's OCR service. Returns a single Haystack Document containing all
pages concatenated with form feed characters (\f), ensuring compatibility with e.g
Haystac's DocumentSplitter for accurate page-wise splitting and overlap handling.
"""

def __init__(
self,
api_key: Secret = Secret.from_env_var("MISTRAL_API_KEY"),
model: str = "mistral-ocr-2505",
include_image_base64: bool = False,
pages: list[int] | None = None,
image_limit: int | None = None,
image_min_size: int | None = None,
bbox_annotation_schema: Type[BaseModel] | None = None,
document_annotation_schema: Type[BaseModel] | None = None,
):
"""
Initialize the MistralOCRDocumentConverter.
Args:
api_key: Mistral API key (defaults to MISTRAL_API_KEY env var)
model: OCR model to use (default: "mistral-ocr-2505")
include_image_base64: Include base64 encoded images in response
(may significantly increase response size and time)
pages: Specific pages to process (0-indexed), Defaults to all pages
image_limit: Maximum number of images to extract
image_min_size: Minimum height and width of images to extract
bbox_annotation_schema: Pydantic model for structured annotations per bounding box
document_annotation_schema: Pydantic model for structured annotations for the full document
"""
self.api_key = api_key
self.model = model
self.include_image_base64 = include_image_base64
self.pages = pages
self.image_limit = image_limit
self.image_min_size = image_min_size

# Keep schemas accessible for filtering downstream
self.bbox_annotation_schema = bbox_annotation_schema
self.document_annotation_schema = document_annotation_schema

# Automatically convert provided Pydantic models into Mistral ResponseFormat schemas
self.bbox_annotation_format = (
response_format_from_pydantic_model(bbox_annotation_schema) if bbox_annotation_schema else None
)
self.document_annotation_format = (
response_format_from_pydantic_model(document_annotation_schema) if document_annotation_schema else None
)

# Initialize Mistral client
self.client = Mistral(api_key=self.api_key.resolve_value())

@component.output_types(documents=list[Document], raw_mistral_response=OCRResponse)
def run(self, source: DocumentURLChunk | FileChunk | ImageURLChunk) -> dict:
"""
Extract text from a document using Mistral OCR.
Args:
source: Document source to process. Can be one of:
- DocumentURLChunk: For document URLs (signed or public URLs to PDFs, etc.)
- ImageURLChunk: For image URLs (signed or public URLs to images)
- FileChunk: For Mistral file IDs (files previously uploaded to Mistral)
Returns:
Dictionary with two keys:
- "documents": List containing a single Haystack Document
The Document contains:
- content: All pages joined with form feed (\f) separators in markdown format.
When using bbox_annotation in any format, image tags will be enriched by your defined descriptions
- meta: Aggregated metadata with structure:
{"source_page_count": int, "source_total_images": int, "source_*": any}
If document_annotation_format was provided, all annotation fields are unpacked
with 'source_' prefix (e.g., source_language, source_chapter_titles, source_urls)
- "raw_mistral_response": Raw OCRResponse object from Mistral API
Contains complete response including per-page details, images, annotations, and usage info
"""
# Call Mistral OCR API with the provided source
ocr_response: OCRResponse = self.client.ocr.process(
model=self.model,
document=source,
include_image_base64=self.include_image_base64,
pages=self.pages,
image_limit=self.image_limit,
image_min_size=self.image_min_size,
bbox_annotation_format=self.bbox_annotation_format,
document_annotation_format=self.document_annotation_format,
)

# Convert OCR pages to a single Haystack Document
# We add "\f" separators between pages to differentiate them and make them usable across other components
page_contents = []
total_images = 0

for page in ocr_response.pages:
# Enrich markdown content with structured image annotations inline
enriched_content = page.markdown
for img in page.images:
if img.image_annotation:
# Regex pattern to find ![img-id](img-id) and insert annotation after it
pattern = f"!\\[{re.escape(img.id)}\\]\\({re.escape(img.id)}\\)"
replacement = f"![{img.id}]({img.id})\n\n**Image Annotation:** {img.image_annotation}\n"
enriched_content = re.sub(pattern, replacement, enriched_content)

page_contents.append(enriched_content)
total_images += len(page.images)

# Join all pages with form feed character (\f) as separator
all_content = "\f".join(page_contents)

# Parse and filter document-level annotations to schema-defined fields
try:
parsed = json.loads(ocr_response.document_annotation or "{}")
if self.document_annotation_schema:
allowed = self.document_annotation_schema.model_fields.keys()
parsed = {k: v for k, v in parsed.items() if k in allowed}
doc_annotation_meta = {f"source_{k}": v for k, v in parsed.items()}
except Exception:
doc_annotation_meta = {}

# Create a single Document with aggregated metadata
document = Document(
content=all_content,
meta={
"source_page_count": len(ocr_response.pages),
"source_total_images": total_images,
# Unpack document annotation
**doc_annotation_meta,
},
)

# Return single document and raw API response for flexibility
return {"documents": [document], "raw_mistral_response": ocr_response}
Loading