deepset-ai · anakin87 · Oct 23, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
@@ -13,13 +13,14 @@ on:
 env:
   CORE_DATADOG_API_KEY: ${{ secrets.CORE_DATADOG_API_KEY }}
   PYTHON_VERSION: "3.10"
-  EXCLUDE_PACKAGES: "(?i)^(azure-identity|fastembed|ragas|tqdm|psycopg).*"
+  EXCLUDE_PACKAGES: "(?i)^(azure-identity|fastembed|ragas|tqdm|psycopg|mistralai).*"
 
   # Exclusions must be explicitly motivated
   #
   # - azure-identity is MIT but the license is not available on PyPI
   # - fastembed is Apache 2.0 but the license on PyPI is unclear ("Other/Proprietary License (Apache License)")
   # - ragas is Apache 2.0 but the license is not available on PyPI
+  # - mistralai is Apache 2.0 but the license is not available on PyPI
 
   # - tqdm is MLP but there are no better alternatives
   # - psycopg is LGPL-3.0 but FOSSA is fine with it

@@ -0,0 +1,87 @@
+# To run this example, you will need to:
+# 1. Set a `MISTRAL_API_KEY` environment variable
+# 2. Place a PDF file named `sample.pdf` in the same directory as this script
+#
+# This example demonstrates OCR document processing with structured annotations,
+# embedding the extracted documents using Mistral embeddings, and storing them
+# in an InMemoryDocumentStore for later retrieval.
+#
+# You can customize the ImageAnnotation and DocumentAnnotation schemas below
+# to extract different structured information from your documents.
+
+from typing import List
+
+from haystack import Pipeline
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from mistralai.models import DocumentURLChunk
+from pydantic import BaseModel, Field
+
+from haystack_integrations.components.converters.mistral.ocr_document_converter import (
+    MistralOCRDocumentConverter,
+)
+from haystack_integrations.components.embedders.mistral.document_embedder import (
+    MistralDocumentEmbedder,
+)
+
+
+# Define schema for structured image annotations (bbox)
+class ImageAnnotation(BaseModel):
+    image_type: str = Field(..., description="The type of image content")
+    description: str = Field(..., description="Brief description of the image")
+
+
+# Define schema for structured document annotations
+class DocumentAnnotation(BaseModel):
+    language: str = Field(..., description="Primary language of the document")
+    urls: List[str] = Field(..., description="URLs found in the document")
+    topics: List[str] = Field(..., description="Main topics covered in the document")
+
+
+# Initialize document store
+document_store = InMemoryDocumentStore()
+
+# Create indexing pipeline
+indexing_pipeline = Pipeline()
+
+# Add components to the pipeline
+indexing_pipeline.add_component(
+    "converter",
+    MistralOCRDocumentConverter(pages=[0, 1]),
+)
+indexing_pipeline.add_component(
+    "embedder",
+    MistralDocumentEmbedder(),
+)
+indexing_pipeline.add_component(
+    "writer",
+    DocumentWriter(document_store=document_store),
+)
+
+# Connect components
+indexing_pipeline.connect("converter.documents", "embedder.documents")
+indexing_pipeline.connect("embedder.documents", "writer.documents")
+
+# Prepare sources: URL and local file
+sources = [
+    DocumentURLChunk(document_url="https://arxiv.org/pdf/1706.03762"),
+    "./sample.pdf",  # Local PDF file
+]
+
+# Run the pipeline with annotation schemas
+result = indexing_pipeline.run(
+    {
+        "converter": {
+            "sources": sources,
+            "bbox_annotation_schema": ImageAnnotation,
+            "document_annotation_schema": DocumentAnnotation,
+        }
+    }
+)
+
+
+# Check out documents processed by OCR.
+# Optional with enriched content (from bbox annotation) and semantic meta data (from document annotation)
+documents = document_store.storage
+# Check out mistral api response for unprocessed data and with usage_info
+raw_mistral_response = result["converter"]["raw_mistral_response"]
@@ -5,6 +5,7 @@ loaders:
       "haystack_integrations.components.embedders.mistral.document_embedder",
       "haystack_integrations.components.embedders.mistral.text_embedder",
       "haystack_integrations.components.generators.mistral.chat.chat_generator",
+      "haystack_integrations.components.converters.mistral.ocr_document_converter",
     ]
     ignore_when_discovered: ["__init__"]
 processors:

@@ -23,7 +23,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai>=2.19.0"]
+dependencies = ["haystack-ai>=2.19.0", "mistralai>=1.9.11"]
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/mistral#readme"
@@ -58,7 +58,7 @@ dependencies = [
     "pytest-rerunfailures",
     "mypy",
     "pip",
-    "pytz"
+    "pytz",
 ]
 
 [tool.hatch.envs.test.scripts]
@@ -68,7 +68,8 @@ all = 'pytest {args:tests}'
 cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'
 
 types = """mypy -p haystack_integrations.components.embedders.mistral \
--p haystack_integrations.components.generators.mistral {args}"""
+-p haystack_integrations.components.generators.mistral \
+-p haystack_integrations.components.converters {args}"""
 
 [tool.mypy]
 install_types = true

@@ -0,0 +1,3 @@
+from .ocr_document_converter import MistralOCRDocumentConverter
+
+__all__ = ["MistralOCRDocumentConverter"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .ocr_document_converter import MistralOCRDocumentConverter

		__all__ = ["MistralOCRDocumentConverter"]