diff --git a/.github/workflows/CI_license_compliance.yml b/.github/workflows/CI_license_compliance.yml index 5cc46dd718..f6b7351abb 100644 --- a/.github/workflows/CI_license_compliance.yml +++ b/.github/workflows/CI_license_compliance.yml @@ -13,13 +13,14 @@ on: env: CORE_DATADOG_API_KEY: ${{ secrets.CORE_DATADOG_API_KEY }} PYTHON_VERSION: "3.10" - EXCLUDE_PACKAGES: "(?i)^(azure-identity|fastembed|ragas|tqdm|psycopg).*" + EXCLUDE_PACKAGES: "(?i)^(azure-identity|fastembed|ragas|tqdm|psycopg|mistralai).*" # Exclusions must be explicitly motivated # # - azure-identity is MIT but the license is not available on PyPI # - fastembed is Apache 2.0 but the license on PyPI is unclear ("Other/Proprietary License (Apache License)") # - ragas is Apache 2.0 but the license is not available on PyPI + # - mistralai is Apache 2.0 but the license is not available on PyPI # - tqdm is MLP but there are no better alternatives # - psycopg is LGPL-3.0 but FOSSA is fine with it diff --git a/README.md b/README.md index 00d4515b52..999b66fc93 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [llama-stack-haystack](integrations/llama_stack/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/llama-stack-haystack.svg?color=orange)](https://pypi.org/project/llama-stack-haystack) | [![Test / llama-stack](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_stack.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_stack.yml) | | [mcp-haystack](integrations/mcp/) | Tool | [![PyPI - Version](https://img.shields.io/pypi/v/mcp-haystack.svg?color=orange)](https://pypi.org/project/mcp-haystack) | [![Test / mcp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mcp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mcp.yml) | | [meta-llama-haystack](integrations/meta_llama/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/meta-llama-haystack.svg?color=orange)](https://pypi.org/project/meta-llama-haystack) | [![Test / meta_llama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/meta_llama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/meta_llama.yml) | -| [mistral-haystack](integrations/mistral/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/mistral-haystack.svg)](https://pypi.org/project/mistral-haystack) | [![Test / mistral](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mistral.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mistral.yml) | +| [mistral-haystack](integrations/mistral/) | Converter, Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/mistral-haystack.svg)](https://pypi.org/project/mistral-haystack) | [![Test / mistral](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mistral.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mistral.yml) | | [mongodb-atlas-haystack](integrations/mongodb_atlas/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg?color=orange)](https://pypi.org/project/mongodb-atlas-haystack) | [![Test / mongodb-atlas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml) | | [nvidia-haystack](integrations/nvidia/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/nvidia-haystack.svg?color=orange)](https://pypi.org/project/nvidia-haystack) | [![Test / nvidia](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml) | | [ollama-haystack](integrations/ollama/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) | diff --git a/integrations/mistral/examples/indexing_ocr_pipeline.py b/integrations/mistral/examples/indexing_ocr_pipeline.py new file mode 100644 index 0000000000..0258231b78 --- /dev/null +++ b/integrations/mistral/examples/indexing_ocr_pipeline.py @@ -0,0 +1,87 @@ +# To run this example, you will need to: +# 1. Set a `MISTRAL_API_KEY` environment variable +# 2. Place a PDF file named `sample.pdf` in the same directory as this script +# +# This example demonstrates OCR document processing with structured annotations, +# embedding the extracted documents using Mistral embeddings, and storing them +# in an InMemoryDocumentStore for later retrieval. +# +# You can customize the ImageAnnotation and DocumentAnnotation schemas below +# to extract different structured information from your documents. + +from typing import List + +from haystack import Pipeline +from haystack.components.writers import DocumentWriter +from haystack.document_stores.in_memory import InMemoryDocumentStore +from mistralai.models import DocumentURLChunk +from pydantic import BaseModel, Field + +from haystack_integrations.components.converters.mistral.ocr_document_converter import ( + MistralOCRDocumentConverter, +) +from haystack_integrations.components.embedders.mistral.document_embedder import ( + MistralDocumentEmbedder, +) + + +# Define schema for structured image annotations (bbox) +class ImageAnnotation(BaseModel): + image_type: str = Field(..., description="The type of image content") + description: str = Field(..., description="Brief description of the image") + + +# Define schema for structured document annotations +class DocumentAnnotation(BaseModel): + language: str = Field(..., description="Primary language of the document") + urls: List[str] = Field(..., description="URLs found in the document") + topics: List[str] = Field(..., description="Main topics covered in the document") + + +# Initialize document store +document_store = InMemoryDocumentStore() + +# Create indexing pipeline +indexing_pipeline = Pipeline() + +# Add components to the pipeline +indexing_pipeline.add_component( + "converter", + MistralOCRDocumentConverter(pages=[0, 1]), +) +indexing_pipeline.add_component( + "embedder", + MistralDocumentEmbedder(), +) +indexing_pipeline.add_component( + "writer", + DocumentWriter(document_store=document_store), +) + +# Connect components +indexing_pipeline.connect("converter.documents", "embedder.documents") +indexing_pipeline.connect("embedder.documents", "writer.documents") + +# Prepare sources: URL and local file +sources = [ + DocumentURLChunk(document_url="https://arxiv.org/pdf/1706.03762"), + "./sample.pdf", # Local PDF file +] + +# Run the pipeline with annotation schemas +result = indexing_pipeline.run( + { + "converter": { + "sources": sources, + "bbox_annotation_schema": ImageAnnotation, + "document_annotation_schema": DocumentAnnotation, + } + } +) + + +# Check out documents processed by OCR. +# Optional with enriched content (from bbox annotation) and semantic meta data (from document annotation) +documents = document_store.storage +# Check out mistral api response for unprocessed data and with usage_info +raw_mistral_response = result["converter"]["raw_mistral_response"] diff --git a/integrations/mistral/pydoc/config.yml b/integrations/mistral/pydoc/config.yml index c26843a549..7ac3b8b999 100644 --- a/integrations/mistral/pydoc/config.yml +++ b/integrations/mistral/pydoc/config.yml @@ -5,6 +5,7 @@ loaders: "haystack_integrations.components.embedders.mistral.document_embedder", "haystack_integrations.components.embedders.mistral.text_embedder", "haystack_integrations.components.generators.mistral.chat.chat_generator", + "haystack_integrations.components.converters.mistral.ocr_document_converter", ] ignore_when_discovered: ["__init__"] processors: diff --git a/integrations/mistral/pydoc/config_docusaurus.yml b/integrations/mistral/pydoc/config_docusaurus.yml index 9340803a2c..275c911e8e 100644 --- a/integrations/mistral/pydoc/config_docusaurus.yml +++ b/integrations/mistral/pydoc/config_docusaurus.yml @@ -5,6 +5,7 @@ loaders: - haystack_integrations.components.embedders.mistral.document_embedder - haystack_integrations.components.embedders.mistral.text_embedder - haystack_integrations.components.generators.mistral.chat.chat_generator + - haystack_integrations.components.converters.mistral.ocr_document_converter search_path: - ../src type: haystack_pydoc_tools.loaders.CustomPythonLoader diff --git a/integrations/mistral/pyproject.toml b/integrations/mistral/pyproject.toml index bcc53c9d3e..15be36f58d 100644 --- a/integrations/mistral/pyproject.toml +++ b/integrations/mistral/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.19.0"] +dependencies = ["haystack-ai>=2.19.0", "mistralai>=1.9.11"] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/mistral#readme" @@ -58,7 +58,7 @@ dependencies = [ "pytest-rerunfailures", "mypy", "pip", - "pytz" + "pytz", ] [tool.hatch.envs.test.scripts] @@ -68,7 +68,8 @@ all = 'pytest {args:tests}' cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x' types = """mypy -p haystack_integrations.components.embedders.mistral \ --p haystack_integrations.components.generators.mistral {args}""" +-p haystack_integrations.components.generators.mistral \ +-p haystack_integrations.components.converters {args}""" [tool.mypy] install_types = true diff --git a/integrations/mistral/src/haystack_integrations/components/converters/mistral/__init__.py b/integrations/mistral/src/haystack_integrations/components/converters/mistral/__init__.py new file mode 100644 index 0000000000..d1e037a7d3 --- /dev/null +++ b/integrations/mistral/src/haystack_integrations/components/converters/mistral/__init__.py @@ -0,0 +1,3 @@ +from .ocr_document_converter import MistralOCRDocumentConverter + +__all__ = ["MistralOCRDocumentConverter"] diff --git a/integrations/mistral/src/haystack_integrations/components/converters/mistral/ocr_document_converter.py b/integrations/mistral/src/haystack_integrations/components/converters/mistral/ocr_document_converter.py new file mode 100644 index 0000000000..47c79fb91a --- /dev/null +++ b/integrations/mistral/src/haystack_integrations/components/converters/mistral/ocr_document_converter.py @@ -0,0 +1,412 @@ +import json +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Type, Union + +from haystack import Document, component, default_from_dict, default_to_dict, logging +from haystack.components.converters.utils import get_bytestream_from_source +from haystack.dataclasses import ByteStream +from haystack.utils import Secret, deserialize_secrets_inplace +from mistralai import Mistral +from mistralai.extra import response_format_from_pydantic_model +from mistralai.models import ( + DocumentURLChunk, + FileChunk, + ImageURLChunk, + OCRResponse, +) +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + + +@component +class MistralOCRDocumentConverter: + """ + This component extracts text from documents using Mistral's OCR API, with optional structured + annotations for both individual image regions (bounding boxes) and full documents. + + Accepts document sources in various formats (str/Path for local files, ByteStream for in-memory data, + DocumentURLChunk for document URLs, ImageURLChunk for image URLs, or FileChunk for Mistral file IDs) + and retrieves the recognized text via Mistral's OCR service. Local files are automatically uploaded + to Mistral's storage. + Returns Haystack Documents (one per source) containing all pages concatenated with form feed characters (\\f), + ensuring compatibility with Haystack's DocumentSplitter for accurate page-wise splitting and overlap handling. + + **How Annotations Work:** + When annotation schemas (`bbox_annotation_schema` or `document_annotation_schema`) are provided, + the OCR model first extracts text and structure from the document. Then, a Vision LLM is called + to analyze the content and generate structured annotations according to your defined schemas. + For more details, see: https://docs.mistral.ai/capabilities/document_ai/annotations/#how-it-works + + **Usage Example:** + ```python + from haystack.utils import Secret + from haystack_integrations.mistral import MistralOCRDocumentConverter + from mistralai.models import DocumentURLChunk, ImageURLChunk, FileChunk + + converter = MistralOCRDocumentConverter( + api_key=Secret.from_env_var("MISTRAL_API_KEY"), + model="mistral-ocr-2505" + ) + + # Process multiple sources + sources = [ + DocumentURLChunk(document_url="https://example.com/document.pdf"), + ImageURLChunk(image_url="https://example.com/receipt.jpg"), + FileChunk(file_id="file-abc123"), + ] + result = converter.run(sources=sources) + + documents = result["documents"] # List of 3 Documents + raw_responses = result["raw_mistral_response"] # List of 3 raw responses + ``` + + **Structured Output Example:** + ```python + from pydantic import BaseModel, Field + from haystack_integrations.mistral import MistralOCRDocumentConverter + + # Define schema for structured image annotations + class ImageAnnotation(BaseModel): + image_type: str = Field(..., description="The type of image content") + short_description: str = Field(..., description="Short natural-language description") + summary: str = Field(..., description="Detailed summary of the image content") + + # Define schema for structured document annotations + class DocumentAnnotation(BaseModel): + language: str = Field(..., description="Primary language of the document") + chapter_titles: List[str] = Field(..., description="Detected chapter or section titles") + urls: List[str] = Field(..., description="URLs found in the text") + + converter = MistralOCRDocumentConverter( + model="mistral-ocr-2505", + ) + + sources = [DocumentURLChunk(document_url="https://example.com/report.pdf")] + result = converter.run( + sources=sources, + bbox_annotation_schema=ImageAnnotation, + document_annotation_schema=DocumentAnnotation, + ) + + documents = result["documents"] + raw_responses = result["raw_mistral_response"] + ``` + """ + + def __init__( + self, + api_key: Secret = Secret.from_env_var("MISTRAL_API_KEY"), + model: str = "mistral-ocr-2505", + include_image_base64: bool = False, + pages: Optional[List[int]] = None, + image_limit: Optional[int] = None, + image_min_size: Optional[int] = None, + cleanup_uploaded_files: bool = True, + ): + """ + Creates a MistralOCRDocumentConverter component. + + :param api_key: + The Mistral API key. Defaults to the MISTRAL_API_KEY environment variable. + :param model: + The OCR model to use. Default is "mistral-ocr-2505". + See more: https://docs.mistral.ai/getting-started/models/models_overview/ + :param include_image_base64: + If True, includes base64 encoded images in the response. + This may significantly increase response size and processing time. + :param pages: + Specific page numbers to process (0-indexed). If None, processes all pages. + :param image_limit: + Maximum number of images to extract from the document. + :param image_min_size: + Minimum height and width (in pixels) for images to be extracted. + :param cleanup_uploaded_files: + If True, automatically deletes files uploaded to Mistral after processing. + Only affects files uploaded from local sources (str, Path, ByteStream). + Files provided as FileChunk are not deleted. Default is True. + """ + self.api_key = api_key + self.model = model + self.include_image_base64 = include_image_base64 + self.pages = pages + self.image_limit = image_limit + self.image_min_size = image_min_size + self.cleanup_uploaded_files = cleanup_uploaded_files + + # Initialize Mistral client + self.client = Mistral(api_key=self.api_key.resolve_value()) + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + model=self.model, + include_image_base64=self.include_image_base64, + pages=self.pages, + image_limit=self.image_limit, + image_min_size=self.image_min_size, + cleanup_uploaded_files=self.cleanup_uploaded_files, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "MistralOCRDocumentConverter": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document], raw_mistral_response=List[Dict[str, Any]]) + def run( + self, + sources: List[Union[str, Path, ByteStream, DocumentURLChunk, FileChunk, ImageURLChunk]], + bbox_annotation_schema: Optional[Type[BaseModel]] = None, + document_annotation_schema: Optional[Type[BaseModel]] = None, + ) -> Dict[str, Any]: + """ + Extract text from documents using Mistral OCR. + + :param sources: + List of document sources to process. Each source can be one of: + - str: File path to a local document + - Path: Path object to a local document + - ByteStream: Haystack ByteStream object containing document data + - DocumentURLChunk: Mistral chunk for document URLs (signed or public URLs to PDFs, etc.) + - ImageURLChunk: Mistral chunk for image URLs (signed or public URLs to images) + - FileChunk: Mistral chunk for file IDs (files previously uploaded to Mistral) + :param bbox_annotation_schema: + Optional Pydantic model for structured annotations per bounding box. + When provided, a Vision LLM analyzes each image region and returns structured data. + :param document_annotation_schema: + Optional Pydantic model for structured annotations for the full document. + When provided, a Vision LLM analyzes the entire document and returns structured data. + Note: Document annotation is limited to a maximum of 8 pages. Documents exceeding + this limit will not be processed for document annotation. + + :returns: + A dictionary with the following keys: + - `documents`: List of Haystack Documents (one per source). Each Document has the following structure: + - `content`: All pages joined with form feed (\\f) separators in markdown format. + When using bbox_annotation_schema, image tags will be enriched with your defined descriptions. + - `meta`: Aggregated metadata dictionary with structure: + {"source_page_count": int, "source_total_images": int, "source_*": any}. + If document_annotation_schema was provided, all annotation fields are unpacked + with 'source_' prefix (e.g., source_language, source_chapter_titles, source_urls). + - `raw_mistral_response`: + List of dictionaries containing raw OCR responses from Mistral API (one per source). + Each response includes per-page details, images, annotations, and usage info. + """ + # Convert Pydantic models to Mistral ResponseFormat schemas + bbox_annotation_format = ( + response_format_from_pydantic_model(bbox_annotation_schema) if bbox_annotation_schema else None + ) + document_annotation_format = ( + response_format_from_pydantic_model(document_annotation_schema) if document_annotation_schema else None + ) + + # Process each source + documents = [] + raw_responses = [] + uploaded_file_ids = [] + + for source in sources: + document, raw_response, uploaded_file_id = self._process_single_source( + source, + bbox_annotation_format, + document_annotation_format, + document_annotation_schema, + ) + + # Add results if processing succeeded + if document is not None and raw_response is not None: + documents.append(document) + raw_responses.append(raw_response) + + # Track uploaded file for cleanup even if processing failed + if uploaded_file_id: + uploaded_file_ids.append(uploaded_file_id) + + # Cleanup uploaded files + self._cleanup_uploaded_files(uploaded_file_ids) + + return {"documents": documents, "raw_mistral_response": raw_responses} + + def _process_single_source( + self, + source: Union[str, Path, ByteStream, DocumentURLChunk, FileChunk, ImageURLChunk], + bbox_annotation_format: Optional[Any], + document_annotation_format: Optional[Any], + document_annotation_schema: Optional[Type[BaseModel]], + ) -> tuple[Optional[Document], Optional[Dict[str, Any]], Optional[str]]: + """ + Process a single source and return the document, raw response, and file_id if uploaded. + + :param source: + The source to process. + :param bbox_annotation_format: + Optional response format for bounding box annotations. + :param document_annotation_format: + Optional response format for document annotations. + :param document_annotation_schema: + Optional Pydantic model for document-level annotations. + + :returns: + A tuple of (Document|None, raw_response_dict|None, uploaded_file_id|None). + Returns (None, None, uploaded_file_id) if processing fails but file was uploaded. + """ + uploaded_file_id = None + try: + chunk = self._convert_source_to_chunk(source) + + # Track if we uploaded this file + if isinstance(source, (str, Path, ByteStream)) and isinstance(chunk, FileChunk): + uploaded_file_id = chunk.file_id + + ocr_response: OCRResponse = self.client.ocr.process( + model=self.model, + document=chunk, + include_image_base64=self.include_image_base64, + pages=self.pages, + image_limit=self.image_limit, + image_min_size=self.image_min_size, + bbox_annotation_format=bbox_annotation_format, + document_annotation_format=document_annotation_format, + ) + + document = self._process_ocr_response(ocr_response, document_annotation_schema) + return (document, ocr_response.model_dump(), uploaded_file_id) + except Exception as e: + logger.warning( + "Could not process source {source}. Skipping it. Error: {error}", + source=source, + error=e, + ) + return (None, None, uploaded_file_id) + + def _cleanup_uploaded_files(self, file_ids: List[str]) -> None: + """ + Delete uploaded files from Mistral storage. + + :param file_ids: + List of file IDs to delete. + """ + if not self.cleanup_uploaded_files or not file_ids: + return + + for file_id in file_ids: + try: + self.client.files.delete(file_id=file_id) + except Exception as e: + logger.warning( + "Failed to delete uploaded file {file_id}. Error: {error}", + file_id=file_id, + error=e, + ) + + def _convert_source_to_chunk( + self, + source: Union[str, Path, ByteStream, DocumentURLChunk, FileChunk, ImageURLChunk], + ) -> Union[DocumentURLChunk, FileChunk, ImageURLChunk]: + """ + Convert various source types to Mistral-compatible chunk format. + + Local sources (str, Path, ByteStream) are uploaded to Mistral's storage and returned + as FileChunk. Remote sources (DocumentURLChunk, ImageURLChunk, FileChunk) are returned as-is. + + :param source: + The source to convert. Can be a file path (str/Path), ByteStream, or Mistral chunk type. + + :returns: + A Mistral chunk type (DocumentURLChunk, FileChunk, or ImageURLChunk). + """ + # If already a Mistral chunk type, return as-is + if isinstance(source, (DocumentURLChunk, FileChunk, ImageURLChunk)): + return source + + # Convert str/Path/ByteStream to ByteStream + bytestream = get_bytestream_from_source(source=source) + + # Upload file to Mistral and get file ID + uploaded_file = self.client.files.upload( + file={ + "file_name": bytestream.meta.get("file_path", "document"), + "content": bytestream.data, + }, + purpose="ocr", + ) + + # Return FileChunk with the uploaded file ID + return FileChunk(file_id=uploaded_file.id) + + def _process_ocr_response( + self, + ocr_response: OCRResponse, + document_annotation_schema: Optional[Type[BaseModel]], + ) -> Document: + """ + Convert an OCR response from Mistral API into a single Haystack Document. + + :param ocr_response: + The OCR response object from Mistral API. + :param document_annotation_schema: + Optional Pydantic model for document-level annotations. + + :returns: + A single Haystack Document containing the processed OCR content. + """ + # Convert OCR pages to a single Haystack Document + # We add "\f" separators between pages to differentiate them and make them usable across other components + page_contents = [] + total_images = 0 + + for page in ocr_response.pages: + # Enrich markdown content with structured image annotations inline + enriched_content = page.markdown + for img in page.images: + if img.image_annotation: + # Regex pattern to find ![img-id](img-id) and insert annotation after it + pattern = f"!\\[{re.escape(img.id)}\\]\\({re.escape(img.id)}\\)" + replacement = f"![{img.id}]({img.id})\n\n**Image Annotation:** {img.image_annotation}\n" + enriched_content = re.sub(pattern, replacement, enriched_content) + + page_contents.append(enriched_content) + total_images += len(page.images) + + # Join all pages with form feed character (\f) as separator + all_content = "\f".join(page_contents) + + # Parse and filter document-level annotations to schema-defined fields + try: + parsed = json.loads(ocr_response.document_annotation or "{}") + if document_annotation_schema: + allowed = document_annotation_schema.model_fields.keys() + parsed = {k: v for k, v in parsed.items() if k in allowed} + doc_annotation_meta = {f"source_{k}": v for k, v in parsed.items()} + except Exception: + doc_annotation_meta = {} + + # Create a single Document with aggregated metadata + document = Document( + content=all_content, + meta={ + "source_page_count": len(ocr_response.pages), + "source_total_images": total_images, + # Unpack document annotation + **doc_annotation_meta, + }, + ) + + return document diff --git a/integrations/mistral/src/haystack_integrations/components/converters/py.typed b/integrations/mistral/src/haystack_integrations/components/converters/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/mistral/tests/test_ocr_document_converter.py b/integrations/mistral/tests/test_ocr_document_converter.py new file mode 100644 index 0000000000..9ca001f732 --- /dev/null +++ b/integrations/mistral/tests/test_ocr_document_converter.py @@ -0,0 +1,608 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import os +from typing import List +from unittest.mock import MagicMock, patch + +import pytest +from haystack import Document +from haystack.dataclasses import ByteStream +from haystack.utils import Secret +from mistralai.models import DocumentURLChunk, FileChunk, ImageURLChunk +from pydantic import BaseModel, Field + +from haystack_integrations.components.converters.mistral import ( + MistralOCRDocumentConverter, +) + + +class TestMistralOCRDocumentConverter: + CLASS_TYPE = ( + "haystack_integrations.components.converters.mistral.ocr_document_converter.MistralOCRDocumentConverter" + ) + + def test_init_default(self, monkeypatch): + monkeypatch.setenv("MISTRAL_API_KEY", "test-api-key") + converter = MistralOCRDocumentConverter() + + assert converter.api_key == Secret.from_env_var("MISTRAL_API_KEY") + assert converter.model == "mistral-ocr-2505" + assert converter.include_image_base64 is False + assert converter.pages is None + assert converter.image_limit is None + assert converter.image_min_size is None + + def test_init_with_all_optional_parameters(self): + converter = MistralOCRDocumentConverter( + api_key=Secret.from_token("test-api-key"), + model="mistral-ocr-custom", + include_image_base64=True, + pages=[0, 1, 2], + image_limit=10, + image_min_size=100, + ) + + assert converter.api_key == Secret.from_token("test-api-key") + assert converter.model == "mistral-ocr-custom" + assert converter.include_image_base64 is True + assert converter.pages == [0, 1, 2] + assert converter.image_limit == 10 + assert converter.image_min_size == 100 + + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("MISTRAL_API_KEY", "test-api-key") + converter = MistralOCRDocumentConverter() + converter_dict = converter.to_dict() + + assert converter_dict == { + "type": self.CLASS_TYPE, + "init_parameters": { + "api_key": { + "env_vars": ["MISTRAL_API_KEY"], + "strict": True, + "type": "env_var", + }, + "model": "mistral-ocr-2505", + "include_image_base64": False, + "pages": None, + "image_limit": None, + "image_min_size": None, + "cleanup_uploaded_files": True, + }, + } + + def test_to_dict_with_custom_parameters(self, monkeypatch): + monkeypatch.setenv("ENV_VAR", "test-api-key") + converter = MistralOCRDocumentConverter( + api_key=Secret.from_env_var("ENV_VAR", strict=False), + model="mistral-ocr-custom", + include_image_base64=True, + pages=[0, 1, 2], + image_limit=10, + image_min_size=100, + cleanup_uploaded_files=False, + ) + converter_dict = converter.to_dict() + + assert converter_dict == { + "type": self.CLASS_TYPE, + "init_parameters": { + "api_key": { + "type": "env_var", + "env_vars": ["ENV_VAR"], + "strict": False, + }, + "model": "mistral-ocr-custom", + "include_image_base64": True, + "pages": [0, 1, 2], + "image_limit": 10, + "image_min_size": 100, + "cleanup_uploaded_files": False, + }, + } + + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("MISTRAL_API_KEY", "test-api-key") + converter_dict = { + "type": self.CLASS_TYPE, + "init_parameters": { + "api_key": { + "env_vars": ["MISTRAL_API_KEY"], + "strict": True, + "type": "env_var", + }, + "model": "mistral-ocr-2505", + "include_image_base64": False, + "pages": None, + "image_limit": None, + "image_min_size": None, + "cleanup_uploaded_files": True, + }, + } + + converter = MistralOCRDocumentConverter.from_dict(converter_dict) + + assert converter.model == "mistral-ocr-2505" + assert converter.include_image_base64 is False + assert converter.pages is None + assert converter.image_limit is None + assert converter.image_min_size is None + assert converter.cleanup_uploaded_files is True + + def test_from_dict_with_custom_parameters(self, monkeypatch): + monkeypatch.setenv("MISTRAL_API_KEY", "test-api-key") + converter_dict = { + "type": self.CLASS_TYPE, + "init_parameters": { + "api_key": { + "env_vars": ["MISTRAL_API_KEY"], + "strict": True, + "type": "env_var", + }, + "model": "mistral-ocr-custom", + "include_image_base64": True, + "pages": [0, 1, 2], + "image_limit": 10, + "image_min_size": 100, + "cleanup_uploaded_files": False, + }, + } + + converter = MistralOCRDocumentConverter.from_dict(converter_dict) + + assert converter.model == "mistral-ocr-custom" + assert converter.include_image_base64 is True + assert converter.pages == [0, 1, 2] + assert converter.image_limit == 10 + assert converter.image_min_size == 100 + assert converter.cleanup_uploaded_files is False + + @pytest.fixture + def mock_ocr_response(self): + """Create a mock OCR response""" + mock_page = MagicMock() + mock_page.markdown = "# Sample Document\n\nThis is page 1." + mock_page.images = [] + + mock_response = MagicMock() + mock_response.pages = [mock_page] + mock_response.document_annotation = None + mock_response.model_dump.return_value = { + "pages": [{"markdown": "# Sample Document\n\nThis is page 1.", "images": []}], + "document_annotation": None, + } + return mock_response + + @pytest.fixture + def mock_ocr_response_with_multiple_pages(self): + """Create a mock OCR response with multiple pages""" + mock_page1 = MagicMock() + mock_page1.markdown = "# Page 1" + mock_page1.images = [] + + mock_page2 = MagicMock() + mock_page2.markdown = "# Page 2" + mock_page2.images = [] + + mock_response = MagicMock() + mock_response.pages = [mock_page1, mock_page2] + mock_response.document_annotation = None + mock_response.model_dump.return_value = { + "pages": [ + {"markdown": "# Page 1", "images": []}, + {"markdown": "# Page 2", "images": []}, + ], + "document_annotation": None, + } + return mock_response + + @pytest.mark.parametrize( + "source", + [ + DocumentURLChunk(document_url="https://example.com/doc.pdf"), + FileChunk(file_id="file-123"), + ImageURLChunk(image_url="https://example.com/image.jpg"), + ], + ids=["document_url_chunk", "file_chunk", "image_url_chunk"], + ) + def test_run_with_remote_chunk_types(self, mock_ocr_response, source): + """Test processing with remote chunk types (DocumentURLChunk, FileChunk, ImageURLChunk)""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + with patch.object(converter.client.ocr, "process", return_value=mock_ocr_response): + result = converter.run(sources=[source]) + + assert len(result["documents"]) == 1 + assert isinstance(result["documents"][0], Document) + assert result["documents"][0].content == "# Sample Document\n\nThis is page 1." + # Metadata assertions apply to all chunk types + if isinstance(source, DocumentURLChunk): + assert result["documents"][0].meta["source_page_count"] == 1 + assert result["documents"][0].meta["source_total_images"] == 0 + + @pytest.mark.parametrize( + "source_type", + ["file_path_str", "path_object", "bytestream"], + ) + def test_run_with_local_sources(self, mock_ocr_response, tmp_path, source_type): + """Test processing with local source types (str, Path, ByteStream)""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + # Create temporary file if needed + if source_type in ["file_path_str", "path_object"]: + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"fake pdf content") + + # Create the source based on type + if source_type == "file_path_str": + source = str(test_file) + elif source_type == "path_object": + source = test_file + else: # bytestream + source = ByteStream(data=b"fake pdf content", meta={"file_path": "test.pdf"}) + + mock_uploaded_file = MagicMock() + mock_uploaded_file.id = "uploaded-file-123" + + with patch.object(converter.client.files, "upload", return_value=mock_uploaded_file): + with patch.object(converter.client.ocr, "process", return_value=mock_ocr_response): + with patch.object(converter.client.files, "delete"): + result = converter.run(sources=[source]) + + assert len(result["documents"]) == 1 + assert isinstance(result["documents"][0], Document) + # Verify file was uploaded for local sources + if source_type == "file_path_str": + converter.client.files.upload.assert_called_once() + + def test_run_with_multiple_sources(self, mock_ocr_response, tmp_path): + """Test processing with multiple mixed source types""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + # Create a temporary file + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"fake pdf content") + + mock_uploaded_file = MagicMock() + mock_uploaded_file.id = "uploaded-file-123" + + with patch.object(converter.client.files, "upload", return_value=mock_uploaded_file): + with patch.object(converter.client.ocr, "process", return_value=mock_ocr_response): + with patch.object(converter.client.files, "delete"): + sources = [ + DocumentURLChunk(document_url="https://example.com/doc.pdf"), + FileChunk(file_id="file-123"), + str(test_file), + ] + result = converter.run(sources=sources) + + assert len(result["documents"]) == 3 + assert all(isinstance(doc, Document) for doc in result["documents"]) + + def test_run_with_bbox_annotations(self): + """Test processing with bbox annotation schema""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + # Define annotation schema + class ImageAnnotation(BaseModel): + image_type: str = Field(..., description="Type of image") + + # Create mock response with image annotation + mock_image = MagicMock() + mock_image.id = "img-1" + mock_image.image_annotation = '{"image_type": "diagram"}' + + mock_page = MagicMock() + mock_page.markdown = "# Document\n\n![img-1](img-1)" + mock_page.images = [mock_image] + + mock_response = MagicMock() + mock_response.pages = [mock_page] + mock_response.document_annotation = None + mock_response.model_dump.return_value = { + "pages": [], + "document_annotation": None, + } + + with patch.object(converter.client.ocr, "process", return_value=mock_response): + sources = [DocumentURLChunk(document_url="https://example.com/doc.pdf")] + result = converter.run(sources=sources, bbox_annotation_schema=ImageAnnotation) + + assert len(result["documents"]) == 1 + # Check that image annotation was enriched in content + assert "Image Annotation:" in result["documents"][0].content + + def test_run_with_document_annotations(self): + """Test processing with document annotation schema""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + # Define annotation schema + class DocumentAnnotation(BaseModel): + language: str = Field(..., description="Document language") + topics: List[str] = Field(..., description="Main topics") + + # Create mock response with document annotation + mock_page = MagicMock() + mock_page.markdown = "# Document" + mock_page.images = [] + + mock_response = MagicMock() + mock_response.pages = [mock_page] + mock_response.document_annotation = '{"language": "en", "topics": ["AI", "ML"]}' + mock_response.model_dump.return_value = { + "pages": [], + "document_annotation": '{"language": "en", "topics": ["AI", "ML"]}', + } + + with patch.object(converter.client.ocr, "process", return_value=mock_response): + sources = [DocumentURLChunk(document_url="https://example.com/doc.pdf")] + result = converter.run(sources=sources, document_annotation_schema=DocumentAnnotation) + + assert len(result["documents"]) == 1 + # Check that document annotations are in metadata + assert result["documents"][0].meta["source_language"] == "en" + assert result["documents"][0].meta["source_topics"] == ["AI", "ML"] + + def test_run_with_both_annotations(self): + """Test processing with both bbox and document annotation schemas""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + class ImageAnnotation(BaseModel): + image_type: str = Field(..., description="Type of image") + + class DocumentAnnotation(BaseModel): + language: str = Field(..., description="Document language") + + # Create mock response + mock_image = MagicMock() + mock_image.id = "img-1" + mock_image.image_annotation = '{"image_type": "chart"}' + + mock_page = MagicMock() + mock_page.markdown = "![img-1](img-1)" + mock_page.images = [mock_image] + + mock_response = MagicMock() + mock_response.pages = [mock_page] + mock_response.document_annotation = '{"language": "en"}' + mock_response.model_dump.return_value = { + "pages": [], + "document_annotation": '{"language": "en"}', + } + + with patch.object(converter.client.ocr, "process", return_value=mock_response): + sources = [DocumentURLChunk(document_url="https://example.com/doc.pdf")] + result = converter.run( + sources=sources, + bbox_annotation_schema=ImageAnnotation, + document_annotation_schema=DocumentAnnotation, + ) + + assert len(result["documents"]) == 1 + assert "Image Annotation:" in result["documents"][0].content + assert result["documents"][0].meta["source_language"] == "en" + + def test_run_with_pages_parameter(self, mock_ocr_response): + """Test that pages parameter is passed to API""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key"), pages=[0, 1]) + + with patch.object(converter.client.ocr, "process", return_value=mock_ocr_response) as mock_process: + sources = [DocumentURLChunk(document_url="https://example.com/doc.pdf")] + result = converter.run(sources=sources) + + # Verify pages parameter was passed + call_args = mock_process.call_args + assert call_args.kwargs["pages"] == [0, 1] + assert len(result["documents"]) == 1 + + def test_run_handles_api_error(self, mock_ocr_response): + """Test error handling when API fails""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + with patch.object(converter.client.ocr, "process") as mock_process: + # First call succeeds, second fails, third succeeds + mock_process.side_effect = [ + mock_ocr_response, + Exception("API Error"), + mock_ocr_response, + ] + + sources = [ + DocumentURLChunk(document_url="https://example.com/doc1.pdf"), + DocumentURLChunk(document_url="https://example.com/doc2.pdf"), + DocumentURLChunk(document_url="https://example.com/doc3.pdf"), + ] + result = converter.run(sources=sources) + + # Should only return 2 documents (failed source skipped) + assert len(result["documents"]) == 2 + assert len(result["raw_mistral_response"]) == 2 + + def test_process_ocr_response_multiple_pages(self, mock_ocr_response_with_multiple_pages): + """Test multi-page document with form feed separator""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + document = converter._process_ocr_response( + mock_ocr_response_with_multiple_pages, document_annotation_schema=None + ) + + assert isinstance(document, Document) + # Pages should be separated by \f + assert document.content == "# Page 1\f# Page 2" + assert "\f" in document.content + assert document.meta["source_page_count"] == 2 + + def test_process_ocr_response_with_images(self): + """Test metadata extraction with images""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key")) + + # Create mock response with images + mock_image1 = MagicMock() + mock_image1.id = "img-1" + mock_image1.image_annotation = None + + mock_image2 = MagicMock() + mock_image2.id = "img-2" + mock_image2.image_annotation = None + + mock_page = MagicMock() + mock_page.markdown = "# Document with images" + mock_page.images = [mock_image1, mock_image2] + + mock_response = MagicMock() + mock_response.pages = [mock_page] + mock_response.document_annotation = None + + document = converter._process_ocr_response(mock_response, document_annotation_schema=None) + + assert document.meta["source_page_count"] == 1 + assert document.meta["source_total_images"] == 2 + + def test_run_with_cleanup_disabled(self, mock_ocr_response, tmp_path): + """Test that files are not deleted when cleanup_uploaded_files=False""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key"), cleanup_uploaded_files=False) + + # Create a temporary file + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"fake pdf content") + + mock_uploaded_file = MagicMock() + mock_uploaded_file.id = "uploaded-file-123" + + with patch.object(converter.client.files, "upload", return_value=mock_uploaded_file): + with patch.object(converter.client.ocr, "process", return_value=mock_ocr_response): + with patch.object(converter.client.files, "delete") as mock_delete: + sources = [str(test_file)] + result = converter.run(sources=sources) + + # Verify file was uploaded but NOT deleted + assert len(result["documents"]) == 1 + converter.client.files.upload.assert_called_once() + mock_delete.assert_not_called() + + def test_run_cleanup_happens_on_ocr_failure(self, tmp_path): + """Test that cleanup happens even when OCR processing fails""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key"), cleanup_uploaded_files=True) + + # Create a temporary file + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"fake pdf content") + + mock_uploaded_file = MagicMock() + mock_uploaded_file.id = "uploaded-file-123" + + with patch.object(converter.client.files, "upload", return_value=mock_uploaded_file): + with patch.object(converter.client.ocr, "process", side_effect=Exception("OCR failed")): + with patch.object(converter.client.files, "delete") as mock_delete: + sources = [str(test_file)] + result = converter.run(sources=sources) + + # Verify no documents returned due to failure + assert len(result["documents"]) == 0 + # But file should still be deleted + mock_delete.assert_called_once_with(file_id="uploaded-file-123") + + def test_run_cleanup_failure_does_not_break_flow(self, mock_ocr_response, tmp_path): + """Test that cleanup failures don't break the main flow""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key"), cleanup_uploaded_files=True) + + # Create a temporary file + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"fake pdf content") + + mock_uploaded_file = MagicMock() + mock_uploaded_file.id = "uploaded-file-123" + + with patch.object(converter.client.files, "upload", return_value=mock_uploaded_file): + with patch.object(converter.client.ocr, "process", return_value=mock_ocr_response): + with patch.object( + converter.client.files, + "delete", + side_effect=Exception("Delete failed"), + ): + sources = [str(test_file)] + # Should not raise an exception + result = converter.run(sources=sources) + + # Verify document was still processed successfully + assert len(result["documents"]) == 1 + assert isinstance(result["documents"][0], Document) + + def test_run_mixed_sources_only_uploaded_files_deleted(self, mock_ocr_response, tmp_path): + """Test that only uploaded files are deleted, not user-provided chunks""" + converter = MistralOCRDocumentConverter(api_key=Secret.from_token("test-api-key"), cleanup_uploaded_files=True) + + # Create a temporary file + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"fake pdf content") + + mock_uploaded_file = MagicMock() + mock_uploaded_file.id = "uploaded-file-123" + + with patch.object(converter.client.files, "upload", return_value=mock_uploaded_file): + with patch.object(converter.client.ocr, "process", return_value=mock_ocr_response): + with patch.object(converter.client.files, "delete") as mock_delete: + sources = [ + str(test_file), # This will be uploaded + FileChunk(file_id="user-file-123"), # User-provided + DocumentURLChunk(document_url="https://example.com/doc.pdf"), # URL + ] + result = converter.run(sources=sources) + + # Verify all sources processed + assert len(result["documents"]) == 3 + # Only the uploaded file should be deleted + mock_delete.assert_called_once_with(file_id="uploaded-file-123") + + @pytest.mark.skipif( + not os.environ.get("MISTRAL_API_KEY"), + reason="Export an env var called MISTRAL_API_KEY containing the Mistral API key to run this test.", + ) + @pytest.mark.integration + def test_integration_run_with_document_url(self): + """Integration test with real API call using arxiv PDF""" + converter = MistralOCRDocumentConverter() + + sources = [DocumentURLChunk(document_url="https://arxiv.org/pdf/1706.03762")] + result = converter.run(sources=sources) + + assert len(result["documents"]) == 1 + assert isinstance(result["documents"][0], Document) + assert len(result["documents"][0].content) > 0 + assert result["documents"][0].meta["source_page_count"] > 0 + assert "raw_mistral_response" in result + assert len(result["raw_mistral_response"]) == 1 + + @pytest.mark.skipif( + not os.environ.get("MISTRAL_API_KEY"), + reason="Export an env var called MISTRAL_API_KEY containing the Mistral API key to run this test.", + ) + @pytest.mark.integration + def test_integration_run_with_annotations(self): + """Integration test with real API call using annotation schemas""" + converter = MistralOCRDocumentConverter(pages=[0]) # Only process first page for speed + + # Define simple annotation schemas + class ImageAnnotation(BaseModel): + image_type: str = Field( + ..., + description="The type of image content (e.g., diagram, chart, photo)", + ) + + class DocumentAnnotation(BaseModel): + language: str = Field(..., description="The primary language of the document") + + sources = [DocumentURLChunk(document_url="https://arxiv.org/pdf/1706.03762")] + result = converter.run( + sources=sources, + bbox_annotation_schema=ImageAnnotation, + document_annotation_schema=DocumentAnnotation, + ) + + assert len(result["documents"]) == 1 + doc = result["documents"][0] + assert isinstance(doc, Document) + assert len(doc.content) > 0 + # Check if document annotation was added to metadata + assert "source_language" in doc.meta