feat: allow custom image type categories in MistralOCRConfig

r-dh · r-dh · commit b3b43fef2221 · 2026-02-12T18:06:59.000+01:00
diff --git a/src/raglite/__init__.py b/src/raglite/__init__.py
@@ -1,6 +1,6 @@
 """RAGLite."""
 
-from raglite._config import ImageType, MistralOCRConfig, RAGLiteConfig
+from raglite._config import MistralOCRConfig, RAGLiteConfig
 from raglite._database import Document
 from raglite._delete import delete_documents, delete_documents_by_metadata
 from raglite._eval import answer_evals, evaluate, insert_evals
@@ -25,7 +25,6 @@
     "RAGLiteConfig",
     "MistralOCRConfig",
     "MistralOCRError",
-    "ImageType",
     # Insert
     "Document",
     "insert_documents",
diff --git a/src/raglite/_config.py b/src/raglite/_config.py
@@ -3,7 +3,6 @@
 import contextlib
 import os
 from dataclasses import dataclass, field
-from enum import Enum
 from io import StringIO
 from pathlib import Path
 from typing import Literal
@@ -24,18 +23,9 @@
 cache_path = Path(user_data_dir("raglite", ensure_exists=True))
 
 
-class ImageType(str, Enum):
-    """Type of image detected by OCR."""
-
-    GRAPH = "graph"
-    CHART = "chart"
-    DIAGRAM = "diagram"
-    TABLE = "table"
-    PHOTO = "photo"
-    SCREENSHOT = "screenshot"
-    LOGO = "logo"
-    ICON = "icon"
-    OTHER = "other"
+DEFAULT_IMAGE_TYPES = frozenset(
+    {"graph", "chart", "diagram", "table", "photo", "screenshot", "logo", "icon", "other"}
+)
 
 
 @dataclass(frozen=True)
@@ -46,8 +36,10 @@ class MistralOCRConfig:
     api_key: str | None = None
     # Whether to use vision to describe images in documents.
     include_image_descriptions: bool = True
-    # Image types to exclude from processing (e.g., {ImageType.LOGO, ImageType.ICON}).
-    exclude_image_types: frozenset[ImageType] = frozenset()
+    # Image types that Mistral classifies each image into.
+    image_types: frozenset[str] = DEFAULT_IMAGE_TYPES
+    # Image types to exclude from the output (e.g., {"logo", "icon"}).
+    exclude_image_types: frozenset[str] = frozenset()
     model: str = "mistral-ocr-latest"
 
 
diff --git a/src/raglite/_mistral_ocr.py b/src/raglite/_mistral_ocr.py
@@ -4,12 +4,13 @@
 import logging
 import os
 import re
+from enum import Enum
 from pathlib import Path
 from typing import Any
 
 from pydantic import BaseModel, Field
 
-from raglite._config import ImageType, MistralOCRConfig
+from raglite._config import MistralOCRConfig
 
 logger = logging.getLogger(__name__)
 
@@ -32,24 +33,28 @@ class MistralOCRError(Exception):
     """Error during MistralOCR processing."""
 
 
-_IMAGE_TYPE_VALUES = ", ".join(t.value for t in ImageType)
+def _build_image_annotation_model(image_types: frozenset[str]) -> type[BaseModel]:
+    """Build an ImageAnnotation Pydantic model with the given image types."""
+    image_type_enum = Enum("ImageType", {t.upper(): t for t in sorted(image_types)}, type=str)  # type: ignore[misc]
+    image_type_values = ", ".join(sorted(image_types))
 
+    class ImageAnnotation(BaseModel):
+        """Schema for vision-based image annotation."""
 
-class ImageAnnotation(BaseModel):
-    """Schema for vision-based image annotation."""
+        image_type: image_type_enum = Field(  # type: ignore[valid-type]
+            ...,
+            description=f"The type of the image. Must be one of: {image_type_values}.",
+        )
+        description: str = Field(
+            ...,
+            description=(
+                "A concise description of the image content. For diagrams and charts, "
+                "describe what is being illustrated. For tables, summarize the data. "
+                "For photos, describe the subject matter."
+            ),
+        )
 
-    image_type: ImageType = Field(
-        ...,
-        description=f"The type of the image. Must be one of: {_IMAGE_TYPE_VALUES}.",
-    )
-    description: str = Field(
-        ...,
-        description=(
-            "A concise description of the image content. For diagrams and charts, "
-            "describe what is being illustrated. For tables, summarize the data. "
-            "For photos, describe the subject matter."
-        ),
-    )
+    return ImageAnnotation
 
 
 def _get_api_key(processor_config: MistralOCRConfig) -> str:
@@ -101,8 +106,9 @@ def _encode_document_base64(doc_path: Path) -> tuple[str, str]:
 def _process_ocr_response(
     ocr_response: Any,
     *,
+    annotation_model: type[BaseModel],
     include_image_descriptions: bool = True,
-    exclude_image_types: frozenset[ImageType] | None = None,
+    exclude_image_types: frozenset[str] | None = None,
 ) -> str:
     """Convert MistralOCR response to markdown string.
 
@@ -113,10 +119,12 @@ def _process_ocr_response(
     ----------
     ocr_response
         Response from Mistral OCR API.
+    annotation_model
+        The Pydantic model used to parse image annotations.
     include_image_descriptions
         Whether to replace image placeholders with annotations.
     exclude_image_types
-        Set of ImageType values to exclude from output.
+        Set of image type strings to exclude from output.
 
     Returns
     -------
@@ -137,12 +145,13 @@ def _process_ocr_response(
                     placeholder_pattern = rf"!\[[^\]]*\]\({re.escape(img.id)}\)"
                     # Parse annotation to check image type for filtering.
                     try:
-                        parsed = ImageAnnotation.model_validate_json(annotation)
-                        if parsed.image_type in exclude_image_types:
+                        parsed: Any = annotation_model.model_validate_json(annotation)
+                        image_type = parsed.image_type.value
+                        if image_type in exclude_image_types:
                             # Remove the image placeholder entirely.
                             page_md = re.sub(placeholder_pattern, "", page_md)
                             continue
-                        replacement = f"[Image ({parsed.image_type.value}): {parsed.description}]"
+                        replacement = f"[Image ({image_type}): {parsed.description}]"
                     except (ValueError, TypeError):
                         # If parsing fails, use raw annotation.
                         replacement = f"[Image: {annotation}]"
@@ -201,13 +210,15 @@ def mistral_ocr_to_markdown(doc_path: Path, *, processor_config: MistralOCRConfi
         "include_image_base64": False,  # We don't need base64, just annotations.
     }
 
+    annotation_model = _build_image_annotation_model(processor_config.image_types)
+
     try:
         client = _get_mistral_client(processor_config)
         # Add bbox annotation format if image descriptions are enabled.
         if processor_config.include_image_descriptions:
             response_format_from_pydantic_model = _get_response_format_converter()
             ocr_params["bbox_annotation_format"] = response_format_from_pydantic_model(
-                ImageAnnotation
+                annotation_model
             )
         ocr_response = client.ocr.process(**ocr_params)
     except (ImportError, ValueError):
@@ -219,6 +230,7 @@ def mistral_ocr_to_markdown(doc_path: Path, *, processor_config: MistralOCRConfi
     # Process response and replace image placeholders with annotations.
     return _process_ocr_response(
         ocr_response,
+        annotation_model=annotation_model,
         include_image_descriptions=processor_config.include_image_descriptions,
         exclude_image_types=processor_config.exclude_image_types,
     )
diff --git a/tests/test_mistral_ocr.py b/tests/test_mistral_ocr.py
@@ -6,8 +6,9 @@
 
 import pytest
 
-from raglite import ImageType, MistralOCRConfig
+from raglite import MistralOCRConfig
 from raglite._mistral_ocr import (
+    _build_image_annotation_model,
     _process_ocr_response,
     mistral_ocr_to_markdown,
 )
@@ -42,10 +43,12 @@ def test_process_ocr_response() -> None:
             ("![](img-r.jpeg)", [("img-r.jpeg", "raw fallback text")]),  # page 2
         ]
     )
+    annotation_model = _build_image_annotation_model(frozenset({"diagram", "logo"}))
     result = _process_ocr_response(
         response,
+        annotation_model=annotation_model,
         include_image_descriptions=True,
-        exclude_image_types=frozenset({ImageType.LOGO}),
+        exclude_image_types=frozenset({"logo"}),
     )
     assert "[Image (diagram): A flowchart]" in result
     assert "Company logo" not in result

Original file line number	Diff line number	Diff line change
`@@ -6,8 +6,9 @@`
`6`	`6`
`7`	`7`	`import pytest`
`8`	`8`
`9`		`-from raglite import ImageType, MistralOCRConfig`
	`9`	`+from raglite import MistralOCRConfig`
`10`	`10`	`from raglite._mistral_ocr import (`
	`11`	`+ _build_image_annotation_model,`
`11`	`12`	`_process_ocr_response,`
`12`	`13`	`mistral_ocr_to_markdown,`
`13`	`14`	`)`
`@@ -42,10 +43,12 @@ def test_process_ocr_response() -> None:`
`42`	`43`	`("![](img-r.jpeg)", [("img-r.jpeg", "raw fallback text")]), # page 2`
`43`	`44`	`]`
`44`	`45`	`)`
	`46`	`+ annotation_model = _build_image_annotation_model(frozenset({"diagram", "logo"}))`
`45`	`47`	`result = _process_ocr_response(`
`46`	`48`	`response,`
	`49`	`+ annotation_model=annotation_model,`
`47`	`50`	`include_image_descriptions=True,`
`48`		`- exclude_image_types=frozenset({ImageType.LOGO}),`
	`51`	`+ exclude_image_types=frozenset({"logo"}),`
`49`	`52`	`)`
`50`	`53`	`assert "[Image (diagram): A flowchart]" in result`
`51`	`54`	`assert "Company logo" not in result`