diff --git a/data/sample.pptx b/data/sample.pptx new file mode 100644 index 00000000..11ec803a Binary files /dev/null and b/data/sample.pptx differ diff --git a/mistral/ocr/ocr_pptx.ipynb b/mistral/ocr/ocr_pptx.ipynb new file mode 100644 index 00000000..1385c2ec --- /dev/null +++ b/mistral/ocr/ocr_pptx.ipynb @@ -0,0 +1,748 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7cc51be0", + "metadata": {}, + "source": [ + "# OCR and Annotation on PPTX file\n", + "\n", + "In this cookbook, we will explore the basics of Annotations and how to extract structured outputs \n", + "from a PPTX file using the Mistral OCR API.\n", + "\n", + "This notebook runs OCR on `data/sample.pptx` and prints the results inline.\n", + "\n", + "> Make sure you have a valid **MISTRAL_API_KEY** in your environment or edit the cell below." + ] + }, + { + "cell_type": "markdown", + "id": "7d2b105e", + "metadata": {}, + "source": [ + "## 1) Install dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "78599125", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip3 install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Install the Mistral client library\n", + "%pip install -qU mistralai==1.9.3 pydantic==2.11.1\n" + ] + }, + { + "cell_type": "markdown", + "id": "be40c7d3", + "metadata": {}, + "source": [ + "## 2) Configure paths & client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5edc7296", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using: ../../data/sample.pptx\n" + ] + } + ], + "source": [ + "import json\n", + "import base64\n", + "from enum import Enum\n", + "from pydantic import BaseModel, Field\n", + "from mistralai import Mistral, DocumentURLChunk\n", + "from mistralai.extra import response_format_from_pydantic_model\n", + "\n", + "# --- File paths ---\n", + "doc_path = \"../../data/sample.pptx\"\n", + "\n", + "# --- Client ---\n", + "api_key = \"YOUR_API_KEY\" # Replace with your API key\n", + "client = Mistral(api_key=api_key)\n", + "\n", + "print(\"Using:\", doc_path)" + ] + }, + { + "cell_type": "markdown", + "id": "647bd914", + "metadata": {}, + "source": [ + "## 3) Define schemas for annotations" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "552f2a11", + "metadata": {}, + "outputs": [], + "source": [ + "class ImageType(str, Enum):\n", + " GRAPH = \"graph\"\n", + " TEXT = \"text\"\n", + " TABLE = \"table\"\n", + " IMAGE = \"image\"\n", + "\n", + "class Image(BaseModel):\n", + " image_type: ImageType = Field(..., description=\"The type of the image. Must be one of 'graph', 'text', 'table' or 'image'.\")\n", + " description: str = Field(..., description=\"A description of the image.\")\n", + "\n", + "class Document(BaseModel):\n", + " summary: str = Field(..., description=\"A summary of the document.\")\n", + " models: str = Field(..., description=\"A list of models provided by Mistral AI​.\")" + ] + }, + { + "cell_type": "markdown", + "id": "6a3f4892", + "metadata": {}, + "source": [ + "## 4) Helper: read file(s) as base64" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "425b9df9", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_file_b64(path: str):\n", + " try:\n", + " with open(path, \"rb\") as f:\n", + " return base64.b64encode(f.read()).decode(\"utf-8\")\n", + " except FileNotFoundError:\n", + " print(f\"Error: The file {path} was not found.\")\n", + " return None\n", + " except Exception as e:\n", + " print(\"Error:\", e)\n", + " return None\n", + "\n", + "base64_doc = encode_file_b64(doc_path)\n", + "assert base64_doc is not None, \"Input PPTX not found – check path.\"" + ] + }, + { + "cell_type": "markdown", + "id": "050a31a7", + "metadata": {}, + "source": [ + "## 5) Run OCR (Full Text)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ce5dfcb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== OCR Output ===\n", + "{\n", + " \"pages\": [\n", + " {\n", + " \"index\": 0,\n", + " \"markdown\": \"# Frontier AI. \\u000bIn your hands.\\nConfigurable AI for all builders\\n\\n# July 2025\\n\\nConfidential ©2025 Mistral AI\\\\. All rights reserved\\\\.\",\n", + " \"images\": [],\n", + " \"dimensions\": null\n", + " },\n", + " {\n", + " \"index\": 1,\n", + " \"markdown\": \"# The Mistral AI model landscape\\n\\n__Our __ __moderation service__ __ to detect __ __harmful__ __ text content__\\n\\n__Our __ __most __ __efficient / powerful__ __edge__ __ models__\\n\\n![img-0.jpeg](img-0.jpeg)\\n\\n![img-1.jpeg](img-1.jpeg)\\n\\n__Our __ __frontier\\\\-class reasoning __ __model__\\n\\n__Our flagship model\\\\, for__ __your __ __most sophisticated needs__\\n\\n![img-2.jpeg](img-2.jpeg)\\n\\n![img-3.jpeg](img-3.jpeg)\\n\\n__Mistral Moderation__\\n\\n__Mistral Medium 3__\\n\\n__Magistral Medium __\\n\\n__The world’s __ __best OCR model __ __to\\\\-date__\\n\\n![img-4.jpeg](img-4.jpeg)\\n\\n__Our __ __embedding model__ __ to make your data accessible__\\n\\n__Our __ __low\\\\-__ __latency code model__ __ for code generation \\\\(FIM\\\\)__\\n\\n__Our __ __agentic LLM__ __ for high\\\\-performance software engineering tasks__\\n\\n![img-5.jpeg](img-5.jpeg)\\n\\n![img-6.jpeg](img-6.jpeg)\\n\\n![img-7.jpeg](img-7.jpeg)\\n\\n__Devstral __\\n\\n__Medium__\\n\\n__Our __ __enterprise\\\\-__ __grade small model__ __\\\\, cost efficient and fast__\\n\\n__Our __ __small reasoning__ __ model__\\n\\n__Our __ __small__ __ __ __agentic LLM__ __ for software engineering tasks__\\n\\n__Open\\\\-__ __source__ __ models__ _Apache 2\\\\.0_\\n\\n__Our __ __frontier speech understanding__ __ models__\\n\\n![img-8.jpeg](img-8.jpeg)\\n\\n![img-9.jpeg](img-9.jpeg)\\n\\n![img-10.jpeg](img-10.jpeg)\\n\\n![img-11.jpeg](img-11.jpeg)\\n\\n__Voxstral Small/Mini__\\n\\n__Mistral __ __Small 3\\\\.2__\\n\\n__Magistral Small __\\n\\nConfidential ©2025 Mistral AI\\\\. All rights reserved\\\\.\\n\\n![img-12.jpeg](img-12.jpeg)\",\n", + " \"images\": [\n", + " {\n", + " \"id\": \"img-0.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-1.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-2.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-3.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-4.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-5.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-6.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-7.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-8.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-9.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-10.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-11.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-12.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " },\n", + " {\n", + " \"index\": 2,\n", + " \"markdown\": \"# Mistral Medium Benchmark\\n\\n![img-13.jpeg](img-13.jpeg)\",\n", + " \"images\": [\n", + " {\n", + " \"id\": \"img-13.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " },\n", + " {\n", + " \"index\": 3,\n", + " \"markdown\": \"# Mistral Medium 3.1\\n\\n![img-14.jpeg](img-14.jpeg)\",\n", + " \"images\": [\n", + " {\n", + " \"id\": \"img-14.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " },\n", + " {\n", + " \"index\": 4,\n", + " \"markdown\": \"# We’ve delivered for our customers\\n\\n![img-15.jpeg](img-15.jpeg)\\n\\n![img-16.jpeg](img-16.jpeg)\\n\\n![img-17.jpeg](img-17.jpeg)\\n\\n__Use case: __ __Banking Advisor Agent for Customer Support__\\n\\n__Adoption: __ __>5000 tickets automated per month__\\n\\n__Metrics:__ __ 88% accuracy\\\\, 80% decrease in cost per ticket__\\n\\n__Business value: __ __Est\\\\. ~$500k / yr\\\\. saved__\\n\\n__Use case:__ __ Vision\\\\-Language\\\\-Action models\\\\, enhancing defence platforms' ability to understand their environment\\\\, communicate with operators\\\\, and make rapid\\\\, reliable decisions in complex scenarios\\\\. This initiative fosters human\\\\-AI collaboration on the battlefield and revolutionizing defence operations\\\\.__\\n\\n__Use case: __ Comprehensive internal generative AI platform\\n\\n__Adoption: __ 30k\\\\+ users\\\\, 100\\\\+ use cases across banking\\\\, markets\\\\, retail\\n\\n__Metrics: __ ~50% cheaper than Llama and other alternatives\\n\\n__Business value: __ Est\\\\. >$30M / yr productivity unlocked\\n\\n![img-18.jpeg](img-18.jpeg)\\n\\n![img-19.jpeg](img-19.jpeg)\\n\\n![img-20.jpeg](img-20.jpeg)\\n\\n__Use case:__ Enterprise AI agents for customer service and internal RAG applications\\n\\n__Adoption__ : 30k\\\\+ active users\\n\\n__Metrics__ : 20%\\\\+ uplift in daily productivity with task automation\\n\\n__Business value__ : Est\\\\. >$50M / yr\\\\. saved\\n\\n__Use case__ : Automatic customer review response\\n\\n__Adoption__ : 3k\\\\+ stores\\\\, 1\\\\.3M monthly reviews\\n\\n__Metrics:__ 80%\\\\+ reduction in review response time\\\\, 80%\\\\+ satisfaction\\n\\n__Business value:__ Est\\\\. >$10M / yr\\\\. saved\\n\\n__Use case:__ SQL assistant Cortex Analyst\\n\\n__Adoption:__ Est\\\\. 100\\\\+ Snowflake customers in production\\n\\n__Metrics__ :  ~90%\\\\+ accuracy\\\\, ~2X single\\\\-shot SQL generation\\n\\n__Business value__ : Est\\\\. >$3M productivity unlocked\\n\\n_Find more stories on:_\\n\\n_https://mistral\\\\.ai/customers_\\n\\nConfidential ©2025 Mistral AI\\\\. All rights reserved\\\\.\\n\\n![img-21.jpeg](img-21.jpeg)\",\n", + " \"images\": [\n", + " {\n", + " \"id\": \"img-15.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-16.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-17.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-18.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-19.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-20.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " },\n", + " {\n", + " \"id\": \"img-21.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": null\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " }\n", + " ],\n", + " \"model\": \"mistral-ocr-latest\",\n", + " \"usage_info\": {\n", + " \"pages_processed\": 5,\n", + " \"doc_size_bytes\": 7002042\n", + " },\n", + " \"document_annotation\": null\n", + "}\n" + ] + } + ], + "source": [ + "# URL for PPTX so OCR can read it directly\n", + "# Use application/vnd.openxmlformats-officedocument.wordprocessingml.document for DOCX\n", + "data_url = f\"data:application/vnd.openxmlformats-officedocument.presentationml.presentation;base64,{base64_doc}\"\n", + "\n", + "# -- OCR (full) --\n", + "ocr_response = client.ocr.process(\n", + " model=\"mistral-ocr-latest\",\n", + " document={\"type\": \"document_url\", \"document_url\": data_url},\n", + " include_image_base64=True,\n", + ")\n", + "\n", + "ocr_dict = json.loads(ocr_response.model_dump_json())\n", + "\n", + "def truncate_base64(obj):\n", + " if isinstance(obj, dict):\n", + " return {k: truncate_base64(v) for k, v in obj.items()}\n", + " elif isinstance(obj, list):\n", + " return [truncate_base64(item) for item in obj]\n", + " elif isinstance(obj, str) and obj.startswith('...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image depicts a pixelated, retro-style illustration of a computer or television set. The screen of the device displays a simple, blocky graphic resembling a stylized letter 'H'. The overall design is reminiscent of early video game or computer graphics, characterized by its minimalistic and blocky aesthetic.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-1.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a hand pointing to the right, with a background resembling a castle or fortress.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-2.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a mountain with a red and white striped base. The mountain is depicted in blue and yellow pixels, and the base consists of horizontal stripes in red and white.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-3.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a Christmas tree with the letters 'MM' at the base.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-4.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a book with a blue cover and a red ribbon bookmark.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-5.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a computer monitor displaying a green blocky figure.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-6.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a window with a black background and the letters 'M' and 'M' in white pixels. The window has a title bar with three colored buttons: red, yellow, and green.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-7.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a mushroom with a red top and white spots, and a brown stem, set against a black background.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-8.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a hand pointing to the right, with an orange and brown color scheme.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-9.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a robot with a square head, a green light on its chest, and a rectangular body standing on a black background.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-10.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixel art image of a pink flower with green leaves.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-11.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"A pixelated image of a window with a black background and the letters 'M' and 'M' in white pixels. The window has a title bar with three colored buttons: red, yellow, and green.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-12.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image is a logo for Mistral AI. It features a stylized 'M' made up of horizontal bars in a gradient of colors from yellow to red, and the text 'Mistral AI' in white.\\\"\\n}\"\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " },\n", + " {\n", + " \"index\": 2,\n", + " \"markdown\": \"# Mistral Medium Benchmark\\n\\n![img-13.jpeg](img-13.jpeg)\",\n", + " \"images\": [\n", + " {\n", + " \"id\": \"img-13.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"table\\\",\\n \\\"description\\\": \\\"This table compares the performance of different AI models across various benchmarks. The benchmarks are categorized into Coding, Instruction Following, Math, Knowledge, Long Context, and Multimodal. Each category includes specific tests, and the performance of each model is represented as a percentage. The models compared are Mistral Medium 3, Llama 4 Maverick, GPT-4o, Claude Sonnet 3.7, Command-A, and DeepSeek 3.1. The table highlights the strengths and weaknesses of each model in different areas, providing a comprehensive overview of their capabilities.\\\"\\n}\"\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " },\n", + " {\n", + " \"index\": 3,\n", + " \"markdown\": \"# Mistral Medium 3.1\\n\\n![img-14.jpeg](img-14.jpeg)\",\n", + " \"images\": [\n", + " {\n", + " \"id\": \"img-14.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"graph\\\",\\n \\\"description\\\": \\\"A horizontal bar chart comparing the win rates of Mistral against Llama 4 Maverick across different domains. The x-axis represents the Mistral win rate in percentage, while the y-axis lists the domains: Coding, Multimodal, English, French, Spanish, German, and Arabic. Each bar is divided into two segments: the orange segment represents Mistral's win rate, and the brown segment represents Llama 4 Maverick's win rate. The chart shows that Mistral has the highest win rate in Coding (81.82%) and the lowest in Multimodal (53.85%).\\\"\\n}\"\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " },\n", + " {\n", + " \"index\": 4,\n", + " \"markdown\": \"# We’ve delivered for our customers\\n\\n![img-15.jpeg](img-15.jpeg)\\n\\n![img-16.jpeg](img-16.jpeg)\\n\\n![img-17.jpeg](img-17.jpeg)\\n\\n__Use case: __ __Banking Advisor Agent for Customer Support__\\n\\n__Adoption: __ __>5000 tickets automated per month__\\n\\n__Metrics:__ __ 88% accuracy\\\\, 80% decrease in cost per ticket__\\n\\n__Business value: __ __Est\\\\. ~$500k / yr\\\\. saved__\\n\\n__Use case:__ __ Vision\\\\-Language\\\\-Action models\\\\, enhancing defence platforms' ability to understand their environment\\\\, communicate with operators\\\\, and make rapid\\\\, reliable decisions in complex scenarios\\\\. This initiative fosters human\\\\-AI collaboration on the battlefield and revolutionizing defence operations\\\\.__\\n\\n__Use case: __ Comprehensive internal generative AI platform\\n\\n__Adoption: __ 30k\\\\+ users\\\\, 100\\\\+ use cases across banking\\\\, markets\\\\, retail\\n\\n__Metrics: __ ~50% cheaper than Llama and other alternatives\\n\\n__Business value: __ Est\\\\. >$30M / yr productivity unlocked\\n\\n![img-18.jpeg](img-18.jpeg)\\n\\n![img-19.jpeg](img-19.jpeg)\\n\\n![img-20.jpeg](img-20.jpeg)\\n\\n__Use case:__ Enterprise AI agents for customer service and internal RAG applications\\n\\n__Adoption__ : 30k\\\\+ active users\\n\\n__Metrics__ : 20%\\\\+ uplift in daily productivity with task automation\\n\\n__Business value__ : Est\\\\. >$50M / yr\\\\. saved\\n\\n__Use case__ : Automatic customer review response\\n\\n__Adoption__ : 3k\\\\+ stores\\\\, 1\\\\.3M monthly reviews\\n\\n__Metrics:__ 80%\\\\+ reduction in review response time\\\\, 80%\\\\+ satisfaction\\n\\n__Business value:__ Est\\\\. >$10M / yr\\\\. saved\\n\\n__Use case:__ SQL assistant Cortex Analyst\\n\\n__Adoption:__ Est\\\\. 100\\\\+ Snowflake customers in production\\n\\n__Metrics__ :  ~90%\\\\+ accuracy\\\\, ~2X single\\\\-shot SQL generation\\n\\n__Business value__ : Est\\\\. >$3M productivity unlocked\\n\\n_Find more stories on:_\\n\\n_https://mistral\\\\.ai/customers_\\n\\nConfidential ©2025 Mistral AI\\\\. All rights reserved\\\\.\\n\\n![img-21.jpeg](img-21.jpeg)\",\n", + " \"images\": [\n", + " {\n", + " \"id\": \"img-15.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image features a design with a green background on the left side and a black background on the right side. On the green background, there are four white, stylized bird shapes arranged in a circular pattern, creating a sense of movement and dynamism. The birds appear to be flying in a counter-clockwise direction, adding a sense of fluidity to the design. The contrast between the green and black backgrounds, along with the white bird shapes, makes the image visually striking and easy to recognize.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-16.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image displays the word 'Qonto' in a stylized font. The text is presented in a gradient of gray shades, transitioning from a darker gray on the left to a lighter gray on the right. The letter 'Q' is particularly distinctive, featuring a circular design with a dot placed slightly off-center to the right.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-17.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"text\\\",\\n \\\"description\\\": \\\"The image contains the text 'Helsing' in a stylized font.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-18.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image features the logo of CMA CGM, a global shipping and logistics company. The logo consists of the company's name in blue capital letters, with 'CMA' and 'CGM' stacked on top of each other. To the right of the text, there is a stylized red arrow that curves around the text, creating a dynamic and modern look. The background of the image is black, which makes the blue text and red arrow stand out prominently. The phrase 'BETTER WAYS' is written in smaller blue capital letters to the left of the company name, emphasizing the company's commitment to innovative and efficient shipping solutions.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-19.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image displays the logo of Carrefour, a multinational retailer headquartered in France. The logo features the word 'Carrefour' in a custom, rounded font, colored in blue. To the right of the text, there is a stylized arrow composed of two colors: blue and red. The arrow forms a shape resembling a shopping cart or a person, with the red part acting as a dynamic element pointing to the right.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-20.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image features the logo of Snowflake, a cloud-based data warehousing company. The logo consists of a stylized snowflake icon on the left and the word 'snowflake' written in a modern, sans-serif font on the right. The design is in a blue color scheme, with the snowflake icon composed of geometric shapes that resemble snowflake patterns.\\\"\\n}\"\n", + " },\n", + " {\n", + " \"id\": \"img-21.jpeg\",\n", + " \"top_left_x\": null,\n", + " \"top_left_y\": null,\n", + " \"bottom_right_x\": null,\n", + " \"bottom_right_y\": null,\n", + " \"image_base64\": \"...\",\n", + " \"image_annotation\": \"{\\n \\\"image_type\\\": \\\"image\\\",\\n \\\"description\\\": \\\"The image displays the logo of Mistral AI. The logo consists of a stylized 'M' made up of stacked, horizontal bars in a gradient of colors transitioning from yellow at the top to red at the bottom. To the right of the stylized 'M', the text 'Mistral AI' is written in a clean, modern font.\\\"\\n}\"\n", + " }\n", + " ],\n", + " \"dimensions\": null\n", + " }\n", + " ],\n", + " \"model\": \"mistral-ocr-latest\",\n", + " \"usage_info\": {\n", + " \"pages_processed\": 5,\n", + " \"doc_size_bytes\": 7002042\n", + " },\n", + " \"document_annotation\": \"{\\n \\\"summary\\\": \\\"The document is a confidential presentation from Mistral AI, dated July 2025. It showcases various AI models offered by Mistral AI, including moderation services, edge models, reasoning models, OCR models, embedding models, code generation models, agentic LLMs, enterprise-grade small models, and speech understanding models. The document also highlights customer success stories, demonstrating the adoption and business value of Mistral AI's models in various industries such as banking, defense, and retail. Specific use cases include banking advisor agents, vision-language-action models for defense, comprehensive internal generative AI platforms, enterprise AI agents, automatic customer review responses, and SQL assistants.\\\",\\n \\\"models\\\": \\\"Mistral Moderation, Mistral Medium 3, Magistral Medium, Mistral Medium 3.1, Mistral Small 3.2, Magistral Small, Voxstral Small/Mini, Devstral, Medium\\\"\\n}\"\n", + "}\n" + ] + } + ], + "source": [ + "# -- OCR (annotations) --\n", + "annotations_response = client.ocr.process(\n", + " model=\"mistral-ocr-latest\",\n", + " document=DocumentURLChunk(document_url=data_url),\n", + " bbox_annotation_format=response_format_from_pydantic_model(Image),\n", + " document_annotation_format=response_format_from_pydantic_model(Document),\n", + " image_limit=30\n", + ")\n", + "\n", + "annotations_dict = json.loads(annotations_response.model_dump_json())\n", + "\n", + "# Print nicely\n", + "print(\"=== OCR Annotations Output ===\")\n", + "truncated_annotations = truncate_base64(annotations_dict)\n", + "print(json.dumps(truncated_annotations, indent=2, ensure_ascii=False))\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}