diff --git a/samples-mistral/python/mistral-large3-basics.ipynb b/samples-mistral/python/mistral-large3-basics.ipynb index ec43f6a1..5b9d0287 100644 --- a/samples-mistral/python/mistral-large3-basics.ipynb +++ b/samples-mistral/python/mistral-large3-basics.ipynb @@ -35,7 +35,8 @@ "source": [ "import requests\n", "import re\n", - "from IPython.display import Markdown, display" + "from IPython.display import Markdown, display\n", + "import base64" ] }, { @@ -168,6 +169,83 @@ "display(Markdown(compoundQuestion2Response.json()[\"choices\"][0][\"message\"][\"content\"]))" ] }, + { + "cell_type": "markdown", + "id": "6d01e485", + "metadata": {}, + "source": [ + "## Multimodal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc9f39d5", + "metadata": {}, + "outputs": [], + "source": [ + "!wget -O fred.png \"https://fred.stlouisfed.org/graph/fredgraph.png?g=1On5p&height=490\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "327a5ce9", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"fred.png\", \"rb\") as image_file:\n", + " encoded_string = base64.b64encode(image_file.read()).decode(\"utf-8\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c82f0fc", + "metadata": {}, + "outputs": [], + "source": [ + "multimodal = {\n", + " \"model\": f\"{AZURE_AI_FOUNDRY_MODEL_DEPLOYMENT_NAME}\",\n", + " \"messages\": [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"Describe this image and any trends.\"},\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\"url\": f\"data:image/jpeg;base64,{encoded_string}\"},\n", + " },\n", + " ],\n", + " }\n", + " ],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1994f786", + "metadata": {}, + "outputs": [], + "source": [ + "multimodalResponse = requests.post(\n", + " url=AZURE_AI_FOUNDRY_PROJECT_ENDPOINT,\n", + " json=multimodal,\n", + " headers=REQUEST_HEADERS,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "700f37f8", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(multimodalResponse.json()[\"choices\"][0][\"message\"][\"content\"]))" + ] + }, { "cell_type": "markdown", "id": "c704cdad", @@ -589,14 +667,6 @@ "source": [ "display(Markdown(finance2Response.json()[\"choices\"][0][\"message\"][\"content\"]))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f77cdb78", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/samples-mistral/python/mistral-ocr-with-vlm.ipynb b/samples-mistral/python/mistral-ocr-with-vlm.ipynb deleted file mode 100644 index 7236acfa..00000000 --- a/samples-mistral/python/mistral-ocr-with-vlm.ipynb +++ /dev/null @@ -1,356 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "3a72f557-356b-4730-a30a-08c7735f2e57", - "metadata": {}, - "outputs": [], - "source": [ - "import base64\n", - "import httpx\n", - "import json\n", - "import pymupdf\n", - "\n", - "from pathlib import Path\n", - "from typing import Dict, Any, Optional, List" - ] - }, - { - "cell_type": "markdown", - "id": "f24d0a15-8f77-4c0a-a22b-56fe5aa8ca79", - "metadata": {}, - "source": [ - "# Combining `mistral-ocr` and `mistral-small-2503` for advanced document analysis \n", - "\n", - "This notebook showcases a example of combining two Mistral AI models for advanced document analysis:\n", - "\n", - "- `mistralai-ocr-2503` to extract text and images content from a document,\n", - "- `mistral-small-2503` to process the extracted text and image content.\n", - "\n", - "You will use the [Pixtral 12B](https://arxiv.org/pdf/2410.07073) technical report as example of a raw input document. The entire workflow will be implemented using Azure AI Foundry deployments.\n", - "\n", - "> **Important**: The OCR API endpoint does not have Internet access so you will have to ensure that the input documents are readily available from your client machine - or download them beforehand." - ] - }, - { - "cell_type": "markdown", - "id": "767ab6cc-b34c-475c-b53e-670c45140cb2", - "metadata": {}, - "source": [ - "## 0. Gathering metadata\n", - "\n", - "Fill in the following variables with your own endpoint URL and API keys:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8ad819a-1be8-4166-a4d6-2ce257b18f92", - "metadata": {}, - "outputs": [], - "source": [ - "AZURE_MISTRAL_OCR_ENDPOINT = \"\"\n", - "AZURE_MISTRAL_OCR_API_KEY = \"\"\n", - "AZURE_MISTRAL_SMALL_ENDPOINT = \"\"\n", - "AZURE_MISTRAL_SMALL_API_KEY = \"\"" - ] - }, - { - "cell_type": "markdown", - "id": "ede7a5a9-982e-4d25-a48a-655e51b26bd5", - "metadata": {}, - "source": [ - "Next, put the (local) path to the document to analyze here:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9962b3a9-a077-4914-9d65-60c9866accb0", - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DOCUMENT_PATH = \"\"" - ] - }, - { - "cell_type": "markdown", - "id": "47c0ae93-57cc-4163-86eb-0c7780f3021f", - "metadata": {}, - "source": [ - "The following cell contains default values for the system messages used by `mistral-small-2503` for each post-OCR task to execute:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c58cf123-a5e3-4411-bd69-1d92f278e9ca", - "metadata": {}, - "outputs": [], - "source": [ - "SUMMARIZATION_SYSTEM_MESSAGE = \"\"\"\n", - "Your mission is to summarize the following text in a short and concise way.\n", - "Always answer in a well-formatted JSON object containing a single string item called 'summary' \n", - "\"\"\"\n", - "\n", - "DESC_FIG_SYSTEM_MESSAGE = \"\"\"\n", - "Your mission is to provide a brief and informative description of each image you will be shown.\n", - "Always answer in a well-formatted JSON object containing:\n", - "- type: a string describing the type of figure you see (plot, picture, diagram, etc.)\n", - "- description: the information you can derive from the figure\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "ecc4ba4e-2842-4e75-b0bb-2c25c4aeed2d", - "metadata": {}, - "source": [ - "## 1. Creating helper functions\n", - "\n", - "Before getting started, we need to create a few building blocks that will make the code more modular:" - ] - }, - { - "cell_type": "markdown", - "id": "96fbd476-558b-48e0-932d-c3f9180d98d5", - "metadata": {}, - "source": [ - "- The function `_encode_document_pages_to_base64` takes the file path of a PDF document as input and returns a base64-encoded string representation of that document.\n", - "\n", - "- The function `_call_ocr_model` sends an API call to the `mistral-ocr-2503` model endpoint and returns a parsed version of the document with the extracted text and images.\n", - "\n", - "- The function `_call_vlm_model` sends an API call to the `mistral-small-2503` model endpoint and returns a JSON-formatted response. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d41d012b-7b64-4992-843e-d78b254cb279", - "metadata": {}, - "outputs": [], - "source": [ - "def _encode_document_pages_to_base64(pdf_doc_path: str) -> List[str]:\n", - " encoded_pages: List[str] = []\n", - " doc = pymupdf.open(pdf_doc_path)\n", - " for page in doc:\n", - " page_bytes = page.get_pixmap().tobytes(\"jpeg\")\n", - " page_b64_encoded = base64.b64encode(page_bytes).decode(\"utf-8\")\n", - " encoded_pages.append(page_b64_encoded)\n", - " return encoded_pages\n", - "\n", - "\n", - "def _encode_document_to_base64(document_path: str) -> str:\n", - " with Path(document_path).open(mode=\"rb\") as f_in:\n", - " doc_encoded = base64.b64encode(f_in.read()).decode(\"utf-8\")\n", - " return doc_encoded\n", - "\n", - "\n", - "def _call_ocr_model(\n", - " endpoint: str, api_key: str, base64_input_data: str\n", - ") -> Dict[str, Any]:\n", - " endpoint_url = f\"{endpoint}/v1/ocr\"\n", - " headers = {\n", - " \"Content-Type\": \"application/json\",\n", - " \"Accept\": \"application/json\",\n", - " \"Authorization\": f\"Bearer {api_key}\",\n", - " }\n", - " payload = {\n", - " \"model\": \"mistral-ocr-2503\",\n", - " \"document\": {\"type\": \"document_url\", \"document_url\": base64_input_data},\n", - " \"include_image_base64\": True,\n", - " }\n", - " with httpx.Client() as client:\n", - " ocr_resp = client.post(\n", - " url=endpoint_url, headers=headers, json=payload, timeout=60.0\n", - " )\n", - " ocr_resp.raise_for_status()\n", - " return ocr_resp.json()\n", - "\n", - "\n", - "def _call_vlm_model(\n", - " endpoint: str,\n", - " api_key: str,\n", - " user_message: Dict[str, Any],\n", - " system_message: Dict[str, str],\n", - ") -> Dict[str, Any]:\n", - " url = f\"{endpoint}/v1/chat/completions\"\n", - " headers = {\n", - " \"Content-Type\": \"application/json\",\n", - " \"Accept\": \"application/json\",\n", - " \"Authorization\": f\"Bearer {api_key}\",\n", - " }\n", - " payload = {\n", - " \"model\": \"mistral-small-2503\",\n", - " \"messages\": [system_message, user_message],\n", - " \"temperature\": 0,\n", - " \"response_format\": {\"type\": \"json_object\"},\n", - " }\n", - " with httpx.Client() as client:\n", - " resp = client.post(url=url, headers=headers, json=payload, timeout=60.0)\n", - " resp.raise_for_status()\n", - " return resp.json()" - ] - }, - { - "cell_type": "markdown", - "id": "00c87850-d1f7-4237-b2a2-90ec32cdb58e", - "metadata": {}, - "source": [ - "## 2. Creating the `Document` class\n", - "\n", - "The `Document` class is designed to manage and process documents, particularly PDFs. It initializes with an optional source file path and provides three primary methods: \n", - "\n", - "- `parse`, which encodes the document to base64 and uses the `mistral-ocr-2503` model to extract text and images,\n", - "- `summarize`, which uses `mistral-small-2503` to summarize the document's content,\n", - "- `describe_figures`, which identifies and describes images within the parsed document using `mistral-small-2503`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1e95178-ebc5-42e1-9b73-533d33140b1f", - "metadata": {}, - "outputs": [], - "source": [ - "class Document:\n", - " def __init__(self, source_file: str | None = None):\n", - " self.source_file: str | None = source_file\n", - " self.parsed_doc: str | None = None\n", - "\n", - " def parse(self):\n", - " encoded_doc = _encode_document_to_base64(document_path=self.source_file)\n", - " self.parsed_doc = _call_ocr_model(\n", - " endpoint=AZURE_MISTRAL_OCR_ENDPOINT,\n", - " api_key=AZURE_MISTRAL_OCR_API_KEY,\n", - " base64_input_data=f\"data:application/pdf;base64,{encoded_doc}\",\n", - " )\n", - "\n", - " def summarize(self) -> Dict[str, Any]:\n", - " system_message = {\"role\": \"system\", \"content\": SUMMARIZATION_SYSTEM_MESSAGE}\n", - " user_message_content: List[Dict[str, Any]] = []\n", - " pages = self.parsed_doc[\"pages\"]\n", - " for page in pages:\n", - " user_message_content.append({\"type\": \"text\", \"text\": page[\"markdown\"]})\n", - " user_message = {\"role\": \"user\", \"content\": user_message_content}\n", - " vlm_resp = _call_vlm_model(\n", - " endpoint=AZURE_MISTRAL_SMALL_ENDPOINT,\n", - " api_key=AZURE_MISTRAL_SMALL_API_KEY,\n", - " system_message=system_message,\n", - " user_message=user_message,\n", - " )\n", - " return json.loads(vlm_resp[\"choices\"][0][\"message\"][\"content\"])\n", - "\n", - " def describe_figures(self, pages: Optional[List[int]] = None) -> Dict[str, Any]:\n", - " system_message = {\"role\": \"system\", \"content\": DESC_FIG_SYSTEM_MESSAGE}\n", - " figures: List[Dict[str, Any]] = []\n", - " for idx, page in enumerate(self.parsed_doc[\"pages\"]):\n", - " for img in page[\"images\"]:\n", - " user_message = {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " {\"type\": \"image_url\", \"image_url\": {\"url\": img[\"image_base64\"]}}\n", - " ],\n", - " }\n", - " vlm_resp = _call_vlm_model(\n", - " endpoint=AZURE_MISTRAL_SMALL_ENDPOINT,\n", - " api_key=AZURE_MISTRAL_SMALL_API_KEY,\n", - " system_message=system_message,\n", - " user_message=user_message,\n", - " )\n", - " desc_dict = json.loads(vlm_resp[\"choices\"][0][\"message\"][\"content\"])\n", - " fig_desc = {\"page\": idx, \"desc\": desc_dict}\n", - " figures.append(fig_desc)\n", - " return figures" - ] - }, - { - "cell_type": "markdown", - "id": "c15084ab-99e6-4e6f-a505-bb8df75ade70", - "metadata": {}, - "source": [ - "## 3. Using the `Document` class\n", - "\n", - "You can now analyze your documents with the `Document` class! Start by creating an instance and pass the path to the document you wish to study:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a246233a-267b-46cf-8be4-fbcdc53b65b2", - "metadata": {}, - "outputs": [], - "source": [ - "doc = Document(INPUT_DOCUMENT_PATH)\n", - "doc.parse()" - ] - }, - { - "cell_type": "markdown", - "id": "f1f25c91-bad5-483e-8fb9-9360e596fcd3", - "metadata": {}, - "source": [ - "Now that the document has been parsed into text and image blocks by the OCR model, you can run downstream processing on these blocks. For example, you can summarize the document's text:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b419dd09-a226-4877-978e-6eb5fd7fbc57", - "metadata": {}, - "outputs": [], - "source": [ - "doc.summarize()" - ] - }, - { - "cell_type": "markdown", - "id": "f415a026-16b1-4082-b7cf-7fcd50edc810", - "metadata": {}, - "source": [ - "You can also leverage the multimodal abilities of `mistral-small-2503` to annotate all the images and figures that are present in the document:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "debb178d-d0ca-4678-b6ca-e376c172a6be", - "metadata": {}, - "outputs": [], - "source": [ - "doc.describe_figures()" - ] - }, - { - "cell_type": "markdown", - "id": "6339f28b-99c1-496c-add6-238458c8cc23", - "metadata": {}, - "source": [ - "## 4. Wrapping up\n", - "\n", - "You now have a working example of how to perform advanced analysis on the text and images extracted from a PDF document in Azure AI!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}