bcgov
diff --git a/‎examples/Image OCR/ImageOCR-Sonnet3.5V1-1.ipynb‎
Lines changed: 399 additions & 0 deletions b/‎examples/Image OCR/ImageOCR-Sonnet3.5V1-1.ipynb‎
Lines changed: 399 additions & 0 deletions
@@ -0,0 +1,399 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. PACKAGE INSTALLATION"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Install required packages\n",
+    "!pip install boto3\n",
+    "!pip install pillow\n",
+    "!pip install ipywidgets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. IMPORTS AND CONFIGURATIONS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import os\n",
+    "import json\n",
+    "import boto3\n",
+    "import base64\n",
+    "from PIL import Image\n",
+    "from collections import defaultdict\n",
+    "from io import BytesIO\n",
+    "\n",
+    "# Define paths and configurations\n",
+    "ROOT_FOLDER = 'images'\n",
+    "OUTPUT_FILE = 'image_sonnet.json'\n",
+    "SUPPORTED_FORMATS = ('.jpg', '.jpeg', '.png', '.gif', '.bmp')\n",
+    "IGNORE_PATTERNS = ('.ipynb_checkpoints', '-checkpoint')\n",
+    "AWS_ACCESS_KEY_ID = \"\"\n",
+    "AWS_SECRET_ACCESS_KEY = \"\"\n",
+    "AWS_REGION = \"us-east-1\"\n",
+    "MODEL_ID = \"anthropic.claude-3-5-sonnet-20240620-v1:0\"\n",
+    "\n",
+    "\n",
+    "# Create output file if it doesn't exist\n",
+    "if not os.path.exists(OUTPUT_FILE):\n",
+    "    with open(OUTPUT_FILE, 'w') as f:\n",
+    "        json.dump({}, f)\n",
+    "    print(f\"Created empty {OUTPUT_FILE}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3. MODEL INITIALIZATION"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# instantiate a bedrock client using boto3\n",
+    "session = boto3.Session(\n",
+    "    aws_access_key_id=AWS_ACCESS_KEY_ID,\n",
+    "    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n",
+    ")\n",
+    "bedrock_runtime_client = session.client(\"bedrock-runtime\", region_name=AWS_REGION)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4. TEST CONNECTION"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Test model access\n",
+    "test_invoke = bedrock_runtime_client.invoke_model(\n",
+    "    modelId=MODEL_ID,\n",
+    "    body=json.dumps({\n",
+    "        \"anthropic_version\": \"bedrock-2023-05-31\",\n",
+    "        \"max_tokens\": 200,\n",
+    "        \"messages\": [{\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [{\n",
+    "                \"type\": \"text\",\n",
+    "                \"text\": \"hello world\"\n",
+    "              }\n",
+    "            ]\n",
+    "          }\n",
+    "        ]\n",
+    "    }\n",
+    "    )\n",
+    ")\n",
+    "print(\"Sonnet Model access confirmed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5. HELPER FUNCTIONS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nested_dict():\n",
+    "    \"\"\"Create a nested defaultdict for hierarchical storage.\"\"\"\n",
+    "    return defaultdict(nested_dict)\n",
+    "\n",
+    "def convert_defaultdict_to_dict(d):\n",
+    "    \"\"\"Convert defaultdict to regular dict for JSON serialization.\"\"\"\n",
+    "    if isinstance(d, defaultdict):\n",
+    "        d = {k: convert_defaultdict_to_dict(v) for k, v in d.items()}\n",
+    "    return d\n",
+    "\n",
+    "def encode_image(image_path):\n",
+    "    \"\"\"Convert image to base64 encoding.\"\"\"\n",
+    "    with Image.open(image_path) as img:\n",
+    "        # Convert to RGB if needed\n",
+    "        if img.mode != 'RGB':\n",
+    "            img = img.convert('RGB')\n",
+    "        # Convert to JPEG format\n",
+    "        buffer = BytesIO()\n",
+    "        img.save(buffer, format='JPEG')\n",
+    "        return base64.b64encode(buffer.getvalue()).decode('utf-8')\n",
+    "\n",
+    "\n",
+    "def process_image(image_path):\n",
+    "    \"\"\"Process a single image using Amazon Bedrock's Claude 3.5 Sonnet model.\"\"\"\n",
+    "    \n",
+    "    # Encode image\n",
+    "    base64_image = encode_image(image_path)\n",
+    "    \n",
+    "    # Prepare the prompt\n",
+    "    prompt = \"\"\"Analyze and comprehensively describe the following image in a manner optimized for legal and regulatory indexing and retrieval, ensuring all details are factual and explicitly supported by visible content. Your description will be used for identifying this image in a graph database to support a Retrieval-Augmented Generation (RAG) pipeline for British Columbia (BC) laws. Structure your description according to the following format:\n",
+    "\n",
+    "1. Image Type and Category:\n",
+    "- Specify the primary type of image (e.g., diagram, chart, seal, form, table, map, figure, etc.).\n",
+    "- If applicable, identify subcategories, such as \"organizational chart,\" \"geographical map,\" \"tax form,\" or \"compliance table.\"\n",
+    "\n",
+    "2. Identifier Information:\n",
+    "- Extract and list any visible document numbers, legal references, or codes.\n",
+    "- Include dates, version numbers, or other temporal markers.\n",
+    "- Note any page numbers or section markers, as well as location indicators (e.g., “Section 5.2” or “Appendix B”).\n",
+    "\n",
+    "3. Content Description:\n",
+    "- Summarize the main subject or topic reflected in the image (e.g., “Building Code Regulation Exemptions” or “District Zoning Compliance Map”).\n",
+    "- Extract key terms and specific language visible in the image, especially technical or legal terminology.\n",
+    "- Include all measurements, quantities, percentages, or numerical data.\n",
+    "- Explicitly list proper nouns, regulatory bodies, names of laws, acts, or agencies.\n",
+    "\n",
+    "4. Visual Structure and Layout:\n",
+    "- Describe the image's overall organization and structure (e.g., hierarchical elements, visually grouped sections, or thematic divisions).\n",
+    "- Specify relationships between elements (e.g., arrows representing steps in a process, lines indicating relationships, or columns and rows in a table).\n",
+    "- Note any use of color, bolding, or other visual emphasis that enhances meaning or denotes priority.\n",
+    "\n",
+    "5. Distinctive Features:\n",
+    "- Identify any unique or notable elements, such as seals, emblems, watermarks, or jurisdiction-specific markings.\n",
+    "- Include symbols, special characters, or formatting that stand out (e.g., \"red warning labels,\" \"italicized legal clauses\").\n",
+    "- Describe any unusual visual arrangements or stylistic choices.\n",
+    "\n",
+    "Guidelines for Description:\n",
+    "- Use precise, searchable language that prioritizes accuracy and completeness.\n",
+    "- DO NOT USE speculative language such as “it appears,” “it might,” or “it seems.”\n",
+    "- Responses should be formulated in a confident and precise tone, without subjective interpretation.\n",
+    "- Include as much specificity as possible, as these descriptions will assist in indexing the image for efficient retrieval.\n",
+    "- Use clear, searchable legal and regulatory terminology wherever applicable.\n",
+    "\n",
+    "YOU MUST focus on delivering a carefully considered response with the aim of maximizing retrieval accuracy and relevance.\"\"\" \n",
+    "    \n",
+    "    # Prepare the request body\n",
+    "    body = {\n",
+    "        \"anthropic_version\": \"bedrock-2023-05-31\",\n",
+    "        \"max_tokens\": 2000,\n",
+    "        \"messages\": [\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": [\n",
+    "                    {\n",
+    "                        \"type\": \"image\",\n",
+    "                        \"source\": {\n",
+    "                            \"type\": \"base64\",\n",
+    "                            \"media_type\": \"image/jpeg\",\n",
+    "                            \"data\": base64_image\n",
+    "                        }\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"type\": \"text\",\n",
+    "                        \"text\": prompt\n",
+    "                    }\n",
+    "                ]\n",
+    "            }\n",
+    "        ]\n",
+    "    }\n",
+    "\n",
+    "    # Make the API call\n",
+    "    response = bedrock_runtime_client.invoke_model(\n",
+    "        modelId=MODEL_ID,\n",
+    "        body=json.dumps(body)\n",
+    "    )\n",
+    "    \n",
+    "    # Parse and return the response\n",
+    "    response_body = json.loads(response['body'].read())\n",
+    "    return response_body['content'][0]['text']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 6. MAIN PROCESSING LOGIC"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def main():\n",
+    "    # Initialize results dictionary\n",
+    "    results = nested_dict()\n",
+    "    \n",
+    "    # Load existing descriptions if any\n",
+    "    try:\n",
+    "        with open(OUTPUT_FILE, 'r') as f:\n",
+    "            existing_results = json.load(f)\n",
+    "            # Convert existing results to nested defaultdict\n",
+    "            for key, value in existing_results.items():\n",
+    "                if isinstance(value, dict):\n",
+    "                    results[key].update(value)\n",
+    "                else:\n",
+    "                    results[key] = value\n",
+    "        print(f\"Loaded existing results from {OUTPUT_FILE}\")\n",
+    "    except json.JSONDecodeError:\n",
+    "        print(f\"Starting with empty results as {OUTPUT_FILE} is empty or invalid\")\n",
+    "\n",
+    "    # Keep track of all possible image paths\n",
+    "    all_image_paths = set()\n",
+    "    processed_images = set()\n",
+    "\n",
+    "    # First pass: collect all image paths and already processed images\n",
+    "    for dirpath, dirnames, filenames in os.walk(ROOT_FOLDER):\n",
+    "        # Remove checkpoint directories\n",
+    "        dirnames[:] = [d for d in dirnames if not any(pattern in d for pattern in IGNORE_PATTERNS)]\n",
+    "        \n",
+    "        # Filter for valid image files\n",
+    "        image_files = [\n",
+    "            f for f in filenames \n",
+    "            if f.lower().endswith(SUPPORTED_FORMATS) \n",
+    "            and not any(pattern in f for pattern in IGNORE_PATTERNS)\n",
+    "        ]\n",
+    "\n",
+    "        for filename in image_files:\n",
+    "            # Get relative path from root folder\n",
+    "            rel_path = os.path.relpath(dirpath, ROOT_FOLDER)\n",
+    "            \n",
+    "            # Store full path for processing\n",
+    "            full_path = os.path.join(dirpath, filename)\n",
+    "            all_image_paths.add(full_path)\n",
+    "\n",
+    "            # Check if image is already in results\n",
+    "            current_dict = results\n",
+    "            if rel_path != '.':\n",
+    "                try:\n",
+    "                    for path_part in rel_path.split(os.sep):\n",
+    "                        current_dict = current_dict[path_part]\n",
+    "                    if filename in current_dict:\n",
+    "                        processed_images.add(full_path)\n",
+    "                except (KeyError, TypeError):\n",
+    "                    continue\n",
+    "\n",
+    "    # Calculate images that need processing\n",
+    "    images_to_process = all_image_paths - processed_images\n",
+    "    \n",
+    "    # Print summary\n",
+    "    print(f\"\\nProcessing Summary:\")\n",
+    "    print(f\"Total images found: {len(all_image_paths)}\")\n",
+    "    print(f\"Already processed: {len(processed_images)}\")\n",
+    "    print(f\"Remaining to process: {len(images_to_process)}\")\n",
+    "    \n",
+    "    # If no new images to process, exit\n",
+    "    if not images_to_process:\n",
+    "        print(\"\\nNo new images to process. Exiting...\")\n",
+    "        return\n",
+    "\n",
+    "    # Ask for confirmation before proceeding\n",
+    "    proceed = input(f\"\\nProceed with processing {len(images_to_process)} images? (y/n): \")\n",
+    "    if proceed.lower() != 'y':\n",
+    "        print(\"Processing cancelled by user.\")\n",
+    "        return\n",
+    "\n",
+    "    # Second pass: process only new images\n",
+    "    count = 0\n",
+    "    total = len(images_to_process)\n",
+    "    \n",
+    "    for image_path in sorted(images_to_process):  # Sort for consistent ordering\n",
+    "        count += 1\n",
+    "        rel_path = os.path.relpath(os.path.dirname(image_path), ROOT_FOLDER)\n",
+    "        filename = os.path.basename(image_path)\n",
+    "        \n",
+    "        print(f\"\\nProcessing image {count}/{total}: {image_path}\")\n",
+    "        \n",
+    "        # Navigate to correct position in results dictionary\n",
+    "        current_dict = results\n",
+    "        if rel_path != '.':\n",
+    "            for path_part in rel_path.split(os.sep):\n",
+    "                current_dict = current_dict[path_part]\n",
+    "        \n",
+    "        try:\n",
+    "            current_dict[filename] = process_image(image_path)\n",
+    "            print(f\"✓ Successfully processed: {image_path}\")\n",
+    "            \n",
+    "            # Save after each successful processing\n",
+    "            with open(OUTPUT_FILE, 'w') as f:\n",
+    "                json.dump(convert_defaultdict_to_dict(results), f, indent=4)\n",
+    "            print(f\"✓ Progress saved to {OUTPUT_FILE}\")\n",
+    "            \n",
+    "        except Exception as e:\n",
+    "            print(f\"✕ Error processing {image_path}: {str(e)}\")\n",
+    "            continue\n",
+    "\n",
+    "    print(f\"\\nProcessing complete!\")\n",
+    "    print(f\"Total images processed in this run: {count}\")\n",
+    "    print(f\"Results saved to: {OUTPUT_FILE}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 7. EXECUTION"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}