Skip to content

Commit a6ca6d7

Browse files
authored
Merge pull request #271 from bcgov/OCR-Embeedings
Ocr embeedings
2 parents 019d0ac + 05abebf commit a6ca6d7

File tree

4 files changed

+6533
-0
lines changed

4 files changed

+6533
-0
lines changed
Lines changed: 399 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,399 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 1. PACKAGE INSTALLATION"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {
14+
"tags": []
15+
},
16+
"outputs": [],
17+
"source": [
18+
"# Install required packages\n",
19+
"!pip install boto3\n",
20+
"!pip install pillow\n",
21+
"!pip install ipywidgets"
22+
]
23+
},
24+
{
25+
"cell_type": "markdown",
26+
"metadata": {},
27+
"source": [
28+
"# 2. IMPORTS AND CONFIGURATIONS"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": null,
34+
"metadata": {
35+
"tags": []
36+
},
37+
"outputs": [],
38+
"source": [
39+
"# Import necessary libraries\n",
40+
"import os\n",
41+
"import json\n",
42+
"import boto3\n",
43+
"import base64\n",
44+
"from PIL import Image\n",
45+
"from collections import defaultdict\n",
46+
"from io import BytesIO\n",
47+
"\n",
48+
"# Define paths and configurations\n",
49+
"ROOT_FOLDER = 'images'\n",
50+
"OUTPUT_FILE = 'image_sonnet.json'\n",
51+
"SUPPORTED_FORMATS = ('.jpg', '.jpeg', '.png', '.gif', '.bmp')\n",
52+
"IGNORE_PATTERNS = ('.ipynb_checkpoints', '-checkpoint')\n",
53+
"AWS_ACCESS_KEY_ID = \"\"\n",
54+
"AWS_SECRET_ACCESS_KEY = \"\"\n",
55+
"AWS_REGION = \"us-east-1\"\n",
56+
"MODEL_ID = \"anthropic.claude-3-5-sonnet-20240620-v1:0\"\n",
57+
"\n",
58+
"\n",
59+
"# Create output file if it doesn't exist\n",
60+
"if not os.path.exists(OUTPUT_FILE):\n",
61+
" with open(OUTPUT_FILE, 'w') as f:\n",
62+
" json.dump({}, f)\n",
63+
" print(f\"Created empty {OUTPUT_FILE}\")"
64+
]
65+
},
66+
{
67+
"cell_type": "markdown",
68+
"metadata": {},
69+
"source": [
70+
"# 3. MODEL INITIALIZATION"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": null,
76+
"metadata": {
77+
"tags": []
78+
},
79+
"outputs": [],
80+
"source": [
81+
"# instantiate a bedrock client using boto3\n",
82+
"session = boto3.Session(\n",
83+
" aws_access_key_id=AWS_ACCESS_KEY_ID,\n",
84+
" aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n",
85+
")\n",
86+
"bedrock_runtime_client = session.client(\"bedrock-runtime\", region_name=AWS_REGION)"
87+
]
88+
},
89+
{
90+
"cell_type": "markdown",
91+
"metadata": {},
92+
"source": [
93+
"# 4. TEST CONNECTION"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": null,
99+
"metadata": {
100+
"tags": []
101+
},
102+
"outputs": [],
103+
"source": [
104+
"# Test model access\n",
105+
"test_invoke = bedrock_runtime_client.invoke_model(\n",
106+
" modelId=MODEL_ID,\n",
107+
" body=json.dumps({\n",
108+
" \"anthropic_version\": \"bedrock-2023-05-31\",\n",
109+
" \"max_tokens\": 200,\n",
110+
" \"messages\": [{\n",
111+
" \"role\": \"user\",\n",
112+
" \"content\": [{\n",
113+
" \"type\": \"text\",\n",
114+
" \"text\": \"hello world\"\n",
115+
" }\n",
116+
" ]\n",
117+
" }\n",
118+
" ]\n",
119+
" }\n",
120+
" )\n",
121+
")\n",
122+
"print(\"Sonnet Model access confirmed\")"
123+
]
124+
},
125+
{
126+
"cell_type": "markdown",
127+
"metadata": {},
128+
"source": [
129+
"# 5. HELPER FUNCTIONS"
130+
]
131+
},
132+
{
133+
"cell_type": "code",
134+
"execution_count": null,
135+
"metadata": {},
136+
"outputs": [],
137+
"source": [
138+
"def nested_dict():\n",
139+
" \"\"\"Create a nested defaultdict for hierarchical storage.\"\"\"\n",
140+
" return defaultdict(nested_dict)\n",
141+
"\n",
142+
"def convert_defaultdict_to_dict(d):\n",
143+
" \"\"\"Convert defaultdict to regular dict for JSON serialization.\"\"\"\n",
144+
" if isinstance(d, defaultdict):\n",
145+
" d = {k: convert_defaultdict_to_dict(v) for k, v in d.items()}\n",
146+
" return d\n",
147+
"\n",
148+
"def encode_image(image_path):\n",
149+
" \"\"\"Convert image to base64 encoding.\"\"\"\n",
150+
" with Image.open(image_path) as img:\n",
151+
" # Convert to RGB if needed\n",
152+
" if img.mode != 'RGB':\n",
153+
" img = img.convert('RGB')\n",
154+
" # Convert to JPEG format\n",
155+
" buffer = BytesIO()\n",
156+
" img.save(buffer, format='JPEG')\n",
157+
" return base64.b64encode(buffer.getvalue()).decode('utf-8')\n",
158+
"\n",
159+
"\n",
160+
"def process_image(image_path):\n",
161+
" \"\"\"Process a single image using Amazon Bedrock's Claude 3.5 Sonnet model.\"\"\"\n",
162+
" \n",
163+
" # Encode image\n",
164+
" base64_image = encode_image(image_path)\n",
165+
" \n",
166+
" # Prepare the prompt\n",
167+
" prompt = \"\"\"Analyze and comprehensively describe the following image in a manner optimized for legal and regulatory indexing and retrieval, ensuring all details are factual and explicitly supported by visible content. Your description will be used for identifying this image in a graph database to support a Retrieval-Augmented Generation (RAG) pipeline for British Columbia (BC) laws. Structure your description according to the following format:\n",
168+
"\n",
169+
"1. Image Type and Category:\n",
170+
"- Specify the primary type of image (e.g., diagram, chart, seal, form, table, map, figure, etc.).\n",
171+
"- If applicable, identify subcategories, such as \"organizational chart,\" \"geographical map,\" \"tax form,\" or \"compliance table.\"\n",
172+
"\n",
173+
"2. Identifier Information:\n",
174+
"- Extract and list any visible document numbers, legal references, or codes.\n",
175+
"- Include dates, version numbers, or other temporal markers.\n",
176+
"- Note any page numbers or section markers, as well as location indicators (e.g., “Section 5.2” or “Appendix B”).\n",
177+
"\n",
178+
"3. Content Description:\n",
179+
"- Summarize the main subject or topic reflected in the image (e.g., “Building Code Regulation Exemptions” or “District Zoning Compliance Map”).\n",
180+
"- Extract key terms and specific language visible in the image, especially technical or legal terminology.\n",
181+
"- Include all measurements, quantities, percentages, or numerical data.\n",
182+
"- Explicitly list proper nouns, regulatory bodies, names of laws, acts, or agencies.\n",
183+
"\n",
184+
"4. Visual Structure and Layout:\n",
185+
"- Describe the image's overall organization and structure (e.g., hierarchical elements, visually grouped sections, or thematic divisions).\n",
186+
"- Specify relationships between elements (e.g., arrows representing steps in a process, lines indicating relationships, or columns and rows in a table).\n",
187+
"- Note any use of color, bolding, or other visual emphasis that enhances meaning or denotes priority.\n",
188+
"\n",
189+
"5. Distinctive Features:\n",
190+
"- Identify any unique or notable elements, such as seals, emblems, watermarks, or jurisdiction-specific markings.\n",
191+
"- Include symbols, special characters, or formatting that stand out (e.g., \"red warning labels,\" \"italicized legal clauses\").\n",
192+
"- Describe any unusual visual arrangements or stylistic choices.\n",
193+
"\n",
194+
"Guidelines for Description:\n",
195+
"- Use precise, searchable language that prioritizes accuracy and completeness.\n",
196+
"- DO NOT USE speculative language such as “it appears,” “it might,” or “it seems.”\n",
197+
"- Responses should be formulated in a confident and precise tone, without subjective interpretation.\n",
198+
"- Include as much specificity as possible, as these descriptions will assist in indexing the image for efficient retrieval.\n",
199+
"- Use clear, searchable legal and regulatory terminology wherever applicable.\n",
200+
"\n",
201+
"YOU MUST focus on delivering a carefully considered response with the aim of maximizing retrieval accuracy and relevance.\"\"\" \n",
202+
" \n",
203+
" # Prepare the request body\n",
204+
" body = {\n",
205+
" \"anthropic_version\": \"bedrock-2023-05-31\",\n",
206+
" \"max_tokens\": 2000,\n",
207+
" \"messages\": [\n",
208+
" {\n",
209+
" \"role\": \"user\",\n",
210+
" \"content\": [\n",
211+
" {\n",
212+
" \"type\": \"image\",\n",
213+
" \"source\": {\n",
214+
" \"type\": \"base64\",\n",
215+
" \"media_type\": \"image/jpeg\",\n",
216+
" \"data\": base64_image\n",
217+
" }\n",
218+
" },\n",
219+
" {\n",
220+
" \"type\": \"text\",\n",
221+
" \"text\": prompt\n",
222+
" }\n",
223+
" ]\n",
224+
" }\n",
225+
" ]\n",
226+
" }\n",
227+
"\n",
228+
" # Make the API call\n",
229+
" response = bedrock_runtime_client.invoke_model(\n",
230+
" modelId=MODEL_ID,\n",
231+
" body=json.dumps(body)\n",
232+
" )\n",
233+
" \n",
234+
" # Parse and return the response\n",
235+
" response_body = json.loads(response['body'].read())\n",
236+
" return response_body['content'][0]['text']"
237+
]
238+
},
239+
{
240+
"cell_type": "markdown",
241+
"metadata": {},
242+
"source": [
243+
"# 6. MAIN PROCESSING LOGIC"
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": null,
249+
"metadata": {},
250+
"outputs": [],
251+
"source": [
252+
"def main():\n",
253+
" # Initialize results dictionary\n",
254+
" results = nested_dict()\n",
255+
" \n",
256+
" # Load existing descriptions if any\n",
257+
" try:\n",
258+
" with open(OUTPUT_FILE, 'r') as f:\n",
259+
" existing_results = json.load(f)\n",
260+
" # Convert existing results to nested defaultdict\n",
261+
" for key, value in existing_results.items():\n",
262+
" if isinstance(value, dict):\n",
263+
" results[key].update(value)\n",
264+
" else:\n",
265+
" results[key] = value\n",
266+
" print(f\"Loaded existing results from {OUTPUT_FILE}\")\n",
267+
" except json.JSONDecodeError:\n",
268+
" print(f\"Starting with empty results as {OUTPUT_FILE} is empty or invalid\")\n",
269+
"\n",
270+
" # Keep track of all possible image paths\n",
271+
" all_image_paths = set()\n",
272+
" processed_images = set()\n",
273+
"\n",
274+
" # First pass: collect all image paths and already processed images\n",
275+
" for dirpath, dirnames, filenames in os.walk(ROOT_FOLDER):\n",
276+
" # Remove checkpoint directories\n",
277+
" dirnames[:] = [d for d in dirnames if not any(pattern in d for pattern in IGNORE_PATTERNS)]\n",
278+
" \n",
279+
" # Filter for valid image files\n",
280+
" image_files = [\n",
281+
" f for f in filenames \n",
282+
" if f.lower().endswith(SUPPORTED_FORMATS) \n",
283+
" and not any(pattern in f for pattern in IGNORE_PATTERNS)\n",
284+
" ]\n",
285+
"\n",
286+
" for filename in image_files:\n",
287+
" # Get relative path from root folder\n",
288+
" rel_path = os.path.relpath(dirpath, ROOT_FOLDER)\n",
289+
" \n",
290+
" # Store full path for processing\n",
291+
" full_path = os.path.join(dirpath, filename)\n",
292+
" all_image_paths.add(full_path)\n",
293+
"\n",
294+
" # Check if image is already in results\n",
295+
" current_dict = results\n",
296+
" if rel_path != '.':\n",
297+
" try:\n",
298+
" for path_part in rel_path.split(os.sep):\n",
299+
" current_dict = current_dict[path_part]\n",
300+
" if filename in current_dict:\n",
301+
" processed_images.add(full_path)\n",
302+
" except (KeyError, TypeError):\n",
303+
" continue\n",
304+
"\n",
305+
" # Calculate images that need processing\n",
306+
" images_to_process = all_image_paths - processed_images\n",
307+
" \n",
308+
" # Print summary\n",
309+
" print(f\"\\nProcessing Summary:\")\n",
310+
" print(f\"Total images found: {len(all_image_paths)}\")\n",
311+
" print(f\"Already processed: {len(processed_images)}\")\n",
312+
" print(f\"Remaining to process: {len(images_to_process)}\")\n",
313+
" \n",
314+
" # If no new images to process, exit\n",
315+
" if not images_to_process:\n",
316+
" print(\"\\nNo new images to process. Exiting...\")\n",
317+
" return\n",
318+
"\n",
319+
" # Ask for confirmation before proceeding\n",
320+
" proceed = input(f\"\\nProceed with processing {len(images_to_process)} images? (y/n): \")\n",
321+
" if proceed.lower() != 'y':\n",
322+
" print(\"Processing cancelled by user.\")\n",
323+
" return\n",
324+
"\n",
325+
" # Second pass: process only new images\n",
326+
" count = 0\n",
327+
" total = len(images_to_process)\n",
328+
" \n",
329+
" for image_path in sorted(images_to_process): # Sort for consistent ordering\n",
330+
" count += 1\n",
331+
" rel_path = os.path.relpath(os.path.dirname(image_path), ROOT_FOLDER)\n",
332+
" filename = os.path.basename(image_path)\n",
333+
" \n",
334+
" print(f\"\\nProcessing image {count}/{total}: {image_path}\")\n",
335+
" \n",
336+
" # Navigate to correct position in results dictionary\n",
337+
" current_dict = results\n",
338+
" if rel_path != '.':\n",
339+
" for path_part in rel_path.split(os.sep):\n",
340+
" current_dict = current_dict[path_part]\n",
341+
" \n",
342+
" try:\n",
343+
" current_dict[filename] = process_image(image_path)\n",
344+
" print(f\"✓ Successfully processed: {image_path}\")\n",
345+
" \n",
346+
" # Save after each successful processing\n",
347+
" with open(OUTPUT_FILE, 'w') as f:\n",
348+
" json.dump(convert_defaultdict_to_dict(results), f, indent=4)\n",
349+
" print(f\"✓ Progress saved to {OUTPUT_FILE}\")\n",
350+
" \n",
351+
" except Exception as e:\n",
352+
" print(f\"✕ Error processing {image_path}: {str(e)}\")\n",
353+
" continue\n",
354+
"\n",
355+
" print(f\"\\nProcessing complete!\")\n",
356+
" print(f\"Total images processed in this run: {count}\")\n",
357+
" print(f\"Results saved to: {OUTPUT_FILE}\")"
358+
]
359+
},
360+
{
361+
"cell_type": "markdown",
362+
"metadata": {},
363+
"source": [
364+
"# 7. EXECUTION"
365+
]
366+
},
367+
{
368+
"cell_type": "code",
369+
"execution_count": null,
370+
"metadata": {},
371+
"outputs": [],
372+
"source": [
373+
"if __name__ == \"__main__\":\n",
374+
" main()"
375+
]
376+
}
377+
],
378+
"metadata": {
379+
"kernelspec": {
380+
"display_name": "Python 3.9",
381+
"language": "python",
382+
"name": "python3"
383+
},
384+
"language_info": {
385+
"codemirror_mode": {
386+
"name": "ipython",
387+
"version": 3
388+
},
389+
"file_extension": ".py",
390+
"mimetype": "text/x-python",
391+
"name": "python",
392+
"nbconvert_exporter": "python",
393+
"pygments_lexer": "ipython3",
394+
"version": "3.9.16"
395+
}
396+
},
397+
"nbformat": 4,
398+
"nbformat_minor": 4
399+
}

0 commit comments

Comments
 (0)