Merge pull request #4 from MicrosoftCloudEssentials-LearningHub/IaC-aivision-booster

brown9804 · web-flow · commit 6a3edd060c75 · 2025-07-21T08:02:12.000-06:00
adding ai vision to the IaC - booster for filled capab
diff --git a/.github/workflows/validate_and_fix_markdown.yml b/.github/workflows/validate_and_fix_markdown.yml
@@ -18,6 +18,7 @@ jobs:
         uses: actions/checkout@v4
         with:
           fetch-depth: 0
+          ref: ${{ github.head_ref || github.ref_name }}
 
       - name: Set up Node.js
         uses: actions/setup-node@v3
@@ -35,11 +36,23 @@ jobs:
           git config --global user.email "github-actions[bot]@users.noreply.github.com"
           git config --global user.name "github-actions[bot]"
 
-      - name: Commit and rebase changes
+      - name: Commit and merge changes
         env:
           PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+          GIT_AUTHOR_NAME: github-actions[bot]
+          GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+          GIT_COMMITTER_NAME: github-actions[bot]
+          GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
         run: |
+          # Ensure we're on the correct branch
+          git switch -c "$PR_BRANCH" || git switch "$PR_BRANCH"
+          
+          # Stage and commit changes if any
           git add -A
-          git commit -m "Fix Markdown syntax issues" || echo "No changes to commit"
-          git pull --rebase origin "$PR_BRANCH" || echo "No rebase needed"
-          git push origin HEAD:"$PR_BRANCH"
+          git diff --staged --quiet || git commit -m "Fix Markdown syntax issues"
+          
+          # Pull and merge existing changes
+          git pull origin "$PR_BRANCH" --no-rebase
+          
+          # Push all changes
+          git push origin "$PR_BRANCH"
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@
 *.tfstate
 *.tfstate.*
 .terraform.lock.hcl
+terraform.tfstate.backup
 
 # Crash log files
 crash.log
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Demo: PDF Layout Extraction with Doc Intelligence (full-code approach)
+# Demo: PDF Layout Extraction with Doc Intelligence <br/> Supporting Multiple Document Versions with Visual Selection Cues (full-code approach)
 
 `Azure Storage + Document Intelligence + Function App +  Cosmos DB`
 
@@ -8,9 +8,16 @@ Costa Rica
 [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/)
 [brown9804](https://github.com/brown9804)
 
-Last updated: 2025-07-16
+Last updated: 2025-07-21
 
-----------
+-----------------------------
+
+> This solution is designed to be flexible and robust, supporting multiple versions of PDF documents with varying layouts—including those that use visual selection cues such as gray fills, hand-drawn Xs, checkmarks, or circles. By building on the [PDFs-Layouts-Processing-Fapp-DocIntelligence](https://github.com/MicrosoftCloudEssentials-LearningHub/PDFs-Layouts-Processing-Fapp-DocIntelligence) repository, we ensure that:
+
+- Table structure and text are extracted using Azure Document Intelligence (Layout model).
+- Visual selection cues are detected using Azure AI Vision or image preprocessing.
+- Visual indicators are mapped to structured data, returning only the selected values in a clean JSON format.
+- The logic is abstracted to support multiple layout variations, so the system adapts easily to new document formats and selection styles.
 
 > [!IMPORTANT]
 > This example is based on a `public network site and is intended for demonstration purposes only`. It showcases how several Azure resources can work together to achieve the desired result. Consider the section below about [Important Considerations for Production Environment](#important-considerations-for-production-environment). Please note that `these demos are intended as a guide and are based on my personal experiences. For official guidance, support, or more detailed information, please refer to Microsoft's official documentation or contact Microsoft directly`: [Microsoft Sales and Support](https://support.microsoft.com/contactus?ContactUsExperienceEntryPointAssetId=S.HP.SMC-HOME)
@@ -440,7 +447,7 @@ Last updated: 2025-07-16
 
 <!-- START BADGE -->
 <div align="center">
-  <img src="https://img.shields.io/badge/Total%20views-55-limegreen" alt="Total views">
-  <p>Refresh Date: 2025-07-16</p>
+  <img src="https://img.shields.io/badge/Total%20views-164-limegreen" alt="Total views">
+  <p>Refresh Date: 2025-07-21</p>
 </div>
 <!-- END BADGE -->
diff --git a/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio
@@ -1,11 +1,11 @@
-<mxfile host="Electron" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/27.0.9 Chrome/134.0.6998.205 Electron/35.4.0 Safari/537.36" version="27.0.9">
+<mxfile host="Electron" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/28.0.4 Chrome/138.0.7204.97 Electron/37.2.1 Safari/537.36" version="28.0.4">
   <diagram name="Page-1" id="_ZzkEdzZPlF0T37kGrCl">
-    <mxGraphModel dx="1281" dy="1822" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
+    <mxGraphModel dx="732" dy="1532" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
       <root>
         <mxCell id="0" />
         <mxCell id="1" parent="0" />
         <mxCell id="SBEox3NDaokPfLYJbtWu-15" value="" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="20" width="920" height="620" as="geometry" />
+          <mxGeometry x="20" y="-90" width="920" height="710" as="geometry" />
         </mxCell>
         <mxCell id="SBEox3NDaokPfLYJbtWu-2" value="Storage Account" style="image;aspect=fixed;html=1;points=[];align=center;fontSize=12;image=img/lib/azure2/storage/Storage_Accounts.svg;" parent="1" vertex="1">
           <mxGeometry x="240" y="136" width="75" height="60" as="geometry" />
@@ -27,10 +27,10 @@
         <mxCell id="SBEox3NDaokPfLYJbtWu-3" value="Employee" style="shape=umlActor;verticalLabelPosition=bottom;verticalAlign=top;html=1;outlineConnect=0;" parent="SBEox3NDaokPfLYJbtWu-10" vertex="1">
           <mxGeometry y="30" width="30" height="60" as="geometry" />
         </mxCell>
-        <mxCell id="_wiV1sLz3M6k8l1JJ68s-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.017;entryY=0.605;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="SBEox3NDaokPfLYJbtWu-12" target="_wiV1sLz3M6k8l1JJ68s-1" edge="1">
+        <mxCell id="_wiV1sLz3M6k8l1JJ68s-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.126;entryY=0.408;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="SBEox3NDaokPfLYJbtWu-12" target="qB0o09IW0mbKmVrXtbLM-1" edge="1">
           <mxGeometry relative="1" as="geometry">
             <Array as="points">
-              <mxPoint x="540" y="131" />
+              <mxPoint x="510" y="18" />
             </Array>
           </mxGeometry>
         </mxCell>
@@ -43,10 +43,10 @@
           <mxGeometry x="510" y="300" width="68" height="60" as="geometry" />
         </mxCell>
         <mxCell id="SBEox3NDaokPfLYJbtWu-13" value="Resource Group" style="image;sketch=0;aspect=fixed;html=1;points=[];align=center;fontSize=12;image=img/lib/mscae/ResourceGroup.svg;" parent="1" vertex="1">
-          <mxGeometry x="20" width="50" height="40" as="geometry" />
+          <mxGeometry x="20" y="-90" width="50" height="40" as="geometry" />
         </mxCell>
         <mxCell id="SBEox3NDaokPfLYJbtWu-14" value="Subscription" style="image;aspect=fixed;html=1;points=[];align=center;fontSize=12;image=img/lib/azure2/general/Subscriptions.svg;" parent="1" vertex="1">
-          <mxGeometry x="890" y="-20" width="44" height="71" as="geometry" />
+          <mxGeometry x="890" y="-90" width="44" height="71" as="geometry" />
         </mxCell>
         <mxCell id="SBEox3NDaokPfLYJbtWu-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1.004;entryY=0.433;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="SBEox3NDaokPfLYJbtWu-12" target="SBEox3NDaokPfLYJbtWu-2" edge="1">
           <mxGeometry relative="1" as="geometry" />
@@ -89,6 +89,16 @@
             <mxPoint as="offset" />
           </mxGeometry>
         </mxCell>
+        <mxCell id="qB0o09IW0mbKmVrXtbLM-1" value="Azure &lt;br&gt;AI Vision&amp;nbsp;" style="image;aspect=fixed;html=1;points=[];align=center;fontSize=12;image=img/lib/azure2/ai_machine_learning/Computer_Vision.svg;" vertex="1" parent="1">
+          <mxGeometry x="550" y="-10" width="68" height="68" as="geometry" />
+        </mxCell>
+        <mxCell id="qB0o09IW0mbKmVrXtbLM-3" style="rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0.609;entryDx=0;entryDy=0;entryPerimeter=0;edgeStyle=orthogonalEdgeStyle;elbow=vertical;shape=link;" edge="1" parent="1" source="_wiV1sLz3M6k8l1JJ68s-1" target="qB0o09IW0mbKmVrXtbLM-1">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="710" y="31" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
       </root>
     </mxGraphModel>
   </diagram>
diff --git a/metrics.json b/metrics.json
@@ -26,7 +26,12 @@
   },
   {
     "date": "2025-07-14",
-    "count": 4,
+    "count": 130,
+    "uniques": 2
+  },
+  {
+    "date": "2025-07-15",
+    "count": 2,
     "uniques": 1
   }
 ]
diff --git a/src/function_app.py b/src/function_app.py
@@ -8,6 +8,12 @@
 import uuid
 import json
 
+# For image conversion and vision API
+from typing import List
+from io import BytesIO
+import requests  # For REST API to Vision
+from pdf2image import convert_from_bytes  # For PDF to image conversion
+
 app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
 
 ## DEFINITIONS 
@@ -35,13 +41,14 @@ def analyze_pdf(form_recognizer_client, pdf_bytes):
     logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).")
     return result
 
-def extract_layout_data(result):
+def extract_layout_data(result, visual_cues: List[dict] = None):
     logging.info("Extracting layout data from analysis result.")
 
     layout_data = {
         "id": str(uuid.uuid4()),
         "pages": []
     }
+    visual_cues = visual_cues or []  # List of dicts with visual cue info per cell
 
     # Log styles
     for idx, style in enumerate(result.styles):
@@ -88,12 +95,16 @@ def extract_layout_data(result):
 
             for cell in table.cells:
                 content = cell.content.strip()
-                table_data["cells"].append({
+                # Find matching visual cue for this cell (if any)
+                cue = next((vc for vc in visual_cues if vc.get("page_number") == page.page_number and vc.get("row_index") == cell.row_index and vc.get("column_index") == cell.column_index), None)
+                cell_info = {
                     "row_index": cell.row_index,
                     "column_index": cell.column_index,
-                    "content": content
-                })
-                logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}'")
+                    "content": content,
+                    "visual_cue": cue["cue_type"] if cue else None
+                }
+                table_data["cells"].append(cell_info)
+                logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}', visual_cue: {cell_info['visual_cue']}")
 
             page_data["tables"].append(table_data)
 
@@ -156,6 +167,31 @@ def save_layout_data_to_cosmos(layout_data):
 ## MAIN 
 @app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}",
                   connection="invoicecontosostorage_STORAGE")
+def call_vision_api(image_bytes, subscription_key, endpoint):
+    vision_url = endpoint + "/vision/v3.2/analyze"
+    headers = {
+        'Ocp-Apim-Subscription-Key': subscription_key,
+        'Content-Type': 'application/octet-stream'
+    }
+    params = {
+        'visualFeatures': 'Objects,Color',  # Add more features if needed
+    }
+    response = requests.post(vision_url, headers=headers, params=params, data=image_bytes)
+    response.raise_for_status()
+    return response.json()
+
+def extract_visual_cues_from_vision(vision_result, page_number):
+    # Example: Detect gray fills, checkmarks, hand-drawn marks
+    cues = []
+    # This is a placeholder. You need to parse vision_result for your cues.
+    # For example, if vision_result['objects'] contains a 'checkmark' or color info for gray fill
+    # cues.append({"page_number": page_number, "row_index": ..., "column_index": ..., "cue_type": "gray_fill"})
+    return cues
+
+def convert_pdf_to_images(pdf_bytes):
+    images = convert_from_bytes(pdf_bytes)
+    return images
+
 def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream):
     logging.info(f"Python blob trigger function processed blob\n"
                  f"Name: {myblob.name}\n"
@@ -176,9 +212,26 @@ def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream):
         logging.error(f"Error analyzing PDF: {e}")
         return
 
+    # --- Step: Convert PDF to image and call Azure AI Vision ---
+    visual_cues = []
+    try:
+        images = convert_pdf_to_images(pdf_bytes)
+        vision_key = os.getenv("VISION_API_KEY")
+        vision_endpoint = os.getenv("VISION_API_ENDPOINT")
+        for page_num, image in enumerate(images, start=1):
+            img_bytes_io = BytesIO()
+            image.save(img_bytes_io, format='JPEG')
+            img_bytes = img_bytes_io.getvalue()
+            vision_result = call_vision_api(img_bytes, vision_key, vision_endpoint)
+            cues = extract_visual_cues_from_vision(vision_result, page_num)
+            visual_cues.extend(cues)
+        logging.info(f"Visual cues extracted: {visual_cues}")
+    except Exception as e:
+        logging.error(f"Error processing visual cues with AI Vision: {e}")
+
     try:
-        layout_data = extract_layout_data(result)
-        logging.info("Successfully extracted layout data.")
+        layout_data = extract_layout_data(result, visual_cues)
+        logging.info("Successfully extracted and merged layout data.")
     except Exception as e:
         logging.error(f"Error extracting layout data: {e}")
         return
diff --git a/terraform-infrastructure/README.md b/terraform-infrastructure/README.md
@@ -5,7 +5,7 @@ Costa Rica
 [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/)
 [brown9804](https://github.com/brown9804)
 
-Last updated: 2025-07-16
+Last updated: 2025-07-21
 
 ----------
 
@@ -109,7 +109,7 @@ graph TD;
 
 <!-- START BADGE -->
 <div align="center">
-  <img src="https://img.shields.io/badge/Total%20views-55-limegreen" alt="Total views">
-  <p>Refresh Date: 2025-07-16</p>
+  <img src="https://img.shields.io/badge/Total%20views-164-limegreen" alt="Total views">
+  <p>Refresh Date: 2025-07-21</p>
 </div>
 <!-- END BADGE -->
diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf
@@ -289,6 +289,7 @@ resource "azurerm_role_assignment" "contributor" {
   ]
 }
 
+
 # Azure Form Recognizer (Document Intelligence)
 resource "azurerm_cognitive_account" "form_recognizer" {
   name                = var.form_recognizer_name
@@ -299,12 +300,27 @@ resource "azurerm_cognitive_account" "form_recognizer" {
 
   depends_on = [azurerm_resource_group.rg]
 
-  # Output the Form Recognizer name
   provisioner "local-exec" {
     command = "echo Form Recognizer: ${self.name}"
   }
 }
 
+# Azure AI Vision (Cognitive Services)
+resource "azurerm_cognitive_account" "ai_vision" {
+  name                = var.ai_vision_name
+  location            = azurerm_resource_group.rg.location
+  resource_group_name = azurerm_resource_group.rg.name
+  kind                = "CognitiveServices"
+  sku_name            = var.ai_vision_sku
+  tags                = var.ai_vision_tags
+
+  depends_on = [azurerm_resource_group.rg]
+
+  provisioner "local-exec" {
+    command = "echo AI Vision: ${self.name}"
+  }
+}
+
 # We need to assign custom or built-in Cosmos DB SQL roles 
 # (like Cosmos DB Built-in Data Reader, etc.) at the data plane level, 
 # which is not currently supported directly in Terraform as of now.
@@ -373,6 +389,10 @@ resource "azurerm_linux_function_app" "function_app" {
 
     "APPINSIGHTS_INSTRUMENTATIONKEY"        = azurerm_application_insights.appinsights.instrumentation_key
     "APPLICATIONINSIGHTS_CONNECTION_STRING" = azurerm_application_insights.appinsights.connection_string
+
+    # Azure AI Vision settings
+    "VISION_API_ENDPOINT" = azurerm_cognitive_account.ai_vision.endpoint
+    "VISION_API_KEY"      = azurerm_cognitive_account.ai_vision.primary_access_key
   }
 
   depends_on = [
diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars
@@ -1,21 +1,30 @@
 # Sample values 
-subscription_id       = "" # "your-subscription_id"
-resource_group_name   = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name"
-location              = "West US" # "your-location"
+subscription_id     = "407f4106-0fd3-42e0-9348-3686dd1e7347"                                        # "your-subscription_id"
+resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name"
+location            = "West US"                                 # "your-location"
 # Storage Account
-storage_account_name  = "storageaccbrownpdfix2" # "your-storage-account-name"
+storage_account_name         = "storageaccbrownpdfix2" # "your-storage-account-name"
 storage_account_name_runtime = "runtimestorebrownix2"  # "your-runtime-storage-account-name"
 # Function App
-function_app_name     = "fapdfbrownix2" # "your-function-app-name"
+function_app_name = "fapdfbrownix2" # "your-function-app-name"
 # App Service Plan
 app_service_plan_name = "asppdfbrownix2" # "your-app-service-plan-name"
 # Application Insights
-app_insights_name     = "apppdfbrownix2" # "your-app-insights-name"
+app_insights_name = "apppdfbrownix2" # "your-app-insights-name"
 # Log Analytics Workspace
 log_analytics_workspace_name = "logwspdfbrownix2" # "your-log-analytics-workspace-name"
 # Key Vault
-key_vault_name        = "kvpdfrbrownix2" # "your-key-vault-name"
+key_vault_name = "kvpdfrbrownrix2" # "your-key-vault-name"
 # CosmosDB
 cosmosdb_account_name = "cosmospdfbrownix2" # "your-cosmosdb-account-name"
 # Form Recognizer -> Document Intelligence 
-form_recognizer_name  = "docintelligt01ix2" # "your-document-intelligence-name"
+form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name"
+
+# AI Vision Service
+ai_vision_name = "aivisionpdfrbrownix2" # "your-ai-vision-name"
+ai_vision_sku  = "S0"
+ai_vision_tags = {
+  Environment = "Development"
+  Project     = "PDF Processing"
+  Service     = "AI Vision"
+}
diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf
@@ -48,6 +48,26 @@ variable "key_vault_name" {
   description = "The name of the Key Vault"
   type        = string
 }
+
+variable "ai_vision_name" {
+  description = "The name of the AI Vision Cognitive Services account"
+  type        = string
+}
+
+variable "ai_vision_sku" {
+  description = "The SKU of the AI Vision Cognitive Services account"
+  type        = string
+  default     = "S0"
+}
+
+variable "ai_vision_tags" {
+  description = "Tags to be applied to the AI Vision resource"
+  type        = map(string)
+  default     = {
+    Environment = "Development"
+    Service     = "AI Vision"
+  }
+}
 variable "cosmosdb_account_name" {
   description = "The name of the CosmosDB account."
   type        = string

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,12 @@`
`26`	`26`	`},`
`27`	`27`	`{`
`28`	`28`	`"date": "2025-07-14",`
`29`		`- "count": 4,`
	`29`	`+ "count": 130,`
	`30`	`+ "uniques": 2`
	`31`	`+ },`
	`32`	`+ {`
	`33`	`+ "date": "2025-07-15",`
	`34`	`+ "count": 2,`
`30`	`35`	`"uniques": 1`
`31`	`36`	`}`
`32`	`37`	`]`