diff --git a/.github/workflows/validate_and_fix_markdown.yml b/.github/workflows/validate_and_fix_markdown.yml
index 4cef7ef..8bb9f1f 100644
--- a/.github/workflows/validate_and_fix_markdown.yml
+++ b/.github/workflows/validate_and_fix_markdown.yml
@@ -18,6 +18,7 @@ jobs:
uses: actions/checkout@v4
with:
fetch-depth: 0
+ ref: ${{ github.head_ref || github.ref_name }}
- name: Set up Node.js
uses: actions/setup-node@v3
@@ -35,11 +36,23 @@ jobs:
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git config --global user.name "github-actions[bot]"
- - name: Commit and rebase changes
+ - name: Commit and merge changes
env:
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+ GIT_AUTHOR_NAME: github-actions[bot]
+ GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com
+ GIT_COMMITTER_NAME: github-actions[bot]
+ GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com
run: |
+ # Ensure we're on the correct branch
+ git switch -c "$PR_BRANCH" || git switch "$PR_BRANCH"
+
+ # Stage and commit changes if any
git add -A
- git commit -m "Fix Markdown syntax issues" || echo "No changes to commit"
- git pull --rebase origin "$PR_BRANCH" || echo "No rebase needed"
- git push origin HEAD:"$PR_BRANCH"
+ git diff --staged --quiet || git commit -m "Fix Markdown syntax issues"
+
+ # Pull and merge existing changes
+ git pull origin "$PR_BRANCH" --no-rebase
+
+ # Push all changes
+ git push origin "$PR_BRANCH"
diff --git a/.gitignore b/.gitignore
index 7062e66..a57fe2b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
*.tfstate
*.tfstate.*
.terraform.lock.hcl
+terraform.tfstate.backup
# Crash log files
crash.log
diff --git a/README.md b/README.md
index c85ac63..bdfe19d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Demo: PDF Layout Extraction with Doc Intelligence (full-code approach)
+# Demo: PDF Layout Extraction with Doc Intelligence
Supporting Multiple Document Versions with Visual Selection Cues (full-code approach)
`Azure Storage + Document Intelligence + Function App + Cosmos DB`
@@ -8,9 +8,16 @@ Costa Rica
[](https://github.com/)
[brown9804](https://github.com/brown9804)
-Last updated: 2025-07-16
+Last updated: 2025-07-21
-----------
+-----------------------------
+
+> This solution is designed to be flexible and robust, supporting multiple versions of PDF documents with varying layouts—including those that use visual selection cues such as gray fills, hand-drawn Xs, checkmarks, or circles. By building on the [PDFs-Layouts-Processing-Fapp-DocIntelligence](https://github.com/MicrosoftCloudEssentials-LearningHub/PDFs-Layouts-Processing-Fapp-DocIntelligence) repository, we ensure that:
+
+- Table structure and text are extracted using Azure Document Intelligence (Layout model).
+- Visual selection cues are detected using Azure AI Vision or image preprocessing.
+- Visual indicators are mapped to structured data, returning only the selected values in a clean JSON format.
+- The logic is abstracted to support multiple layout variations, so the system adapts easily to new document formats and selection styles.
> [!IMPORTANT]
> This example is based on a `public network site and is intended for demonstration purposes only`. It showcases how several Azure resources can work together to achieve the desired result. Consider the section below about [Important Considerations for Production Environment](#important-considerations-for-production-environment). Please note that `these demos are intended as a guide and are based on my personal experiences. For official guidance, support, or more detailed information, please refer to Microsoft's official documentation or contact Microsoft directly`: [Microsoft Sales and Support](https://support.microsoft.com/contactus?ContactUsExperienceEntryPointAssetId=S.HP.SMC-HOME)
@@ -440,7 +447,7 @@ Last updated: 2025-07-16
-

-
Refresh Date: 2025-07-16
+

+
Refresh Date: 2025-07-21
diff --git a/docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio
similarity index 81%
rename from docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio
rename to docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio
index de05840..b05d1f0 100644
--- a/docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio
+++ b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio
@@ -1,11 +1,11 @@
-
+
-
+
-
+
@@ -27,10 +27,10 @@
-
+
-
+
@@ -43,10 +43,10 @@
-
+
-
+
@@ -89,6 +89,16 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/metrics.json b/metrics.json
index c9fdd61..8665541 100644
--- a/metrics.json
+++ b/metrics.json
@@ -26,7 +26,12 @@
},
{
"date": "2025-07-14",
- "count": 4,
+ "count": 130,
+ "uniques": 2
+ },
+ {
+ "date": "2025-07-15",
+ "count": 2,
"uniques": 1
}
]
\ No newline at end of file
diff --git a/src/function_app.py b/src/function_app.py
index 370c713..2efed7e 100644
--- a/src/function_app.py
+++ b/src/function_app.py
@@ -8,6 +8,12 @@
import uuid
import json
+# For image conversion and vision API
+from typing import List
+from io import BytesIO
+import requests # For REST API to Vision
+from pdf2image import convert_from_bytes # For PDF to image conversion
+
app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
## DEFINITIONS
@@ -35,13 +41,14 @@ def analyze_pdf(form_recognizer_client, pdf_bytes):
logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).")
return result
-def extract_layout_data(result):
+def extract_layout_data(result, visual_cues: List[dict] = None):
logging.info("Extracting layout data from analysis result.")
layout_data = {
"id": str(uuid.uuid4()),
"pages": []
}
+ visual_cues = visual_cues or [] # List of dicts with visual cue info per cell
# Log styles
for idx, style in enumerate(result.styles):
@@ -88,12 +95,16 @@ def extract_layout_data(result):
for cell in table.cells:
content = cell.content.strip()
- table_data["cells"].append({
+ # Find matching visual cue for this cell (if any)
+ cue = next((vc for vc in visual_cues if vc.get("page_number") == page.page_number and vc.get("row_index") == cell.row_index and vc.get("column_index") == cell.column_index), None)
+ cell_info = {
"row_index": cell.row_index,
"column_index": cell.column_index,
- "content": content
- })
- logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}'")
+ "content": content,
+ "visual_cue": cue["cue_type"] if cue else None
+ }
+ table_data["cells"].append(cell_info)
+ logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}', visual_cue: {cell_info['visual_cue']}")
page_data["tables"].append(table_data)
@@ -156,6 +167,31 @@ def save_layout_data_to_cosmos(layout_data):
## MAIN
@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}",
connection="invoicecontosostorage_STORAGE")
+def call_vision_api(image_bytes, subscription_key, endpoint):
+ vision_url = endpoint + "/vision/v3.2/analyze"
+ headers = {
+ 'Ocp-Apim-Subscription-Key': subscription_key,
+ 'Content-Type': 'application/octet-stream'
+ }
+ params = {
+ 'visualFeatures': 'Objects,Color', # Add more features if needed
+ }
+ response = requests.post(vision_url, headers=headers, params=params, data=image_bytes)
+ response.raise_for_status()
+ return response.json()
+
+def extract_visual_cues_from_vision(vision_result, page_number):
+ # Example: Detect gray fills, checkmarks, hand-drawn marks
+ cues = []
+ # This is a placeholder. You need to parse vision_result for your cues.
+ # For example, if vision_result['objects'] contains a 'checkmark' or color info for gray fill
+ # cues.append({"page_number": page_number, "row_index": ..., "column_index": ..., "cue_type": "gray_fill"})
+ return cues
+
+def convert_pdf_to_images(pdf_bytes):
+ images = convert_from_bytes(pdf_bytes)
+ return images
+
def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream):
logging.info(f"Python blob trigger function processed blob\n"
f"Name: {myblob.name}\n"
@@ -176,9 +212,26 @@ def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream):
logging.error(f"Error analyzing PDF: {e}")
return
+ # --- Step: Convert PDF to image and call Azure AI Vision ---
+ visual_cues = []
+ try:
+ images = convert_pdf_to_images(pdf_bytes)
+ vision_key = os.getenv("VISION_API_KEY")
+ vision_endpoint = os.getenv("VISION_API_ENDPOINT")
+ for page_num, image in enumerate(images, start=1):
+ img_bytes_io = BytesIO()
+ image.save(img_bytes_io, format='JPEG')
+ img_bytes = img_bytes_io.getvalue()
+ vision_result = call_vision_api(img_bytes, vision_key, vision_endpoint)
+ cues = extract_visual_cues_from_vision(vision_result, page_num)
+ visual_cues.extend(cues)
+ logging.info(f"Visual cues extracted: {visual_cues}")
+ except Exception as e:
+ logging.error(f"Error processing visual cues with AI Vision: {e}")
+
try:
- layout_data = extract_layout_data(result)
- logging.info("Successfully extracted layout data.")
+ layout_data = extract_layout_data(result, visual_cues)
+ logging.info("Successfully extracted and merged layout data.")
except Exception as e:
logging.error(f"Error extracting layout data: {e}")
return
diff --git a/terraform-infrastructure/README.md b/terraform-infrastructure/README.md
index 8a56aa7..3a28d91 100644
--- a/terraform-infrastructure/README.md
+++ b/terraform-infrastructure/README.md
@@ -5,7 +5,7 @@ Costa Rica
[](https://github.com/)
[brown9804](https://github.com/brown9804)
-Last updated: 2025-07-16
+Last updated: 2025-07-21
----------
@@ -109,7 +109,7 @@ graph TD;
-

-
Refresh Date: 2025-07-16
+

+
Refresh Date: 2025-07-21
diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf
index 956b5cb..e476224 100644
--- a/terraform-infrastructure/main.tf
+++ b/terraform-infrastructure/main.tf
@@ -289,6 +289,7 @@ resource "azurerm_role_assignment" "contributor" {
]
}
+
# Azure Form Recognizer (Document Intelligence)
resource "azurerm_cognitive_account" "form_recognizer" {
name = var.form_recognizer_name
@@ -299,12 +300,27 @@ resource "azurerm_cognitive_account" "form_recognizer" {
depends_on = [azurerm_resource_group.rg]
- # Output the Form Recognizer name
provisioner "local-exec" {
command = "echo Form Recognizer: ${self.name}"
}
}
+# Azure AI Vision (Cognitive Services)
+resource "azurerm_cognitive_account" "ai_vision" {
+ name = var.ai_vision_name
+ location = azurerm_resource_group.rg.location
+ resource_group_name = azurerm_resource_group.rg.name
+ kind = "CognitiveServices"
+ sku_name = var.ai_vision_sku
+ tags = var.ai_vision_tags
+
+ depends_on = [azurerm_resource_group.rg]
+
+ provisioner "local-exec" {
+ command = "echo AI Vision: ${self.name}"
+ }
+}
+
# We need to assign custom or built-in Cosmos DB SQL roles
# (like Cosmos DB Built-in Data Reader, etc.) at the data plane level,
# which is not currently supported directly in Terraform as of now.
@@ -373,6 +389,10 @@ resource "azurerm_linux_function_app" "function_app" {
"APPINSIGHTS_INSTRUMENTATIONKEY" = azurerm_application_insights.appinsights.instrumentation_key
"APPLICATIONINSIGHTS_CONNECTION_STRING" = azurerm_application_insights.appinsights.connection_string
+
+ # Azure AI Vision settings
+ "VISION_API_ENDPOINT" = azurerm_cognitive_account.ai_vision.endpoint
+ "VISION_API_KEY" = azurerm_cognitive_account.ai_vision.primary_access_key
}
depends_on = [
diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars
index 12a8bea..2f978b5 100644
--- a/terraform-infrastructure/terraform.tfvars
+++ b/terraform-infrastructure/terraform.tfvars
@@ -1,21 +1,30 @@
# Sample values
-subscription_id = "" # "your-subscription_id"
-resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name"
-location = "West US" # "your-location"
+subscription_id = "407f4106-0fd3-42e0-9348-3686dd1e7347" # "your-subscription_id"
+resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name"
+location = "West US" # "your-location"
# Storage Account
-storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name"
+storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name"
storage_account_name_runtime = "runtimestorebrownix2" # "your-runtime-storage-account-name"
# Function App
-function_app_name = "fapdfbrownix2" # "your-function-app-name"
+function_app_name = "fapdfbrownix2" # "your-function-app-name"
# App Service Plan
app_service_plan_name = "asppdfbrownix2" # "your-app-service-plan-name"
# Application Insights
-app_insights_name = "apppdfbrownix2" # "your-app-insights-name"
+app_insights_name = "apppdfbrownix2" # "your-app-insights-name"
# Log Analytics Workspace
log_analytics_workspace_name = "logwspdfbrownix2" # "your-log-analytics-workspace-name"
# Key Vault
-key_vault_name = "kvpdfrbrownix2" # "your-key-vault-name"
+key_vault_name = "kvpdfrbrownrix2" # "your-key-vault-name"
# CosmosDB
cosmosdb_account_name = "cosmospdfbrownix2" # "your-cosmosdb-account-name"
# Form Recognizer -> Document Intelligence
-form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name"
+form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name"
+
+# AI Vision Service
+ai_vision_name = "aivisionpdfrbrownix2" # "your-ai-vision-name"
+ai_vision_sku = "S0"
+ai_vision_tags = {
+ Environment = "Development"
+ Project = "PDF Processing"
+ Service = "AI Vision"
+}
diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf
index a9ea20a..33a04dd 100644
--- a/terraform-infrastructure/variables.tf
+++ b/terraform-infrastructure/variables.tf
@@ -48,6 +48,26 @@ variable "key_vault_name" {
description = "The name of the Key Vault"
type = string
}
+
+variable "ai_vision_name" {
+ description = "The name of the AI Vision Cognitive Services account"
+ type = string
+}
+
+variable "ai_vision_sku" {
+ description = "The SKU of the AI Vision Cognitive Services account"
+ type = string
+ default = "S0"
+}
+
+variable "ai_vision_tags" {
+ description = "Tags to be applied to the AI Vision resource"
+ type = map(string)
+ default = {
+ Environment = "Development"
+ Service = "AI Vision"
+ }
+}
variable "cosmosdb_account_name" {
description = "The name of the CosmosDB account."
type = string