diff --git a/.github/workflows/validate_and_fix_markdown.yml b/.github/workflows/validate_and_fix_markdown.yml index 4cef7ef..8bb9f1f 100644 --- a/.github/workflows/validate_and_fix_markdown.yml +++ b/.github/workflows/validate_and_fix_markdown.yml @@ -18,6 +18,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} - name: Set up Node.js uses: actions/setup-node@v3 @@ -35,11 +36,23 @@ jobs: git config --global user.email "github-actions[bot]@users.noreply.github.com" git config --global user.name "github-actions[bot]" - - name: Commit and rebase changes + - name: Commit and merge changes env: PR_BRANCH: ${{ github.head_ref || github.ref_name }} + GIT_AUTHOR_NAME: github-actions[bot] + GIT_AUTHOR_EMAIL: github-actions[bot]@users.noreply.github.com + GIT_COMMITTER_NAME: github-actions[bot] + GIT_COMMITTER_EMAIL: github-actions[bot]@users.noreply.github.com run: | + # Ensure we're on the correct branch + git switch -c "$PR_BRANCH" || git switch "$PR_BRANCH" + + # Stage and commit changes if any git add -A - git commit -m "Fix Markdown syntax issues" || echo "No changes to commit" - git pull --rebase origin "$PR_BRANCH" || echo "No rebase needed" - git push origin HEAD:"$PR_BRANCH" + git diff --staged --quiet || git commit -m "Fix Markdown syntax issues" + + # Pull and merge existing changes + git pull origin "$PR_BRANCH" --no-rebase + + # Push all changes + git push origin "$PR_BRANCH" diff --git a/.gitignore b/.gitignore index 7062e66..a57fe2b 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.tfstate *.tfstate.* .terraform.lock.hcl +terraform.tfstate.backup # Crash log files crash.log diff --git a/README.md b/README.md index c85ac63..bdfe19d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Demo: PDF Layout Extraction with Doc Intelligence (full-code approach) +# Demo: PDF Layout Extraction with Doc Intelligence
Supporting Multiple Document Versions with Visual Selection Cues (full-code approach) `Azure Storage + Document Intelligence + Function App + Cosmos DB` @@ -8,9 +8,16 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-07-21 ----------- +----------------------------- + +> This solution is designed to be flexible and robust, supporting multiple versions of PDF documents with varying layouts—including those that use visual selection cues such as gray fills, hand-drawn Xs, checkmarks, or circles. By building on the [PDFs-Layouts-Processing-Fapp-DocIntelligence](https://github.com/MicrosoftCloudEssentials-LearningHub/PDFs-Layouts-Processing-Fapp-DocIntelligence) repository, we ensure that: + +- Table structure and text are extracted using Azure Document Intelligence (Layout model). +- Visual selection cues are detected using Azure AI Vision or image preprocessing. +- Visual indicators are mapped to structured data, returning only the selected values in a clean JSON format. +- The logic is abstracted to support multiple layout variations, so the system adapts easily to new document formats and selection styles. > [!IMPORTANT] > This example is based on a `public network site and is intended for demonstration purposes only`. It showcases how several Azure resources can work together to achieve the desired result. Consider the section below about [Important Considerations for Production Environment](#important-considerations-for-production-environment). Please note that `these demos are intended as a guide and are based on my personal experiences. For official guidance, support, or more detailed information, please refer to Microsoft's official documentation or contact Microsoft directly`: [Microsoft Sales and Support](https://support.microsoft.com/contactus?ContactUsExperienceEntryPointAssetId=S.HP.SMC-HOME) @@ -440,7 +447,7 @@ Last updated: 2025-07-16
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-07-21

diff --git a/docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio similarity index 81% rename from docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio rename to docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio index de05840..b05d1f0 100644 --- a/docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio +++ b/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio @@ -1,11 +1,11 @@ - + - + - + @@ -27,10 +27,10 @@ - + - + @@ -43,10 +43,10 @@ - + - + @@ -89,6 +89,16 @@ + + + + + + + + + + diff --git a/metrics.json b/metrics.json index c9fdd61..8665541 100644 --- a/metrics.json +++ b/metrics.json @@ -26,7 +26,12 @@ }, { "date": "2025-07-14", - "count": 4, + "count": 130, + "uniques": 2 + }, + { + "date": "2025-07-15", + "count": 2, "uniques": 1 } ] \ No newline at end of file diff --git a/src/function_app.py b/src/function_app.py index 370c713..2efed7e 100644 --- a/src/function_app.py +++ b/src/function_app.py @@ -8,6 +8,12 @@ import uuid import json +# For image conversion and vision API +from typing import List +from io import BytesIO +import requests # For REST API to Vision +from pdf2image import convert_from_bytes # For PDF to image conversion + app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) ## DEFINITIONS @@ -35,13 +41,14 @@ def analyze_pdf(form_recognizer_client, pdf_bytes): logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).") return result -def extract_layout_data(result): +def extract_layout_data(result, visual_cues: List[dict] = None): logging.info("Extracting layout data from analysis result.") layout_data = { "id": str(uuid.uuid4()), "pages": [] } + visual_cues = visual_cues or [] # List of dicts with visual cue info per cell # Log styles for idx, style in enumerate(result.styles): @@ -88,12 +95,16 @@ def extract_layout_data(result): for cell in table.cells: content = cell.content.strip() - table_data["cells"].append({ + # Find matching visual cue for this cell (if any) + cue = next((vc for vc in visual_cues if vc.get("page_number") == page.page_number and vc.get("row_index") == cell.row_index and vc.get("column_index") == cell.column_index), None) + cell_info = { "row_index": cell.row_index, "column_index": cell.column_index, - "content": content - }) - logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}'") + "content": content, + "visual_cue": cue["cue_type"] if cue else None + } + table_data["cells"].append(cell_info) + logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}', visual_cue: {cell_info['visual_cue']}") page_data["tables"].append(table_data) @@ -156,6 +167,31 @@ def save_layout_data_to_cosmos(layout_data): ## MAIN @app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", connection="invoicecontosostorage_STORAGE") +def call_vision_api(image_bytes, subscription_key, endpoint): + vision_url = endpoint + "/vision/v3.2/analyze" + headers = { + 'Ocp-Apim-Subscription-Key': subscription_key, + 'Content-Type': 'application/octet-stream' + } + params = { + 'visualFeatures': 'Objects,Color', # Add more features if needed + } + response = requests.post(vision_url, headers=headers, params=params, data=image_bytes) + response.raise_for_status() + return response.json() + +def extract_visual_cues_from_vision(vision_result, page_number): + # Example: Detect gray fills, checkmarks, hand-drawn marks + cues = [] + # This is a placeholder. You need to parse vision_result for your cues. + # For example, if vision_result['objects'] contains a 'checkmark' or color info for gray fill + # cues.append({"page_number": page_number, "row_index": ..., "column_index": ..., "cue_type": "gray_fill"}) + return cues + +def convert_pdf_to_images(pdf_bytes): + images = convert_from_bytes(pdf_bytes) + return images + def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): logging.info(f"Python blob trigger function processed blob\n" f"Name: {myblob.name}\n" @@ -176,9 +212,26 @@ def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): logging.error(f"Error analyzing PDF: {e}") return + # --- Step: Convert PDF to image and call Azure AI Vision --- + visual_cues = [] + try: + images = convert_pdf_to_images(pdf_bytes) + vision_key = os.getenv("VISION_API_KEY") + vision_endpoint = os.getenv("VISION_API_ENDPOINT") + for page_num, image in enumerate(images, start=1): + img_bytes_io = BytesIO() + image.save(img_bytes_io, format='JPEG') + img_bytes = img_bytes_io.getvalue() + vision_result = call_vision_api(img_bytes, vision_key, vision_endpoint) + cues = extract_visual_cues_from_vision(vision_result, page_num) + visual_cues.extend(cues) + logging.info(f"Visual cues extracted: {visual_cues}") + except Exception as e: + logging.error(f"Error processing visual cues with AI Vision: {e}") + try: - layout_data = extract_layout_data(result) - logging.info("Successfully extracted layout data.") + layout_data = extract_layout_data(result, visual_cues) + logging.info("Successfully extracted and merged layout data.") except Exception as e: logging.error(f"Error extracting layout data: {e}") return diff --git a/terraform-infrastructure/README.md b/terraform-infrastructure/README.md index 8a56aa7..3a28d91 100644 --- a/terraform-infrastructure/README.md +++ b/terraform-infrastructure/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-07-21 ---------- @@ -109,7 +109,7 @@ graph TD;
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-07-21

diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf index 956b5cb..e476224 100644 --- a/terraform-infrastructure/main.tf +++ b/terraform-infrastructure/main.tf @@ -289,6 +289,7 @@ resource "azurerm_role_assignment" "contributor" { ] } + # Azure Form Recognizer (Document Intelligence) resource "azurerm_cognitive_account" "form_recognizer" { name = var.form_recognizer_name @@ -299,12 +300,27 @@ resource "azurerm_cognitive_account" "form_recognizer" { depends_on = [azurerm_resource_group.rg] - # Output the Form Recognizer name provisioner "local-exec" { command = "echo Form Recognizer: ${self.name}" } } +# Azure AI Vision (Cognitive Services) +resource "azurerm_cognitive_account" "ai_vision" { + name = var.ai_vision_name + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + kind = "CognitiveServices" + sku_name = var.ai_vision_sku + tags = var.ai_vision_tags + + depends_on = [azurerm_resource_group.rg] + + provisioner "local-exec" { + command = "echo AI Vision: ${self.name}" + } +} + # We need to assign custom or built-in Cosmos DB SQL roles # (like Cosmos DB Built-in Data Reader, etc.) at the data plane level, # which is not currently supported directly in Terraform as of now. @@ -373,6 +389,10 @@ resource "azurerm_linux_function_app" "function_app" { "APPINSIGHTS_INSTRUMENTATIONKEY" = azurerm_application_insights.appinsights.instrumentation_key "APPLICATIONINSIGHTS_CONNECTION_STRING" = azurerm_application_insights.appinsights.connection_string + + # Azure AI Vision settings + "VISION_API_ENDPOINT" = azurerm_cognitive_account.ai_vision.endpoint + "VISION_API_KEY" = azurerm_cognitive_account.ai_vision.primary_access_key } depends_on = [ diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars index 12a8bea..2f978b5 100644 --- a/terraform-infrastructure/terraform.tfvars +++ b/terraform-infrastructure/terraform.tfvars @@ -1,21 +1,30 @@ # Sample values -subscription_id = "" # "your-subscription_id" -resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name" -location = "West US" # "your-location" +subscription_id = "407f4106-0fd3-42e0-9348-3686dd1e7347" # "your-subscription_id" +resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name" +location = "West US" # "your-location" # Storage Account -storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name" +storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name" storage_account_name_runtime = "runtimestorebrownix2" # "your-runtime-storage-account-name" # Function App -function_app_name = "fapdfbrownix2" # "your-function-app-name" +function_app_name = "fapdfbrownix2" # "your-function-app-name" # App Service Plan app_service_plan_name = "asppdfbrownix2" # "your-app-service-plan-name" # Application Insights -app_insights_name = "apppdfbrownix2" # "your-app-insights-name" +app_insights_name = "apppdfbrownix2" # "your-app-insights-name" # Log Analytics Workspace log_analytics_workspace_name = "logwspdfbrownix2" # "your-log-analytics-workspace-name" # Key Vault -key_vault_name = "kvpdfrbrownix2" # "your-key-vault-name" +key_vault_name = "kvpdfrbrownrix2" # "your-key-vault-name" # CosmosDB cosmosdb_account_name = "cosmospdfbrownix2" # "your-cosmosdb-account-name" # Form Recognizer -> Document Intelligence -form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name" +form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name" + +# AI Vision Service +ai_vision_name = "aivisionpdfrbrownix2" # "your-ai-vision-name" +ai_vision_sku = "S0" +ai_vision_tags = { + Environment = "Development" + Project = "PDF Processing" + Service = "AI Vision" +} diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf index a9ea20a..33a04dd 100644 --- a/terraform-infrastructure/variables.tf +++ b/terraform-infrastructure/variables.tf @@ -48,6 +48,26 @@ variable "key_vault_name" { description = "The name of the Key Vault" type = string } + +variable "ai_vision_name" { + description = "The name of the AI Vision Cognitive Services account" + type = string +} + +variable "ai_vision_sku" { + description = "The SKU of the AI Vision Cognitive Services account" + type = string + default = "S0" +} + +variable "ai_vision_tags" { + description = "Tags to be applied to the AI Vision resource" + type = map(string) + default = { + Environment = "Development" + Service = "AI Vision" + } +} variable "cosmosdb_account_name" { description = "The name of the CosmosDB account." type = string