diff --git a/README.md b/README.md index 58ac866..210e09c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Demo: PDF Layout Extraction with Doc Intelligence
Supporting Multiple Document Versions with Visual Selection Cues (full-code approach) +# Demo: PDF Layout Extraction with Doc Intelligence (full-code approach) `Azure Storage + Document Intelligence + Function App + Cosmos DB` @@ -8,16 +8,9 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-21 +Last updated: 2025-07-16 ------------------------------ - -> This solution is designed to be flexible and robust, supporting multiple versions of PDF documents with varying layouts—including those that use visual selection cues such as gray fills, hand-drawn Xs, checkmarks, or circles. By building on the [PDFs-Layouts-Processing-Fapp-DocIntelligence](https://github.com/MicrosoftCloudEssentials-LearningHub/PDFs-Layouts-Processing-Fapp-DocIntelligence) repository, we ensure that: - -- Table structure and text are extracted using Azure Document Intelligence (Layout model). -- Visual selection cues are detected using Azure AI Vision or image preprocessing. -- Visual indicators are mapped to structured data, returning only the selected values in a clean JSON format. -- The logic is abstracted to support multiple layout variations, so the system adapts easily to new document formats and selection styles. +---------- > [!IMPORTANT] > This example is based on a `public network site and is intended for demonstration purposes only`. It showcases how several Azure resources can work together to achieve the desired result. Consider the section below about [Important Considerations for Production Environment](#important-considerations-for-production-environment). Please note that `these demos are intended as a guide and are based on my personal experiences. For official guidance, support, or more detailed information, please refer to Microsoft's official documentation or contact Microsoft directly`: [Microsoft Sales and Support](https://support.microsoft.com/contactus?ContactUsExperienceEntryPointAssetId=S.HP.SMC-HOME) diff --git a/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio b/docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio similarity index 81% rename from docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio rename to docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio index b05d1f0..8a5e0fc 100644 --- a/docs/automated-PDFLayoutprocessing-FunctionApp-DocIntellig-AI-Vision.drawio +++ b/docs/automatedPDFLayoutprocessingFunctionAppDocIntellig.drawio @@ -1,105 +1,95 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/function_app.py b/src/function_app.py index 2efed7e..370c713 100644 --- a/src/function_app.py +++ b/src/function_app.py @@ -8,12 +8,6 @@ import uuid import json -# For image conversion and vision API -from typing import List -from io import BytesIO -import requests # For REST API to Vision -from pdf2image import convert_from_bytes # For PDF to image conversion - app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) ## DEFINITIONS @@ -41,14 +35,13 @@ def analyze_pdf(form_recognizer_client, pdf_bytes): logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).") return result -def extract_layout_data(result, visual_cues: List[dict] = None): +def extract_layout_data(result): logging.info("Extracting layout data from analysis result.") layout_data = { "id": str(uuid.uuid4()), "pages": [] } - visual_cues = visual_cues or [] # List of dicts with visual cue info per cell # Log styles for idx, style in enumerate(result.styles): @@ -95,16 +88,12 @@ def extract_layout_data(result, visual_cues: List[dict] = None): for cell in table.cells: content = cell.content.strip() - # Find matching visual cue for this cell (if any) - cue = next((vc for vc in visual_cues if vc.get("page_number") == page.page_number and vc.get("row_index") == cell.row_index and vc.get("column_index") == cell.column_index), None) - cell_info = { + table_data["cells"].append({ "row_index": cell.row_index, "column_index": cell.column_index, - "content": content, - "visual_cue": cue["cue_type"] if cue else None - } - table_data["cells"].append(cell_info) - logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}', visual_cue: {cell_info['visual_cue']}") + "content": content + }) + logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}'") page_data["tables"].append(table_data) @@ -167,31 +156,6 @@ def save_layout_data_to_cosmos(layout_data): ## MAIN @app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", connection="invoicecontosostorage_STORAGE") -def call_vision_api(image_bytes, subscription_key, endpoint): - vision_url = endpoint + "/vision/v3.2/analyze" - headers = { - 'Ocp-Apim-Subscription-Key': subscription_key, - 'Content-Type': 'application/octet-stream' - } - params = { - 'visualFeatures': 'Objects,Color', # Add more features if needed - } - response = requests.post(vision_url, headers=headers, params=params, data=image_bytes) - response.raise_for_status() - return response.json() - -def extract_visual_cues_from_vision(vision_result, page_number): - # Example: Detect gray fills, checkmarks, hand-drawn marks - cues = [] - # This is a placeholder. You need to parse vision_result for your cues. - # For example, if vision_result['objects'] contains a 'checkmark' or color info for gray fill - # cues.append({"page_number": page_number, "row_index": ..., "column_index": ..., "cue_type": "gray_fill"}) - return cues - -def convert_pdf_to_images(pdf_bytes): - images = convert_from_bytes(pdf_bytes) - return images - def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): logging.info(f"Python blob trigger function processed blob\n" f"Name: {myblob.name}\n" @@ -212,26 +176,9 @@ def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream): logging.error(f"Error analyzing PDF: {e}") return - # --- Step: Convert PDF to image and call Azure AI Vision --- - visual_cues = [] - try: - images = convert_pdf_to_images(pdf_bytes) - vision_key = os.getenv("VISION_API_KEY") - vision_endpoint = os.getenv("VISION_API_ENDPOINT") - for page_num, image in enumerate(images, start=1): - img_bytes_io = BytesIO() - image.save(img_bytes_io, format='JPEG') - img_bytes = img_bytes_io.getvalue() - vision_result = call_vision_api(img_bytes, vision_key, vision_endpoint) - cues = extract_visual_cues_from_vision(vision_result, page_num) - visual_cues.extend(cues) - logging.info(f"Visual cues extracted: {visual_cues}") - except Exception as e: - logging.error(f"Error processing visual cues with AI Vision: {e}") - try: - layout_data = extract_layout_data(result, visual_cues) - logging.info("Successfully extracted and merged layout data.") + layout_data = extract_layout_data(result) + logging.info("Successfully extracted layout data.") except Exception as e: logging.error(f"Error extracting layout data: {e}") return diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf index e476224..956b5cb 100644 --- a/terraform-infrastructure/main.tf +++ b/terraform-infrastructure/main.tf @@ -289,7 +289,6 @@ resource "azurerm_role_assignment" "contributor" { ] } - # Azure Form Recognizer (Document Intelligence) resource "azurerm_cognitive_account" "form_recognizer" { name = var.form_recognizer_name @@ -300,27 +299,12 @@ resource "azurerm_cognitive_account" "form_recognizer" { depends_on = [azurerm_resource_group.rg] + # Output the Form Recognizer name provisioner "local-exec" { command = "echo Form Recognizer: ${self.name}" } } -# Azure AI Vision (Cognitive Services) -resource "azurerm_cognitive_account" "ai_vision" { - name = var.ai_vision_name - location = azurerm_resource_group.rg.location - resource_group_name = azurerm_resource_group.rg.name - kind = "CognitiveServices" - sku_name = var.ai_vision_sku - tags = var.ai_vision_tags - - depends_on = [azurerm_resource_group.rg] - - provisioner "local-exec" { - command = "echo AI Vision: ${self.name}" - } -} - # We need to assign custom or built-in Cosmos DB SQL roles # (like Cosmos DB Built-in Data Reader, etc.) at the data plane level, # which is not currently supported directly in Terraform as of now. @@ -389,10 +373,6 @@ resource "azurerm_linux_function_app" "function_app" { "APPINSIGHTS_INSTRUMENTATIONKEY" = azurerm_application_insights.appinsights.instrumentation_key "APPLICATIONINSIGHTS_CONNECTION_STRING" = azurerm_application_insights.appinsights.connection_string - - # Azure AI Vision settings - "VISION_API_ENDPOINT" = azurerm_cognitive_account.ai_vision.endpoint - "VISION_API_KEY" = azurerm_cognitive_account.ai_vision.primary_access_key } depends_on = [ diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars index 2f978b5..12a8bea 100644 --- a/terraform-infrastructure/terraform.tfvars +++ b/terraform-infrastructure/terraform.tfvars @@ -1,30 +1,21 @@ # Sample values -subscription_id = "407f4106-0fd3-42e0-9348-3686dd1e7347" # "your-subscription_id" -resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name" -location = "West US" # "your-location" +subscription_id = "" # "your-subscription_id" +resource_group_name = "RG-PDFLayout-Processing-DocIntelligence" # "your-resource-group-name" +location = "West US" # "your-location" # Storage Account -storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name" +storage_account_name = "storageaccbrownpdfix2" # "your-storage-account-name" storage_account_name_runtime = "runtimestorebrownix2" # "your-runtime-storage-account-name" # Function App -function_app_name = "fapdfbrownix2" # "your-function-app-name" +function_app_name = "fapdfbrownix2" # "your-function-app-name" # App Service Plan app_service_plan_name = "asppdfbrownix2" # "your-app-service-plan-name" # Application Insights -app_insights_name = "apppdfbrownix2" # "your-app-insights-name" +app_insights_name = "apppdfbrownix2" # "your-app-insights-name" # Log Analytics Workspace log_analytics_workspace_name = "logwspdfbrownix2" # "your-log-analytics-workspace-name" # Key Vault -key_vault_name = "kvpdfrbrownrix2" # "your-key-vault-name" +key_vault_name = "kvpdfrbrownix2" # "your-key-vault-name" # CosmosDB cosmosdb_account_name = "cosmospdfbrownix2" # "your-cosmosdb-account-name" # Form Recognizer -> Document Intelligence -form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name" - -# AI Vision Service -ai_vision_name = "aivisionpdfrbrownix2" # "your-ai-vision-name" -ai_vision_sku = "S0" -ai_vision_tags = { - Environment = "Development" - Project = "PDF Processing" - Service = "AI Vision" -} +form_recognizer_name = "docintelligt01ix2" # "your-document-intelligence-name" diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf index 33a04dd..a9ea20a 100644 --- a/terraform-infrastructure/variables.tf +++ b/terraform-infrastructure/variables.tf @@ -48,26 +48,6 @@ variable "key_vault_name" { description = "The name of the Key Vault" type = string } - -variable "ai_vision_name" { - description = "The name of the AI Vision Cognitive Services account" - type = string -} - -variable "ai_vision_sku" { - description = "The SKU of the AI Vision Cognitive Services account" - type = string - default = "S0" -} - -variable "ai_vision_tags" { - description = "Tags to be applied to the AI Vision resource" - type = map(string) - default = { - Environment = "Development" - Service = "AI Vision" - } -} variable "cosmosdb_account_name" { description = "The name of the CosmosDB account." type = string