diff --git a/notebooks/classifier.ipynb b/notebooks/classifier.ipynb index 886c723..878577e 100644 --- a/notebooks/classifier.ipynb +++ b/notebooks/classifier.ipynb @@ -6,17 +6,17 @@ "source": [ "# Azure AI Content Understanding - Classifier and Analyzer Demo\n", "\n", - "This notebook demonstrates how to use Azure AI Content Understanding service to:\n", - "1. Create a classifier to categorize documents\n", + "This notebook demonstrates how to use the Azure AI Content Understanding service to:\n", + "1. Create a classifier for document categorization\n", "2. Create a custom analyzer to extract specific fields\n", - "3. Combine classifier and analyzers to classify, optionally split, and analyze documents in a flexible processing pipeline\n", + "3. Combine classifiers and analyzers to classify, optionally split, and analyze documents within a flexible processing pipeline\n", "\n", - "If you’d like to learn more before getting started, see the official documentation:\n", + "For more detailed information before getting started, refer to the official documentation:\n", "[Understanding Classifiers in Azure AI Services](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/classifier)\n", "\n", "## Prerequisites\n", - "1. Ensure Azure AI service is configured following [steps](../README.md#configure-azure-ai-service-resource)\n", - "2. Install the required packages to run the sample.\n" + "1. Ensure the Azure AI service is configured by following the [setup steps](../README.md#configure-azure-ai-service-resource).\n", + "2. Install the required packages to run this sample." ] }, { @@ -63,7 +63,7 @@ "source": [ "## 2. Import Azure Content Understanding Client\n", "\n", - "The `AzureContentUnderstandingClient` class handles all API interactions with the Azure AI service." + "The `AzureContentUnderstandingClient` class manages all API interactions with the Azure AI service." ] }, { @@ -72,14 +72,14 @@ "metadata": {}, "outputs": [], "source": [ - "# Add the parent directory to the path to use shared modules\n", + "# Add the parent directory to the system path to access shared modules\n", "parent_dir = Path(Path.cwd()).parent\n", "sys.path.append(str(parent_dir))\n", "try:\n", " from python.content_understanding_client import AzureContentUnderstandingClient\n", " print(\"✅ Azure Content Understanding Client imported successfully!\")\n", "except ImportError:\n", - " print(\"❌ Error: Make sure 'AzureContentUnderstandingClient.py' is in the same directory as this notebook.\")\n", + " print(\"❌ Error: Ensure 'AzureContentUnderstandingClient.py' exists in the appropriate directory.\")\n", " raise" ] }, @@ -89,11 +89,11 @@ "source": [ "## 3. Configure Azure AI Service Settings and Prepare the Sample\n", "\n", - "Update these settings to match your Azure environment:\n", + "Update the following settings to match your Azure environment:\n", "\n", - "- **AZURE_AI_ENDPOINT**: Your Azure AI service endpoint URL or set up in \".env\" file\n", - "- **AZURE_AI_API_VERSION**: The Azure AI API version to use. Default is \"2025-05-01-preview\". \n", - "- **AZURE_AI_API_KEY**: Your Azure AI service key (optional if using token authentication)\n", + "- **AZURE_AI_ENDPOINT**: Your Azure AI service endpoint URL, or configure it in the \".env\" file\n", + "- **AZURE_AI_API_VERSION**: Azure AI API version to use. Defaults to \"2025-05-01-preview\"\n", + "- **AZURE_AI_API_KEY**: Your Azure AI subscription key (optional if using token-based authentication)\n", "- **ANALYZER_SAMPLE_FILE**: Path to the PDF document you want to process" ] }, @@ -103,14 +103,14 @@ "metadata": {}, "outputs": [], "source": [ - "# For authentication, you can use either token-based auth or subscription key, and only one of them is required\n", + "# Authentication supports either token-based or subscription key methods; only one is required\n", "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", - "# IMPORTANT: Replace with your actual subscription key or set up in \".env\" file if not using token auth\n", + "# IMPORTANT: Substitute with your subscription key or configure in \".env\" if not using token auth\n", "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", "AZURE_AI_API_VERSION = os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\")\n", "ANALYZER_SAMPLE_FILE = \"../data/mixed_financial_docs.pdf\" # Update this path to your PDF file\n", "\n", - "# Authentication - Using DefaultAzureCredential for token-based auth\n", + "# Use DefaultAzureCredential for token-based authentication\n", "credential = DefaultAzureCredential()\n", "token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n", "\n", @@ -130,16 +130,16 @@ "\n", "The classifier schema defines:\n", "- **Categories**: Document types to classify (e.g., Legal, Medical)\n", - " - **description (Optional)**: An optional field used to provide additional context or hints for categorizing or splitting documents. This can be helpful when the category name alone isn’t descriptive enough. If the category name is already clear and self-explanatory, this field can be omitted.\n", - "- **splitMode Options**: Defines how multi-page documents should be split before classification or analysis.\n", + " - **description (Optional)**: Provides additional context or hints for categorizing or splitting documents. Useful when the category name alone is not sufficiently descriptive. Omit if the category name is self-explanatory.\n", + "- **splitMode Options**: Determines how multi-page documents are split before classification or analysis.\n", " - `\"auto\"`: Automatically split based on content. \n", - " For example, if two categories are defined as “invoice” and “application form”:\n", - " - A PDF with only one invoice will be classified as a single document.\n", - " - A PDF containing two invoices and one application form will be automatically split into three classified sections.\n", + " For example, given categories “invoice” and “application form”:\n", + " - A PDF with one invoice will be classified as a single document.\n", + " - A PDF containing two invoices and one application form will be automatically split into three sections classified separately.\n", " - `\"none\"`: No splitting. \n", - " The entire multi-page document is treated as a single unit for classification and analysis.\n", + " The entire multi-page document is treated as one unit for classification and analysis.\n", " - `\"perPage\"`: Split by page. \n", - " Each page is treated as a separate document. This is useful when you’ve built custom analyzers designed to operate on a per-page basis." + " Treats each page as a separate document, useful if custom analyzers operate at the page level." ] }, { @@ -151,17 +151,17 @@ "# Define document categories and their descriptions\n", "classifier_schema = {\n", " \"categories\": {\n", - " \"Loan application\": { # Both space and underscore are allowed in category names\n", + " \"Loan application\": { # Both spaces and underscores are supported in category names\n", " \"description\": \"Documents submitted by individuals or businesses to request funding, typically including personal or business details, financial history, loan amount, purpose, and supporting documentation.\"\n", " },\n", " \"Invoice\": {\n", " \"description\": \"Billing documents issued by sellers or service providers to request payment for goods or services, detailing items, prices, taxes, totals, and payment terms.\"\n", " },\n", - " \"Bank_Statement\": { # Both space and underscore are allowed in category names\n", - " \"description\": \"Official statements issued by banks that summarize account activity over a period, including deposits, withdrawals, fees, and balances.\"\n", + " \"Bank_Statement\": { # Both spaces and underscores are supported\n", + " \"description\": \"Official statements issued by banks summarizing account activity over a period, including deposits, withdrawals, fees, and balances.\"\n", " },\n", " },\n", - " \"splitMode\": \"auto\" # IMPORTANT: Automatically detect document boundaries. Can change mode for your needs.\n", + " \"splitMode\": \"auto\" # IMPORTANT: Automatically detect document boundaries; adjust as needed.\n", "}\n", "\n", "print(\"📄 Classifier Categories:\")\n", @@ -175,14 +175,14 @@ "source": [ "## 5. Initialize Content Understanding Client\n", "\n", - "Create the client that will communicate with Azure AI services.\n", + "Create the client to interact with Azure AI services.\n", "\n", "⚠️ Important:\n", - "You must update the code below to match your Azure authentication method.\n", - "Look for the `# IMPORTANT` comments and modify those sections accordingly.\n", - "If you skip this step, the sample may not run correctly.\n", + "Update the authentication details below to match your Azure setup.\n", + "See the `# IMPORTANT` comments for instructions.\n", + "Skipping this step may result in runtime errors.\n", "\n", - "⚠️ Note: Using a subscription key works, but using a token provider with Azure Active Directory (AAD) is much safer and is highly recommended for production environments." + "⚠️ Note: While subscription key authentication works, using Azure Active Directory (AAD) token provider is more secure and recommended for production." ] }, { @@ -214,7 +214,7 @@ "source": [ "## 6. Create a Basic Classifier\n", "\n", - "First, we'll create a simple classifier that categorizes documents without additional analysis." + "Create a simple classifier that categorizes documents without performing additional analysis." ] }, { @@ -223,11 +223,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Generate unique classifier ID\n", + "# Generate a unique classifier ID\n", "classifier_id = \"classifier-sample-\" + str(uuid.uuid4())\n", "\n", "try:\n", - " # Create classifier\n", + " # Create the classifier\n", " print(f\"🔨 Creating classifier: {classifier_id}\")\n", " print(\" This may take a few seconds...\")\n", " \n", @@ -253,7 +253,7 @@ "source": [ "## 7. Classify Your Document\n", "\n", - "Now let's use the classifier to categorize your document." + "Use the classifier to categorize your document." ] }, { @@ -263,13 +263,13 @@ "outputs": [], "source": [ "try:\n", - " # Check if document exists\n", + " # Verify that the document exists\n", " if not file_location.exists():\n", " raise FileNotFoundError(f\"Document not found at {file_location}\")\n", " \n", - " # Classify document\n", + " # Classify the document\n", " print(f\"📄 Classifying document: {file_location.name}\")\n", - " print(\"\\n⏳ Processing... This may take a few minutes for large documents.\")\n", + " print(\"\\n⏳ Processing... This may take several minutes for large files.\")\n", " \n", " response = content_understanding_client.begin_classify(classifier_id, file_location=str(file_location))\n", " result = content_understanding_client.poll_result(response, timeout_seconds=360)\n", @@ -278,7 +278,7 @@ " \n", "except FileNotFoundError:\n", " print(f\"\\n❌ Document not found: {file_location}\")\n", - " print(\" Please update file_location to point to your PDF file.\")\n", + " print(\" Please update 'file_location' to point to your PDF file.\")\n", "except Exception as e:\n", " print(f\"\\n❌ Error classifying document: {e}\")" ] @@ -289,7 +289,7 @@ "source": [ "## 8. View Classification Results\n", "\n", - "Let's examine what the classifier found in your document." + "Review the classification results generated for your document." ] }, { @@ -307,14 +307,14 @@ " print(\"=\" * 50)\n", " print(f\"\\nTotal sections found: {len(contents)}\")\n", " \n", - " # Show summary of each classified section\n", + " # Summarize each classified section\n", " print(\"\\n📋 Document Sections:\")\n", " for i, content in enumerate(contents, 1):\n", " print(f\"\\n Section {i}:\")\n", " print(f\" • Category: {content.get('category', 'Unknown')}\")\n", " print(f\" • Pages: {content.get('startPageNumber', '?')} - {content.get('endPageNumber', '?')}\")\n", " \n", - " print(\"\\nFull result:\")\n", + " print(\"\\nFull result output:\")\n", " print(json.dumps(result, indent=2))\n", "else:\n", " print(\"❌ No results available. Please run the classification step first.\")" @@ -326,10 +326,8 @@ "source": [ "## 9. Create a Custom Analyzer (Advanced)\n", "\n", - "Now let's create a custom analyzer that can extract specific fields from documents.\n", - "This analyzer will:\n", - "- Extract common fields from loan application documents\n", - "- Generate document excerpts" + "Create a custom analyzer to extract specific fields from documents.\n", + "This example extracts common fields from loan application documents and generates document excerpts." ] }, { @@ -338,17 +336,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Define analyzer schema with custom fields\n", + "# Define the analyzer schema with custom fields\n", "analyzer_schema = {\n", - " \"description\": \"Loan application analyzer - extracts key information from loan application\",\n", - " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\", # Built on top of the general document analyzer\n", + " \"description\": \"Loan application analyzer - extracts key information from loan applications\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\", # Extends the general document analyzer\n", " \"config\": {\n", " \"returnDetails\": True,\n", - " \"enableLayout\": True, # Extract layout information\n", - " \"enableBarcode\": False, # Skip barcode detection\n", - " \"enableFormula\": False, # Skip formula detection\n", - " \"estimateFieldSourceAndConfidence\": True, # Set to True if you want to estimate the field location (aka grounding) and confidence\n", - " \"disableContentFiltering\": False,\n", + " \"enableLayout\": True, # Extract layout details\n", + " \"enableBarcode\": False, # Disable barcode detection\n", + " \"enableFormula\": False, # Disable formula detection\n", + " \"estimateFieldSourceAndConfidence\": True, # Enable estimation of field location and confidence\n", + " \"disableContentFiltering\": False\n", " },\n", " \"fieldSchema\": {\n", " \"fields\": {\n", @@ -360,12 +358,12 @@ " \"ApplicantName\": {\n", " \"type\": \"string\",\n", " \"method\": \"generate\",\n", - " \"description\": \"The full name of the loan applicant or company.\"\n", + " \"description\": \"Full name of the loan applicant or company.\"\n", " },\n", " \"LoanAmountRequested\": {\n", " \"type\": \"number\",\n", " \"method\": \"generate\",\n", - " \"description\": \"The total amount of loan money requested by the applicant.\"\n", + " \"description\": \"The total loan amount requested by the applicant.\"\n", " },\n", " \"LoanPurpose\": {\n", " \"type\": \"string\",\n", @@ -375,24 +373,24 @@ " \"CreditScore\": {\n", " \"type\": \"number\",\n", " \"method\": \"generate\",\n", - " \"description\": \"The credit score of the applicant, if available.\"\n", + " \"description\": \"Credit score of the applicant, if available.\"\n", " },\n", " \"Summary\": {\n", " \"type\": \"string\",\n", " \"method\": \"generate\",\n", - " \"description\": \"A brief overview of the loan application details.\"\n", + " \"description\": \"A brief summary overview of the loan application details.\"\n", " }\n", " }\n", " }\n", "}\n", "\n", - "# Generate unique analyzer ID\n", + "# Generate a unique analyzer ID\n", "analyzer_id = \"analyzer-loan-application-\" + str(uuid.uuid4())\n", "\n", - "# Create the analyzer\n", + "# Create the custom analyzer\n", "try:\n", " print(f\"🔨 Creating custom analyzer: {analyzer_id}\")\n", - " print(\"\\n📋 Analyzer will extract:\")\n", + " print(\"\\n📋 The analyzer will extract the following fields:\")\n", " for field_name, field_info in analyzer_schema[\"fieldSchema\"][\"fields\"].items():\n", " print(f\" • {field_name}: {field_info['description']}\")\n", " \n", @@ -413,8 +411,8 @@ "source": [ "## 10. Create an Enhanced Classifier with Custom Analyzer\n", "\n", - "Now we'll create a new classifier that uses the prebuilt invoice analyzer for invoices and our custom analyzer for loan application documents.\n", - "This combines classification with field extraction in one operation." + "Now create a new classifier that uses the prebuilt invoice analyzer for invoices and the custom analyzer for loan application documents.\n", + "This combines document classification with field extraction in one operation." ] }, { @@ -423,30 +421,30 @@ "metadata": {}, "outputs": [], "source": [ - "# Generate unique enhanced classifier ID\n", + "# Generate a unique enhanced classifier ID\n", "enhanced_classifier_id = \"classifier-enhanced-\" + str(uuid.uuid4())\n", "\n", - "# Create enhanced classifier schema\n", + "# Define the enhanced classifier schema\n", "enhanced_classifier_schema = {\n", " \"categories\": {\n", - " \"Loan application\": { # Both space and underscore are allowed in category names\n", - " \"description\": \"Documents submitted by individuals or businesses to request funding, typically including personal or business details, financial history, loan amount, purpose, and supporting documentation.\",\n", - " \"analyzerId\": analyzer_id # IMPORTANT: Use created custom analyzer in previous step for loan applications\n", + " \"Loan application\": { # Both spaces and underscores allowed\n", + " \"description\": \"Documents submitted by individuals or businesses requesting funding, including personal/business details, financial history, and supporting documents.\",\n", + " \"analyzerId\": analyzer_id # IMPORTANT: Use the custom analyzer created previously for loan applications\n", " },\n", " \"Invoice\": {\n", - " \"description\": \"Billing documents issued by sellers or service providers to request payment for goods or services, detailing items, prices, taxes, totals, and payment terms.\",\n", + " \"description\": \"Billing documents issued by sellers or service providers requesting payment for goods or services, detailing items, prices, taxes, totals, and payment terms.\",\n", " \"analyzerId\": \"prebuilt-invoice\" # Use prebuilt invoice analyzer for invoices\n", " },\n", - " \"Bank_Statement\": { # Both space and underscore are allowed in category names\n", - " \"description\": \"Official statements issued by banks that summarize account activity over a period, including deposits, withdrawals, fees, and balances.\"\n", + " \"Bank_Statement\": { # Both spaces and underscores allowed\n", + " \"description\": \"Official bank statements summarizing account activity over a period, including deposits, withdrawals, fees, and balances.\"\n", " # No analyzer specified - uses default processing\n", " }\n", " },\n", " \"splitMode\": \"auto\"\n", "}\n", "\n", - "# Create the enhanced classifier\n", - "if analyzer_id: # Only create if analyzer was successfully created\n", + "# Create the enhanced classifier only if the custom analyzer was created successfully\n", + "if analyzer_id:\n", " try:\n", " print(f\"🔨 Creating enhanced classifier: {enhanced_classifier_id}\")\n", " print(\"\\n📋 Configuration:\")\n", @@ -462,7 +460,7 @@ " except Exception as e:\n", " print(f\"\\n❌ Error creating enhanced classifier: {e}\")\n", "else:\n", - " print(\"⚠️ Skipping enhanced classifier creation - analyzer was not created successfully.\")" + " print(\"⚠️ Skipping enhanced classifier creation - custom analyzer was not created successfully.\")" ] }, { @@ -471,8 +469,8 @@ "source": [ "## 11. Process Document with Enhanced Classifier\n", "\n", - "Let's process the document again using our enhanced classifier.\n", - "Invoices and loan application documents will now have additional fields extracted." + "Process the document again using the enhanced classifier.\n", + "Invoices and loan applications will now have additional fields extracted." ] }, { @@ -483,14 +481,14 @@ "source": [ "if 'enhanced_classifier_id' in locals() and analyzer_id:\n", " try:\n", - " # Check if document exists\n", + " # Verify the document exists\n", " if not file_location.exists():\n", " raise FileNotFoundError(f\"Document not found at {file_location}\")\n", " \n", - " # Process with enhanced classifier\n", + " # Process document with enhanced classifier\n", " print(\"📄 Processing document with enhanced classifier\")\n", " print(f\" Document: {file_location.name}\")\n", - " print(\"\\n⏳ Processing with classification + field extraction...\")\n", + " print(\"\\n⏳ Processing with classification and field extraction...\")\n", " \n", " response = content_understanding_client.begin_classify(enhanced_classifier_id, file_location=str(file_location))\n", " enhanced_result = content_understanding_client.poll_result(response, timeout_seconds=360)\n", @@ -509,7 +507,7 @@ "source": [ "## 12. View Enhanced Results with Extracted Fields\n", "\n", - "Let's see the classification results along with the extracted fields from loan application documents." + "Review the classification results alongside extracted fields from loan application documents." ] }, { @@ -518,7 +516,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Display enhanced results\n", + "# Display enhanced classification results\n", "if 'enhanced_result' in locals() and enhanced_result:\n", " result_data = enhanced_result.get(\"result\", {})\n", " contents = result_data.get(\"contents\", [])\n", @@ -527,7 +525,7 @@ " print(\"=\" * 70)\n", " print(f\"\\nTotal sections found: {len(contents)}\")\n", " \n", - " # Process each section\n", + " # Iterate through each document section\n", " for i, content in enumerate(contents, 1):\n", " print(f\"\\n{'='*70}\")\n", " print(f\"SECTION {i}\")\n", @@ -537,7 +535,7 @@ " print(f\"\\n📁 Category: {category}\")\n", " print(f\"📄 Pages: {content.get('startPageNumber', '?')} - {content.get('endPageNumber', '?')}\")\n", " \n", - " # Show extracted fields from field extraction\n", + " # Display extracted fields if available\n", " fields = content.get('fields', {})\n", " if fields:\n", " print(\"\\n🔍 Extracted Information:\")\n", @@ -552,7 +550,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can also see the fulll JSON result below." + "You can also view the full JSON result below." ] }, { @@ -570,7 +568,7 @@ "source": [ "## Summary and Next Steps\n", "\n", - "Congratulations! You've successfully:\n", + "Congratulations! You have successfully:\n", "1. ✅ Created a basic classifier to categorize documents\n", "2. ✅ Created a custom analyzer to extract specific fields\n", "3. ✅ Combined them into an enhanced classifier for intelligent document processing" @@ -598,4 +596,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file