Skip to content

Commit 9ddb685

Browse files
revise for new sample
1 parent aea69c4 commit 9ddb685

File tree

3 files changed

+40
-36
lines changed

3 files changed

+40
-36
lines changed

data/MS_Annual_Report_2024.pdf

-132 Bytes
Binary file not shown.

data/mixed_financial_docs.pdf

131 Bytes
Binary file not shown.

notebooks/classifier.ipynb

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
"# IMPORTANT: Replace with your actual subscription key or set up in \".env\" file if not using token auth\n",
109109
"AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n",
110110
"AZURE_AI_API_VERSION = os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\")\n",
111-
"ANALYZER_SAMPLE_FILE = \"../data/MS_Annual_Report_2024.pdf\" # Update this path to your PDF file\n",
111+
"ANALYZER_SAMPLE_FILE = \"../data/mixed_financial_docs.pdf\" # Update this path to your PDF file\n",
112112
"\n",
113113
"# Authentication - Using DefaultAzureCredential for token-based auth\n",
114114
"credential = DefaultAzureCredential()\n",
@@ -151,18 +151,15 @@
151151
"# Define document categories and their descriptions\n",
152152
"classifier_schema = {\n",
153153
" \"categories\": {\n",
154-
" \"Executive Summary & Strategy\": {\n",
155-
" \"description\": \"Leadership messages, strategic vision, and future outlook.\"\n",
154+
" \"Loan application\": {\n",
155+
" \"description\": \"Documents submitted by individuals or businesses to request funding, typically including personal or business details, financial history, loan amount, purpose, and supporting documentation.\"\n",
156156
" },\n",
157-
" \"Financial Performance & Segment Reporting\": {\n",
158-
" \"description\": \"Overall financial results and detailed performance by business units.\"\n",
157+
" \"Invoice\": {\n",
158+
" \"description\": \"Billing documents issued by sellers or service providers to request payment for goods or services, detailing items, prices, taxes, totals, and payment terms.\"\n",
159159
" },\n",
160-
" \"Operations & Corporate Governance\": {\n",
161-
" \"description\": \"Business operations, governance structure, and risk management.\"\n",
160+
" \"Bank_Statement\": {\n",
161+
" \"description\": \"Official statements issued by banks that summarize account activity over a period, including deposits, withdrawals, fees, and balances.\"\n",
162162
" },\n",
163-
" \"Shareholder Information & Relations\": {\n",
164-
" \"description\": \"Annual meeting details, stock information, and shareholder services.\"\n",
165-
" }\n",
166163
" },\n",
167164
" \"splitMode\": \"auto\" # IMPORTANT: Automatically detect document boundaries. Can change mode for your needs.\n",
168165
"}\n",
@@ -331,7 +328,7 @@
331328
"\n",
332329
"Now let's create a custom analyzer that can extract specific fields from documents.\n",
333330
"This analyzer will:\n",
334-
"- Extract visit dates from medical documents\n",
331+
"- Extract common fields from loan application documents\n",
335332
"- Generate document excerpts"
336333
]
337334
},
@@ -343,7 +340,7 @@
343340
"source": [
344341
"# Define analyzer schema with custom fields\n",
345342
"analyzer_schema = {\n",
346-
" \"description\": \"Medical encounter analyzer - extracts key information from medical records\",\n",
343+
" \"description\": \"Loan application analyzer - extracts key information from loan application\",\n",
347344
" \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\", # Built on top of the general document analyzer\n",
348345
" \"config\": {\n",
349346
" \"returnDetails\": True,\n",
@@ -355,37 +352,42 @@
355352
" },\n",
356353
" \"fieldSchema\": {\n",
357354
" \"fields\": {\n",
358-
" \"ReportDate\": {\n",
355+
" \"ApplicationDate\": {\n",
359356
" \"type\": \"date\",\n",
360357
" \"method\": \"generate\",\n",
361-
" \"description\": \"The publication or filing date of the annual report.\"\n",
358+
" \"description\": \"The date when the loan application was submitted.\"\n",
362359
" },\n",
363-
" \"CompanyName\": {\n",
360+
" \"ApplicantName\": {\n",
364361
" \"type\": \"string\",\n",
365362
" \"method\": \"generate\",\n",
366-
" \"description\": \"The name of the company issuing the report.\"\n",
363+
" \"description\": \"The full name of the loan applicant or company.\"\n",
364+
" },\n",
365+
" \"LoanAmountRequested\": {\n",
366+
" \"type\": \"number\",\n",
367+
" \"method\": \"generate\",\n",
368+
" \"description\": \"The total amount of loan money requested by the applicant.\"\n",
367369
" },\n",
368-
" \"FiscalYear\": {\n",
370+
" \"LoanPurpose\": {\n",
369371
" \"type\": \"string\",\n",
370372
" \"method\": \"generate\",\n",
371-
" \"description\": \"The fiscal year the report covers.\"\n",
373+
" \"description\": \"The stated purpose or reason for the loan.\"\n",
372374
" },\n",
373-
" \"NetIncome\": {\n",
375+
" \"CreditScore\": {\n",
374376
" \"type\": \"number\",\n",
375377
" \"method\": \"generate\",\n",
376-
" \"description\": \"Net income or profit reported for the fiscal year.\"\n",
378+
" \"description\": \"The credit score of the applicant, if available.\"\n",
377379
" },\n",
378380
" \"Summary\": {\n",
379381
" \"type\": \"string\",\n",
380382
" \"method\": \"generate\",\n",
381-
" \"description\": \"Brief summary of the annual report\"\n",
383+
" \"description\": \"A brief overview of the loan application details.\"\n",
382384
" }\n",
383385
" }\n",
384386
" }\n",
385387
"}\n",
386388
"\n",
387389
"# Generate unique analyzer ID\n",
388-
"analyzer_id = \"analyzer-medical-\" + str(uuid.uuid4())\n",
390+
"analyzer_id = \"analyzer-loan-application-\" + str(uuid.uuid4())\n",
389391
"\n",
390392
"# Create the analyzer\n",
391393
"try:\n",
@@ -411,7 +413,7 @@
411413
"source": [
412414
"## 10. Create an Enhanced Classifier with Custom Analyzer\n",
413415
"\n",
414-
"Now we'll create a new classifier that uses our custom analyzer for medical documents.\n",
416+
"Now we'll create a new classifier that uses the prebuilt invoice analyzer for invoices and our custom analyzer for loan application documents.\n",
415417
"This combines classification with field extraction in one operation."
416418
]
417419
},
@@ -427,16 +429,17 @@
427429
"# Create enhanced classifier schema\n",
428430
"enhanced_classifier_schema = {\n",
429431
" \"categories\": {\n",
430-
" \"Legal\": {\n",
431-
" \"description\": \"Legal documents including subpoenas, declarations, contracts, and other legal paperwork.\"\n",
432-
" # No analyzer specified - uses default processing\n",
432+
" \"Loan application\": {\n",
433+
" \"description\": \"Documents submitted by individuals or businesses to request funding, typically including personal or business details, financial history, loan amount, purpose, and supporting documentation.\",\n",
434+
" \"analyzerId\": analyzer_id # IMPORTANT: Use created custom analyzer in previous step for load applications\n",
433435
" },\n",
434-
" \"Annual Report\": {\n",
435-
" \"description\": \"Each document must ends with 'end of encounter'. Dont rely on page numbers\",\n",
436-
" \"analyzerId\": analyzer_id # IMPORTANT: Use created custom analyzer in previous step for annual reports\n",
436+
" \"Invoice\": {\n",
437+
" \"description\": \"Billing documents issued by sellers or service providers to request payment for goods or services, detailing items, prices, taxes, totals, and payment terms.\",\n",
438+
" \"analyzerId\": \"prebuilt-invoice\" # IMPORTANT: Use created custom analyzer in previous step for annual reports\n",
437439
" },\n",
438-
" \"Declaration_of_custodian\": {\n",
439-
" \"description\": \"Declarations of custodian documents, often used in legal contexts.\"\n",
440+
" \"Bank_Statement\": {\n",
441+
" \"description\": \"Official statements issued by banks that summarize account activity over a period, including deposits, withdrawals, fees, and balances.\"\n",
442+
" # No analyzer specified - uses default processing\n",
440443
" }\n",
441444
" },\n",
442445
" \"splitMode\": \"auto\"\n",
@@ -447,8 +450,9 @@
447450
" try:\n",
448451
" print(f\"🔨 Creating enhanced classifier: {enhanced_classifier_id}\")\n",
449452
" print(\"\\n📋 Configuration:\")\n",
450-
" print(\" • Legal documents → Standard processing\")\n",
451-
" print(\" • Medical documents → Custom analyzer with field extraction\")\n",
453+
" print(\" • Loan application documents → Custom analyzer with field extraction\")\n",
454+
" print(\" • Invoice documents → Prebuilt invoice analyzer\")\n",
455+
" print(\" • Bank_Statement documents → Standard processing\")\n",
452456
" \n",
453457
" response = content_understanding_client.begin_create_classifier(enhanced_classifier_id, enhanced_classifier_schema)\n",
454458
" result = content_understanding_client.poll_result(response)\n",
@@ -468,7 +472,7 @@
468472
"## 11. Process Document with Enhanced Classifier\n",
469473
"\n",
470474
"Let's process the document again using our enhanced classifier.\n",
471-
"Medical documents will now have additional fields extracted."
475+
"Invoices and loan application documents will now have additional fields extracted."
472476
]
473477
},
474478
{
@@ -505,7 +509,7 @@
505509
"source": [
506510
"## 12. View Enhanced Results with Extracted Fields\n",
507511
"\n",
508-
"Let's see the classification results along with the extracted fields from medical documents."
512+
"Let's see the classification results along with the extracted fields from loan application documents."
509513
]
510514
},
511515
{
@@ -533,7 +537,7 @@
533537
" print(f\"\\n📁 Category: {category}\")\n",
534538
" print(f\"📄 Pages: {content.get('startPageNumber', '?')} - {content.get('endPageNumber', '?')}\")\n",
535539
" \n",
536-
" # Show extracted fields for medical documents\n",
540+
" # Show extracted fields from field extraction\n",
537541
" fields = content.get('fields', {})\n",
538542
" if fields:\n",
539543
" print(\"\\n🔍 Extracted Information:\")\n",

0 commit comments

Comments
 (0)