|
108 | 108 | "# IMPORTANT: Replace with your actual subscription key or set up in \".env\" file if not using token auth\n",
|
109 | 109 | "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n",
|
110 | 110 | "AZURE_AI_API_VERSION = os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\")\n",
|
111 |
| - "ANALYZER_SAMPLE_FILE = \"../data/MS_Annual_Report_2024.pdf\" # Update this path to your PDF file\n", |
| 111 | + "ANALYZER_SAMPLE_FILE = \"../data/mixed_financial_docs.pdf\" # Update this path to your PDF file\n", |
112 | 112 | "\n",
|
113 | 113 | "# Authentication - Using DefaultAzureCredential for token-based auth\n",
|
114 | 114 | "credential = DefaultAzureCredential()\n",
|
|
151 | 151 | "# Define document categories and their descriptions\n",
|
152 | 152 | "classifier_schema = {\n",
|
153 | 153 | " \"categories\": {\n",
|
154 |
| - " \"Executive Summary & Strategy\": {\n", |
155 |
| - " \"description\": \"Leadership messages, strategic vision, and future outlook.\"\n", |
| 154 | + " \"Loan application\": {\n", |
| 155 | + " \"description\": \"Documents submitted by individuals or businesses to request funding, typically including personal or business details, financial history, loan amount, purpose, and supporting documentation.\"\n", |
156 | 156 | " },\n",
|
157 |
| - " \"Financial Performance & Segment Reporting\": {\n", |
158 |
| - " \"description\": \"Overall financial results and detailed performance by business units.\"\n", |
| 157 | + " \"Invoice\": {\n", |
| 158 | + " \"description\": \"Billing documents issued by sellers or service providers to request payment for goods or services, detailing items, prices, taxes, totals, and payment terms.\"\n", |
159 | 159 | " },\n",
|
160 |
| - " \"Operations & Corporate Governance\": {\n", |
161 |
| - " \"description\": \"Business operations, governance structure, and risk management.\"\n", |
| 160 | + " \"Bank_Statement\": {\n", |
| 161 | + " \"description\": \"Official statements issued by banks that summarize account activity over a period, including deposits, withdrawals, fees, and balances.\"\n", |
162 | 162 | " },\n",
|
163 |
| - " \"Shareholder Information & Relations\": {\n", |
164 |
| - " \"description\": \"Annual meeting details, stock information, and shareholder services.\"\n", |
165 |
| - " }\n", |
166 | 163 | " },\n",
|
167 | 164 | " \"splitMode\": \"auto\" # IMPORTANT: Automatically detect document boundaries. Can change mode for your needs.\n",
|
168 | 165 | "}\n",
|
|
331 | 328 | "\n",
|
332 | 329 | "Now let's create a custom analyzer that can extract specific fields from documents.\n",
|
333 | 330 | "This analyzer will:\n",
|
334 |
| - "- Extract visit dates from medical documents\n", |
| 331 | + "- Extract common fields from loan application documents\n", |
335 | 332 | "- Generate document excerpts"
|
336 | 333 | ]
|
337 | 334 | },
|
|
343 | 340 | "source": [
|
344 | 341 | "# Define analyzer schema with custom fields\n",
|
345 | 342 | "analyzer_schema = {\n",
|
346 |
| - " \"description\": \"Medical encounter analyzer - extracts key information from medical records\",\n", |
| 343 | + " \"description\": \"Loan application analyzer - extracts key information from loan application\",\n", |
347 | 344 | " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\", # Built on top of the general document analyzer\n",
|
348 | 345 | " \"config\": {\n",
|
349 | 346 | " \"returnDetails\": True,\n",
|
|
355 | 352 | " },\n",
|
356 | 353 | " \"fieldSchema\": {\n",
|
357 | 354 | " \"fields\": {\n",
|
358 |
| - " \"ReportDate\": {\n", |
| 355 | + " \"ApplicationDate\": {\n", |
359 | 356 | " \"type\": \"date\",\n",
|
360 | 357 | " \"method\": \"generate\",\n",
|
361 |
| - " \"description\": \"The publication or filing date of the annual report.\"\n", |
| 358 | + " \"description\": \"The date when the loan application was submitted.\"\n", |
362 | 359 | " },\n",
|
363 |
| - " \"CompanyName\": {\n", |
| 360 | + " \"ApplicantName\": {\n", |
364 | 361 | " \"type\": \"string\",\n",
|
365 | 362 | " \"method\": \"generate\",\n",
|
366 |
| - " \"description\": \"The name of the company issuing the report.\"\n", |
| 363 | + " \"description\": \"The full name of the loan applicant or company.\"\n", |
| 364 | + " },\n", |
| 365 | + " \"LoanAmountRequested\": {\n", |
| 366 | + " \"type\": \"number\",\n", |
| 367 | + " \"method\": \"generate\",\n", |
| 368 | + " \"description\": \"The total amount of loan money requested by the applicant.\"\n", |
367 | 369 | " },\n",
|
368 |
| - " \"FiscalYear\": {\n", |
| 370 | + " \"LoanPurpose\": {\n", |
369 | 371 | " \"type\": \"string\",\n",
|
370 | 372 | " \"method\": \"generate\",\n",
|
371 |
| - " \"description\": \"The fiscal year the report covers.\"\n", |
| 373 | + " \"description\": \"The stated purpose or reason for the loan.\"\n", |
372 | 374 | " },\n",
|
373 |
| - " \"NetIncome\": {\n", |
| 375 | + " \"CreditScore\": {\n", |
374 | 376 | " \"type\": \"number\",\n",
|
375 | 377 | " \"method\": \"generate\",\n",
|
376 |
| - " \"description\": \"Net income or profit reported for the fiscal year.\"\n", |
| 378 | + " \"description\": \"The credit score of the applicant, if available.\"\n", |
377 | 379 | " },\n",
|
378 | 380 | " \"Summary\": {\n",
|
379 | 381 | " \"type\": \"string\",\n",
|
380 | 382 | " \"method\": \"generate\",\n",
|
381 |
| - " \"description\": \"Brief summary of the annual report\"\n", |
| 383 | + " \"description\": \"A brief overview of the loan application details.\"\n", |
382 | 384 | " }\n",
|
383 | 385 | " }\n",
|
384 | 386 | " }\n",
|
385 | 387 | "}\n",
|
386 | 388 | "\n",
|
387 | 389 | "# Generate unique analyzer ID\n",
|
388 |
| - "analyzer_id = \"analyzer-medical-\" + str(uuid.uuid4())\n", |
| 390 | + "analyzer_id = \"analyzer-loan-application-\" + str(uuid.uuid4())\n", |
389 | 391 | "\n",
|
390 | 392 | "# Create the analyzer\n",
|
391 | 393 | "try:\n",
|
|
411 | 413 | "source": [
|
412 | 414 | "## 10. Create an Enhanced Classifier with Custom Analyzer\n",
|
413 | 415 | "\n",
|
414 |
| - "Now we'll create a new classifier that uses our custom analyzer for medical documents.\n", |
| 416 | + "Now we'll create a new classifier that uses the prebuilt invoice analyzer for invoices and our custom analyzer for loan application documents.\n", |
415 | 417 | "This combines classification with field extraction in one operation."
|
416 | 418 | ]
|
417 | 419 | },
|
|
427 | 429 | "# Create enhanced classifier schema\n",
|
428 | 430 | "enhanced_classifier_schema = {\n",
|
429 | 431 | " \"categories\": {\n",
|
430 |
| - " \"Legal\": {\n", |
431 |
| - " \"description\": \"Legal documents including subpoenas, declarations, contracts, and other legal paperwork.\"\n", |
432 |
| - " # No analyzer specified - uses default processing\n", |
| 432 | + " \"Loan application\": {\n", |
| 433 | + " \"description\": \"Documents submitted by individuals or businesses to request funding, typically including personal or business details, financial history, loan amount, purpose, and supporting documentation.\",\n", |
| 434 | + " \"analyzerId\": analyzer_id # IMPORTANT: Use created custom analyzer in previous step for load applications\n", |
433 | 435 | " },\n",
|
434 |
| - " \"Annual Report\": {\n", |
435 |
| - " \"description\": \"Each document must ends with 'end of encounter'. Dont rely on page numbers\",\n", |
436 |
| - " \"analyzerId\": analyzer_id # IMPORTANT: Use created custom analyzer in previous step for annual reports\n", |
| 436 | + " \"Invoice\": {\n", |
| 437 | + " \"description\": \"Billing documents issued by sellers or service providers to request payment for goods or services, detailing items, prices, taxes, totals, and payment terms.\",\n", |
| 438 | + " \"analyzerId\": \"prebuilt-invoice\" # IMPORTANT: Use created custom analyzer in previous step for annual reports\n", |
437 | 439 | " },\n",
|
438 |
| - " \"Declaration_of_custodian\": {\n", |
439 |
| - " \"description\": \"Declarations of custodian documents, often used in legal contexts.\"\n", |
| 440 | + " \"Bank_Statement\": {\n", |
| 441 | + " \"description\": \"Official statements issued by banks that summarize account activity over a period, including deposits, withdrawals, fees, and balances.\"\n", |
| 442 | + " # No analyzer specified - uses default processing\n", |
440 | 443 | " }\n",
|
441 | 444 | " },\n",
|
442 | 445 | " \"splitMode\": \"auto\"\n",
|
|
447 | 450 | " try:\n",
|
448 | 451 | " print(f\"🔨 Creating enhanced classifier: {enhanced_classifier_id}\")\n",
|
449 | 452 | " print(\"\\n📋 Configuration:\")\n",
|
450 |
| - " print(\" • Legal documents → Standard processing\")\n", |
451 |
| - " print(\" • Medical documents → Custom analyzer with field extraction\")\n", |
| 453 | + " print(\" • Loan application documents → Custom analyzer with field extraction\")\n", |
| 454 | + " print(\" • Invoice documents → Prebuilt invoice analyzer\")\n", |
| 455 | + " print(\" • Bank_Statement documents → Standard processing\")\n", |
452 | 456 | " \n",
|
453 | 457 | " response = content_understanding_client.begin_create_classifier(enhanced_classifier_id, enhanced_classifier_schema)\n",
|
454 | 458 | " result = content_understanding_client.poll_result(response)\n",
|
|
468 | 472 | "## 11. Process Document with Enhanced Classifier\n",
|
469 | 473 | "\n",
|
470 | 474 | "Let's process the document again using our enhanced classifier.\n",
|
471 |
| - "Medical documents will now have additional fields extracted." |
| 475 | + "Invoices and loan application documents will now have additional fields extracted." |
472 | 476 | ]
|
473 | 477 | },
|
474 | 478 | {
|
|
505 | 509 | "source": [
|
506 | 510 | "## 12. View Enhanced Results with Extracted Fields\n",
|
507 | 511 | "\n",
|
508 |
| - "Let's see the classification results along with the extracted fields from medical documents." |
| 512 | + "Let's see the classification results along with the extracted fields from loan application documents." |
509 | 513 | ]
|
510 | 514 | },
|
511 | 515 | {
|
|
533 | 537 | " print(f\"\\n📁 Category: {category}\")\n",
|
534 | 538 | " print(f\"📄 Pages: {content.get('startPageNumber', '?')} - {content.get('endPageNumber', '?')}\")\n",
|
535 | 539 | " \n",
|
536 |
| - " # Show extracted fields for medical documents\n", |
| 540 | + " # Show extracted fields from field extraction\n", |
537 | 541 | " fields = content.get('fields', {})\n",
|
538 | 542 | " if fields:\n",
|
539 | 543 | " print(\"\\n🔍 Extracted Information:\")\n",
|
|
0 commit comments