Skip to content

Commit 939f68d

Browse files
author
Taniya Mathur
committed
adding maxPagesForClassification configuration in UpdateSchemaConfig
1 parent 0723c2b commit 939f68d

File tree

5 files changed

+59
-21
lines changed

5 files changed

+59
-21
lines changed

lib/idp_common_pkg/idp_common/classification/service.py

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,8 @@ def _classify_pages_multimodal(self, document: Document) -> Document:
325325
)
326326
all_page_results = list(cached_page_classifications.values())
327327
combined_metering = {}
328-
errors_lock = threading.Lock()
329-
failed_page_exceptions = {}
328+
errors_lock = threading.Lock() # Thread safety for error collection
329+
failed_page_exceptions = {} # Store original exceptions for failed pages
330330

331331
# Determine which pages need classification
332332
pages_to_classify = {}
@@ -343,12 +343,13 @@ def _classify_pages_multimodal(self, document: Document) -> Document:
343343
page_id
344344
].confidence = cached_result.classification.confidence
345345

346-
# Copy metadata to the page
346+
# Copy metadata (including boundary information) to the page
347347
if hasattr(document.pages[page_id], "metadata"):
348348
document.pages[
349349
page_id
350350
].metadata = cached_result.classification.metadata
351351
else:
352+
# If the page doesn't have a metadata attribute, add it
352353
setattr(
353354
document.pages[page_id],
354355
"metadata",
@@ -403,12 +404,13 @@ def _classify_pages_multimodal(self, document: Document) -> Document:
403404
page_id
404405
].confidence = page_result.classification.confidence
405406

406-
# Copy metadata to the page
407+
# Copy metadata (including boundary information) to the page
407408
if hasattr(document.pages[page_id], "metadata"):
408409
document.pages[
409410
page_id
410411
].metadata = page_result.classification.metadata
411412
else:
413+
# If the page doesn't have a metadata attribute, add it
412414
setattr(
413415
document.pages[page_id],
414416
"metadata",
@@ -423,10 +425,12 @@ def _classify_pages_multimodal(self, document: Document) -> Document:
423425
combined_metering, page_metering
424426
)
425427
except Exception as e:
428+
# Capture exception details in the document object instead of raising
426429
error_msg = f"Error classifying page {page_id}: {str(e)}"
427430
logger.error(error_msg)
428431
with errors_lock:
429432
document.errors.append(error_msg)
433+
# Store the original exception for later use
430434
failed_page_exceptions[page_id] = e
431435

432436
# Mark page as unclassified on error
@@ -436,11 +440,13 @@ def _classify_pages_multimodal(self, document: Document) -> Document:
436440
].classification = "error (backoff/retry)"
437441
document.pages[page_id].confidence = 0.0
438442

439-
# Store failed page exceptions in document metadata
443+
# Store failed page exceptions in document metadata for caller to access
440444
if failed_page_exceptions:
441445
logger.info(
442446
f"Processing {len(failed_page_exceptions)} failed page exceptions for document {document.id}"
443447
)
448+
449+
# Store the first encountered exception as the primary failure cause
444450
first_exception = next(iter(failed_page_exceptions.values()))
445451
document.metadata = document.metadata or {}
446452
document.metadata["failed_page_exceptions"] = {
@@ -453,13 +459,37 @@ def _classify_pages_multimodal(self, document: Document) -> Document:
453459
}
454460
for page_id, exc in failed_page_exceptions.items()
455461
}
462+
# Store the primary exception for easy access by caller
456463
document.metadata["primary_exception"] = first_exception
464+
465+
# Cache successful page classifications (only when some pages fail - for retry scenarios)
466+
successful_results = [
467+
r
468+
for r in all_page_results
469+
if "error" not in r.classification.metadata
470+
]
471+
if successful_results:
472+
logger.info(
473+
f"Caching {len(successful_results)} successful page classifications for document {document.id} due to {len(failed_page_exceptions)} failed pages (retry scenario)"
474+
)
475+
self._cache_successful_page_classifications(
476+
document, successful_results
477+
)
478+
else:
479+
logger.warning(
480+
f"No successful page classifications to cache for document {document.id} - all {len(failed_page_exceptions)} pages failed"
481+
)
482+
else:
483+
# All pages succeeded - no need to cache since there won't be retries
484+
logger.info(
485+
f"All pages succeeded for document {document.id} - skipping cache (no retry needed)"
486+
)
457487
else:
458488
logger.info(
459489
f"All {len(cached_page_classifications)} page classifications found in cache"
460490
)
461491

462-
# Group pages into sections
492+
# Group pages into sections only if we have results
463493
document.sections = []
464494
sorted_results = self._sort_page_results(all_page_results)
465495

@@ -506,13 +536,19 @@ def _classify_pages_multimodal(self, document: Document) -> Document:
506536
)
507537

508538
t1 = time.time()
509-
logger.info(f"Time taken for classification: {t1 - t0:.2f} seconds")
539+
logger.info(
540+
f"Document classified with {len(document.sections)} sections in {t1 - t0:.2f} seconds"
541+
)
510542

511543
except Exception as e:
512-
logger.error(f"Error in multimodal classification: {str(e)}")
544+
error_msg = f"Error classifying all document pages: {str(e)}"
513545
document = self._update_document_status(
514-
document, success=False, error_message=str(e)
546+
document, success=False, error_message=error_msg
515547
)
548+
# Store the exception in metadata for caller to access
549+
document.metadata = document.metadata or {}
550+
document.metadata["primary_exception"] = e
551+
# raise exception to enable client retries
516552
raise
517553

518554
return document
@@ -1647,6 +1683,9 @@ def classify_document(self, document: Document) -> Document:
16471683
if limited_document.id != document.id: # Pages were actually limited
16481684
# Classify the limited document
16491685
if self.classification_method == self.TEXTBASED_HOLISTIC:
1686+
logger.info(
1687+
f"Classifying limited document with {len(limited_document.pages)} pages using holistic packet method"
1688+
)
16501689
classified_limited = self.holistic_classify_document(
16511690
limited_document
16521691
)

patterns/pattern-2/template.yaml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,6 @@ Parameters:
9191
- "false"
9292
Description: "Enable Human In The Loop (A2I) for document review"
9393

94-
95-
9694
SageMakerA2IReviewPortalURL:
9795
Type: String
9896
Default: ""
@@ -515,33 +513,38 @@ Resources:
515513
description: "Classification methodology to use"
516514
enum: ["multimodalPageLevelClassification", "textbasedHolisticClassification"]
517515
order: 2
516+
maxPagesForClassification:
517+
type: string
518+
description: "Number of pages to use for classification [When set to a number, forces the entire document to be assigned a single class]"
519+
enum: ["ALL", "1", "2", "3", "5", "10"]
520+
order: 3
518521
temperature:
519522
type: number
520523
minimum: 0
521524
maximum: 1
522525
description: Sampling temperature
523-
order: 3
526+
order: 4
524527
top_k:
525528
type: integer
526529
minimum: 1
527530
description: Sampling Top K
528-
order: 4
531+
order: 5
529532
top_p:
530533
type: number
531534
description: Sampling Top P
532-
order: 5
535+
order: 6
533536
max_tokens:
534537
type: number
535538
description: Max tokens
536-
order: 6
539+
order: 7
537540
system_prompt:
538541
type: string
539542
description: System prompt
540-
order: 7
543+
order: 8
541544
task_prompt:
542545
type: string
543546
description: Task prompt - include placeholders {CLASS_NAMES_AND_DESCRIPTIONS} (replaced with the class names and descriptions for all specified classes), {FEW_SHOT_EXAMPLES} (replaced by classPrompt and image data from examples in class definitions), {DOCUMENT_TEXT} (replaced by the OCR output), and for multi-modal classification {DOCUMENT_IMAGE} (replaced by the page image attachment). Optionally use <<CACHEPOINT>> to separate static and dynamic elements of prompt for Bedrock prompt caching.
544-
order: 8
547+
order: 9
545548
extraction:
546549
order: 4
547550
type: object

patterns/pattern-3/template.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,6 @@ Parameters:
105105
- "false"
106106
Description: "Enable Human In The Loop (A2I) for document review"
107107

108-
109-
110108
SageMakerA2IReviewPortalURL:
111109
Type: String
112110
Default: ""

template.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,6 @@ Parameters:
114114
Description: >-
115115
Select the configuration preset for Pattern 1. Each configuration contains pre-tuned settings for specific document processing scenarios - see https://github.com/aws-samples/sample-genai-idp/blob/main/config_library/README.md. Note: This selected configuration will be replaced by the Custom Configuration Path if specified.
116116
117-
118-
119117
# Pattern 2 Parameters
120118

121119
Pattern2Configuration:

0 commit comments

Comments
 (0)