refactor: rename Document utility methods for clarity and consistency

Bob Strahan · Bob Strahan · commit 6624cf8b9a91 · 2025-06-29T13:55:52.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,26 +13,26 @@ SPDX-License-Identifier: MIT-0
     - Transparent handling of both compressed and uncompressed documents in Lambda functions
     - Temporary S3 storage for compressed document state with automatic cleanup via lifecycle policies
   - **New Utility Methods**:
-    - `Document.handle_input_document()`: Automatically detects and decompresses document input from Lambda events
-    - `Document.prepare_output()`: Automatically compresses large documents for Lambda responses
+    - `Document.load_document()`: Automatically detects and decompresses document input from Lambda events
+    - `Document.serialize_document()`: Automatically compresses large documents for Lambda responses
     - `Document.compress()` and `Document.decompress()`: Compression/decompression methods
-  - **Lambda Function Integration**: All Pattern-2 and Pattern-3 Lambda functions updated to use compression utilities
+  - **Lambda Function Integration**: All relevant Lambda functions updated to use compression utilities
   - **Resolves Step Functions Errors**: Eliminates "result with a size exceeding the maximum number of bytes service limit" errors for large multi-page documents
-- **Multi-Backend OCR Support**
+- **Multi-Backend OCR Support - Pattern 2 and 3**
   - Textract Backend (default): Existing AWS Textract functionality
   - Bedrock Backend: New LLM-based OCR using Claude/Nova models
   - None Backend: Image-only processing without OCR
-- **Bedrock OCR Integration**
+- **Bedrock OCR Integration - Pattern 2 and 3**
   - Customizable system and task prompts for OCR optimization
   - Better handling of complex documents, tables, and forms
   - Layout preservation capabilities
-- **Image Preprocessing**
+- **Image Preprocessing - Pattern 2 and 3**
   - Adaptive Binarization: Improves OCR accuracy on documents with:
     - Uneven lighting or shadows
     - Low contrast text
     - Background noise or gradients
   - Optional feature with configurable enable/disable
-- **YAML Parsing Support for LLM Responses**
+- **YAML Parsing Support for LLM Responses - Pattern 2 and 3**
   - Added comprehensive YAML parsing capabilities to complement existing JSON parsing functionality
   - New `extract_yaml_from_text()` function with robust multi-strategy YAML extraction:
     - YAML in ```yaml and ```yml code blocks
diff --git a/lib/idp_common_pkg/README.md b/lib/idp_common_pkg/README.md
@@ -53,7 +53,7 @@ The Document model includes automatic compression support to handle large docume
 - **Automatic Compression**: Documents exceeding configurable size thresholds are automatically compressed to S3
 - **Transparent Handling**: Lambda functions seamlessly handle both compressed and uncompressed documents
 - **Section Preservation**: Section IDs are preserved in compressed payloads for Step Functions Map operations
-- **Utility Methods**: Simplified input/output handling with `handle_input_document()` and `prepare_output()`
+- **Utility Methods**: Simplified input/output handling with `load_document()` and `serialize_document()`
 
 ### Usage in Lambda Functions
 
@@ -65,7 +65,7 @@ def lambda_handler(event, context):
     working_bucket = os.environ.get('WORKING_BUCKET')
     
     # Handle input - automatically detects and decompresses if needed
-    document = Document.handle_input_document(
+    document = Document.load_document(
         event_data=event["document"], 
         working_bucket=working_bucket, 
         logger=logger
@@ -76,7 +76,7 @@ def lambda_handler(event, context):
     
     # Prepare output - automatically compresses if document is large
     response = {
-        "document": document.prepare_output(
+        "document": document.serialize_document(
             working_bucket=working_bucket, 
             step_name="classification", 
             logger=logger
diff --git a/lib/idp_common_pkg/idp_common/README.md b/lib/idp_common_pkg/idp_common/README.md
@@ -181,14 +181,14 @@ document = Document.from_compressed_or_dict(data, working_bucket)
 
 ```python
 # Handle input - automatically detects and decompresses if needed
-document = Document.handle_input_document(
+document = Document.load_document(
     event_data=event["document"], 
     working_bucket=working_bucket, 
     logger=logger
 )
 
 # Prepare output - automatically compresses if document is large
-response_data = document.prepare_output(
+response_data = document.serialize_document(
     working_bucket=working_bucket, 
     step_name="classification", 
     logger=logger,
@@ -214,7 +214,7 @@ def lambda_handler(event, context):
     working_bucket = os.environ.get('WORKING_BUCKET')
     
     # Input handling - works with both compressed and uncompressed documents
-    document = Document.handle_input_document(
+    document = Document.load_document(
         event["document"], working_bucket, logger
     )
     
@@ -223,7 +223,7 @@ def lambda_handler(event, context):
     
     # Output handling - automatically compresses if needed
     return {
-        "document": document.prepare_output(working_bucket, "step_name", logger)
+        "document": document.serialize_document(working_bucket, "step_name", logger)
     }
 ```
 
diff --git a/lib/idp_common_pkg/idp_common/models.py b/lib/idp_common_pkg/idp_common/models.py
@@ -152,15 +152,15 @@ class Document:
     - from_compressed_or_dict(): Handle both compressed and regular document data
 
     Utility Methods:
-    - handle_input_document(): Process document input from Lambda events
-    - prepare_output(): Prepare document output with automatic compression
+    - load_document(): Process document input from Lambda events
+    - serialize_document(): Prepare document output with automatic compression
 
     Usage Examples:
         # Handle input in Lambda functions
-        document = Document.handle_input_document(event_data, working_bucket, logger)
+        document = Document.load_document(event_data, working_bucket, logger)
 
         # Prepare output with automatic compression
-        response = {"document": document.prepare_output(working_bucket, "step_name", logger)}
+        response = {"document": document.serialize_document(working_bucket, "step_name", logger)}
 
         # Manual compression/decompression
         compressed_data = document.compress(working_bucket, "processing")
@@ -650,7 +650,7 @@ def from_compressed_or_dict(cls, data, bucket=None):
             return cls.from_dict(data)
 
     @classmethod
-    def handle_input_document(cls, event_data, working_bucket, logger=None):
+    def load_document(cls, event_data, working_bucket, logger=None):
         """
         Utility method to handle document input from Lambda events.
         Automatically handles both compressed and uncompressed documents.
@@ -672,7 +672,7 @@ def handle_input_document(cls, event_data, working_bucket, logger=None):
                 logger.info("Loaded uncompressed document")
             return cls.from_dict(event_data)
 
-    def prepare_output(
+    def serialize_document(
         self, working_bucket, step_name, logger=None, size_threshold_kb=0
     ):
         """
diff --git a/patterns/pattern-2/src/assessment_function/index.py b/patterns/pattern-2/src/assessment_function/index.py
@@ -41,7 +41,7 @@ def handler(event, context):
         
     # Convert document data to Document object - handle compression
     working_bucket = os.environ.get('WORKING_BUCKET')
-    document = Document.handle_input_document(document_data, working_bucket, logger)
+    document = Document.load_document(document_data, working_bucket, logger)
     logger.info(f"Processing assessment for document {document.id}, section {section_id}")
 
     # Update document status to ASSESSING
@@ -72,7 +72,7 @@ def handler(event, context):
     
     # Prepare output with automatic compression if needed
     result = {
-        'document': updated_document.prepare_output(working_bucket, f"assessment_{section_id}", logger),
+        'document': updated_document.serialize_document(working_bucket, f"assessment_{section_id}", logger),
         'section_id': section_id
     }
     
diff --git a/patterns/pattern-2/src/classification_function/index.py b/patterns/pattern-2/src/classification_function/index.py
@@ -34,7 +34,7 @@ def handler(event, context):
     
     # Extract document from the OCR result - handle both compressed and uncompressed
     working_bucket = os.environ.get('WORKING_BUCKET')
-    document = Document.handle_input_document(event["OCRResult"]["document"], working_bucket, logger)
+    document = Document.load_document(event["OCRResult"]["document"], working_bucket, logger)
     
     # Update document status to CLASSIFYING
     document.status = Status.CLASSIFYING
@@ -103,7 +103,7 @@ def handler(event, context):
     
     # Prepare output with automatic compression if needed
     response = {
-        "document": document.prepare_output(working_bucket, "classification", logger)
+        "document": document.serialize_document(working_bucket, "classification", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-2/src/extraction_function/index.py b/patterns/pattern-2/src/extraction_function/index.py
@@ -33,7 +33,7 @@ def handler(event, context):
     # For Map state, we get just one section from the document
     # Extract the document and section from the event - handle both compressed and uncompressed
     working_bucket = os.environ.get('WORKING_BUCKET')
-    full_document = Document.handle_input_document(event.get("document", {}), working_bucket, logger)
+    full_document = Document.load_document(event.get("document", {}), working_bucket, logger)
     
     # Get the section ID from the Map state input
     section_input = event.get("section", {})
@@ -97,7 +97,7 @@ def handler(event, context):
     # Prepare output with automatic compression if needed
     response = {
         "section_id": section_id,
-        "document": section_document.prepare_output(working_bucket, f"extraction_{section_id}", logger)
+        "document": section_document.serialize_document(working_bucket, f"extraction_{section_id}", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-2/src/ocr_function/index.py b/patterns/pattern-2/src/ocr_function/index.py
@@ -125,7 +125,7 @@ def handler(event, context):
     # Prepare output with automatic compression if needed
     working_bucket = os.environ.get('WORKING_BUCKET')
     response = {
-        "document": document.prepare_output(working_bucket, "ocr", logger)
+        "document": document.serialize_document(working_bucket, "ocr", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-2/src/processresults_function/index.py b/patterns/pattern-2/src/processresults_function/index.py
@@ -32,7 +32,7 @@ def handler(event, context):
     # Get the base document from the original classification result - handle both compressed and uncompressed
     working_bucket = os.environ.get('WORKING_BUCKET')
     classification_document_data = event.get("ClassificationResult", {}).get("document", {})
-    document = Document.handle_input_document(classification_document_data, working_bucket, logger)
+    document = Document.load_document(classification_document_data, working_bucket, logger)
     
     extraction_results = event.get("ExtractionResults", [])
     
@@ -51,11 +51,11 @@ def handler(event, context):
         # or extraction result if assessment is disabled
         assessment_document_data = result.get("AssessmentResult", {}).get("document", {})
         if assessment_document_data:
-            section_document = Document.handle_input_document(assessment_document_data, working_bucket, logger)
+            section_document = Document.load_document(assessment_document_data, working_bucket, logger)
         else:
             # No assessment result, try extraction result
             extraction_document_data = result.get("document", {})
-            section_document = Document.handle_input_document(extraction_document_data, working_bucket, logger)
+            section_document = Document.load_document(extraction_document_data, working_bucket, logger)
         if section_document:       
             # Add section to document if present
             if section_document.sections:
@@ -80,7 +80,7 @@ def handler(event, context):
     
     # Return the completed document with compression
     response = {
-        "document": document.prepare_output(working_bucket, "processresults", logger)
+        "document": document.serialize_document(working_bucket, "processresults", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-2/src/summarization_function/index.py b/patterns/pattern-2/src/summarization_function/index.py
@@ -43,7 +43,7 @@ def handler(event, context):
         
         # Convert data to Document object - handle compression
         working_bucket = os.environ.get('WORKING_BUCKET')
-        document = Document.handle_input_document(document_data, working_bucket, logger)
+        document = Document.load_document(document_data, working_bucket, logger)
         
         # Update document status to SUMMARIZING
         document.status = Status.SUMMARIZING
@@ -74,7 +74,7 @@ def handler(event, context):
         
         # Prepare output with automatic compression if needed
         return {
-            'document': processed_document.prepare_output(working_bucket, "summarization", logger),
+            'document': processed_document.serialize_document(working_bucket, "summarization", logger),
         }
         
     except Exception as e:
diff --git a/patterns/pattern-3/src/assessment_function/index.py b/patterns/pattern-3/src/assessment_function/index.py
@@ -41,7 +41,7 @@ def handler(event, context):
         
     # Convert document data to Document object - handle compression
     working_bucket = os.environ.get('WORKING_BUCKET')
-    document = Document.handle_input_document(document_data, working_bucket, logger)
+    document = Document.load_document(document_data, working_bucket, logger)
     logger.info(f"Processing assessment for document {document.id}, section {section_id}")
 
     # Update document status to ASSESSING
@@ -72,7 +72,7 @@ def handler(event, context):
     
     # Prepare output with automatic compression if needed
     result = {
-        'document': updated_document.prepare_output(working_bucket, f"assessment_{section_id}", logger),
+        'document': updated_document.serialize_document(working_bucket, f"assessment_{section_id}", logger),
         'section_id': section_id
     }
     
diff --git a/patterns/pattern-3/src/classification_function/index.py b/patterns/pattern-3/src/classification_function/index.py
@@ -32,7 +32,7 @@ def handler(event, context):
     
     # Extract document from the OCR result - handle both compressed and uncompressed
     working_bucket = os.environ.get('WORKING_BUCKET')
-    document = Document.handle_input_document(event["OCRResult"]["document"], working_bucket, logger)
+    document = Document.load_document(event["OCRResult"]["document"], working_bucket, logger)
     
     # Update document status to CLASSIFYING
     document.status = Status.CLASSIFYING
@@ -109,7 +109,7 @@ def handler(event, context):
     
     # Prepare output with automatic compression if needed
     response = {
-        "document": document.prepare_output(working_bucket, "classification", logger)
+        "document": document.serialize_document(working_bucket, "classification", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-3/src/extraction_function/index.py b/patterns/pattern-3/src/extraction_function/index.py
@@ -33,7 +33,7 @@ def handler(event, context):
     # For Map state, we get just one section from the document
     # Extract the document and section from the event - handle both compressed and uncompressed
     working_bucket = os.environ.get('WORKING_BUCKET')
-    full_document = Document.handle_input_document(event.get("document", {}), working_bucket, logger)
+    full_document = Document.load_document(event.get("document", {}), working_bucket, logger)
     
     # Get the section ID from the Map state input
     section_input = event.get("section", {})
@@ -97,7 +97,7 @@ def handler(event, context):
     # Prepare output with automatic compression if needed
     response = {
         "section_id": section_id,
-        "document": section_document.prepare_output(working_bucket, f"extraction_{section_id}", logger)
+        "document": section_document.serialize_document(working_bucket, f"extraction_{section_id}", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-3/src/ocr_function/index.py b/patterns/pattern-3/src/ocr_function/index.py
@@ -125,7 +125,7 @@ def handler(event, context):
     # Prepare output with automatic compression if needed
     working_bucket = os.environ.get('WORKING_BUCKET')
     response = {
-        "document": document.prepare_output(working_bucket, "ocr", logger)
+        "document": document.serialize_document(working_bucket, "ocr", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-3/src/processresults_function/index.py b/patterns/pattern-3/src/processresults_function/index.py
@@ -32,7 +32,7 @@ def handler(event, context):
     # Get the base document from the original classification result - handle both compressed and uncompressed
     working_bucket = os.environ.get('WORKING_BUCKET')
     classification_document_data = event.get("ClassificationResult", {}).get("document", {})
-    document = Document.handle_input_document(classification_document_data, working_bucket, logger)
+    document = Document.load_document(classification_document_data, working_bucket, logger)
     
     extraction_results = event.get("ExtractionResults", [])
     
@@ -51,11 +51,11 @@ def handler(event, context):
         # or extraction result if assessment is disabled
         assessment_document_data = result.get("AssessmentResult", {}).get("document", {})
         if assessment_document_data:
-            section_document = Document.handle_input_document(assessment_document_data, working_bucket, logger)
+            section_document = Document.load_document(assessment_document_data, working_bucket, logger)
         else:
             # No assessment result, try extraction result
             extraction_document_data = result.get("document", {})
-            section_document = Document.handle_input_document(extraction_document_data, working_bucket, logger)
+            section_document = Document.load_document(extraction_document_data, working_bucket, logger)
         if section_document:       
             # Add section to document if present
             if section_document.sections:
@@ -80,7 +80,7 @@ def handler(event, context):
     
     # Return the completed document with compression
     response = {
-        "document": document.prepare_output(working_bucket, "processresults", logger)
+        "document": document.serialize_document(working_bucket, "processresults", logger)
     }
     
     logger.info(f"Response: {json.dumps(response, default=str)}")
diff --git a/patterns/pattern-3/src/summarization_function/index.py b/patterns/pattern-3/src/summarization_function/index.py
@@ -43,7 +43,7 @@ def handler(event, context):
         
         # Convert data to Document object - handle compression
         working_bucket = os.environ.get('WORKING_BUCKET')
-        document = Document.handle_input_document(document_data, working_bucket, logger)
+        document = Document.load_document(document_data, working_bucket, logger)
         
         # Update document status to SUMMARIZING
         document.status = Status.SUMMARIZING
@@ -74,7 +74,7 @@ def handler(event, context):
         
         # Prepare output with automatic compression if needed
         return {
-            'document': processed_document.prepare_output(working_bucket, "summarization", logger),
+            'document': processed_document.serialize_document(working_bucket, "summarization", logger),
         }
         
     except Exception as e:
diff --git a/src/lambda/evaluation_function/index.py b/src/lambda/evaluation_function/index.py
@@ -85,7 +85,7 @@ def extract_document_from_event(event: Dict[str, Any]) -> Optional[Document]:
             
         # Get document from the final processing step - handle both compressed and uncompressed
         working_bucket = os.environ.get('WORKING_BUCKET')
-        document = Document.handle_input_document(processed_result.get("document", {}), working_bucket, logger)
+        document = Document.load_document(processed_result.get("document", {}), working_bucket, logger)
         logger.info(f"Successfully loaded actual document with {len(document.pages)} pages and {len(document.sections)} sections")
         return document
     except Exception as e:
diff --git a/src/lambda/workflow_tracker/index.py b/src/lambda/workflow_tracker/index.py
@@ -58,7 +58,7 @@ def update_document_completion(object_key: str, workflow_status: str, output_dat
             if "document" in workflow_result:
                 # Get document from the final processing step - handle both compressed and uncompressed
                 working_bucket = os.environ.get('WORKING_BUCKET')
-                processed_doc = Document.handle_input_document(workflow_result.get("document", {}), working_bucket, logger)
+                processed_doc = Document.load_document(workflow_result.get("document", {}), working_bucket, logger)
                 
                 # Copy data from processed document to our update document
                 document.num_pages = processed_doc.num_pages

Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ def handler(event, context):`
`125`	`125`	`# Prepare output with automatic compression if needed`
`126`	`126`	`working_bucket = os.environ.get('WORKING_BUCKET')`
`127`	`127`	`response = {`
`128`		`- "document": document.prepare_output(working_bucket, "ocr", logger)`
	`128`	`+ "document": document.serialize_document(working_bucket, "ocr", logger)`
`129`	`129`	`}`
`130`	`130`
`131`	`131`	`logger.info(f"Response: {json.dumps(response, default=str)}")`