Merge branch 'feature/evaluation-inside-workflow' into 'develop'

rstrahan · rstrahan · commit 97d8001692c9 · 2025-10-26T16:48:27.000Z
Feature/evaluation inside workflow

See merge request genaiic-reusable-assets/engagement-artifacts/genaiic-idp-accelerator!378
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,15 @@ SPDX-License-Identifier: MIT-0
 
 ### Changed
 
+- **Migrated Evaluation from EventBridge Trigger to Step Functions Workflow**
+  - Moved evaluation processing from external EventBridge-triggered Lambda to integrated Step Functions workflow step
+  - **Race Condition Eliminated**: Evaluation now runs inside state machine before WorkflowTracker marks documents COMPLETE, preventing premature completion status when evaluation is still running
+  - **Config-Driven Control**: Evaluation now controlled by `evaluation.enabled` configuration setting instead of CloudFormation stack parameter, enabling runtime control without stack redeployment
+  - **Enhanced Status Tracking**: Added EVALUATING status to document processing pipeline for better visibility of evaluation progress
+  - **UI Improvements**: Added support for displaying EVALUATING status in processing flow viewer and "NOT ENABLED" badge when evaluation is disabled in configuration
+  - **Consistent Pattern**: Aligns evaluation with summarization and assessment patterns for unified feature control approach
+
+
 - **Migrated UI Build System from Create React App to Vite**
   - Upgraded to Vite 7 for faster build times
   - Updated to React 18, AWS Amplify v6, react-router-dom v6, and Cloudscape Design System
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.3.21-rc1
+0.3.21-rc2
diff --git a/config_library/pattern-1/lending-package-sample/config.yaml b/config_library/pattern-1/lending-package-sample/config.yaml
@@ -67,6 +67,7 @@ summarization:
   system_prompt: >-
     You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions.
 evaluation:
+  enabled: true
   llm_method:
     top_p: '0.1'
     max_tokens: '4096'
@@ -520,4 +521,3 @@ pricing:
     units:
       - name: gb_seconds
         price: '1.66667E-5'   # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds)
-
diff --git a/config_library/pattern-2/bank-statement-sample/config.yaml b/config_library/pattern-2/bank-statement-sample/config.yaml
@@ -529,6 +529,7 @@ assessment:
     </extraction-results>
 
 evaluation:
+  enabled: true
   llm_method:
     top_p: '0.1'
     max_tokens: '4096'
@@ -997,4 +998,3 @@ pricing:
     units:
       - name: gb_seconds
         price: '1.66667E-5'   # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds)
-
diff --git a/config_library/pattern-2/lending-package-sample/config.yaml b/config_library/pattern-2/lending-package-sample/config.yaml
@@ -1307,6 +1307,7 @@ assessment:
     {EXTRACTION_RESULTS}
     </extraction-results>
 evaluation:
+  enabled: true
   llm_method:
     top_p: "0.1"
     max_tokens: "4096"
@@ -1776,4 +1777,3 @@ pricing:
     units:
       - name: gb_seconds
         price: "1.66667E-5" # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds)
-
diff --git a/config_library/pattern-2/rvl-cdip-package-sample-with-few-shot-examples/config.yaml b/config_library/pattern-2/rvl-cdip-package-sample-with-few-shot-examples/config.yaml
@@ -957,6 +957,7 @@ assessment:
     {EXTRACTION_RESULTS}
     </extraction-results>
 evaluation:
+  enabled: true
   llm_method:
     top_p: '0.1'
     max_tokens: '4096'
@@ -1509,4 +1510,4 @@ pricing:
   - name: lambda/duration  
     units:
       - name: gb_seconds
-        price: '1.66667E-5'   # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds)
+        price: '1.66667E-5'   # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds)
diff --git a/config_library/pattern-2/rvl-cdip-package-sample/config.yaml b/config_library/pattern-2/rvl-cdip-package-sample/config.yaml
@@ -766,6 +766,7 @@ assessment:
     {EXTRACTION_RESULTS}
     </extraction-results>
 evaluation:
+  enabled: true
   llm_method:
     top_p: '0.1'
     max_tokens: '4096'
@@ -1235,4 +1236,3 @@ pricing:
     units:
       - name: gb_seconds
         price: '1.66667E-5'   # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds)
-
diff --git a/config_library/pattern-3/rvl-cdip-package-sample/config.yaml b/config_library/pattern-3/rvl-cdip-package-sample/config.yaml
@@ -625,6 +625,7 @@ assessment:
     {EXTRACTION_RESULTS}
     </extraction-results>
 evaluation:
+  enabled: true
   llm_method:
     top_p: '0.1'
     max_tokens: '4096'
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -155,7 +155,6 @@ Key parameters that can be configured during CloudFormation deployment:
 
 ### Optional Features
 - `EvaluationBaselineBucketName`: Optional existing bucket for ground truth data
-- `EvaluationAutoEnabled`: Enable automatic accuracy evaluation (default: true)
 - `DocumentKnowledgeBase`: Enable document knowledge base functionality
 - `KnowledgeBaseModelId`: Bedrock model for knowledge base queries
 - `PostProcessingLambdaHookFunctionArn`: Optional Lambda ARN for custom post-processing (see [post-processing-lambda-hook.md](post-processing-lambda-hook.md) for detailed implementation guidance)
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -16,9 +16,11 @@ The GenAIIDP solution includes a built-in evaluation framework to assess the acc
    - Use an existing bucket or let the solution create one
    - Can use outputs from another GenAIIDP stack to compare different patterns/prompts
 
-2. **Automatic Evaluation**
-   - When enabled, automatically evaluates each processed document
-   - Compares against baseline data if available
+2. **Integrated Evaluation Step**
+   - Evaluation runs as the final step in the Step Functions workflow (after summarization)
+   - Executes **before** the workflow marks documents as COMPLETE, eliminating race conditions
+   - When `evaluation.enabled: true` in configuration, evaluates against baseline data if available
+   - When `evaluation.enabled: false` in configuration, step executes but skips processing
    - Generates detailed markdown reports using AI analysis
 
 3. **Evaluation Reports**
@@ -79,21 +81,38 @@ The confidence integration is fully backward compatible:
 
 ## Configuration
 
-Set the following parameters during stack deployment:
+### Stack Deployment Parameters
+
+Set the following parameter during stack deployment:
 
 ```yaml
 EvaluationBaselineBucketName:
   Description: Existing bucket with baseline data, or leave empty to create new bucket
-  
-EvaluationAutoEnabled:
-  Default: true
-  Description: Automatically evaluate each document (if baseline exists)
-  
-EvaluationModelId:
-  Default: "anthropic.claude-3-sonnet-20240229-v1:0"
-  Description: Model to use for evaluation reports (e.g., "us.anthropic.claude-3-7-sonnet-20250219-v1:0")
 ```
 
+### Runtime Configuration
+
+Control evaluation behavior through the configuration file (no stack redeployment needed):
+
+```yaml
+evaluation:
+  enabled: true  # Set to false to disable evaluation processing
+  llm_method:
+    model: "us.anthropic.claude-3-haiku-20240307-v1:0"  # Model for evaluation reports
+    temperature: "0.0"
+    top_p: "0.1"
+    max_tokens: "4096"
+    # Additional model parameters...
+```
+
+**Benefits of Configuration-Based Control:**
+- Enable/disable evaluation without stack redeployment
+- Runtime control similar to summarization and assessment features
+- Zero LLM costs when disabled (step executes but skips processing)
+- Consistent feature control pattern across the solution
+
+### Attribute-Specific Evaluation Methods
+
 You can also configure evaluation methods for specific document classes and attributes through the solution's configuration. The framework supports three types of attributes with different evaluation approaches:
 
 ### Simple Attributes
diff --git a/docs/idp-cli.md b/docs/idp-cli.md
@@ -709,20 +709,10 @@ idp-cli run-inference \
 
 Download the evaluation results to analyze accuracy:
 
-**⏱️ Important Timing Note:** Evaluation processing runs as a separate step after the main document processing completes. This takes an additional 2-3 minutes per document. If you download results immediately after the batch shows "Complete", the evaluation data may not be ready yet.
-
-**Best practice:**
-1. Wait 5-10 minutes after batch completion before downloading evaluation results
-2. Check that the downloaded files include the `evaluation/` directory
-3. If evaluation data is missing, wait a few more minutes and download again
+**✓ Synchronous Evaluation:** Evaluation runs as the final step in the workflow before completion. When a document shows status "COMPLETE", all processing including evaluation is finished - results are immediately available for download.
 
 ```bash
-# Wait for evaluation to complete (check status)
-idp-cli status \
-    --stack-name eval-testing \
-    --batch-id eval-run-001
-
-# Download evaluation results
+# Download evaluation results (no waiting needed)
 idp-cli download-results \
     --stack-name eval-testing \
     --batch-id eval-run-001 \
diff --git a/docs/idp-configuration-best-practices.md b/docs/idp-configuration-best-practices.md
@@ -1362,16 +1362,10 @@ Set the following parameters during stack deployment:
 ```yaml
 EvaluationBaselineBucketName:
   Description: Existing bucket with baseline data, or leave empty to create new bucket
-  
-EvaluationAutoEnabled:
-  Default: true
-  Description: Automatically evaluate each document (if baseline exists)
-  
-EvaluationModelId:
-  Default: "anthropic.claude-3-sonnet-20240229-v1:0"
-  Description: Model to use for evaluation reports
 ```
 
+**Note:** Evaluation is now controlled via configuration file (`evaluation.enabled: true/false`) rather than stack parameters. See the [evaluation.md](./evaluation.md) documentation for details.
+
 ### Evaluation Methods Configuration
 
 Configure evaluation methods for specific document classes and attributes:
diff --git a/lib/idp_common_pkg/idp_common/models.py b/lib/idp_common_pkg/idp_common/models.py
@@ -27,6 +27,7 @@ class Status(Enum):
     POSTPROCESSING = "POSTPROCESSING"  # Document summarization
     HITL_IN_PROGRESS = "HITL_IN_PROGRESS"  # Human-in-the-loop review in progress
     SUMMARIZING = "SUMMARIZING"  # Document summarization
+    EVALUATING = "EVALUATING"  # Document evaluation
     COMPLETED = "COMPLETED"  # All processing completed
     FAILED = "FAILED"  # Processing failed
 
diff --git a/patterns/pattern-1/statemachine/workflow.asl.json b/patterns/pattern-1/statemachine/workflow.asl.json
@@ -167,6 +167,33 @@
                     "BackoffRate": 2
                 }
             ],
+            "Next": "EvaluationStep"
+        },
+        "EvaluationStep": {
+            "Type": "Task",
+            "Resource": "${EvaluationLambdaArn}",
+            "Parameters": {
+                "execution_arn.$": "$$.Execution.Id",
+                "document.$": "$.Result.document"
+            },
+            "ResultPath": "$.Result",
+            "Retry": [
+                {
+                    "ErrorEquals": [
+                        "Lambda.ServiceException",
+                        "Lambda.AWSLambdaException",
+                        "Lambda.SdkClientException",
+                        "Lambda.TooManyRequestsException",
+                        "ServiceQuotaExceededException",
+                        "ThrottlingException",
+                        "ProvisionedThroughputExceededException",
+                        "RequestLimitExceeded"
+                    ],
+                    "IntervalSeconds": 2,
+                    "MaxAttempts": 10,
+                    "BackoffRate": 2
+                }
+            ],
             "Next": "WorkflowComplete"
         },
         "WorkflowComplete": {
diff --git a/patterns/pattern-1/template.yaml b/patterns/pattern-1/template.yaml
@@ -94,6 +94,10 @@ Parameters:
   ConfigLibraryHash:
     Type: String
     Description: "Hash token from config library to force updates when config library changes"
+
+  EvaluationFunctionArn:
+    Type: String
+    Description: "ARN of the Evaluation Lambda function"
     
   EnableHITL:
     Type: String
@@ -362,6 +366,11 @@ Resources:
             type: object
             sectionLabel: Evaluation Inference
             properties:
+              enabled:
+                type: boolean
+                description: Enable or disable evaluation processing
+                default: true
+                order: 0
               llm_method:
                 type: object
                 properties:
@@ -657,6 +666,7 @@ Resources:
         HITLWaitFunctionArn: !GetAtt HITLWaitFunction.Arn
         HITLStatusUpdateFunctionArn: !GetAtt HITLStatusUpdateFunction.Arn
         SummarizationLambdaArn: !GetAtt SummarizationFunction.Arn
+        EvaluationLambdaArn: !Ref EvaluationFunctionArn
         EnableHITL: !Ref EnableHITL
         OutputBucket: !Ref OutputBucket
         WorkingBucket: !Ref WorkingBucket
@@ -679,6 +689,10 @@ Resources:
             FunctionName: !Ref HITLWaitFunction
         - LambdaInvokePolicy:
             FunctionName: !Ref HITLStatusUpdateFunction
+        - Statement:
+            - Effect: Allow
+              Action: lambda:InvokeFunction
+              Resource: !Ref EvaluationFunctionArn
         - CloudWatchLogsFullAccess
 
   StateMachineLogGroup:
diff --git a/patterns/pattern-2/statemachine/workflow.asl.json b/patterns/pattern-2/statemachine/workflow.asl.json
@@ -252,6 +252,34 @@
                     "BackoffRate": 2
                 }
             ],
+            "Next": "EvaluationStep"
+        },
+        "EvaluationStep": {
+            "Type": "Task",
+            "Resource": "${EvaluationLambdaArn}",
+            "Parameters": {
+                "execution_arn.$": "$$.Execution.Id",
+                "document.$": "$"
+            },
+            "ResultPath": "$",
+            "Retry": [
+                {
+                    "ErrorEquals": [
+                        "Sandbox.Timedout",
+                        "Lambda.ServiceException",
+                        "Lambda.AWSLambdaException",
+                        "Lambda.SdkClientException",
+                        "Lambda.TooManyRequestsException",
+                        "ServiceQuotaExceededException",
+                        "ThrottlingException",
+                        "ProvisionedThroughputExceededException",
+                        "RequestLimitExceeded"
+                    ],
+                    "IntervalSeconds": 2,
+                    "MaxAttempts": 10,
+                    "BackoffRate": 2
+                }
+            ],
             "Next": "WorkflowComplete"
         },
         "WorkflowComplete": {
diff --git a/patterns/pattern-2/template.yaml b/patterns/pattern-2/template.yaml
@@ -100,6 +100,10 @@ Parameters:
     Type: String
     Description: "Hash token from config library to force updates when config library changes"
 
+  EvaluationFunctionArn:
+    Type: String
+    Description: "ARN of the Evaluation Lambda function"
+
   EnableXRayTracing:
     Type: String
     Default: 'false'
@@ -1155,6 +1159,11 @@ Resources:
             type: object
             sectionLabel: Evaluation Inference
             properties:
+              enabled:
+                type: boolean
+                description: Enable or disable evaluation processing
+                default: true
+                order: 0
               llm_method:
                 type: object
                 properties:
@@ -2404,6 +2413,7 @@ Resources:
         HITLWaitFunctionArn: !GetAtt HITLWaitFunction.Arn
         HITLStatusUpdateFunctionArn: !GetAtt HITLStatusUpdateFunction.Arn
         SummarizationLambdaArn: !GetAtt SummarizationFunction.Arn
+        EvaluationLambdaArn: !Ref EvaluationFunctionArn
         OutputBucket: !Ref OutputBucket
       Logging:
         Level: ALL
@@ -2429,6 +2439,10 @@ Resources:
             FunctionName: !Ref HITLStatusUpdateFunction
         - LambdaInvokePolicy:
             FunctionName: !Ref SummarizationFunction
+        - Statement:
+            - Effect: Allow
+              Action: lambda:InvokeFunction
+              Resource: !Ref EvaluationFunctionArn
         - CloudWatchLogsFullAccess
 
   StateMachineLogGroup:
diff --git a/patterns/pattern-3/statemachine/workflow.asl.json b/patterns/pattern-3/statemachine/workflow.asl.json
diff --git a/patterns/pattern-3/template.yaml b/patterns/pattern-3/template.yaml
diff --git a/src/lambda/evaluation_function/index.py b/src/lambda/evaluation_function/index.py
diff --git a/src/ui/src/components/step-function-flow/FlowDiagram.jsx b/src/ui/src/components/step-function-flow/FlowDiagram.jsx
diff --git a/src/ui/src/components/step-function-flow/StepDetails.jsx b/src/ui/src/components/step-function-flow/StepDetails.jsx
diff --git a/src/ui/src/components/step-function-flow/StepFunctionFlowViewer.jsx b/src/ui/src/components/step-function-flow/StepFunctionFlowViewer.jsx
diff --git a/template.yaml b/template.yaml