aws-solutions-library-samples
diff --git a/‎config_library/pattern-1/lending-package-sample/config.yaml‎
Lines changed: 102 additions & 0 deletions b/‎config_library/pattern-1/lending-package-sample/config.yaml‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎config_library/pattern-2/bank-statement-sample/config.yaml‎
Lines changed: 103 additions & 1 deletion b/‎config_library/pattern-2/bank-statement-sample/config.yaml‎
Lines changed: 103 additions & 1 deletion
diff --git a/‎config_library/pattern-2/criteria-validation/config.yaml‎
Lines changed: 103 additions & 1 deletion b/‎config_library/pattern-2/criteria-validation/config.yaml‎
Lines changed: 103 additions & 1 deletion
@@ -215,3 +215,105 @@ pricing:
         price: '1.5E-6'
       - name: cacheWriteInputTokens
         price: '1.875E-5'
+discovery:
+  output_format:
+    sample_json: |-
+      {
+          "document_class" : "Form-1040",
+          "document_description" : "Brief summary of the document",
+          "groups" : [
+              {
+                  "name" : "PersonalInformation",
+                  "description" : "Personal information of Tax payer",
+                  "attributeType" : "group",
+                  "groupAttributes" : [
+                      {
+                          "name": "FirstName",
+                          "dataType" : "string",
+                          "description" : "First Name of Taxpayer"
+                      },
+                      {
+                          "name": "Age",
+                          "dataType" : "number",
+                          "description" : "Age of Taxpayer"
+                      }
+                  ]
+              },
+              {
+                  "name" : "Dependents",
+                  "description" : "Dependents of taxpayer",
+                  "attributeType" : "list",
+                  "listItemTemplate": {
+                      "itemAttributes" : [
+                          {
+                              "name": "FirstName",
+                              "dataType" : "string",
+                              "description" : "Dependent first name"
+                          },
+                          {
+                              "name": "Age",
+                              "dataType" : "number",
+                              "description" : "Dependent Age"
+                          }
+                      ]
+                  }
+              }
+          ]
+      }
+  with_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.                        
+      <GROUND_TRUTH_REFERENCE>
+      {ground_truth_json}
+      </GROUND_TRUTH_REFERENCE>
+      Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
+      Image may contain multiple pages, process all pages.
+      Extract all field names including those without values.
+      Do not change the group name and field name from ground truth in the extracted data json.
+      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field. 
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words.
+      If the group repeats and follows table format, update the attributeType as "list".                         
+      Do not extract the values.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+        
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Use provided ground truth data as reference to optimize field
+      extraction and ensure consistency with expected document structure and
+      field definitions.
+    max_tokens: '10000'
+  without_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains forms data. Analyze the form line by line.
+      Image may contains multiple pages, process all the pages. 
+      Form may contain multiple name value pair in one line. 
+      Extract all the names in the form including the name value pair which doesn't have value. 
+      Organize them into groups, extract field_name, data_type and field description
+      Field_name should be less than 60 characters, should not have space use '-' instead of space.
+      field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
+      Field_name should be unique within the group.
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words. 
+
+      Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
+      If the group repeats and follows table format, update the attributeType as "list".
+      Do not extract the values.
+      Return the extracted data in JSON format.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Analyze forms line by line to identify field names, data types,
+      and organizational structure. Focus on creating comprehensive blueprints
+      for document processing without extracting actual values.
+    max_tokens: '10000'
@@ -371,7 +371,7 @@ summarization:
 
 assessment:
   enabled: true
-  validation_enabled: true
+  validation_enabled: false
   image:
     target_height: ''
     target_width: ''
@@ -692,3 +692,105 @@ pricing:
         price: '1.5E-6'
       - name: cacheWriteInputTokens
         price: '1.875E-5'
+discovery:
+  output_format:
+    sample_json: |-
+      {
+          "document_class" : "Form-1040",
+          "document_description" : "Brief summary of the document",
+          "groups" : [
+              {
+                  "name" : "PersonalInformation",
+                  "description" : "Personal information of Tax payer",
+                  "attributeType" : "group",
+                  "groupAttributes" : [
+                      {
+                          "name": "FirstName",
+                          "dataType" : "string",
+                          "description" : "First Name of Taxpayer"
+                      },
+                      {
+                          "name": "Age",
+                          "dataType" : "number",
+                          "description" : "Age of Taxpayer"
+                      }
+                  ]
+              },
+              {
+                  "name" : "Dependents",
+                  "description" : "Dependents of taxpayer",
+                  "attributeType" : "list",
+                  "listItemTemplate": {
+                      "itemAttributes" : [
+                          {
+                              "name": "FirstName",
+                              "dataType" : "string",
+                              "description" : "Dependent first name"
+                          },
+                          {
+                              "name": "Age",
+                              "dataType" : "number",
+                              "description" : "Dependent Age"
+                          }
+                      ]
+                  }
+              }
+          ]
+      }
+  with_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.                        
+      <GROUND_TRUTH_REFERENCE>
+      {ground_truth_json}
+      </GROUND_TRUTH_REFERENCE>
+      Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
+      Image may contain multiple pages, process all pages.
+      Extract all field names including those without values.
+      Do not change the group name and field name from ground truth in the extracted data json.
+      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field. 
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words.
+      If the group repeats and follows table format, update the attributeType as "list".                         
+      Do not extract the values.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+        
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Use provided ground truth data as reference to optimize field
+      extraction and ensure consistency with expected document structure and
+      field definitions.
+    max_tokens: '10000'
+  without_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains forms data. Analyze the form line by line.
+      Image may contains multiple pages, process all the pages. 
+      Form may contain multiple name value pair in one line. 
+      Extract all the names in the form including the name value pair which doesn't have value. 
+      Organize them into groups, extract field_name, data_type and field description
+      Field_name should be less than 60 characters, should not have space use '-' instead of space.
+      field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
+      Field_name should be unique within the group.
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words. 
+
+      Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
+      If the group repeats and follows table format, update the attributeType as "list".
+      Do not extract the values.
+      Return the extracted data in JSON format.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Analyze forms line by line to identify field names, data types,
+      and organizational structure. Focus on creating comprehensive blueprints
+      for document processing without extracting actual values.
+    max_tokens: '10000'
@@ -4,7 +4,7 @@
 notes: Criteria validation configuration for healthcare/insurance prior authorization
 assessment:
   enabled: true
-  validation_enabled: true
+  validation_enabled: false
 criteria_validation:
   model: us.anthropic.claude-3-5-sonnet-20240620-v1:0
   temperature: 0.0
@@ -212,3 +212,105 @@ pricing:
         price: 0.0000032
       - name: cacheReadInputTokens
         price: 0.0000002
+discovery:
+  output_format:
+    sample_json: |-
+      {
+          "document_class" : "Form-1040",
+          "document_description" : "Brief summary of the document",
+          "groups" : [
+              {
+                  "name" : "PersonalInformation",
+                  "description" : "Personal information of Tax payer",
+                  "attributeType" : "group",
+                  "groupAttributes" : [
+                      {
+                          "name": "FirstName",
+                          "dataType" : "string",
+                          "description" : "First Name of Taxpayer"
+                      },
+                      {
+                          "name": "Age",
+                          "dataType" : "number",
+                          "description" : "Age of Taxpayer"
+                      }
+                  ]
+              },
+              {
+                  "name" : "Dependents",
+                  "description" : "Dependents of taxpayer",
+                  "attributeType" : "list",
+                  "listItemTemplate": {
+                      "itemAttributes" : [
+                          {
+                              "name": "FirstName",
+                              "dataType" : "string",
+                              "description" : "Dependent first name"
+                          },
+                          {
+                              "name": "Age",
+                              "dataType" : "number",
+                              "description" : "Dependent Age"
+                          }
+                      ]
+                  }
+              }
+          ]
+      }
+  with_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference.                        
+      <GROUND_TRUTH_REFERENCE>
+      {ground_truth_json}
+      </GROUND_TRUTH_REFERENCE>
+      Ground truth reference JSON has the fields we are interested in extracting from the document/image. Use the ground truth to optimize field extraction. Match field names, data types, and groupings from the reference.
+      Image may contain multiple pages, process all pages.
+      Extract all field names including those without values.
+      Do not change the group name and field name from ground truth in the extracted data json.
+      Add field_description field for every field which will contain instruction to LLM to extract the field data from the image/document. Add data_type field for every field. 
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words.
+      If the group repeats and follows table format, update the attributeType as "list".                         
+      Do not extract the values.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+        
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Use provided ground truth data as reference to optimize field
+      extraction and ensure consistency with expected document structure and
+      field definitions.
+    max_tokens: '10000'
+  without_ground_truth:
+    top_p: '0.1'
+    temperature: '1.0'
+    user_prompt: >-
+      This image contains forms data. Analyze the form line by line.
+      Image may contains multiple pages, process all the pages. 
+      Form may contain multiple name value pair in one line. 
+      Extract all the names in the form including the name value pair which doesn't have value. 
+      Organize them into groups, extract field_name, data_type and field description
+      Field_name should be less than 60 characters, should not have space use '-' instead of space.
+      field_description is a brief description of the field and the location of the field like box number or line number in the form and section of the form.
+      Field_name should be unique within the group.
+      Add two fields document_class and document_description. 
+      For document_class generate a short name based on the document content like W4, I-9, Paystub. 
+      For document_description generate a description about the document in less than 50 words. 
+
+      Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
+      If the group repeats and follows table format, update the attributeType as "list".
+      Do not extract the values.
+      Return the extracted data in JSON format.
+      Format the extracted data using the below JSON format:
+      Format the extracted groups and fields using the below JSON format:
+    model_id: us.amazon.nova-pro-v1:0
+    system_prompt: >-
+      You are an expert in processing forms. Extracting data from images and
+      documents. Analyze forms line by line to identify field names, data types,
+      and organizational structure. Focus on creating comprehensive blueprints
+      for document processing without extracting actual values.
+    max_tokens: '10000'