aws-solutions-library-samples
diff --git a/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/appsync/service.py‎
Lines changed: 2 additions & 0 deletions b/‎lib/idp_common_pkg/idp_common/appsync/service.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/ocr/README.md‎
Lines changed: 23 additions & 22 deletions b/‎lib/idp_common_pkg/idp_common/ocr/README.md‎
Lines changed: 23 additions & 22 deletions
diff --git a/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 33 additions & 32 deletions b/‎lib/idp_common_pkg/idp_common/ocr/service.py‎
Lines changed: 33 additions & 32 deletions
diff --git a/‎lib/idp_common_pkg/tests/unit/ocr/test_ocr_service.py‎
Lines changed: 16 additions & 14 deletions b/‎lib/idp_common_pkg/tests/unit/ocr/test_ocr_service.py‎
Lines changed: 16 additions & 14 deletions
diff --git a/‎src/api/schema.graphql‎
Lines changed: 2 additions & 0 deletions b/‎src/api/schema.graphql‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/ui/src/components/common/map-document-attributes.js‎
Lines changed: 5 additions & 1 deletion b/‎src/ui/src/components/common/map-document-attributes.js‎
Lines changed: 5 additions & 1 deletion
@@ -7,6 +7,23 @@ SPDX-License-Identifier: MIT-0
 
 ### Added
 
+- **Text Confidence View for Document Pages**
+  - Added support for displaying OCR text confidence data through new `TextConfidenceUri` field
+  - New "Text Confidence View" option in the UI pages panel alongside existing Markdown and Text views
+  - Fixed issues with view persistence - Text Confidence View button now always visible with appropriate messaging when content unavailable
+  - Fixed view toggle behavior - switching between views no longer closes the viewer window
+  - Reordered view buttons to: Markdown View, Text Confidence View, Text View for better user experience
+
+### Changed
+- **Converted text confidence data format from JSON to markdown table for improved readability and reduced token usage**
+  - Removed unnecessary "page_count" field
+  - Changed "text_blocks" array to "text" field containing a markdown table with Text and Confidence columns
+  - Reduces prompt size for assessment service while improving UI readability
+  - OCR confidence values now rounded to 1 decimal point (e.g., 99.1, 87.3) for cleaner display
+  - Markdown table headers now explicitly left-aligned using `|:-----|:-----------|` format for consistent appearance
+
+
+
 ### Fixed
 
 
 
@@ -117,6 +117,7 @@ def _document_to_update_input(self, document: Document) -> Dict[str, Any]:
                     "Class": page.classification or "",
                     "ImageUri": page.image_uri or "",
                     "TextUri": page.parsed_text_uri or page.raw_text_uri or "",
+                    "TextConfidenceUri": page.text_confidence_uri or "",
                 }
                 pages_data.append(page_data)
 
@@ -290,6 +291,7 @@ def _appsync_to_document(self, appsync_data: Dict[str, Any]) -> Document:
                     page_id=page_id,
                     image_uri=page_data.get("ImageUri"),
                     raw_text_uri=page_data.get("TextUri"),
+                    text_confidence_uri=page_data.get("TextConfidenceUri"),
                     classification=page_data.get("Class"),
                 )
 
 
@@ -15,21 +15,21 @@ The service supports three OCR backends, each with different capabilities and us
 
 ### 1. Textract Backend (Default - Recommended for Assessment)
 - **Technology**: AWS Textract OCR service
-- **Confidence Data**: ✅ Full granular confidence scores per text block
+- **Confidence Data**: ✅ Full granular confidence scores per text line (displayed as markdown table)
 - **Features**: Basic text detection + enhanced document analysis (tables, forms, signatures, layout)
 - **Assessment Quality**: ⭐⭐⭐ Optimal - Real OCR confidence enables accurate assessment
 - **Use Cases**: Standard document processing, when assessment is enabled, production workflows
 
 ### 2. Bedrock Backend (LLM-based OCR)
 - **Technology**: Amazon Bedrock LLMs (Claude, Nova) for text extraction
-- **Confidence Data**: ❌ No confidence data (empty text_blocks array)
+- **Confidence Data**: ❌ No confidence data (displays "No confidence data available from LLM OCR")
 - **Features**: Advanced text understanding, better handling of challenging/degraded documents
 - **Assessment Quality**: ❌ No confidence data for assessment
 - **Use Cases**: Challenging documents where traditional OCR fails, specialized text extraction needs
 
 ### 3. None Backend (Image-only)
 - **Technology**: No OCR processing
-- **Confidence Data**: ❌ Empty confidence data
+- **Confidence Data**: ❌ No confidence data (displays "No OCR performed")
 - **Features**: Image extraction and storage only
 - **Assessment Quality**: ❌ No text confidence for assessment
 - **Use Cases**: Image-only workflows, custom OCR integration
@@ -104,36 +104,37 @@ The format varies by OCR backend:
 **Textract Backend (with confidence data):**
 ```json
 {
-  "page_count": 1,
-  "text_blocks": [
-    {
-      "text": "WESTERN DARK FIRED TOBACCO GROWERS' ASSOCIATION",
-      "confidence": 99.35,
-      "type": "PRINTED"
-    },
-    {
-      "text": "206 Maple Street",
-      "confidence": 91.41,
-      "type": "PRINTED"
-    }
-  ]
+  "text": "| Text | Confidence |\n|------|------------|\n| WESTERN DARK FIRED TOBACCO GROWERS' ASSOCIATION | 99.4 |\n| 206 Maple Street | 91.4 |\n| Murray, KY 42071 | 98.7 |"
+}
+```
+
+The `text` field contains a markdown table with two columns:
+- **Text**: The extracted text content (with pipe characters escaped as `\|`)
+- **Confidence**: OCR confidence score rounded to 1 decimal point
+- Handwriting is indicated with "(HANDWRITING)" suffix in the text column
+
+**Bedrock Backend (no confidence data):**
+```json
+{
+  "text": "| Text | Confidence |\n|------|------------|\n| *No confidence data available from LLM OCR* | N/A |"
 }
 ```
 
-**Bedrock/None Backend (no confidence data):**
+**None Backend (no OCR):**
 ```json
 {
-  "page_count": 1,
-  "text_blocks": []
+  "text": "| Text | Confidence |\n|------|------------|\n| *No OCR performed* | N/A |"
 }
 ```
 
 ### Benefits
 
-- **80-90% token reduction** compared to raw Textract output
-- **Preserved assessment data**: Text content, OCR confidence scores, text type (PRINTED/HANDWRITING)
-- **Removed overhead**: Geometric data, relationships, block IDs, and verbose metadata
+- **85-95% token reduction** compared to raw Textract output (markdown table format is more compact than JSON)
+- **Preserved assessment data**: Text content, OCR confidence scores (rounded to 1 decimal), text type (PRINTED/HANDWRITING)
+- **Removed overhead**: Geometric data, relationships, block IDs, verbose metadata, and unnecessary JSON syntax
+- **Improved readability**: Markdown table format is human-readable in both UI and assessment prompts
 - **Cost efficiency**: Significantly reduced LLM inference costs for assessment workflows
+- **UI compatibility**: Displays beautifully in the Text Confidence View using existing markdown rendering
 - **Automated generation**: Created during initial OCR processing, not repeatedly during assessment
 
 ### Usage in Assessment Prompts
 
@@ -622,10 +622,9 @@ def _process_single_page_bedrock(
         )
 
         # Generate and store text confidence data
-        # For Bedrock, we use empty confidence data since LLM OCR doesn't provide real confidence scores
+        # For Bedrock, we use empty markdown table since LLM OCR doesn't provide real confidence scores
         text_confidence_data = {
-            "page_count": 1,
-            "text_blocks": [],  # Empty - no confidence data available from LLM OCR
+            "text": "| Text | Confidence |\n|:-----|:------------|\n| *No confidence data available from LLM OCR* | N/A |"
         }
 
         text_confidence_key = f"{prefix}/pages/{page_id}/textConfidence.json"
@@ -703,8 +702,10 @@ def _process_single_page_none(
             content_type="application/json",
         )
 
-        # Generate minimal text confidence data (empty)
-        text_confidence_data = {"page_count": 1, "text_blocks": []}
+        # Generate minimal text confidence data (empty markdown table)
+        text_confidence_data = {
+            "text": "| Text | Confidence |\n|:-----|:------------|\n| *No OCR performed* | N/A |"
+        }
 
         text_confidence_key = f"{prefix}/pages/{page_id}/textConfidence.json"
         s3.write_content(
@@ -807,11 +808,9 @@ def _generate_text_confidence_data(
         """
         Generate text confidence data from raw OCR to reduce token usage while preserving essential information.
 
-        This method transforms verbose Textract output into a minimal format containing:
+        This method transforms verbose Textract output into a markdown table format containing:
         - Essential text content (LINE blocks only)
-        - OCR confidence scores
-        - Text type (PRINTED/HANDWRITING)
-        - Page count
+        - OCR confidence scores (rounded to 1 decimal point)
 
         Removes geometric data, relationships, block IDs, and other verbose metadata
         that aren't needed for assessment purposes.
@@ -820,29 +819,30 @@ def _generate_text_confidence_data(
             raw_ocr_data: Raw Textract API response
 
         Returns:
-            Text confidence data with ~80-90% token reduction
+            Text confidence data as markdown table with ~80-90% token reduction
         """
-        text_confidence_data = {
-            "page_count": raw_ocr_data.get("DocumentMetadata", {}).get("Pages", 1),
-            "text_blocks": [],
-        }
+        # Start building the markdown table with explicit left alignment
+        markdown_lines = ["| Text | Confidence |", "|:-----|:-----------|"]
 
         blocks = raw_ocr_data.get("Blocks", [])
 
         for block in blocks:
             if block.get("BlockType") == "LINE" and block.get("Text"):
-                text_block = {
-                    "text": block.get("Text", ""),
-                    "confidence": block.get("Confidence"),
-                }
+                text = block.get("Text", "").replace(
+                    "|", "\\|"
+                )  # Escape pipe characters
+                confidence = round(block.get("Confidence", 0.0), 1)
 
-                # Include text type if available (PRINTED vs HANDWRITING)
-                if "TextType" in block:
-                    text_block["type"] = block["TextType"]
+                # Add text type indicator if it's handwriting
+                if block.get("TextType") == "HANDWRITING":
+                    markdown_lines.append(f"| {text} (HANDWRITING) | {confidence} |")
+                else:
+                    markdown_lines.append(f"| {text} | {confidence} |")
 
-                text_confidence_data["text_blocks"].append(text_block)
+        # Join all lines into a single markdown string
+        markdown_table = "\n".join(markdown_lines)
 
-        return text_confidence_data
+        return {"text": markdown_table}
 
     def _parse_textract_response(
         self, response: Dict[str, Any], page_id: int = None
@@ -1070,15 +1070,16 @@ def _process_converted_page(
             content_type="application/json",
         )
 
-        # Generate text confidence data
-        text_confidence_data = {
-            "page_count": 1,
-            "text_blocks": [
-                {"text": line, "confidence": 99.0, "type": "PRINTED"}
-                for line in page_text.split("\n")
-                if line.strip()
-            ],
-        }
+        # Generate text confidence data as markdown table with explicit left alignment
+        markdown_lines = ["| Text | Confidence |", "|:-----|:-----------|"]
+        for line in page_text.split("\n"):
+            if line.strip():
+                # Escape pipe characters in text
+                escaped_line = line.replace("|", "\\|")
+                markdown_lines.append(f"| {escaped_line} | 99.0 |")
+
+        markdown_table = "\n".join(markdown_lines)
+        text_confidence_data = {"text": markdown_table}
 
         text_confidence_key = f"{prefix}/pages/{page_id}/textConfidence.json"
         s3.write_content(
 
@@ -528,20 +528,22 @@ def test_generate_text_confidence_data(self, mock_textract_response):
             service = OcrService()
             result = service._generate_text_confidence_data(mock_textract_response)
 
-            # Verify structure
-            assert "page_count" in result
-            assert "text_blocks" in result
-            assert result["page_count"] == 1
-            assert len(result["text_blocks"]) == 2  # Two LINE blocks
-
-            # Verify text blocks
-            assert result["text_blocks"][0]["text"] == "Sample text line 1"
-            assert result["text_blocks"][0]["confidence"] == 98.5
-            assert result["text_blocks"][0]["type"] == "PRINTED"
-
-            assert result["text_blocks"][1]["text"] == "Sample text line 2"
-            assert result["text_blocks"][1]["confidence"] == 97.2
-            assert result["text_blocks"][1]["type"] == "PRINTED"
+            # Verify structure - now returns markdown table in 'text' field
+            assert "text" in result
+            assert "page_count" not in result  # Removed in new format
+            assert "text_blocks" not in result  # Replaced with markdown table
+
+            # Verify markdown table content
+            markdown_table = result["text"]
+            lines = markdown_table.split("\n")
+
+            # Check header
+            assert lines[0] == "| Text | Confidence |"
+            assert lines[1] == "|:-----|:-----------|"
+
+            # Check data rows
+            assert lines[2] == "| Sample text line 1 | 98.5 |"
+            assert lines[3] == "| Sample text line 2 | 97.2 |"
 
     def test_parse_textract_response_markdown_success(self):
         """Test parsing Textract response to markdown successfully."""
 
@@ -46,6 +46,7 @@ type Page @aws_cognito_user_pools @aws_iam {
   Class: String
   ImageUri: String
   TextUri: String
+  TextConfidenceUri: String
 }
 
 type DocumentList @aws_cognito_user_pools @aws_iam {
@@ -125,6 +126,7 @@ input PageInput {
   Class: String
   ImageUri: String
   TextUri: String
+  TextConfidenceUri: String
 }
 
 type CopyToBaselineResponse @aws_cognito_user_pools {
 
@@ -88,7 +88,11 @@ const mapDocumentsAttributes = (documents) => {
       workflowStatus,
       duration: getDuration(completionTime, initialEventTime),
       sections,
-      pages,
+      pages:
+        pages?.map((page) => ({
+          ...page,
+          TextConfidenceUri: page.TextConfidenceUri || null,
+        })) || [],
       pageCount,
       metering,
       evaluationReportUri,
Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,7 @@ def _document_to_update_input(self, document: Document) -> Dict[str, Any]:`
`117`	`117`	`"Class": page.classification or "",`
`118`	`118`	`"ImageUri": page.image_uri or "",`
`119`	`119`	`"TextUri": page.parsed_text_uri or page.raw_text_uri or "",`
	`120`	`+ "TextConfidenceUri": page.text_confidence_uri or "",`
`120`	`121`	`}`
`121`	`122`	`pages_data.append(page_data)`
`122`	`123`
`@@ -290,6 +291,7 @@ def _appsync_to_document(self, appsync_data: Dict[str, Any]) -> Document:`
`290`	`291`	`page_id=page_id,`
`291`	`292`	`image_uri=page_data.get("ImageUri"),`
`292`	`293`	`raw_text_uri=page_data.get("TextUri"),`
	`294`	`+ text_confidence_uri=page_data.get("TextConfidenceUri"),`
`293`	`295`	`classification=page_data.get("Class"),`
`294`	`296`	`)`
`295`	`297`
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ type Page @aws_cognito_user_pools @aws_iam {`
`46`	`46`	`Class: String`
`47`	`47`	`ImageUri: String`
`48`	`48`	`TextUri: String`
	`49`	`+ TextConfidenceUri: String`
`49`	`50`	`}`
`50`	`51`
`51`	`52`	`type DocumentList @aws_cognito_user_pools @aws_iam {`
`@@ -125,6 +126,7 @@ input PageInput {`
`125`	`126`	`Class: String`
`126`	`127`	`ImageUri: String`
`127`	`128`	`TextUri: String`
	`129`	`+ TextConfidenceUri: String`
`128`	`130`	`}`
`129`	`131`
`130`	`132`	`type CopyToBaselineResponse @aws_cognito_user_pools {`