Enhance reporting data type handling and optimize dependencies

Bob Strahan · Bob Strahan · commit 79af4325be73 · 2025-07-11T20:54:41.000Z
diff --git a/lib/idp_common_pkg/idp_common/reporting/save_reporting_data.py b/lib/idp_common_pkg/idp_common/reporting/save_reporting_data.py
@@ -147,18 +147,47 @@ def _infer_pyarrow_type(self, value: Any) -> pa.DataType:
         else:
             return pa.string()  # Default to string for unknown types
 
+    def _convert_value_to_string(self, value: Any) -> Optional[str]:
+        """
+        Convert any value to string, handling special cases for robust type compatibility.
+
+        Args:
+            value: The value to convert
+
+        Returns:
+            String representation of the value, or None if input is None
+        """
+        if value is None:
+            return None
+        elif isinstance(value, bytes):
+            # Handle binary data
+            try:
+                return value.decode("utf-8")
+            except UnicodeDecodeError:
+                # If can't decode, convert to hex string
+                return value.hex()
+        elif isinstance(value, (list, dict)):
+            return json.dumps(value)
+        elif isinstance(value, datetime.datetime):
+            return value.isoformat()
+        elif isinstance(value, (int, float, bool)):
+            return str(value)
+        else:
+            return str(value)
+
     def _flatten_json_data(
         self, data: Dict[str, Any], prefix: str = ""
     ) -> Dict[str, Any]:
         """
-        Flatten nested JSON data with dot notation.
+        Flatten nested JSON data with dot notation and convert all values to strings
+        for robust type compatibility.
 
         Args:
             data: The JSON data to flatten
             prefix: Prefix for nested keys
 
         Returns:
-            Flattened dictionary
+            Flattened dictionary with all values converted to strings
         """
         flattened = {}
 
@@ -172,7 +201,8 @@ def _flatten_json_data(
                 # Convert lists to JSON strings
                 flattened[new_key] = json.dumps(value) if value else None
             else:
-                flattened[new_key] = value
+                # Convert all values to strings for type consistency
+                flattened[new_key] = self._convert_value_to_string(value)
 
         return flattened
 
@@ -217,6 +247,64 @@ def _create_dynamic_schema(self, records: List[Dict[str, Any]]) -> pa.Schema:
 
         return pa.schema(schema_fields)
 
+    def _sanitize_records_for_schema(
+        self, records: List[Dict[str, Any]], schema: pa.Schema
+    ) -> List[Dict[str, Any]]:
+        """
+        Sanitize records to ensure they conform to the schema and handle type compatibility issues.
+
+        Args:
+            records: List of record dictionaries
+            schema: PyArrow schema to conform to
+
+        Returns:
+            List of sanitized records
+        """
+        sanitized_records = []
+
+        for record in records:
+            sanitized_record = {}
+
+            # Process each field in the schema
+            for field in schema:
+                field_name = field.name
+                value = record.get(field_name)
+
+                if value is None:
+                    sanitized_record[field_name] = None
+                elif field.type == pa.string():
+                    # Convert all values to strings for string fields
+                    sanitized_record[field_name] = self._convert_value_to_string(value)
+                elif field.type == pa.timestamp("ms"):
+                    # Handle timestamp fields
+                    if isinstance(value, datetime.datetime):
+                        sanitized_record[field_name] = value
+                    else:
+                        # Try to parse string timestamps
+                        try:
+                            if isinstance(value, str):
+                                sanitized_record[field_name] = (
+                                    datetime.datetime.fromisoformat(
+                                        value.replace("Z", "+00:00")
+                                    )
+                                )
+                            else:
+                                sanitized_record[field_name] = None
+                        except (ValueError, TypeError):
+                            sanitized_record[field_name] = None
+                else:
+                    # For any other types, convert to string as fallback
+                    sanitized_record[field_name] = self._convert_value_to_string(value)
+
+            # Add any fields from the record that aren't in the schema (shouldn't happen with dynamic schema)
+            for field_name, value in record.items():
+                if field_name not in sanitized_record:
+                    sanitized_record[field_name] = self._convert_value_to_string(value)
+
+            sanitized_records.append(sanitized_record)
+
+        return sanitized_records
+
     def save(self, document: Document, data_to_save: List[str]) -> List[Dict[str, Any]]:
         """
         Save document data based on the data_to_save list.
@@ -747,40 +835,10 @@ def save_document_sections(self, document: Document) -> Optional[Dict[str, Any]]
                 # Create dynamic schema for this section's data
                 schema = self._create_dynamic_schema(section_records)
 
-                # Ensure all records conform to the schema by filling missing fields and converting types
-                # With conservative typing, most fields will be strings to prevent type conflicts
-                for record in section_records:
-                    for field in schema:
-                        field_name = field.name
-                        if field_name not in record:
-                            record[field_name] = None
-                        else:
-                            # Convert values to match the expected schema types
-                            value = record[field_name]
-                            if value is not None:
-                                if field.type == pa.string():
-                                    # Convert all values to strings for consistency
-                                    record[field_name] = str(value)
-                                elif field.type == pa.timestamp("ms"):
-                                    # Keep timestamps as datetime objects
-                                    if isinstance(value, datetime.datetime):
-                                        record[field_name] = value
-                                    else:
-                                        # Try to parse string timestamps
-                                        try:
-                                            if isinstance(value, str):
-                                                record[field_name] = (
-                                                    datetime.datetime.fromisoformat(
-                                                        value.replace("Z", "+00:00")
-                                                    )
-                                                )
-                                            else:
-                                                record[field_name] = None
-                                        except (ValueError, TypeError):
-                                            record[field_name] = None
-                                else:
-                                    # For any other types, convert to string as fallback
-                                    record[field_name] = str(value)
+                # Sanitize all records to ensure robust type compatibility
+                section_records = self._sanitize_records_for_schema(
+                    section_records, schema
+                )
 
                 # Create S3 key with separate tables for each section type
                 # document_sections/{section_type}/date={date}/{escaped_doc_id}_section_{section_id}.parquet
diff --git a/lib/idp_common_pkg/tests/unit/reporting/test_save_reporting_data.py b/lib/idp_common_pkg/tests/unit/reporting/test_save_reporting_data.py
@@ -275,7 +275,7 @@ def test_flatten_json_data(self, mock_s3_client):
             "customer.address.street": "123 Main St",
             "customer.address.city": "Anytown",
             "items": '["item1", "item2"]',
-            "total": 150.75,
+            "total": "150.75",  # Now converted to string for type consistency
         }
 
         assert flattened == expected
diff --git a/patterns/pattern-1/src/bda_invoke_function/requirements.txt b/patterns/pattern-1/src/bda_invoke_function/requirements.txt
@@ -1,2 +1 @@
-boto3>=1.37.4
 ../../lib/idp_common_pkg  # common utilities package
diff --git a/patterns/pattern-1/src/hitl-process-function/requirements.txt b/patterns/pattern-1/src/hitl-process-function/requirements.txt
@@ -1,2 +1 @@
-boto3>=1.37.4
 ../../lib/idp_common_pkg  # common utilities package
diff --git a/template.yaml b/template.yaml
@@ -1607,7 +1607,7 @@ Resources:
             "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" },
             "Tables": { "AddOrUpdateBehavior": "MergeNewColumns" }
           },
-          "Grouping": { "TableLevelConfiguration": 2 },
+          "Grouping": { "TableLevelConfiguration": 3 },
           "CreatePartitionIndex": true
         }
       Schedule: !If

Original file line number	Diff line number	Diff line change
`@@ -275,7 +275,7 @@ def test_flatten_json_data(self, mock_s3_client):`
`275`	`275`	`"customer.address.street": "123 Main St",`
`276`	`276`	`"customer.address.city": "Anytown",`
`277`	`277`	`"items": '["item1", "item2"]',`
`278`		`- "total": 150.75,`
	`278`	`+ "total": "150.75", # Now converted to string for type consistency`
`279`	`279`	`}`
`280`	`280`
`281`	`281`	`assert flattened == expected`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1 @@`
`1`		`-boto3>=1.37.4`
`2`	`1`	`../../lib/idp_common_pkg # common utilities package`
Original file line number	Diff line number	Diff line change
`@@ -1607,7 +1607,7 @@ Resources:`
`1607`	`1607`	`"Partitions": { "AddOrUpdateBehavior": "InheritFromTable" },`
`1608`	`1608`	`"Tables": { "AddOrUpdateBehavior": "MergeNewColumns" }`
`1609`	`1609`	`},`
`1610`		`- "Grouping": { "TableLevelConfiguration": 2 },`
	`1610`	`+ "Grouping": { "TableLevelConfiguration": 3 },`
`1611`	`1611`	`"CreatePartitionIndex": true`
`1612`	`1612`	`}`
`1613`	`1613`	`Schedule: !If`