Skip to content

Commit ab52acb

Browse files
committed
config and example fixes backend
1 parent da5c90b commit ab52acb

File tree

10 files changed

+350
-369
lines changed

10 files changed

+350
-369
lines changed

lib/idp_common_pkg/idp_common/classification/service.py

Lines changed: 2 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
)
4545
from idp_common.models import Document, Section, Status
4646
from idp_common.utils import extract_json_from_text, extract_structured_data_from_text
47+
from idp_common.utils.few_shot_example_builder import build_few_shot_examples_content
4748

4849
logger = logging.getLogger(__name__)
4950

@@ -865,7 +866,7 @@ def _build_content(
865866
content.extend(before_examples_content)
866867

867868
# Add few-shot examples from config
868-
examples_content = self._build_few_shot_examples_content()
869+
examples_content = build_few_shot_examples_content(self.config)
869870
content.extend(examples_content)
870871

871872
# Add the part after examples
@@ -875,139 +876,6 @@ def _build_content(
875876

876877
return content
877878

878-
def _build_few_shot_examples_content(self) -> List[Dict[str, Any]]:
879-
"""
880-
Build content items for few-shot examples from the configuration.
881-
882-
Returns:
883-
List of content items containing text and image content for examples
884-
"""
885-
content = []
886-
classes = self.config.classes or []
887-
888-
for schema in classes:
889-
classification_meta = schema.get(X_AWS_IDP_CLASSIFICATION, {})
890-
examples = classification_meta.get("examples", [])
891-
for example in examples:
892-
class_prompt = example.get("classPrompt")
893-
894-
# Only process this example if it has a non-empty class_prompt
895-
if not class_prompt or not class_prompt.strip():
896-
logger.info(
897-
f"Skipping example with empty classPrompt: {example.get('name')}"
898-
)
899-
continue
900-
901-
content.append({"text": class_prompt})
902-
903-
image_path = example.get("imagePath")
904-
if image_path:
905-
try:
906-
# Load image content from the path
907-
908-
from idp_common import image, s3
909-
910-
# Get list of image files from the path (supports directories/prefixes)
911-
image_files = self._get_image_files_from_path(image_path)
912-
913-
# Process each image file
914-
for image_file_path in image_files:
915-
try:
916-
# Load image content
917-
if image_file_path.startswith("s3://"):
918-
# Direct S3 URI
919-
image_content = s3.get_binary_content(
920-
image_file_path
921-
)
922-
else:
923-
# Local file
924-
with open(image_file_path, "rb") as f:
925-
image_content = f.read()
926-
927-
# Prepare image content for Bedrock
928-
image_attachment = (
929-
image.prepare_bedrock_image_attachment(
930-
image_content
931-
)
932-
)
933-
content.append(image_attachment)
934-
935-
except Exception as e:
936-
logger.warning(
937-
f"Failed to load image {image_file_path}: {e}"
938-
)
939-
continue
940-
941-
except Exception as e:
942-
raise ValueError(
943-
f"Failed to load example images from {image_path}: {e}"
944-
)
945-
946-
return content
947-
948-
def _get_image_files_from_path(self, image_path: str) -> List[str]:
949-
"""
950-
Get list of image files from a path that could be a single file, directory, or S3 prefix.
951-
952-
Args:
953-
image_path: Path to image file, directory, or S3 prefix
954-
955-
Returns:
956-
List of image file paths/URIs sorted by filename
957-
"""
958-
import os
959-
960-
from idp_common import s3
961-
962-
# Handle S3 URIs
963-
if image_path.startswith("s3://"):
964-
# Check if it's a direct file or a prefix
965-
if image_path.endswith(
966-
(".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")
967-
):
968-
# Direct S3 file
969-
return [image_path]
970-
else:
971-
# S3 prefix - list all images
972-
return s3.list_images_from_path(image_path)
973-
else:
974-
# Handle local paths
975-
config_bucket = os.environ.get("CONFIGURATION_BUCKET")
976-
root_dir = os.environ.get("ROOT_DIR")
977-
978-
if config_bucket:
979-
# Use environment bucket with imagePath as key
980-
s3_uri = f"s3://{config_bucket}/{image_path}"
981-
982-
# Check if it's a direct file or a prefix
983-
if image_path.endswith(
984-
(".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp")
985-
):
986-
# Direct S3 file
987-
return [s3_uri]
988-
else:
989-
# S3 prefix - list all images
990-
return s3.list_images_from_path(s3_uri)
991-
elif root_dir:
992-
# Use relative path from ROOT_DIR
993-
full_path = os.path.join(root_dir, image_path)
994-
full_path = os.path.normpath(full_path)
995-
996-
if os.path.isfile(full_path):
997-
# Single local file
998-
return [full_path]
999-
elif os.path.isdir(full_path):
1000-
# Local directory - list all images
1001-
return s3.list_images_from_path(full_path)
1002-
else:
1003-
# Path doesn't exist
1004-
logger.warning(f"Image path does not exist: {full_path}")
1005-
return []
1006-
else:
1007-
raise ValueError(
1008-
"No CONFIGURATION_BUCKET or ROOT_DIR set. Cannot read example images from local filesystem."
1009-
)
1010-
1011879
def classify_page_bedrock(
1012880
self,
1013881
page_id: str,

lib/idp_common_pkg/idp_common/config/migration.py

Lines changed: 8 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
SCHEMA_ITEMS,
1111
SCHEMA_REQUIRED,
1212
SCHEMA_DESCRIPTION,
13-
SCHEMA_EXAMPLES,
1413
TYPE_OBJECT,
1514
TYPE_ARRAY,
1615
TYPE_STRING,
1716
# AWS IDP extensions
1817
X_AWS_IDP_DOCUMENT_TYPE,
18+
X_AWS_IDP_EXAMPLES,
1919
X_AWS_IDP_LIST_ITEM_DESCRIPTION,
2020
X_AWS_IDP_ORIGINAL_NAME,
2121
X_AWS_IDP_EVALUATION_METHOD,
@@ -151,9 +151,7 @@ def migrate_legacy_to_schema(
151151

152152
# Migrate examples if present
153153
if LEGACY_EXAMPLES in class_config:
154-
migrated_class[LEGACY_EXAMPLES] = _migrate_examples(
155-
class_config[LEGACY_EXAMPLES]
156-
)
154+
migrated_class[X_AWS_IDP_EXAMPLES] = class_config[LEGACY_EXAMPLES]
157155

158156
legacy_attributes = class_config.get(LEGACY_ATTRIBUTES, [])
159157

@@ -280,98 +278,6 @@ def _migrate_list_attribute(attr: Dict[str, Any]) -> Dict[str, Any]:
280278
return schema_attr
281279

282280

283-
def _migrate_examples(legacy_examples: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
284-
"""
285-
Migrate legacy examples to JSON Schema examples format.
286-
287-
Legacy format has:
288-
- classPrompt: Classification prompt
289-
- attributesPrompt: Extraction prompt
290-
- imagePath: Path to example image
291-
- name: Example name
292-
293-
JSON Schema format uses standard 'examples' array with AWS extensions.
294-
295-
Args:
296-
legacy_examples: List of legacy example dictionaries
297-
298-
Returns:
299-
List of migrated examples in JSON Schema format
300-
"""
301-
if not legacy_examples:
302-
return []
303-
304-
migrated_examples = []
305-
306-
for example in legacy_examples:
307-
# Create a JSON Schema example with AWS IDP extensions
308-
migrated_example = {}
309-
310-
# Preserve the name if present
311-
if LEGACY_NAME in example:
312-
migrated_example[LEGACY_NAME] = example[LEGACY_NAME]
313-
314-
# Add AWS IDP extensions for few-shot learning
315-
if LEGACY_CLASS_PROMPT in example:
316-
migrated_example[X_AWS_IDP_CLASS_PROMPT] = example[LEGACY_CLASS_PROMPT]
317-
318-
if LEGACY_ATTRIBUTES_PROMPT in example:
319-
migrated_example[X_AWS_IDP_ATTRIBUTES_PROMPT] = example[
320-
LEGACY_ATTRIBUTES_PROMPT
321-
]
322-
323-
if LEGACY_IMAGE_PATH in example:
324-
migrated_example[X_AWS_IDP_IMAGE_PATH] = example[LEGACY_IMAGE_PATH]
325-
326-
# Parse the attributes prompt to create actual example values if possible
327-
# This helps with standard JSON Schema validation
328-
if LEGACY_ATTRIBUTES_PROMPT in example:
329-
try:
330-
# Try to extract JSON values from the attributes prompt
331-
import re
332-
import json
333-
334-
prompt = example[LEGACY_ATTRIBUTES_PROMPT]
335-
# Look for JSON-like structure in the prompt
336-
json_match = re.search(r"\{.*\}", prompt, re.DOTALL)
337-
parsed = False
338-
if json_match:
339-
json_str = json_match.group()
340-
try:
341-
example_values = json.loads(json_str)
342-
if isinstance(example_values, dict):
343-
for key, value in example_values.items():
344-
migrated_example[key] = value
345-
parsed = True
346-
except (json.JSONDecodeError, ValueError):
347-
parsed = False
348-
349-
if not parsed:
350-
# Try to find key-value pairs in format: "key": "value"
351-
pairs = re.findall(
352-
r'"([^"]+)":\s*("(?:[^"\\]|\\.)*"|\bnull\b)', prompt
353-
)
354-
if pairs:
355-
# Build a JSON object from the pairs
356-
json_str = (
357-
"{" + ", ".join([f'"{k}": {v}' for k, v in pairs]) + "}"
358-
)
359-
try:
360-
example_values = json.loads(json_str)
361-
# Add the parsed values to the example
362-
for key, value in example_values.items():
363-
migrated_example[key] = value
364-
except (json.JSONDecodeError, ValueError):
365-
pass # If parsing fails, just keep the prompt as-is
366-
except Exception:
367-
pass # Safely ignore parsing errors
368-
369-
if migrated_example: # Only add if we have content
370-
migrated_examples.append(migrated_example)
371-
372-
return migrated_examples
373-
374-
375281
def _add_aws_extensions(legacy_attr: Dict[str, Any], schema: Dict[str, Any]) -> None:
376282
"""Add AWS extension fields back to legacy format (for reverse migration if needed)."""
377283
if X_AWS_IDP_EVALUATION_METHOD in schema:
@@ -542,9 +448,13 @@ def _convert_classes_to_json_schema(
542448
if required:
543449
schema[SCHEMA_REQUIRED] = required
544450

545-
# Add examples if present
451+
# Add examples if present (check both legacy and new key)
546452
if LEGACY_EXAMPLES in doc_type_class and doc_type_class[LEGACY_EXAMPLES]:
547-
schema[SCHEMA_EXAMPLES] = doc_type_class[LEGACY_EXAMPLES]
453+
schema[X_AWS_IDP_EXAMPLES] = doc_type_class[LEGACY_EXAMPLES]
454+
elif (
455+
X_AWS_IDP_EXAMPLES in doc_type_class and doc_type_class[X_AWS_IDP_EXAMPLES]
456+
):
457+
schema[X_AWS_IDP_EXAMPLES] = doc_type_class[X_AWS_IDP_EXAMPLES]
548458

549459
if defs:
550460
schema[DEFS_FIELD] = defs

lib/idp_common_pkg/idp_common/config/models.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,32 @@ class IDPConfig(BaseModel):
542542
default_factory=EvaluationConfig, description="Evaluation configuration"
543543
)
544544

545+
# Criteria validation specific fields (used in pattern-2/criteria-validation)
546+
summary: Optional[Dict[str, Any]] = Field(
547+
default=None, description="Summary configuration for criteria validation"
548+
)
549+
criteria_types: Optional[List[str]] = Field(
550+
default=None, description="List of criteria types for validation"
551+
)
552+
request_bucket: Optional[str] = Field(
553+
default=None, description="S3 bucket for user history/request data"
554+
)
555+
request_history_prefix: Optional[str] = Field(
556+
default=None, description="S3 prefix for request history"
557+
)
558+
criteria_bucket: Optional[str] = Field(
559+
default=None, description="S3 bucket for criteria documents"
560+
)
561+
output_bucket: Optional[str] = Field(
562+
default=None, description="S3 bucket for processing output"
563+
)
564+
textract_page_tracker: Optional[str] = Field(
565+
default=None, description="S3 bucket for Textract page tracking"
566+
)
567+
cost_report_bucket: Optional[str] = Field(
568+
default=None, description="S3 bucket for cost reports"
569+
)
570+
545571
model_config = ConfigDict(
546572
# Do not allow extra fields - all config should be explicit
547573
extra="forbid",

lib/idp_common_pkg/idp_common/config/schema_constants.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@
5151
# Evaluation method for attribute comparison
5252
X_AWS_IDP_EVALUATION_METHOD = "x-aws-idp-evaluation-method"
5353

54+
55+
56+
X_AWS_IDP_EXAMPLES= "x-aws-idp-examples"
57+
5458
# Valid evaluation methods
5559
EVALUATION_METHOD_EXACT = "EXACT"
5660
EVALUATION_METHOD_NUMERIC_EXACT = "NUMERIC_EXACT"
@@ -118,7 +122,6 @@
118122
SCHEMA_ITEMS = "items"
119123
SCHEMA_REQUIRED = "required"
120124
SCHEMA_DESCRIPTION = "description"
121-
SCHEMA_EXAMPLES = "examples" # Standard JSON Schema examples field
122125

123126
# JSON Schema type values
124127
TYPE_OBJECT = "object"

0 commit comments

Comments
 (0)