Skip to content

Commit 21d4efe

Browse files
committed
Add Back document and page content regex functionality
1 parent c2b8e93 commit 21d4efe

File tree

7 files changed

+88
-16
lines changed

7 files changed

+88
-16
lines changed

docs/classification.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,7 @@ classes:
607607
x-aws-idp-document-type: Payslip
608608
type: object
609609
description: "Employee wage statement showing earnings and deductions"
610-
document_name_regex: "(?i).*(payslip|paystub|salary|wage).*"
610+
x-aws-idp-document-name-regex: "(?i).*(payslip|paystub|salary|wage).*"
611611
properties:
612612
EmployeeName:
613613
type: string
@@ -640,7 +640,7 @@ classes:
640640
x-aws-idp-document-type: Invoice
641641
type: object
642642
description: "Business invoice document"
643-
document_page_content_regex: "(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)"
643+
x-aws-idp-document-page-content-regex: "(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)"
644644
properties:
645645
InvoiceNumber:
646646
type: string
@@ -650,7 +650,7 @@ classes:
650650
x-aws-idp-document-type: Payslip
651651
type: object
652652
description: "Employee wage statement"
653-
document_page_content_regex: "(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)"
653+
x-aws-idp-document-page-content-regex: "(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)"
654654
properties:
655655
EmployeeName:
656656
type: string

lib/idp_common_pkg/idp_common/classification/service.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@
4040
from idp_common.config.models import IDPConfig
4141
from idp_common.config.schema_constants import (
4242
X_AWS_IDP_CLASSIFICATION,
43+
X_AWS_IDP_DOCUMENT_NAME_REGEX,
4344
X_AWS_IDP_DOCUMENT_TYPE,
45+
X_AWS_IDP_PAGE_CONTENT_REGEX,
4446
)
4547
from idp_common.models import Document, Section, Status
4648
from idp_common.utils import extract_json_from_text, extract_structured_data_from_text
@@ -168,14 +170,21 @@ def _load_document_types(self) -> List[DocumentType]:
168170
classes = self.config.classes
169171
for schema in classes:
170172
classification_meta = schema.get(X_AWS_IDP_CLASSIFICATION, {})
173+
174+
# Support both new top-level format and legacy nested format for regex patterns
175+
document_name_regex = schema.get(
176+
X_AWS_IDP_DOCUMENT_NAME_REGEX
177+
) or classification_meta.get("documentNamePattern")
178+
document_page_content_regex = schema.get(
179+
X_AWS_IDP_PAGE_CONTENT_REGEX
180+
) or classification_meta.get("pageContentPattern")
181+
171182
doc_types.append(
172183
DocumentType(
173184
type_name=schema.get(X_AWS_IDP_DOCUMENT_TYPE, ""),
174185
description=schema.get("description", ""),
175-
document_name_regex=classification_meta.get("documentNamePattern"),
176-
document_page_content_regex=classification_meta.get(
177-
"pageContentPattern"
178-
),
186+
document_name_regex=document_name_regex,
187+
document_page_content_regex=document_page_content_regex,
179188
)
180189
)
181190

lib/idp_common_pkg/idp_common/config/migration.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
X_AWS_IDP_CLASS_PROMPT,
2525
X_AWS_IDP_ATTRIBUTES_PROMPT,
2626
X_AWS_IDP_IMAGE_PATH,
27+
X_AWS_IDP_DOCUMENT_NAME_REGEX,
28+
X_AWS_IDP_PAGE_CONTENT_REGEX,
2729
VALID_EVALUATION_METHODS,
2830
MAX_PROMPT_OVERRIDE_LENGTH,
2931
# Attribute types (for legacy migration only)
@@ -46,6 +48,8 @@
4648
LEGACY_CLASS_PROMPT,
4749
LEGACY_ATTRIBUTES_PROMPT,
4850
LEGACY_IMAGE_PATH,
51+
LEGACY_DOCUMENT_NAME_REGEX,
52+
LEGACY_DOCUMENT_PAGE_CONTENT_REGEX,
4953
)
5054

5155

@@ -153,6 +157,17 @@ def migrate_legacy_to_schema(
153157
if LEGACY_EXAMPLES in class_config:
154158
migrated_class[X_AWS_IDP_EXAMPLES] = class_config[LEGACY_EXAMPLES]
155159

160+
# Migrate regex patterns if present
161+
if LEGACY_DOCUMENT_NAME_REGEX in class_config:
162+
migrated_class[X_AWS_IDP_DOCUMENT_NAME_REGEX] = class_config[
163+
LEGACY_DOCUMENT_NAME_REGEX
164+
]
165+
166+
if LEGACY_DOCUMENT_PAGE_CONTENT_REGEX in class_config:
167+
migrated_class[X_AWS_IDP_PAGE_CONTENT_REGEX] = class_config[
168+
LEGACY_DOCUMENT_PAGE_CONTENT_REGEX
169+
]
170+
156171
legacy_attributes = class_config.get(LEGACY_ATTRIBUTES, [])
157172

158173
for attr in legacy_attributes:
@@ -456,6 +471,17 @@ def _convert_classes_to_json_schema(
456471
):
457472
schema[X_AWS_IDP_EXAMPLES] = doc_type_class[X_AWS_IDP_EXAMPLES]
458473

474+
# Add regex patterns if present
475+
if X_AWS_IDP_DOCUMENT_NAME_REGEX in doc_type_class:
476+
schema[X_AWS_IDP_DOCUMENT_NAME_REGEX] = doc_type_class[
477+
X_AWS_IDP_DOCUMENT_NAME_REGEX
478+
]
479+
480+
if X_AWS_IDP_PAGE_CONTENT_REGEX in doc_type_class:
481+
schema[X_AWS_IDP_PAGE_CONTENT_REGEX] = doc_type_class[
482+
X_AWS_IDP_PAGE_CONTENT_REGEX
483+
]
484+
459485
if defs:
460486
schema[DEFS_FIELD] = defs
461487

lib/idp_common_pkg/idp_common/config/schema_constants.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
# Classification metadata for document type
2626
X_AWS_IDP_CLASSIFICATION = "x-aws-idp-classification"
2727

28+
# Regex patterns for classification optimization
29+
X_AWS_IDP_DOCUMENT_NAME_REGEX = "x-aws-idp-document-name-regex"
30+
X_AWS_IDP_PAGE_CONTENT_REGEX = "x-aws-idp-document-page-content-regex"
31+
2832
# ============================================================================
2933
# Legacy Attribute Type Values (for migration only)
3034
# ============================================================================
@@ -52,8 +56,7 @@
5256
X_AWS_IDP_EVALUATION_METHOD = "x-aws-idp-evaluation-method"
5357

5458

55-
56-
X_AWS_IDP_EXAMPLES= "x-aws-idp-examples"
59+
X_AWS_IDP_EXAMPLES = "x-aws-idp-examples"
5760

5861
# Valid evaluation methods
5962
EVALUATION_METHOD_EXACT = "EXACT"
@@ -114,6 +117,10 @@
114117
LEGACY_ATTRIBUTES_PROMPT = "attributesPrompt"
115118
LEGACY_IMAGE_PATH = "imagePath"
116119

120+
# Legacy regex fields (same name in both legacy and new format)
121+
LEGACY_DOCUMENT_NAME_REGEX = "document_name_regex"
122+
LEGACY_DOCUMENT_PAGE_CONTENT_REGEX = "document_page_content_regex"
123+
117124
# ============================================================================
118125
# JSON Schema Standard Property Names
119126
# ============================================================================

notebooks/examples/step2_classification_with_regex.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124
" 'x-aws-idp-document-type': 'BankStatement',\n",
125125
" 'type': 'object',\n",
126126
" 'description': 'Employee wage statement',\n",
127-
" 'document_name_regex': r'(?i).*(statement).*',\n",
127+
" 'x-aws-idp-document-name-regex': r'(?i).*(statement).*',\n",
128128
" 'properties': {\n",
129129
" 'Name': {\n",
130130
" 'type': 'string',\n",
@@ -192,7 +192,7 @@
192192
" 'x-aws-idp-document-type': 'Payslip',\n",
193193
" 'type': 'object',\n",
194194
" 'description': 'Employee wage statement',\n",
195-
" 'document_page_content_regex': r'(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)',\n",
195+
" 'x-aws-idp-document-page-content-regex': r'(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)',\n",
196196
" 'properties': {\n",
197197
" 'EmployeeName': {\n",
198198
" 'type': 'string',\n",
@@ -206,7 +206,7 @@
206206
" 'x-aws-idp-document-type': 'Invoice',\n",
207207
" 'type': 'object',\n",
208208
" 'description': 'Business invoice',\n",
209-
" 'document_page_content_regex': r'(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)',\n",
209+
" 'x-aws-idp-document-page-content-regex': r'(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)',\n",
210210
" 'properties': {\n",
211211
" 'InvoiceNumber': {\n",
212212
" 'type': 'string',\n",

src/ui/src/components/json-schema-builder/SchemaInspector.jsx

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import {
1515
X_AWS_IDP_EVALUATION_METHOD,
1616
X_AWS_IDP_CONFIDENCE_THRESHOLD,
1717
X_AWS_IDP_EXAMPLES,
18+
X_AWS_IDP_DOCUMENT_NAME_REGEX,
19+
X_AWS_IDP_PAGE_CONTENT_REGEX,
1820
} from '../../constants/schemaConstants';
1921

2022
const SchemaInspector = ({
@@ -99,10 +101,34 @@ const SchemaInspector = ({
99101
</FormField>
100102

101103
{selectedClass[X_AWS_IDP_DOCUMENT_TYPE] && (
102-
<ExamplesEditor
103-
examples={selectedClass[X_AWS_IDP_EXAMPLES] || []}
104-
onChange={(examples) => onUpdateClass({ [X_AWS_IDP_EXAMPLES]: examples })}
105-
/>
104+
<>
105+
<ExamplesEditor
106+
examples={selectedClass[X_AWS_IDP_EXAMPLES] || []}
107+
onChange={(examples) => onUpdateClass({ [X_AWS_IDP_EXAMPLES]: examples })}
108+
/>
109+
110+
<FormField
111+
label="Document Name Regex (Optional)"
112+
description="Pattern to match document ID/name. When matched, instantly classifies all pages as this type (single-class configs only). Use case-insensitive patterns like (?i).*(invoice|bill).*"
113+
>
114+
<Input
115+
value={selectedClass[X_AWS_IDP_DOCUMENT_NAME_REGEX] || ''}
116+
onChange={({ detail }) => onUpdateClass({ [X_AWS_IDP_DOCUMENT_NAME_REGEX]: detail.value || undefined })}
117+
placeholder="e.g., (?i).*(invoice|bill).*"
118+
/>
119+
</FormField>
120+
121+
<FormField
122+
label="Page Content Regex (Optional)"
123+
description="Pattern to match page text content. When matched during page-level classification, classifies the page as this type. Use case-insensitive patterns like (?i)(invoice\\s+number|amount\\s+due)"
124+
>
125+
<Input
126+
value={selectedClass[X_AWS_IDP_PAGE_CONTENT_REGEX] || ''}
127+
onChange={({ detail }) => onUpdateClass({ [X_AWS_IDP_PAGE_CONTENT_REGEX]: detail.value || undefined })}
128+
placeholder="e.g., (?i)(invoice\\s+number|bill\\s+to)"
129+
/>
130+
</FormField>
131+
</>
106132
)}
107133

108134
{usedIn.length > 0 && (

src/ui/src/constants/schemaConstants.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ export const X_AWS_IDP_DOCUMENT_TYPE = 'x-aws-idp-document-type';
6767
/** Classification metadata for document type */
6868
export const X_AWS_IDP_CLASSIFICATION = 'x-aws-idp-classification';
6969

70+
/** Regex patterns for classification optimization */
71+
export const X_AWS_IDP_DOCUMENT_NAME_REGEX = 'x-aws-idp-document-name-regex';
72+
export const X_AWS_IDP_PAGE_CONTENT_REGEX = 'x-aws-idp-document-page-content-regex';
73+
7074
// ============================================================================
7175
// AWS IDP List-Specific Extensions
7276
// ============================================================================

0 commit comments

Comments
 (0)