Skip to content

Commit af8c0a3

Browse files
author
Bob Strahan
committed
Merge branch 'develop' into feature/s3-vectorstore
2 parents eba01e1 + 5d1aaab commit af8c0a3

File tree

10 files changed

+820
-253
lines changed

10 files changed

+820
-253
lines changed

config_library/pattern-2/bank-statement-sample/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ classes:
6868
description: List of all transactions in the statement period
6969
attributeType: list
7070
classification:
71+
maxPagesForClassification: "ALL"
7172
image:
7273
target_height: ''
7374
target_width: ''

config_library/pattern-2/lending-package-sample/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,7 @@ classes:
914914
attributeType: group
915915
classification:
916916
classificationMethod: multimodalPageLevelClassification
917+
maxPagesForClassification: "ALL"
917918
image:
918919
target_height: ''
919920
target_width: ''

config_library/pattern-2/rvl-cdip-package-sample-with-few-shot-examples/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,7 @@ classes:
647647
imagePath: config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/bank-statement-pages/
648648

649649
classification:
650+
maxPagesForClassification: "ALL"
650651
image:
651652
target_height: ''
652653
target_width: ''

config_library/pattern-2/rvl-cdip-package-sample/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ classes:
307307
- name: comments
308308
description: Additional notes or remarks about the document. Look for sections labeled 'notes', 'remarks', or 'comments'.
309309
classification:
310+
maxPagesForClassification: "ALL"
310311
image:
311312
target_height: ''
312313
target_width: ''

docs/classification.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,22 @@ When deciding between Text-Based Holistic Classification and MultiModal Page-Lev
181181
182182
## Customizing Classification in Pattern 2
183183
184+
### Configuration Settings
185+
186+
#### Page Limit Configuration
187+
188+
Control how many pages are used for classification:
189+
190+
```yaml
191+
classification:
192+
maxPagesForClassification: "ALL" # Default: use all pages
193+
# Or: "1", "2", "3", etc. - use only first N pages
194+
```
195+
196+
**Important**: When set to a number (e.g., `"3"`), only the first N pages are classified, but the result is applied to ALL pages in the document. This forces the entire document to be assigned a single class with one section.
197+
198+
### Prompt Components
199+
184200
In Pattern 2, you can customize classification behavior through various prompt components:
185201

186202
### System Prompts

lib/idp_common_pkg/idp_common/classification/service.py

Lines changed: 399 additions & 246 deletions
Large diffs are not rendered by default.
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
# SPDX-License-Identifier: MIT-0
3+
4+
"""
5+
Tests for metering data transfer in limited classification scenarios.
6+
"""
7+
8+
import pytest
9+
from idp_common.classification.service import ClassificationService
10+
from idp_common.models import Document, Page, Section
11+
12+
13+
@pytest.mark.unit
14+
def test_apply_limited_classification_transfers_metering():
15+
"""Test that metering data is transferred from classified document to original document."""
16+
# Create classification service with minimal config
17+
config = {
18+
"model_id": "anthropic.claude-3-sonnet-20240229-v1:0",
19+
"classification": {"maxPagesForClassification": "2"},
20+
}
21+
service = ClassificationService(region="us-east-1", config=config)
22+
23+
# Create original document with 5 pages
24+
original_doc = Document(
25+
id="original-doc",
26+
pages={
27+
"1": Page(page_id="1"),
28+
"2": Page(page_id="2"),
29+
"3": Page(page_id="3"),
30+
"4": Page(page_id="4"),
31+
"5": Page(page_id="5"),
32+
},
33+
metering={},
34+
errors=[],
35+
metadata={},
36+
)
37+
38+
# Create classified document with 2 pages and metering data
39+
classified_doc = Document(
40+
id="classified-doc",
41+
pages={
42+
"1": Page(page_id="1", classification="invoice"),
43+
"2": Page(page_id="2", classification="invoice"),
44+
},
45+
sections=[
46+
Section(
47+
section_id="1",
48+
classification="invoice",
49+
page_ids=["1", "2"],
50+
confidence=0.9,
51+
)
52+
],
53+
metering={
54+
"Classification/bedrock/anthropic.claude-3-sonnet": {
55+
"inputTokens": 1000,
56+
"outputTokens": 100,
57+
"totalTokens": 1100,
58+
}
59+
},
60+
errors=["Classification warning"],
61+
metadata={"processing_time": 2.5},
62+
)
63+
64+
# Apply limited classification to all pages
65+
result_doc = service._apply_limited_classification_to_all_pages(
66+
original_doc, classified_doc
67+
)
68+
69+
# Verify metering data was transferred
70+
assert result_doc.metering == {
71+
"Classification/bedrock/anthropic.claude-3-sonnet": {
72+
"inputTokens": 1000,
73+
"outputTokens": 100,
74+
"totalTokens": 1100,
75+
}
76+
}
77+
78+
# Verify errors were transferred
79+
assert "Classification warning" in result_doc.errors
80+
81+
# Verify metadata was transferred
82+
assert result_doc.metadata["processing_time"] == 2.5
83+
84+
# Verify classification was applied to all pages
85+
assert len(result_doc.sections) == 1
86+
assert result_doc.sections[0].classification == "invoice"
87+
assert len(result_doc.sections[0].page_ids) == 5 # All original pages
88+
89+
# Verify all pages have the classification
90+
for page in result_doc.pages.values():
91+
assert page.classification == "invoice"
92+
assert page.confidence == 1.0
93+
94+
95+
@pytest.mark.unit
96+
def test_apply_limited_classification_merges_existing_metering():
97+
"""Test that metering data is merged with existing metering in original document."""
98+
config = {"model_id": "anthropic.claude-3-sonnet-20240229-v1:0"}
99+
service = ClassificationService(region="us-east-1", config=config)
100+
101+
# Original document with existing metering
102+
original_doc = Document(
103+
id="original-doc",
104+
pages={"1": Page(page_id="1")},
105+
metering={
106+
"OCR/textract/detect_document_text": {
107+
"pages": 1,
108+
"cost": 0.001,
109+
}
110+
},
111+
)
112+
113+
# Classified document with new metering
114+
classified_doc = Document(
115+
id="classified-doc",
116+
pages={"1": Page(page_id="1", classification="receipt")},
117+
sections=[
118+
Section(
119+
section_id="1",
120+
classification="receipt",
121+
page_ids=["1"],
122+
confidence=0.8,
123+
)
124+
],
125+
metering={
126+
"Classification/bedrock/anthropic.claude-3-sonnet": {
127+
"inputTokens": 500,
128+
"outputTokens": 50,
129+
}
130+
},
131+
)
132+
133+
# Apply limited classification
134+
result_doc = service._apply_limited_classification_to_all_pages(
135+
original_doc, classified_doc
136+
)
137+
138+
# Verify both metering entries exist
139+
assert "OCR/textract/detect_document_text" in result_doc.metering
140+
assert "Classification/bedrock/anthropic.claude-3-sonnet" in result_doc.metering
141+
assert result_doc.metering["OCR/textract/detect_document_text"]["pages"] == 1
142+
assert (
143+
result_doc.metering["Classification/bedrock/anthropic.claude-3-sonnet"][
144+
"inputTokens"
145+
]
146+
== 500
147+
)
148+
149+
150+
@pytest.mark.unit
151+
def test_apply_limited_classification_no_metering_data():
152+
"""Test that method works correctly when classified document has no metering data."""
153+
config = {"model_id": "anthropic.claude-3-sonnet-20240229-v1:0"}
154+
service = ClassificationService(region="us-east-1", config=config)
155+
156+
original_doc = Document(
157+
id="original-doc",
158+
pages={"1": Page(page_id="1")},
159+
)
160+
161+
classified_doc = Document(
162+
id="classified-doc",
163+
pages={"1": Page(page_id="1", classification="form")},
164+
sections=[
165+
Section(
166+
section_id="1",
167+
classification="form",
168+
page_ids=["1"],
169+
confidence=0.7,
170+
)
171+
],
172+
metering={}, # Empty metering
173+
)
174+
175+
# Should not raise an error
176+
result_doc = service._apply_limited_classification_to_all_pages(
177+
original_doc, classified_doc
178+
)
179+
180+
# Original metering should be preserved (empty in this case)
181+
assert result_doc.metering == {}
182+
assert result_doc.pages["1"].classification == "form"

0 commit comments

Comments
 (0)