@@ -57,7 +57,9 @@ This dual approach ensures discovery insights can be leveraged across different
5757- [ Troubleshooting] ( #troubleshooting )
5858 - [ Common Issues] ( #common-issues )
5959 - [ Error Handling] ( #error-handling )
60- - [ Performance Optimization] ( #performance-optimization )
60+ [ Limitations] ( #limitations )
61+ - [ Known Limitations] ( #known-limitations )
62+
6163
6264## Overview
6365
@@ -133,6 +135,10 @@ This analysis produces structured configuration templates that can be used to co
133135- ** Metadata Storage** : Pattern-neutral job information and progress tracking
134136- ** Event Coordination** : Enables real-time updates and pattern-specific notifications
135137
138+ ** Configuration Table:**
139+ - ** Metadata Storage** : Discovered classes are stored in configuration table as "custom" configuration classes
140+
141+
136142### Pattern-Specific Implementations
137143
138144#### Pattern 1: BDA Blueprint Automation
@@ -176,8 +182,6 @@ The discovery system is designed to support additional patterns through:
176182# Configuration event structure (pattern-agnostic)
177183{
178184 " eventType" : " CONFIGURATION_UPDATE" ,
179- " pattern" : " pattern-X" ,
180- " discoveryJobId" : " job-12345" ,
181185 " documentClasses" : [... ],
182186 " metadata" : {... }
183187}
@@ -209,28 +213,24 @@ graph TD
209213 G --> H
210214 H --> I[Structure Extraction]
211215 I --> J[Pattern-Neutral Configuration]
212- J --> K{Target Pattern?}
213- K -->|Pattern 1| L[BDA Blueprint Creation]
214- K -->|Pattern 2/3| M[Direct Config Update]
215- K -->|New Pattern| N[Custom Handler]
216- L --> O[Job Completion]
217- M --> O
218- N --> O
219- O --> P[UI Notification]
216+ J --> K[Configuration Table Update]
217+ K --> P[Job Completion]
218+ P --> O[UI Notification]
220219```
221220
222221#### Pattern 1: BDA Blueprint Automation Flow
223222``` mermaid
224223graph TD
225- A[Configuration Update Event] --> B[BDA Discovery Function]
226- B --> C[BDA Blueprint Service]
227- C --> D{Blueprint Exists?}
228- D -->|Yes| E[Check for Changes]
229- D -->|No| F[Create New Blueprint]
230- E -->|Changes Found| G[Update Blueprint]
231- E -->|No Changes| H[Skip Update]
232- F --> I[Schema Converter]
233- G --> I
224+ A[View/Edit Configuration UI] --> B[Save Changes]
225+ B --> C[Configuration Update Event]
226+ C --> D[BDA Discovery Lambda - Blueprint Service]
227+ D --> E{Blueprint Exists?}
228+ E -->|Yes| F[Check for Changes]
229+ E -->|No| G[Create New Blueprint]
230+ F -->|Changes Found| H[Update Blueprint]
231+ F -->|No Changes| N[Skip Update]
232+ G --> I[Schema Converter]
233+ H --> I[Schema Converter]
234234 I --> J[Generate BDA Schema]
235235 J --> K[Create/Update in BDA]
236236 K --> L[Create Blueprint Version]
@@ -492,7 +492,7 @@ discovery:
492492
493493**Group Types:**
494494- ` normal` - Standard field groupings
495- - ` Table ` - Repeating tabular data structures
495+ - ` List ` - Repeating tabular data structures
496496
497497# # Using the Discovery Module
498498
@@ -609,25 +609,7 @@ Configuration events are triggered when discovery jobs complete and contain patt
609609{
610610 "eventType": "CONFIGURATION_UPDATE",
611611 "source": "discovery-processor",
612- "pattern": "pattern-1",
613- "discoveryJobId": "discovery-job-12345",
614- "timestamp": "2024-01-15T10:30:00Z",
615- "documentClasses": [
616- {
617- "name": "W4Form",
618- "description": "Employee withholding certificate",
619- "groups": [...],
620- "metadata": {
621- "confidence": 0.95,
622- "model_used": "us.amazon.nova-pro-v1:0"
623- }
624- }
625- ],
626- "processingMetadata": {
627- "groundTruthUsed": true,
628- "processingTime": "45.2s",
629- "documentCount": 1
630- }
612+ "timestamp": "2024-01-15T10:30:00Z"
631613}
632614` ` `
633615
@@ -695,10 +677,10 @@ def pattern_x_configuration_handler(event, context):
695677 """
696678 try:
697679 # Extract discovery results from event
698- document_classes = event.get('documentClasses', [])
680+ # retrieve custom classes from configuration table for processing.
699681
700682 # Transform to pattern-specific format
701- pattern_config = transform_to_pattern_x_format(document_classes )
683+ pattern_config = transform_to_pattern_x_format(custom_classes )
702684
703685 # Update pattern-specific configuration
704686 update_pattern_x_configuration(pattern_config)
@@ -731,51 +713,6 @@ ConfigurationEventSource:
731713 FunctionName: !Ref PatternXConfigurationHandler
732714` ` `
733715
734- # ### Step 3: Implement Schema Transformation
735- ` ` ` python
736- class PatternXSchemaConverter:
737- """
738- Converts pattern-neutral discovery results to Pattern X format.
739- """
740-
741- def convert(self, discovery_result):
742- """
743- Transform discovery document class to Pattern X configuration.
744- """
745- pattern_x_config = {
746- "documentType": discovery_result["name"],
747- "description": discovery_result["description"],
748- "extractionRules": []
749- }
750-
751- # Transform groups and fields
752- for group in discovery_result.get("groups", []):
753- extraction_rule = self._convert_group_to_rule(group)
754- pattern_x_config["extractionRules"].append(extraction_rule)
755-
756- return pattern_x_config
757- ` ` `
758-
759- # ### Step 4: Integration Points
760- ` ` ` python
761- # Configuration update integration
762- def update_pattern_x_configuration(config):
763- """
764- Update Pattern X configuration with discovery results.
765- """
766- # Store in configuration database
767- config_table.put_item(
768- Item={
769- "ConfigurationType": "PatternX",
770- "DocumentClasses": config,
771- "UpdatedAt": datetime.utcnow().isoformat()
772- }
773- )
774-
775- # Trigger any pattern-specific post-processing
776- notify_pattern_x_services(config)
777- ` ` `
778-
779716# ## Benefits of the Generic Event System
780717
781718**🔄 Loose Coupling**: Patterns can implement discovery integration independently
@@ -1278,90 +1215,15 @@ def discovery_with_fallback(discovery_service, document_key, ground_truth_key=No
12781215 )
12791216```
12801217
1281- ### Performance Optimization
1218+ ## Limitations
12821219
1283- ** Document Preprocessing:**
1284- ``` python
1285- def optimize_document_for_discovery (document_path ):
1286- """ Optimize document for better discovery performance."""
1287- # Resize images to optimal dimensions
1288- if document_path.lower().endswith((' .jpg' , ' .jpeg' , ' .png' )):
1289- optimize_image_resolution(document_path, target_dpi = 150 )
1290-
1291- # Split large PDFs into manageable sections
1292- elif document_path.lower().endswith(' .pdf' ):
1293- page_count = get_pdf_page_count(document_path)
1294- if page_count > 10 :
1295- return split_pdf_into_sections(document_path, max_pages = 10 )
1296-
1297- return [document_path]
1298- ```
1299-
1300- ** Batch Processing:**
1301- ``` python
1302- def batch_discovery_processing (document_list , batch_size = 5 ):
1303- """ Process multiple documents efficiently."""
1304- results = []
1305-
1306- for i in range (0 , len (document_list), batch_size):
1307- batch = document_list[i:i + batch_size]
1308-
1309- # Process batch concurrently
1310- with ThreadPoolExecutor(max_workers = batch_size) as executor:
1311- futures = [
1312- executor.submit(process_single_document, doc)
1313- for doc in batch
1314- ]
1315-
1316- batch_results = [
1317- future.result() for future in as_completed(futures)
1318- ]
1319-
1320- results.extend(batch_results)
1321-
1322- # Rate limiting between batches
1323- time.sleep(1 )
1324-
1325- return results
1326- ```
1327-
1328- ** Caching and Reuse:**
1329- ``` python
1330- def cached_discovery_analysis (document_hash , config_hash ):
1331- """ Cache discovery results for reuse."""
1332- cache_key = f " discovery: { document_hash} : { config_hash} "
1333-
1334- # Check cache first
1335- cached_result = get_from_cache(cache_key)
1336- if cached_result:
1337- return cached_result
1338-
1339- # Perform discovery if not cached
1340- result = perform_discovery_analysis()
1341-
1342- # Cache result for future use
1343- set_cache(cache_key, result, ttl = 3600 ) # 1 hour TTL
1344-
1345- return result
1346- ```
1220+ ### Known Limitations
1221+ ** Configuration Table**
1222+ - Discovery feature stores all custom classes as an array in Configuration table with "custom" key.
1223+ - DynamoDB has hard limit of 440 KB per item. We have to refactor to store classes in multiple items in DynamoDB.
1224+ ** Discovery Output Format**
1225+ - Output format is configuration via View/Edit configuration. JSON format should follow custom classes format.
1226+ - Output in any other format will result in failure.
13471227
1348- ** Monitoring and Metrics:**
1349- ``` python
1350- def track_discovery_metrics (job_id , start_time , result ):
1351- """ Track discovery performance metrics."""
1352- processing_time = time.time() - start_time
1353-
1354- metrics = {
1355- ' job_id' : job_id,
1356- ' processing_time_seconds' : processing_time,
1357- ' fields_discovered' : count_discovered_fields(result),
1358- ' groups_identified' : count_groups(result),
1359- ' model_used' : result.get(' metadata' , {}).get(' model_id' ),
1360- ' success' : result.get(' status' ) == ' SUCCESS'
1361- }
1362-
1363- # Send to CloudWatch or monitoring system
1364- publish_metrics(metrics)
1365- ```
13661228
13671229The Discovery module provides a powerful foundation for understanding and processing new document types. By following these guidelines and best practices, you can effectively leverage the module to bootstrap document processing workflows and continuously improve their accuracy and coverage.
0 commit comments