@@ -27,17 +27,17 @@ def __init__(
2727 self .input_bucket = input_bucket
2828 self .input_prefix = input_prefix
2929 self .region = region or os .environ .get ("AWS_REGION" , "us-east-1" )
30-
30+
3131 # Load configuration
3232 self .config = config or self ._load_default_config ()
33-
33+
3434 # Get discovery configuration
3535 self .discovery_config = self .config .get ("discovery" , {})
36-
36+
3737 # Get model configuration for both scenarios
3838 self .without_gt_config = self .discovery_config .get ("without_ground_truth" , {})
3939 self .with_gt_config = self .discovery_config .get ("with_ground_truth" , {})
40-
40+
4141 # Backward compatibility: use bedrock_model_id if provided
4242 if bedrock_model_id :
4343 self .without_gt_config ["model_id" ] = bedrock_model_id
@@ -76,7 +76,7 @@ def _load_default_config(self):
7676Group the fields based on the section they are grouped in the form. Group should have attributeType as "group".
7777If the group repeats and follows table format, update the attributeType as "list".
7878Do not extract the values.
79- Return the extracted data in JSON format."""
79+ Return the extracted data in JSON format.""" ,
8080 },
8181 "with_ground_truth" : {
8282 "model_id" : "anthropic.claude-3-sonnet-20240229-v1:0" ,
@@ -97,7 +97,7 @@ def _load_default_config(self):
9797For document_class generate a short name based on the document content like W4, I-9, Paystub.
9898For document_description generate a description about the document in less than 50 words.
9999If the group repeats and follows table format, update the attributeType as "list".
100- Do not extract the values."""
100+ Do not extract the values.""" ,
101101 },
102102 "output_format" : {
103103 "sample_json" : """{
@@ -123,7 +123,7 @@ def _load_default_config(self):
123123 }
124124 ]
125125}"""
126- }
126+ },
127127 }
128128 }
129129
@@ -353,16 +353,24 @@ def _load_ground_truth(self, bucket: str, key: str):
353353 def _extract_data_from_document (self , document_content , file_extension ):
354354 try :
355355 # Get configuration for without ground truth
356- model_id = self .without_gt_config .get ("model_id" , "anthropic.claude-3-sonnet-20240229-v1:0" )
357- system_prompt = self .without_gt_config .get ("system_prompt" ,
358- "You are an expert in processing forms. Extracting data from images and documents" )
356+ model_id = self .without_gt_config .get (
357+ "model_id" , "anthropic.claude-3-sonnet-20240229-v1:0"
358+ )
359+ system_prompt = self .without_gt_config .get (
360+ "system_prompt" ,
361+ "You are an expert in processing forms. Extracting data from images and documents" ,
362+ )
359363 temperature = self .without_gt_config .get ("temperature" , 1.0 )
360364 top_p = self .without_gt_config .get ("top_p" , 0.1 )
361365 max_tokens = self .without_gt_config .get ("max_tokens" , 10000 )
362-
366+
363367 # Create user prompt with sample format
364- user_prompt = self .without_gt_config .get ("user_prompt" , self ._prompt_classes_discovery ())
365- sample_format = self .discovery_config .get ("output_format" , {}).get ("sample_json" , self ._sample_output_format ())
368+ user_prompt = self .without_gt_config .get (
369+ "user_prompt" , self ._prompt_classes_discovery ()
370+ )
371+ sample_format = self .discovery_config .get ("output_format" , {}).get (
372+ "sample_json" , self ._sample_output_format ()
373+ )
366374 full_prompt = f"{ user_prompt } \n Format the extracted data using the below JSON format:\n { sample_format } "
367375
368376 # Create content for the user message
@@ -422,25 +430,35 @@ def _extract_data_from_document_with_ground_truth(
422430 """Extract data from document using ground truth as reference."""
423431 try :
424432 # Get configuration for with ground truth
425- model_id = self .with_gt_config .get ("model_id" , "anthropic.claude-3-sonnet-20240229-v1:0" )
426- system_prompt = self .with_gt_config .get ("system_prompt" ,
427- "You are an expert in processing forms. Extracting data from images and documents" )
433+ model_id = self .with_gt_config .get (
434+ "model_id" , "anthropic.claude-3-sonnet-20240229-v1:0"
435+ )
436+ system_prompt = self .with_gt_config .get (
437+ "system_prompt" ,
438+ "You are an expert in processing forms. Extracting data from images and documents" ,
439+ )
428440 temperature = self .with_gt_config .get ("temperature" , 1.0 )
429441 top_p = self .with_gt_config .get ("top_p" , 0.1 )
430442 max_tokens = self .with_gt_config .get ("max_tokens" , 10000 )
431443
432444 # Create enhanced prompt with ground truth
433- user_prompt = self .with_gt_config .get ("user_prompt" ,
434- self ._prompt_classes_discovery_with_ground_truth (ground_truth_data ))
435-
445+ user_prompt = self .with_gt_config .get (
446+ "user_prompt" ,
447+ self ._prompt_classes_discovery_with_ground_truth (ground_truth_data ),
448+ )
449+
436450 # If user_prompt contains placeholder, replace it with ground truth
437451 if "{ground_truth_json}" in user_prompt :
438452 ground_truth_json = json .dumps (ground_truth_data , indent = 2 )
439453 prompt = user_prompt .replace ("{ground_truth_json}" , ground_truth_json )
440454 else :
441- prompt = self ._prompt_classes_discovery_with_ground_truth (ground_truth_data )
442-
443- sample_format = self .discovery_config .get ("output_format" , {}).get ("sample_json" , self ._sample_output_format ())
455+ prompt = self ._prompt_classes_discovery_with_ground_truth (
456+ ground_truth_data
457+ )
458+
459+ sample_format = self .discovery_config .get ("output_format" , {}).get (
460+ "sample_json" , self ._sample_output_format ()
461+ )
444462 full_prompt = f"{ prompt } \n Format the extracted data using the below JSON format:\n { sample_format } "
445463
446464 # Create content for the user message
0 commit comments