1111import os
1212from urllib .parse import urlparse
1313from botocore .exceptions import ClientError
14+ from idp_common .bedrock .client import BedrockClient
1415
1516# Set up logging
1617logger = logging .getLogger ()
1718logger .setLevel (os .environ .get ("LOG_LEVEL" , "INFO" ))
1819# Get LOG_LEVEL from environment variable with INFO as default
1920
21+ def s3_object_exists (bucket , key ):
22+ try :
23+ s3 = boto3 .client ('s3' )
24+ s3 .head_object (Bucket = bucket , Key = key )
25+ return True
26+ except ClientError as e :
27+ if e .response ['Error' ]['Code' ] == '404' :
28+ return False
29+ else :
30+ raise
31+
32+ def get_full_text (bucket , key ):
33+ try :
34+ dynamodb = boto3 .resource ('dynamodb' )
35+ tracking_table = dynamodb .Table (os .environ ['TRACKING_TABLE_NAME' ])
36+
37+ doc_pk = f"doc#{ key } "
38+ response = tracking_table .get_item (
39+ Key = {'PK' : doc_pk , 'SK' : 'none' }
40+ )
41+
42+ if 'Item' not in response :
43+ logger .info (f"Document { key } not found" )
44+ raise Exception (f"Document { key } not found" )
45+
46+ document = response ['Item' ]
47+ pages = document .get ('Pages' , {})
48+ sorted_pages = sorted (pages , key = lambda x : x ['Id' ])
49+
50+ s3 = boto3 .client ('s3' )
51+ all_text = ""
52+
53+ for page in sorted_pages :
54+ if 'TextUri' in page :
55+ # Extract S3 key from URI
56+ text_key = page ['TextUri' ].replace (f"s3://{ bucket } /" , "" )
57+
58+ try :
59+ response = s3 .get_object (Bucket = bucket , Key = text_key )
60+ page_text = response ['Body' ].read ().decode ('utf-8' )
61+ all_text += f"<page-number>{ page ['Id' ]} </page-number>\n { page_text } \n \n "
62+ except Exception as e :
63+ logger .warning (f"Failed to load page { page ['Id' ]} : { e } " )
64+
65+ return all_text
66+
67+ except Exception as e :
68+ logger .error (f"Error getting document pages: { str (e )} " )
69+ raise
70+
71+
2072def get_summarization_model ():
2173 """Get the summarization model from configuration table"""
2274 try :
@@ -51,106 +103,79 @@ def handler(event, context):
51103 prompt = event ['arguments' ]['prompt' ]
52104 history = event ['arguments' ]['history' ]
53105
54- full_prompt = "You are an assistant that's responsible for getting details from document text attached here based on questions from the user.\n \n "
55- full_prompt += "If you don't know the answer, just say that you don't know. Don't try to make up an answer.\n \n "
56- full_prompt += "Additionally, use the user and assistant responses in the following JSON object to see what's been asked and what the resposes were in the past.\n \n "
57- # full_prompt += "Your response MUST be in the following JSON format: {'content': [{'text': 'String'}]}.\n\n"
58- # full_prompt += "You MUST NOT include outside of that JSON format.\n\n"
59- # full_prompt += "Do NOT include the role or anything else in the response."
60- full_prompt += "The history JSON object is: " + json .dumps (history ) + ".\n \n "
106+ full_prompt = "The history JSON object is: " + json .dumps (history ) + ".\n \n "
61107 full_prompt += "The user's question is: " + prompt + "\n \n "
62108
63109 # this feature is not enabled until the model can be selected on the chat screen
64110 # selectedModelId = event['arguments']['modelId']
65111 selectedModelId = get_summarization_model ()
66112
67113 logger .info (f"Processing S3 URI: { objectKey } " )
114+ logger .info (f"Region: { os .environ ['AWS_REGION' ]} " )
68115
69116 output_bucket = os .environ ['OUTPUT_BUCKET' ]
70117
71- bedrock_runtime = boto3 .client ('bedrock-runtime' , region_name = 'us-west-2' )
118+ bedrock_runtime = boto3 .client ('bedrock-runtime' , region_name = os . environ [ 'AWS_REGION' ] )
72119
73- # Call Bedrock Runtime to get Python code based on the prompt
74120 if (len (objectKey )):
75- # encoded_string = objectKey.encode()
76- # md5_hash = hashlib.md5(encoded_string)
77- # hex_representation = md5_hash.hexdigest()
78-
79- # full text key
80121 fulltext_key = objectKey + '/summary/fulltext.txt'
122+ content_str = ""
123+ s3 = boto3 .client ('s3' )
124+
125+ if not s3_object_exists (output_bucket , fulltext_key ):
126+ logger .info (f"Creating full text file: { fulltext_key } " )
127+ content_str = get_full_text (output_bucket , objectKey )
128+
129+ s3 .put_object (
130+ Bucket = output_bucket ,
131+ Key = fulltext_key ,
132+ Body = content_str .encode ('utf-8' )
133+ )
134+ else :
135+ # read full contents of the object as text
136+ response = s3 .get_object (Bucket = output_bucket , Key = fulltext_key )
137+ content_str = response ['Body' ].read ().decode ('utf-8' )
81138
82139 logger .info (f"Model: { selectedModelId } " )
83140 logger .info (f"Output Bucket: { output_bucket } " )
84141 logger .info (f"Full Text Key: { fulltext_key } " )
85142
86- # read full contents of the object as text
87- s3 = boto3 .client ('s3' )
88- response = s3 .get_object (Bucket = output_bucket , Key = fulltext_key )
89- content_str = response ['Body' ].read ().decode ('utf-8' )
90-
91- message = [
92- {
93- "role" :"user" ,
94- "content" : [
95- {
96- "text" : content_str
97- },
98- {
99- "cachePoint" : {
100- 'type' : 'default'
101- }
102- }
103- ]
104- },
143+ client = BedrockClient ()
144+ # Content with cachepoint tags
145+ content = [
105146 {
106- "role" :"user" ,
107- "content" : [
108- {
109- "text" : full_prompt
110- }
111- ]
147+ "text" : content_str + """
148+ <<CACHEPOINT>>
149+ """ + full_prompt
112150 }
113151 ]
114152
115- # print('invoking model converse')
116-
117- selectedModelId = 'us.amazon.nova-pro-v1:0'
118- response = bedrock_runtime .converse (
119- modelId = selectedModelId ,
120- messages = message
153+ model_response = client .invoke_model (
154+ model_id = "us.amazon.nova-pro-v1:0" ,
155+ system_prompt = "You are an assistant that's responsible for getting details from document text attached here based on questions from the user.\n \n If you don't know the answer, just say that you don't know. Don't try to make up an answer.\n \n Additionally, use the user and assistant responses in the following JSON object to see what's been asked and what the resposes were in the past.\n \n " ,
156+ content = content ,
157+ temperature = 0.0
121158 )
122159
123- token_usage = response ['usage' ]
124- # print(f"Input tokens: {token_usage['inputTokens']}")
125- # print(f"Output tokens: {token_usage['outputTokens']}")
126- # print(f"Total tokens: {token_usage['totalTokens']}")
127- # print(f"cacheReadInputTokens: {token_usage['cacheReadInputTokens']}")
128- # print(f"cacheWriteInputTokens: {token_usage['cacheWriteInputTokens']}")
129- # print(f"Stop reason: {response['stopReason']}")
130-
131-
132- output_message = response ['output' ]['message' ]
133- text_content = output_message ['content' ][0 ]['text' ]
134-
135- chat_response = {"cr" : {"content" : [{"text" : text_content }]}}
160+ text = client .extract_text_from_response (model_response )
161+
162+ chat_response = {"cr" : {"content" : [{"text" : text }]}}
136163 return json .dumps (chat_response )
137164
138-
139-
140165 except ClientError as e :
141166 error_code = e .response ['Error' ]['Code' ]
142167 error_message = e .response ['Error' ]['Message' ]
143- logger .error (f"S3 ClientError : { error_code } - { error_message } " )
168+ logger .error (f"Error : { error_code } - { error_message } " )
144169
145170 if error_code == 'NoSuchKey' :
146- raise Exception (f"File not found: { fulltext_key } . The chat feature will not work with files that were processed prior to v0.3.11. " )
171+ raise Exception (f"File not found: { fulltext_key } " )
147172 elif error_code == 'NoSuchBucket' :
148173 raise Exception (f"Bucket not found: { output_bucket } " )
149174 else :
150175 raise Exception (error_message )
151176
152177 except Exception as e :
153178 logger .error (f"Unexpected error: { str (e )} " )
154- raise Exception (f"Error fetching file : { str (e )} " )
179+ raise Exception (f"Unexpected error : { str (e )} " )
155180
156181 return response_data
0 commit comments