1+ import os
2+ import io
3+ import pdfplumber
4+ import streamlit as st
5+ from azure .storage .blob import BlobServiceClient
6+ from azure .core .credentials import AzureKeyCredential
7+ from azure .identity import DefaultAzureCredential
8+ from azure .search .documents import SearchClient
9+ from azure .search .documents .indexes import SearchIndexClient
10+ from azure .search .documents .indexes .models import (
11+ SimpleField ,
12+ SearchFieldDataType ,
13+ VectorSearch ,
14+ SearchIndex ,
15+ SearchableField ,
16+ SearchField ,
17+ VectorSearchProfile ,
18+ HnswAlgorithmConfiguration
19+ )
20+ from dotenv import load_dotenv
121from openai import AzureOpenAI
2- import os
3- import streamlit as st
4- from dotenv import load_dotenv
5- from styling import global_page_style
6-
7- # load in .env variables
8- load_dotenv ()
9-
10- # Configure Azure OpenAI params, using an Azure OpenAI account with a deployment of an embedding model
11- azure_endpoint : str = os .getenv ('AZURE_OPENAI_BASE' )
12- azure_openai_api_key : str = os .getenv ('AZURE_OPENAI_KEY' )
13- azure_openai_api_version : str = os .getenv ('AZURE_OPENAI_VERSION' )
14- azure_ada_deployment : str = os .getenv ('AZURE_EMBEDDINGS_DEPLOYMENT' )
15- azure_gpt_deployment : str = os .getenv ('AZURE_GPT_DEPLOYMENT' )
16-
17- # Configure Azure AI Search params
18- search_endpoint : str = os .getenv ('AZURE_SEARCH_ENDPOINT' )
19- search_key : str = os .getenv ('AZURE_SEARCH_ADMIN_KEY' )
20-
21- def chat_on_your_data (query , search_index , messages ):
22- messages .append ({"role" : "user" , "content" :query })
22+ import tiktoken
23+ from styling import global_page_style
24+
25+ # Load environment variables
26+ load_dotenv ()
27+
28+ # Configure Azure OpenAI parameters
29+ azure_endpoint = os .getenv ('AZURE_OPENAI_BASE' )
30+ azure_openai_api_key = os .getenv ('AZURE_OPENAI_KEY' )
31+ azure_openai_api_version = os .getenv ('AZURE_OPENAI_VERSION' )
32+ azure_ada_deployment = os .getenv ('AZURE_EMBEDDINGS_DEPLOYMENT' )
33+ azure_gpt_deployment = os .getenv ('AZURE_GPT_DEPLOYMENT' )
34+
35+ # Configure Azure AI Search parameters
36+ search_endpoint = os .getenv ('AZURE_SEARCH_ENDPOINT' )
37+ search_key = os .getenv ('AZURE_SEARCH_ADMIN_KEY' )
38+
39+ def chat_on_your_data (query , search_index , messages ):
40+ """
41+ Perform retrieval queries over documents from the Azure AI Search Index.
42+ """
43+ messages .append ({"role" : "user" , "content" : query })
44+
2345 with st .chat_message ("user" ):
24- st .markdown (query )
46+ st .markdown (query )
47+
2548 with st .spinner ('Processing...' ):
2649 client = AzureOpenAI (
2750 azure_endpoint = azure_endpoint ,
@@ -31,8 +54,7 @@ def chat_on_your_data(query, search_index, messages):
3154 completion = client .chat .completions .create (
3255 model = azure_gpt_deployment ,
3356 messages = [
34- {"role" : "system" , "content" : "You are an AI assistant that helps people find information. \
35- Ensure the Markdown responses are correctly formatted before responding." },
57+ {"role" : "system" , "content" : "You are an AI assistant that helps people find information. Ensure the Markdown responses are correctly formatted before responding." },
3658 {"role" : "user" , "content" : query }
3759 ],
3860 max_tokens = 800 ,
@@ -46,7 +68,7 @@ def chat_on_your_data(query, search_index, messages):
4668 "data_sources" : [{
4769 "type" : "azure_search" ,
4870 "parameters" : {
49- "endpoint" : f" { search_endpoint } " ,
71+ "endpoint" : search_endpoint ,
5072 "index_name" : search_index ,
5173 "semantic_configuration" : "default" ,
5274 "query_type" : "vector_simple_hybrid" ,
@@ -55,52 +77,238 @@ def chat_on_your_data(query, search_index, messages):
5577 "role_information" : "You are an AI assistant that helps people find information." ,
5678 "filter" : None ,
5779 "strictness" : 3 ,
58- "top_n_documents" : 5 ,
80+ "top_n_documents" : 1 ,
5981 "authentication" : {
6082 "type" : "api_key" ,
61- "key" : f" { search_key } "
83+ "key" : search_key
6284 },
6385 "embedding_dependency" : {
6486 "type" : "deployment_name" ,
65- "deployment_name" : azure_ada_deployment
87+ "deployment_name" : azure_ada_deployment
6688 }
6789 }
6890 }]
6991 }
7092 )
71- print ( completion )
93+
7294 response_data = completion .to_dict ()
73- ai_response = response_data ['choices' ][0 ]['message' ]['content' ]
74- messages .append ({"role" : "assistant" , "content" :ai_response })
95+ ai_response = response_data ['choices' ][0 ]['message' ]['content' ]
96+ messages .append ({"role" : "assistant" , "content" : ai_response })
97+
7598 with st .chat_message ("assistant" ):
76- st .markdown (ai_response )
77-
78- def main ():
99+ st .markdown (ai_response )
100+
101+ def setup_azure_openai (log_text ):
102+ """
103+ Sets up Azure OpenAI.
104+ """
105+ log_text .write ("Setting up Azure OpenAI..." )
106+ azure_openai = AzureOpenAI (
107+ api_key = os .getenv ("Azure_OPENAI_KEY" ),
108+ api_version = os .getenv ('AZURE_OPENAI_VERSION' ),
109+ azure_endpoint = os .getenv ('AZURE_OPENAI_ENDPOINT' )
110+ )
111+ log_text .write ("Azure OpenAI setup complete." )
112+ return azure_openai
113+
114+ def connect_to_blob_storage (log_text , container ):
115+ """
116+ Connects to Azure Blob Storage.
117+ """
118+ log_text .write ("Connecting to Blob Storage..." )
119+ blob_service_client = BlobServiceClient .from_connection_string (os .getenv ("BLOB_CONNECTION_STRING" ))
120+ container_client = blob_service_client .get_container_client (os .getenv ("BLOB_CONTAINER_NAME" ))
121+ log_text .write ("Connected to Blob Storage." )
122+ return container_client
123+
124+ def split_text_with_metadata (text , metadata , max_length = 800 , overlap = 75 , encoding_name = 'cl100k_base' ):
125+ """
126+ Splits the text into chunks with metadata.
127+ """
128+ tokenizer = tiktoken .get_encoding (encoding_name )
129+ tokens = tokenizer .encode (text )
130+ chunks = []
131+ start = 0
132+ end = max_length
133+
134+ while start < len (tokens ):
135+ chunk = tokens [start :end ]
136+ chunk_text = tokenizer .decode (chunk )
137+ chunk_metadata = metadata .copy ()
138+ chunk_metadata .update ({
139+ 'start_token' : start ,
140+ 'end_token' : end ,
141+ 'chunk_length' : len (chunk ),
142+ 'chunk_text_preview' : chunk_text [:50 ] + '...'
143+ })
144+ chunks .append ({
145+ 'text' : chunk_text ,
146+ 'metadata' : chunk_metadata
147+ })
148+ start = end - overlap
149+ end = start + max_length
150+
151+ return chunks
152+
153+ def load_blob_content (blob_client ):
154+ """
155+ Loads and returns the content of the PDF blob.
156+ """
157+ blob_name = blob_client .blob_name
158+ if not blob_name .lower ().endswith ('.pdf' ):
159+ raise ValueError (f"Blob { blob_name } is not a PDF file." )
160+
161+ blob_data = blob_client .download_blob ().readall ()
162+ pdf_stream = io .BytesIO (blob_data )
163+ document_text = ""
164+
165+ with pdfplumber .open (pdf_stream ) as pdf :
166+ for page in pdf .pages :
167+ document_text += page .extract_text () + "\n "
168+
169+ return document_text
170+
171+ def vectorize (log_text ):
172+ """
173+ Main function that orchestrates the vector workflow.
174+ """
175+ azure_openai = setup_azure_openai (log_text )
176+ container_client = connect_to_blob_storage (log_text )
177+
178+ # Read and chunk documents with metadata
179+ log_text .write ("Listing blobs in container..." )
180+ blob_list = container_client .list_blobs ()
181+ documents = []
182+ for blob in blob_list :
183+ if not blob .name .lower ().endswith ('.pdf' ):
184+ log_text .write (f"Skipping non-PDF blob: { blob .name } " )
185+ continue
186+
187+ log_text .write (f"Processing blob: { blob .name } " )
188+ blob_client = container_client .get_blob_client (blob )
189+ try :
190+ document = load_blob_content (blob_client )
191+ metadata = {"blob_name" : blob .name }
192+ chunks = split_text_with_metadata (document , metadata )
193+ documents .extend (chunks )
194+ except Exception as e :
195+ log_text .write (f"Failed to process blob { blob .name } : { e } " )
196+
197+ log_text .write ("Blobs processed and documents chunked." )
198+
199+ # Generate embeddings
200+ log_text .write ("Generating embeddings..." )
201+ embeddings = []
202+ tokenizer = tiktoken .get_encoding ("cl100k_base" )
203+ max_tokens = 8192
204+ for i , doc in enumerate (documents ):
205+ log_text .write (f"Processing chunk { i + 1 } /{ len (documents )} " )
206+ log_text .write (f"Chunk text: { doc ['text' ]} \n " )
207+ tokens = tokenizer .encode (doc ["text" ])
208+ if len (tokens ) > max_tokens :
209+ log_text .write (f"Skipping document chunk { i + 1 } with { len (tokens )} tokens, exceeding max limit of { max_tokens } ." )
210+ continue
211+ response = azure_openai .embeddings .create (input = doc ["text" ], model = os .getenv ("AZURE_EMBEDDINGS_DEPLOYMENT" ))
212+ embeddings .append ({
213+ "embedding" : response .data [0 ].embedding ,
214+ "metadata" : doc ["metadata" ]
215+ })
216+ log_text .write (f"Embeddings: { response .data [0 ].embedding } " )
217+
218+ log_text .write ("Embeddings generation complete." )
219+
220+ # Create Search Index
221+ log_text .write ("Creating search index..." )
222+ credential = AzureKeyCredential (os .getenv ("AZURE_SEARCH_ADMIN_KEY" ))
223+ search_index_client = SearchIndexClient (endpoint = os .getenv ("AZURE_SEARCH_ENDPOINT" ), credential = credential )
224+ fields = [
225+ SimpleField (name = "id" , type = SearchFieldDataType .String , key = True ),
226+ SearchableField (name = "content" , type = SearchFieldDataType .String ),
227+ SearchableField (name = "blob_name" , type = SearchFieldDataType .String ),
228+ SearchField (
229+ name = "embedding" ,
230+ type = SearchFieldDataType .Collection (SearchFieldDataType .Single ),
231+ searchable = True ,
232+ vector_search_dimensions = 1536 ,
233+ vector_search_profile_name = "myHnswProfile"
234+ )
235+ ]
236+ vector_search = VectorSearch (
237+ algorithms = [
238+ HnswAlgorithmConfiguration (name = "myHnsw" )
239+ ],
240+ profiles = [
241+ VectorSearchProfile (
242+ name = "myHnswProfile" ,
243+ algorithm_configuration_name = "myHnsw"
244+ )
245+ ]
246+ )
247+ index = SearchIndex (name = "documents-index" , fields = fields , vector_search = vector_search )
248+ search_index_client .create_index (index )
249+ log_text .write ("Search index created." )
250+
251+ # Upload chunks and embeddings to Azure AI Search
252+ log_text .write ("Uploading documents to search index..." )
253+ search_client = SearchClient (endpoint = os .getenv ("AZURE_SEARCH_ENDPOINT" ), index_name = "documents-index" , credential = credential )
254+ documents_to_upload = []
255+
256+ for i , doc in enumerate (embeddings ):
257+ documents_to_upload .append ({
258+ "id" : str (i ),
259+ "content" : documents [i ]["text" ],
260+ "embedding" : doc ["embedding" ],
261+ "blob_name" : doc ["metadata" ]["blob_name" ]
262+ })
263+ search_client .upload_documents (documents = documents_to_upload )
264+ log_text .success ("Documents uploaded to search index." )
265+
266+ def main ():
267+ """
268+ Main program execution function.
269+ """
79270 st .markdown (
80- f'<div style="text-align: center;"><img src="{ "https://upload.wikimedia.org/wikipedia/commons/4/44/Microsoft_logo.svg" } " width="{ 60 } "></div>' ,
81- unsafe_allow_html = True
82- )
83- st .title ("Demo - Azure OpenAI & AI Search" )
84- # image = Image.open('image_logo2.png')
85- # st.image(image, caption = '')
86- st .write ('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by leveraging both \
87- semantic and vector search techniques. Semantic search enhances the querying process by comprehending the meaning and context of \
88- user queries, thereby providing more pertinent results. Vector search, on the other hand, employs numerical representations of \
89- text to identify similar content using cosine similarity. ***For users to effectively utilize this demo, it is essential that they \
90- have previously created their Azure AI Search Index, following the necessary steps to upload and query their data as outlined [here](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md).***' )
91- if 'messages' not in st .session_state :
92- st .session_state .messages = []
93- index_name = st .text_input (label = "Azure AI Search index name:" , value = "" )
94- st .write ('-' * 50 )
95- if index_name :
96- query = st .chat_input ('Input search query here...' )
271+ f'<div style="text-align: center;"><img src="{ "https://upload.wikimedia.org/wikipedia/commons/4/44/Microsoft_logo.svg" } " width="{ 60 } "></div>' ,
272+ unsafe_allow_html = True
273+ )
274+ st .title ("Demo - Azure OpenAI & AI Search" )
275+
276+ task = st .sidebar .radio (
277+ 'Choose a function below:' ,
278+ ['Vectorize' , 'Retrieve' ]
279+ )
280+
281+ # Task for retrieving documents from Azure AI Search in Streamlit UI
282+ if task == 'Retrieve' :
283+ st .write ('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by \
284+ leveraging both semantic and vector search techniques. Semantic search enhances the querying process by comprehending \
285+ the meaning and context of user queries, thereby providing more pertinent results. Vector search, on the other hand, employs \
286+ numerical representations of text to identify similar content using cosine similarity. ***For users to effectively \
287+ utilize this demo, it is essential that they have previously created their Azure AI Search Index, by executing the \
288+ "vectorize" task.***' )
289+
290+ if 'messages' not in st .session_state :
291+ st .session_state .messages = []
292+
293+ index_name = os .getenv ('AZURE_SEARCH_INDEX' )
294+
295+ st .write ('-' * 50 )
296+ query = st .chat_input ('Input search query here...' )
97297 for message in st .session_state .messages :
98298 with st .chat_message (message ["role" ]):
99299 st .markdown (message ['content' ])
100- if query :
101- chat_on_your_data (query , index_name , st .session_state .messages )
102-
103-
104- if __name__ == '__main__' :
105- global_page_style ()
106- main ()
300+ if query :
301+ chat_on_your_data (query , index_name , st .session_state .messages )
302+
303+ # Task for embedding documents from Azure Blob to Azure AI Search index in Streamlit UI
304+ elif task == 'Vectorize' :
305+ st .write ('This demo processes PDF files from Azure Blob Storage, generates embeddings, and uploads them to Azure AI Search for indexing. \
306+ ***For users to effectively utilize this demo, it is essential that they uploade PDF files from the \
307+ "/search_documents" directory to Azure Blob container. Instructions to do this can be found [here](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal).***' )
308+ if st .button ("Start Process" ):
309+ log_text = st .empty ()
310+ vectorize (log_text )
311+
312+ if __name__ == '__main__' :
313+ global_page_style ()
314+ main ()
0 commit comments