Skip to content

Commit 11efb56

Browse files
committed
added ai search doc ingest, updated workshop example scripts
1 parent cc4b935 commit 11efb56

File tree

6 files changed

+348
-94
lines changed

6 files changed

+348
-94
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
notebooks/GenAI/embedding_demos/p1.py
Lines changed: 266 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,50 @@
1+
import os
2+
import io
3+
import pdfplumber
4+
import streamlit as st
5+
from azure.storage.blob import BlobServiceClient
6+
from azure.core.credentials import AzureKeyCredential
7+
from azure.identity import DefaultAzureCredential
8+
from azure.search.documents import SearchClient
9+
from azure.search.documents.indexes import SearchIndexClient
10+
from azure.search.documents.indexes.models import (
11+
SimpleField,
12+
SearchFieldDataType,
13+
VectorSearch,
14+
SearchIndex,
15+
SearchableField,
16+
SearchField,
17+
VectorSearchProfile,
18+
HnswAlgorithmConfiguration
19+
)
20+
from dotenv import load_dotenv
121
from openai import AzureOpenAI
2-
import os
3-
import streamlit as st
4-
from dotenv import load_dotenv
5-
from styling import global_page_style
6-
7-
# load in .env variables
8-
load_dotenv()
9-
10-
# Configure Azure OpenAI params, using an Azure OpenAI account with a deployment of an embedding model
11-
azure_endpoint: str = os.getenv('AZURE_OPENAI_BASE')
12-
azure_openai_api_key: str = os.getenv('AZURE_OPENAI_KEY')
13-
azure_openai_api_version: str = os.getenv('AZURE_OPENAI_VERSION')
14-
azure_ada_deployment: str = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT')
15-
azure_gpt_deployment: str = os.getenv('AZURE_GPT_DEPLOYMENT')
16-
17-
# Configure Azure AI Search params
18-
search_endpoint: str = os.getenv('AZURE_SEARCH_ENDPOINT')
19-
search_key: str = os.getenv('AZURE_SEARCH_ADMIN_KEY')
20-
21-
def chat_on_your_data(query, search_index, messages):
22-
messages.append({"role": "user", "content":query})
22+
import tiktoken
23+
from styling import global_page_style
24+
25+
# Load environment variables
26+
load_dotenv()
27+
28+
# Configure Azure OpenAI parameters
29+
azure_endpoint = os.getenv('AZURE_OPENAI_BASE')
30+
azure_openai_api_key = os.getenv('AZURE_OPENAI_KEY')
31+
azure_openai_api_version = os.getenv('AZURE_OPENAI_VERSION')
32+
azure_ada_deployment = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT')
33+
azure_gpt_deployment = os.getenv('AZURE_GPT_DEPLOYMENT')
34+
35+
# Configure Azure AI Search parameters
36+
search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT')
37+
search_key = os.getenv('AZURE_SEARCH_ADMIN_KEY')
38+
39+
def chat_on_your_data(query, search_index, messages):
40+
"""
41+
Perform retrieval queries over documents from the Azure AI Search Index.
42+
"""
43+
messages.append({"role": "user", "content": query})
44+
2345
with st.chat_message("user"):
24-
st.markdown(query)
46+
st.markdown(query)
47+
2548
with st.spinner('Processing...'):
2649
client = AzureOpenAI(
2750
azure_endpoint=azure_endpoint,
@@ -31,8 +54,7 @@ def chat_on_your_data(query, search_index, messages):
3154
completion = client.chat.completions.create(
3255
model=azure_gpt_deployment,
3356
messages=[
34-
{"role": "system", "content": "You are an AI assistant that helps people find information. \
35-
Ensure the Markdown responses are correctly formatted before responding."},
57+
{"role": "system", "content": "You are an AI assistant that helps people find information. Ensure the Markdown responses are correctly formatted before responding."},
3658
{"role": "user", "content": query}
3759
],
3860
max_tokens=800,
@@ -46,7 +68,7 @@ def chat_on_your_data(query, search_index, messages):
4668
"data_sources": [{
4769
"type": "azure_search",
4870
"parameters": {
49-
"endpoint": f"{search_endpoint}",
71+
"endpoint": search_endpoint,
5072
"index_name": search_index,
5173
"semantic_configuration": "default",
5274
"query_type": "vector_simple_hybrid",
@@ -55,52 +77,238 @@ def chat_on_your_data(query, search_index, messages):
5577
"role_information": "You are an AI assistant that helps people find information.",
5678
"filter": None,
5779
"strictness": 3,
58-
"top_n_documents": 5,
80+
"top_n_documents": 1,
5981
"authentication": {
6082
"type": "api_key",
61-
"key": f"{search_key}"
83+
"key": search_key
6284
},
6385
"embedding_dependency": {
6486
"type": "deployment_name",
65-
"deployment_name": azure_ada_deployment
87+
"deployment_name": azure_ada_deployment
6688
}
6789
}
6890
}]
6991
}
7092
)
71-
print(completion)
93+
7294
response_data = completion.to_dict()
73-
ai_response = response_data['choices'][0]['message']['content']
74-
messages.append({"role": "assistant", "content":ai_response})
95+
ai_response = response_data['choices'][0]['message']['content']
96+
messages.append({"role": "assistant", "content": ai_response})
97+
7598
with st.chat_message("assistant"):
76-
st.markdown(ai_response)
77-
78-
def main():
99+
st.markdown(ai_response)
100+
101+
def setup_azure_openai(log_text):
102+
"""
103+
Sets up Azure OpenAI.
104+
"""
105+
log_text.write("Setting up Azure OpenAI...")
106+
azure_openai = AzureOpenAI(
107+
api_key=os.getenv("Azure_OPENAI_KEY"),
108+
api_version=os.getenv('AZURE_OPENAI_VERSION'),
109+
azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')
110+
)
111+
log_text.write("Azure OpenAI setup complete.")
112+
return azure_openai
113+
114+
def connect_to_blob_storage(log_text, container):
115+
"""
116+
Connects to Azure Blob Storage.
117+
"""
118+
log_text.write("Connecting to Blob Storage...")
119+
blob_service_client = BlobServiceClient.from_connection_string(os.getenv("BLOB_CONNECTION_STRING"))
120+
container_client = blob_service_client.get_container_client(os.getenv("BLOB_CONTAINER_NAME"))
121+
log_text.write("Connected to Blob Storage.")
122+
return container_client
123+
124+
def split_text_with_metadata(text, metadata, max_length=800, overlap=75, encoding_name='cl100k_base'):
125+
"""
126+
Splits the text into chunks with metadata.
127+
"""
128+
tokenizer = tiktoken.get_encoding(encoding_name)
129+
tokens = tokenizer.encode(text)
130+
chunks = []
131+
start = 0
132+
end = max_length
133+
134+
while start < len(tokens):
135+
chunk = tokens[start:end]
136+
chunk_text = tokenizer.decode(chunk)
137+
chunk_metadata = metadata.copy()
138+
chunk_metadata.update({
139+
'start_token': start,
140+
'end_token': end,
141+
'chunk_length': len(chunk),
142+
'chunk_text_preview': chunk_text[:50] + '...'
143+
})
144+
chunks.append({
145+
'text': chunk_text,
146+
'metadata': chunk_metadata
147+
})
148+
start = end - overlap
149+
end = start + max_length
150+
151+
return chunks
152+
153+
def load_blob_content(blob_client):
154+
"""
155+
Loads and returns the content of the PDF blob.
156+
"""
157+
blob_name = blob_client.blob_name
158+
if not blob_name.lower().endswith('.pdf'):
159+
raise ValueError(f"Blob {blob_name} is not a PDF file.")
160+
161+
blob_data = blob_client.download_blob().readall()
162+
pdf_stream = io.BytesIO(blob_data)
163+
document_text = ""
164+
165+
with pdfplumber.open(pdf_stream) as pdf:
166+
for page in pdf.pages:
167+
document_text += page.extract_text() + "\n"
168+
169+
return document_text
170+
171+
def vectorize(log_text):
172+
"""
173+
Main function that orchestrates the vector workflow.
174+
"""
175+
azure_openai = setup_azure_openai(log_text)
176+
container_client = connect_to_blob_storage(log_text)
177+
178+
# Read and chunk documents with metadata
179+
log_text.write("Listing blobs in container...")
180+
blob_list = container_client.list_blobs()
181+
documents = []
182+
for blob in blob_list:
183+
if not blob.name.lower().endswith('.pdf'):
184+
log_text.write(f"Skipping non-PDF blob: {blob.name}")
185+
continue
186+
187+
log_text.write(f"Processing blob: {blob.name}")
188+
blob_client = container_client.get_blob_client(blob)
189+
try:
190+
document = load_blob_content(blob_client)
191+
metadata = {"blob_name": blob.name}
192+
chunks = split_text_with_metadata(document, metadata)
193+
documents.extend(chunks)
194+
except Exception as e:
195+
log_text.write(f"Failed to process blob {blob.name}: {e}")
196+
197+
log_text.write("Blobs processed and documents chunked.")
198+
199+
# Generate embeddings
200+
log_text.write("Generating embeddings...")
201+
embeddings = []
202+
tokenizer = tiktoken.get_encoding("cl100k_base")
203+
max_tokens = 8192
204+
for i, doc in enumerate(documents):
205+
log_text.write(f"Processing chunk {i + 1}/{len(documents)}")
206+
log_text.write(f"Chunk text: {doc['text']}\n")
207+
tokens = tokenizer.encode(doc["text"])
208+
if len(tokens) > max_tokens:
209+
log_text.write(f"Skipping document chunk {i + 1} with {len(tokens)} tokens, exceeding max limit of {max_tokens}.")
210+
continue
211+
response = azure_openai.embeddings.create(input=doc["text"], model=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))
212+
embeddings.append({
213+
"embedding": response.data[0].embedding,
214+
"metadata": doc["metadata"]
215+
})
216+
log_text.write(f"Embeddings: {response.data[0].embedding}")
217+
218+
log_text.write("Embeddings generation complete.")
219+
220+
# Create Search Index
221+
log_text.write("Creating search index...")
222+
credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY"))
223+
search_index_client = SearchIndexClient(endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"), credential=credential)
224+
fields = [
225+
SimpleField(name="id", type=SearchFieldDataType.String, key=True),
226+
SearchableField(name="content", type=SearchFieldDataType.String),
227+
SearchableField(name="blob_name", type=SearchFieldDataType.String),
228+
SearchField(
229+
name="embedding",
230+
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
231+
searchable=True,
232+
vector_search_dimensions=1536,
233+
vector_search_profile_name="myHnswProfile"
234+
)
235+
]
236+
vector_search = VectorSearch(
237+
algorithms=[
238+
HnswAlgorithmConfiguration(name="myHnsw")
239+
],
240+
profiles=[
241+
VectorSearchProfile(
242+
name="myHnswProfile",
243+
algorithm_configuration_name="myHnsw"
244+
)
245+
]
246+
)
247+
index = SearchIndex(name="documents-index", fields=fields, vector_search=vector_search)
248+
search_index_client.create_index(index)
249+
log_text.write("Search index created.")
250+
251+
# Upload chunks and embeddings to Azure AI Search
252+
log_text.write("Uploading documents to search index...")
253+
search_client = SearchClient(endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"), index_name="documents-index", credential=credential)
254+
documents_to_upload = []
255+
256+
for i, doc in enumerate(embeddings):
257+
documents_to_upload.append({
258+
"id": str(i),
259+
"content": documents[i]["text"],
260+
"embedding": doc["embedding"],
261+
"blob_name": doc["metadata"]["blob_name"]
262+
})
263+
search_client.upload_documents(documents=documents_to_upload)
264+
log_text.success("Documents uploaded to search index.")
265+
266+
def main():
267+
"""
268+
Main program execution function.
269+
"""
79270
st.markdown(
80-
f'<div style="text-align: center;"><img src="{"https://upload.wikimedia.org/wikipedia/commons/4/44/Microsoft_logo.svg" }" width="{60}"></div>',
81-
unsafe_allow_html=True
82-
)
83-
st.title("Demo - Azure OpenAI & AI Search")
84-
# image = Image.open('image_logo2.png')
85-
# st.image(image, caption = '')
86-
st.write('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by leveraging both \
87-
semantic and vector search techniques. Semantic search enhances the querying process by comprehending the meaning and context of \
88-
user queries, thereby providing more pertinent results. Vector search, on the other hand, employs numerical representations of \
89-
text to identify similar content using cosine similarity. ***For users to effectively utilize this demo, it is essential that they \
90-
have previously created their Azure AI Search Index, following the necessary steps to upload and query their data as outlined [here](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md).***')
91-
if 'messages' not in st.session_state:
92-
st.session_state.messages = []
93-
index_name = st.text_input(label="Azure AI Search index name:", value="")
94-
st.write('-'*50)
95-
if index_name:
96-
query = st.chat_input('Input search query here...')
271+
f'<div style="text-align: center;"><img src="{"https://upload.wikimedia.org/wikipedia/commons/4/44/Microsoft_logo.svg" }" width="{60}"></div>',
272+
unsafe_allow_html=True
273+
)
274+
st.title("Demo - Azure OpenAI & AI Search")
275+
276+
task = st.sidebar.radio(
277+
'Choose a function below:',
278+
['Vectorize', 'Retrieve']
279+
)
280+
281+
# Task for retrieving documents from Azure AI Search in Streamlit UI
282+
if task == 'Retrieve':
283+
st.write('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by \
284+
leveraging both semantic and vector search techniques. Semantic search enhances the querying process by comprehending \
285+
the meaning and context of user queries, thereby providing more pertinent results. Vector search, on the other hand, employs \
286+
numerical representations of text to identify similar content using cosine similarity. ***For users to effectively \
287+
utilize this demo, it is essential that they have previously created their Azure AI Search Index, by executing the \
288+
"vectorize" task.***')
289+
290+
if 'messages' not in st.session_state:
291+
st.session_state.messages = []
292+
293+
index_name = os.getenv('AZURE_SEARCH_INDEX')
294+
295+
st.write('-'*50)
296+
query = st.chat_input('Input search query here...')
97297
for message in st.session_state.messages:
98298
with st.chat_message(message["role"]):
99299
st.markdown(message['content'])
100-
if query:
101-
chat_on_your_data(query, index_name, st.session_state.messages)
102-
103-
104-
if __name__ == '__main__':
105-
global_page_style()
106-
main()
300+
if query:
301+
chat_on_your_data(query, index_name, st.session_state.messages)
302+
303+
# Task for embedding documents from Azure Blob to Azure AI Search index in Streamlit UI
304+
elif task == 'Vectorize':
305+
st.write('This demo processes PDF files from Azure Blob Storage, generates embeddings, and uploads them to Azure AI Search for indexing. \
306+
***For users to effectively utilize this demo, it is essential that they uploade PDF files from the \
307+
"/search_documents" directory to Azure Blob container. Instructions to do this can be found [here](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal).***')
308+
if st.button("Start Process"):
309+
log_text = st.empty()
310+
vectorize(log_text)
311+
312+
if __name__ == '__main__':
313+
global_page_style()
314+
main()

0 commit comments

Comments
 (0)