Skip to content

Commit 5667d76

Browse files
revert changes
1 parent ea2c82a commit 5667d76

File tree

1 file changed

+39
-29
lines changed

1 file changed

+39
-29
lines changed

infra/scripts/index_scripts/create_search_index.py

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,7 @@
3131
DataLakeServiceClient,
3232
FileSystemClient,
3333
)
34-
from openai import AzureOpenAI
35-
from azure.storage.blob import BlobServiceClient
34+
from azure.ai.projects import AIProjectClient
3635

3736
# Get Azure Key Vault Client
3837
key_vault_name = "kv_to-be-replaced" #'nc6262-kv-2fpeafsylfd2e'
@@ -62,6 +61,7 @@
6261
openai_api_version = secret_client.get_secret("AZURE-OPENAI-PREVIEW-API-VERSION").value
6362
openai_embedding_model = secret_client.get_secret("AZURE-OPENAI-EMBEDDING-MODEL").value
6463
account_name = secret_client.get_secret("ADLS-ACCOUNT-NAME").value
64+
ai_project_endpoint = secret_client.get_secret("AZURE-AI-AGENT-ENDPOINT").value
6565

6666
# Create a search index
6767
index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
@@ -133,15 +133,22 @@
133133

134134

135135
# Function: Get Embeddings
136-
def get_embeddings(text: str, openai_api_base, openai_api_version, azure_token_provider):
136+
def get_embeddings(text: str, ai_project_endpoint, openai_api_version, credential):
137137
model_id = openai_embedding_model or "text-embedding-ada-002"
138-
client = AzureOpenAI(
138+
139+
# Create AI Projects client
140+
project_client = AIProjectClient(
141+
endpoint=ai_project_endpoint,
142+
credential=credential,
139143
api_version=openai_api_version,
140-
azure_endpoint=openai_api_base,
141-
azure_ad_token_provider=azure_token_provider,
144+
)
145+
146+
# Get the OpenAI client from the AI Projects client
147+
openai_client = project_client.get_openai_client(
148+
api_version=openai_api_version
142149
)
143150

144-
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
151+
embedding = openai_client.embeddings.create(input=text, model=model_id).data[0].embedding
145152

146153
return embedding
147154

@@ -200,13 +207,16 @@ def chunk_data(text):
200207
# paths = os.listdir(path_name)
201208

202209

203-
account_url = f"https://{account_name}.blob.core.windows.net"
204-
blob_service_client = BlobServiceClient(account_url, credential=credential)
205-
container_client = blob_service_client.get_container_client(file_system_client_name)
210+
account_url = f"https://{account_name}.dfs.core.windows.net"
206211

207-
print(f"Listing blobs under '{directory}' using BlobServiceClient...")
208-
paths = [blob.name for blob in container_client.list_blobs(name_starts_with=directory)]
212+
service_client = DataLakeServiceClient(
213+
account_url, credential=credential, api_version="2023-01-03"
214+
)
209215

216+
file_system_client = service_client.get_file_system_client(file_system_client_name)
217+
directory_name = directory
218+
paths = file_system_client.get_paths(path=directory_name)
219+
print(paths)
210220

211221
search_client = SearchClient(search_endpoint, index_name, credential)
212222
# index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
@@ -219,22 +229,22 @@ def chunk_data(text):
219229
# Read the CSV file into a Pandas DataFrame
220230
file_path = csv_file_name
221231
print(file_path)
222-
blob_client = container_client.get_blob_client(file_path)
223-
download_stream = blob_client.download_blob()
224-
df_metadata = pd.read_csv(download_stream, encoding="utf-8")
232+
file_client = file_system_client.get_file_client(file_path)
233+
csv_file = file_client.download_file()
234+
df_metadata = pd.read_csv(csv_file, encoding="utf-8")
225235

226236
docs = []
227237
counter = 0
228-
for blob_name in paths:
229-
if not blob_name.endswith(".json"):
230-
continue
231-
232-
blob_client = container_client.get_blob_client(blob_name)
233-
download_stream = blob_client.download_blob()
234-
data = json.loads(download_stream.readall())
235-
text = data.get("Content", "")
236-
237-
filename = blob_name.split("/")[-1]
238+
for path in paths:
239+
# file_path = f'Data/{foldername}/meeting_transcripts/' + path
240+
# with open(file_path, "r") as file:
241+
# data = json.load(file)
242+
file_client = file_system_client.get_file_client(path.name)
243+
data_file = file_client.download_file()
244+
data = json.load(data_file)
245+
text = data["Content"]
246+
247+
filename = path.name.split("/")[-1]
238248
document_id = filename.replace(".json", "").replace("convo_", "")
239249
# print(document_id)
240250
df_file_metadata = df_metadata[
@@ -258,12 +268,12 @@ def chunk_data(text):
258268

259269
try:
260270
v_contentVector = get_embeddings(
261-
d["content"], openai_api_base, openai_api_version, token_provider
271+
d["content"], ai_project_endpoint, openai_api_version, credential
262272
)
263273
except:
264274
time.sleep(30)
265275
v_contentVector = get_embeddings(
266-
d["content"], openai_api_base, openai_api_version, token_provider
276+
d["content"], ai_project_endpoint, openai_api_version, credential
267277
)
268278

269279
docs.append(
@@ -274,15 +284,15 @@ def chunk_data(text):
274284
"chunk_id": d["chunk_id"],
275285
"client_id": d["client_id"],
276286
"content": d["content"],
277-
"sourceurl": blob_name.split("/")[-1],
287+
"sourceurl": path.name.split("/")[-1],
278288
"contentVector": v_contentVector,
279289
}
280290
)
281291

282292
if counter % 10 == 0:
283293
result = search_client.upload_documents(documents=docs)
284294
docs = []
285-
print(f"{counter} documents uploaded...")
295+
print(f" {str(counter)} uploaded")
286296

287297
# upload the last batch
288298
if docs != []:

0 commit comments

Comments
 (0)