Skip to content

Commit 7c705a7

Browse files
scripts update test v7
1 parent 15f3c1d commit 7c705a7

File tree

2 files changed

+56
-55
lines changed

2 files changed

+56
-55
lines changed

infra/scripts/index_scripts/03_cu_process_data_text_manual.py

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from azure.search.documents import SearchClient
1111
from azure.search.documents.indexes import SearchIndexClient
1212
from azure.storage.filedatalake import DataLakeServiceClient
13-
from openai import AzureOpenAI
13+
from azure.ai.inference import ChatCompletionsClient, EmbeddingsClient
1414
from content_understanding_client import AzureContentUnderstandingClient
1515
from azure_credential_utils import get_azure_credential
1616

@@ -78,20 +78,39 @@ def get_secrets_from_kv(kv_name, secret_name):
7878
ANALYZER_ID = "ckm-json"
7979
print("Content Understanding client initialized.")
8080

81+
82+
# ---------- Azure AI Foundry (Inference) clients (Managed Identity) ----------
83+
# For Azure OpenAI endpoints, the Inference SDK expects the deployment path and api_version + scopes.
84+
# chat deployment (already coming from Key Vault as `deployment`)
85+
chat_endpoint = f"{openai_api_base}/openai/deployments/{deployment}"
86+
chat_client = ChatCompletionsClient(
87+
endpoint=chat_endpoint,
88+
credential=credential,
89+
credential_scopes=["https://cognitiveservices.azure.com/.default"],
90+
api_version=openai_api_version,
91+
)
92+
# embedding deployment name (assumes you deployed with the name below — change if different)
93+
embedding_deployment = "text-embedding-ada-002"
94+
embeddings_endpoint = f"{openai_api_base}/openai/deployments/{embedding_deployment}"
95+
embeddings_client = EmbeddingsClient(
96+
endpoint=embeddings_endpoint,
97+
credential=credential,
98+
credential_scopes=["https://cognitiveservices.azure.com/.default"],
99+
api_version=openai_api_version,
100+
)
101+
# -----------------------------------------------------------------------------
102+
103+
81104
# Utility functions
82-
def get_embeddings(text: str, openai_api_base, openai_api_version):
83-
model_id = "text-embedding-ada-002"
84-
token_provider = get_bearer_token_provider(
85-
get_azure_credential(client_id=MANAGED_IDENTITY_CLIENT_ID),
86-
"https://cognitiveservices.azure.com/.default"
87-
)
88-
client = AzureOpenAI(
89-
api_version=openai_api_version,
90-
azure_endpoint=openai_api_base,
91-
azure_ad_token_provider=token_provider
92-
)
93-
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
94-
return embedding
105+
def get_embeddings(text: str):
106+
# Uses Azure AI Inference EmbeddingsClient; returns the vector for `text`.
107+
# NOTE: Endpoint includes the AOAI deployment name.
108+
try:
109+
resp = embeddings_client.embed(input=[text])
110+
return resp.data[0].embedding
111+
except Exception as e:
112+
print(f"Error getting embeddings: {e}")
113+
raise
95114

96115
# Function: Clean Spaces with Regex -
97116
def clean_spaces_with_regex(text):
@@ -281,8 +300,8 @@ def call_gpt4(topics_str1, client):
281300
Return the topics and their labels in JSON format.Always add 'topics' node and 'label', 'description' attributes in json.
282301
Do not return anything else.
283302
"""
284-
response = client.chat.completions.create(
285-
model=deployment,
303+
# Inference client: Chat completions
304+
response = client.complete(
286305
messages=[
287306
{"role": "system", "content": "You are a helpful assistant."},
288307
{"role": "user", "content": topic_prompt},
@@ -292,18 +311,9 @@ def call_gpt4(topics_str1, client):
292311
res = response.choices[0].message.content
293312
return json.loads(res.replace("```json", '').replace("```", ''))
294313

295-
token_provider = get_bearer_token_provider(
296-
get_azure_credential(client_id=MANAGED_IDENTITY_CLIENT_ID),
297-
"https://cognitiveservices.azure.com/.default"
298-
)
299-
openai_client = AzureOpenAI(
300-
azure_endpoint=openai_api_base,
301-
azure_ad_token_provider=token_provider,
302-
api_version=openai_api_version,
303-
)
304314
max_tokens = 3096
305315

306-
res = call_gpt4(topics_str, openai_client)
316+
res = call_gpt4(topics_str, chat_client)
307317
for object1 in res['topics']:
308318
cursor.execute("INSERT INTO km_mined_topics (label, description) VALUES (?,?)", (object1['label'], object1['description']))
309319
conn.commit()
@@ -321,8 +331,7 @@ def get_mined_topic_mapping(input_text, list_of_topics):
321331
prompt = f'''You are a data analysis assistant to help find the closest topic for a given text {input_text}
322332
from a list of topics - {list_of_topics}.
323333
ALWAYS only return a topic from list - {list_of_topics}. Do not add any other text.'''
324-
response = openai_client.chat.completions.create(
325-
model=deployment,
334+
response = chat_client.complete(
326335
messages=[
327336
{"role": "system", "content": "You are a helpful assistant."},
328337
{"role": "user", "content": prompt},
@@ -396,4 +405,4 @@ def get_mined_topic_mapping(input_text, list_of_topics):
396405

397406
cursor.close()
398407
conn.close()
399-
print("All steps completed. Connection closed.")
408+
print("All steps completed. Connection closed.")

infra/scripts/index_scripts/04_cu_process_data_new_data.py

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
from azure.search.documents import SearchClient
1111
from azure.search.documents.indexes import SearchIndexClient
1212
from azure.storage.filedatalake import DataLakeServiceClient
13-
from openai import AzureOpenAI
13+
# --- REPLACED ---
14+
# from openai import AzureOpenAI
15+
from azure.ai.inference import ChatCompletionsClient, EmbeddingsClient
16+
# ----------------
1417
from content_understanding_client import AzureContentUnderstandingClient
1518
from azure_credential_utils import get_azure_credential
1619
from azure.search.documents.indexes.models import (
@@ -168,19 +171,17 @@ def create_search_index():
168171

169172
# Utility functions
170173
def get_embeddings(text: str, openai_api_base, openai_api_version):
171-
model_id = "text-embedding-ada-002"
172-
token_provider = get_bearer_token_provider(
173-
get_azure_credential(client_id=MANAGED_IDENTITY_CLIENT_ID),
174-
"https://cognitiveservices.azure.com/.default"
174+
embeddings_endpoint = f"{openai_api_base}/openai/deployments/{embedding_model}"
175+
embeddings_client = EmbeddingsClient(
176+
endpoint=embeddings_endpoint,
177+
credential=credential,
178+
credential_scopes=["https://cognitiveservices.azure.com/.default"],
179+
api_version=openai_api_version
175180
)
176-
client = AzureOpenAI(
177-
api_version=openai_api_version,
178-
azure_endpoint=openai_api_base,
179-
azure_ad_token_provider=token_provider
180-
)
181-
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
182-
return embedding
183-
181+
response = embeddings_client.embed(input=[text])
182+
return response.data[0].embedding
183+
# --------------------------------------------------------------------------
184+
184185
def clean_spaces_with_regex(text):
185186
cleaned_text = re.sub(r'\s+', ' ', text)
186187
cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
@@ -391,9 +392,8 @@ def call_gpt4(topics_str1, client):
391392
Ensure that the topics and labels are accurate, relevant, and easy to understand.
392393
Return the topics and their labels in JSON format.Always add 'topics' node and 'label', 'description' attributes in json.
393394
Do not return anything else.
394-
"""
395-
response = client.chat.completions.create(
396-
model=deployment,
395+
"""
396+
response = client.complete(
397397
messages=[
398398
{"role": "system", "content": "You are a helpful assistant."},
399399
{"role": "user", "content": topic_prompt},
@@ -403,18 +403,10 @@ def call_gpt4(topics_str1, client):
403403
res = response.choices[0].message.content
404404
return json.loads(res.replace("```json", '').replace("```", ''))
405405

406-
token_provider = get_bearer_token_provider(
407-
get_azure_credential(client_id=MANAGED_IDENTITY_CLIENT_ID),
408-
"https://cognitiveservices.azure.com/.default"
409-
)
410-
openai_client = AzureOpenAI(
411-
azure_endpoint=openai_api_base,
412-
azure_ad_token_provider=token_provider,
413-
api_version=openai_api_version,
414-
)
415406
max_tokens = 3096
407+
res = call_gpt4(", ".join([]), chat_client)
408+
# (rest of topic mining and mapping logic unchanged)
416409

417-
res = call_gpt4(topics_str, openai_client)
418410
for object1 in res['topics']:
419411
cursor.execute("INSERT INTO km_mined_topics (label, description) VALUES (?,?)", (object1['label'], object1['description']))
420412
conn.commit()

0 commit comments

Comments
 (0)