diff --git a/app/backend/app.py b/app/backend/app.py
index 1b4563bb98..a4545d5abd 100644
--- a/app/backend/app.py
+++ b/app/backend/app.py
@@ -56,6 +56,8 @@
from approaches.promptmanager import PromptyManager
from approaches.retrievethenread import RetrieveThenReadApproach
from approaches.retrievethenreadvision import RetrieveThenReadVisionApproach
+from approaches.orchestrator_approach import OrchestratorApproach
+from approaches.domain_classifier import DomainClassifier
from chat_history.cosmosdb import chat_history_cosmosdb_bp
from config import (
CONFIG_AGENT_CLIENT,
@@ -793,6 +795,113 @@ async def setup_clients():
prompt_manager=prompt_manager,
)
+ # Environment variables for index names
+ AZURE_SEARCH_COSMIC_INDEX = os.getenv("AZURE_SEARCH_COSMIC_INDEX", "cosmic-index")
+ AZURE_SEARCH_SUBSTRATE_INDEX = os.getenv("AZURE_SEARCH_SUBSTRATE_INDEX", "substrate-index")
+ AZURE_SEARCH_CLASSIFIER_INDEX = os.getenv("AZURE_SEARCH_CLASSIFIER_INDEX", "domain-classifier-index")
+ print(f"š App will use these indexes:")
+ print(f" Cosmic: {AZURE_SEARCH_COSMIC_INDEX}")
+ print(f" Substrate: {AZURE_SEARCH_SUBSTRATE_INDEX}")
+ print(f" Classifier: {AZURE_SEARCH_CLASSIFIER_INDEX}")
+
+ # Set up domain-specific search clients
+ cosmic_search_client = SearchClient(
+ endpoint=AZURE_SEARCH_ENDPOINT,
+ index_name=AZURE_SEARCH_COSMIC_INDEX,
+ credential=azure_credential
+ )
+
+ substrate_search_client = SearchClient(
+ endpoint=AZURE_SEARCH_ENDPOINT,
+ index_name=AZURE_SEARCH_SUBSTRATE_INDEX,
+ credential=azure_credential
+ )
+
+ # Set up domain classifier
+ classifier_search_client = SearchClient(
+ endpoint=AZURE_SEARCH_ENDPOINT,
+ index_name=AZURE_SEARCH_CLASSIFIER_INDEX,
+ credential=azure_credential
+ )
+
+ # Define the embeddings service (this was missing!)
+ openai_embeddings_service = setup_embeddings_service(
+ azure_credential=azure_credential,
+ openai_host=OPENAI_HOST,
+ openai_model_name=OPENAI_EMB_MODEL,
+ openai_service=AZURE_OPENAI_SERVICE,
+ openai_custom_url=AZURE_OPENAI_CUSTOM_URL,
+ openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
+ openai_dimensions=OPENAI_EMB_DIMENSIONS,
+ openai_api_version=AZURE_OPENAI_API_VERSION,
+ openai_key=clean_key_if_exists(OPENAI_API_KEY),
+ openai_org=OPENAI_ORGANIZATION,
+ disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
+ )
+
+ domain_classifier = DomainClassifier(
+ search_client=classifier_search_client,
+ openai_client=openai_client,
+ embeddings_service=openai_embeddings_service,
+ chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT
+ )
+
+ # Create domain-specific approaches WITHOUT agentic retrieval for now
+ cosmic_approach = ChatReadRetrieveReadApproach(
+ search_client=cosmic_search_client,
+ search_index_name=AZURE_SEARCH_COSMIC_INDEX,
+ agent_model=None, # Disable agentic retrieval
+ agent_deployment=None, # Disable agentic retrieval
+ agent_client=None, # Disable agentic retrieval
+ openai_client=openai_client,
+ auth_helper=auth_helper,
+ chatgpt_model=OPENAI_CHATGPT_MODEL,
+ chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
+ embedding_model=OPENAI_EMB_MODEL,
+ embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
+ embedding_dimensions=OPENAI_EMB_DIMENSIONS,
+ embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
+ sourcepage_field=KB_FIELDS_SOURCEPAGE,
+ content_field=KB_FIELDS_CONTENT,
+ query_language=AZURE_SEARCH_QUERY_LANGUAGE,
+ query_speller=AZURE_SEARCH_QUERY_SPELLER,
+ prompt_manager=prompt_manager,
+ reasoning_effort=OPENAI_REASONING_EFFORT,
+ )
+
+ substrate_approach = ChatReadRetrieveReadApproach(
+ search_client=substrate_search_client,
+ search_index_name=AZURE_SEARCH_SUBSTRATE_INDEX,
+ agent_model=None, # Disable agentic retrieval
+ agent_deployment=None, # Disable agentic retrieval
+ agent_client=None, # Disable agentic retrieval
+ openai_client=openai_client,
+ auth_helper=auth_helper,
+ chatgpt_model=OPENAI_CHATGPT_MODEL,
+ chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT,
+ embedding_model=OPENAI_EMB_MODEL,
+ embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
+ embedding_dimensions=OPENAI_EMB_DIMENSIONS,
+ embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
+ sourcepage_field=KB_FIELDS_SOURCEPAGE,
+ content_field=KB_FIELDS_CONTENT,
+ query_language=AZURE_SEARCH_QUERY_LANGUAGE,
+ query_speller=AZURE_SEARCH_QUERY_SPELLER,
+ prompt_manager=prompt_manager,
+ reasoning_effort=OPENAI_REASONING_EFFORT,
+ )
+
+ # Create orchestrator with classifier
+ orchestrator = OrchestratorApproach(
+ cosmic_approach=cosmic_approach,
+ substrate_approach=substrate_approach,
+ domain_classifier=domain_classifier,
+ openai_client=openai_client,
+ prompt_manager=prompt_manager
+ )
+
+ current_app.config[CONFIG_CHAT_APPROACH] = orchestrator
+
@bp.after_app_serving
async def close_clients():
diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
index ed87976e3b..389d24bc45 100644
--- a/app/backend/approaches/chatreadretrieveread.py
+++ b/app/backend/approaches/chatreadretrieveread.py
@@ -1,5 +1,5 @@
from collections.abc import Awaitable
-from typing import Any, Optional, Union, cast
+from typing import Any, AsyncGenerator, List, Optional, Union, cast
from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient
from azure.search.documents.aio import SearchClient
@@ -30,9 +30,11 @@ def __init__(
*,
search_client: SearchClient,
search_index_name: str,
+ cosmic_index_name: str = None, # Add domain-specific index names
+ substrate_index_name: str = None,
agent_model: Optional[str],
agent_deployment: Optional[str],
- agent_client: KnowledgeAgentRetrievalClient,
+ agent_client: Optional[KnowledgeAgentRetrievalClient],
auth_helper: AuthenticationHelper,
openai_client: AsyncOpenAI,
chatgpt_model: str,
@@ -47,30 +49,45 @@ def __init__(
query_speller: str,
prompt_manager: PromptManager,
reasoning_effort: Optional[str] = None,
+ domain_classifier: Optional[Any] = None, # Add this parameter
+ openai_host: str = "",
+ vision_endpoint: str = "",
+ vision_token_provider: Optional[Any] = None,
):
- self.search_client = search_client
- self.search_index_name = search_index_name
- self.agent_model = agent_model
- self.agent_deployment = agent_deployment
- self.agent_client = agent_client
- self.openai_client = openai_client
- self.auth_helper = auth_helper
+ # Call parent class __init__ with the correct parameters
+ super().__init__(
+ search_client=search_client,
+ openai_client=openai_client,
+ auth_helper=auth_helper,
+ query_language=query_language,
+ query_speller=query_speller,
+ embedding_deployment=embedding_deployment,
+ embedding_model=embedding_model,
+ embedding_dimensions=embedding_dimensions,
+ embedding_field=embedding_field,
+ openai_host=openai_host,
+ vision_endpoint=vision_endpoint,
+ vision_token_provider=vision_token_provider or (lambda: ""),
+ prompt_manager=prompt_manager,
+ reasoning_effort=reasoning_effort,
+ )
+
+ # Set additional attributes specific to this class
+ self.search_index_name = search_index_name # Store it as instance variable
+ self.cosmic_index_name = cosmic_index_name or search_index_name
+ self.substrate_index_name = substrate_index_name or search_index_name
self.chatgpt_model = chatgpt_model
self.chatgpt_deployment = chatgpt_deployment
- self.embedding_deployment = embedding_deployment
- self.embedding_model = embedding_model
- self.embedding_dimensions = embedding_dimensions
- self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
- self.query_language = query_language
- self.query_speller = query_speller
- self.prompt_manager = prompt_manager
- self.query_rewrite_prompt = self.prompt_manager.load_prompt("chat_query_rewrite.prompty")
- self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json")
- self.answer_prompt = self.prompt_manager.load_prompt("chat_answer_question.prompty")
- self.reasoning_effort = reasoning_effort
+ self.agent_model = agent_model
+ self.agent_deployment = agent_deployment
+ self.agent_client = agent_client
+ self.query_rewrite_prompt = prompt_manager.load_prompt("chat_query_rewrite.prompty")
+ self.query_rewrite_tools = prompt_manager.load_tools("chat_query_rewrite_tools.json")
+ self.answer_prompt = prompt_manager.load_prompt("chat_answer_question.prompty")
self.include_token_usage = True
+ self.domain_classifier = domain_classifier
async def run_until_final_call(
self,
@@ -82,26 +99,104 @@ async def run_until_final_call(
use_agentic_retrieval = True if overrides.get("use_agentic_retrieval") else False
original_user_query = messages[-1]["content"]
+ # Get domain classification if classifier is available
+ domain_info = None
+ domain_message = ""
+ if self.domain_classifier is not None:
+ # Extract history from messages for classification
+ history = []
+ for msg in messages[:-1]:
+ if msg.get("role") and msg.get("content"):
+ history.append({"role": msg["role"], "content": msg["content"]})
+
+ domains, confidence, reasoning = await self.domain_classifier.classify_with_context(
+ original_user_query,
+ history
+ )
+
+ domain_message = self._format_domain_message(domains, confidence, reasoning)
+ domain_info = {
+ "domains": domains,
+ "confidence": confidence,
+ "reasoning": reasoning,
+ "message": domain_message
+ }
+
+ # Store domain info in overrides so search approaches can use it
+ overrides["domain_classification"] = domain_info
+
reasoning_model_support = self.GPT_REASONING_MODELS.get(self.chatgpt_model)
if reasoning_model_support and (not reasoning_model_support.streaming and should_stream):
raise Exception(
f"{self.chatgpt_model} does not support streaming. Please use a different model or disable streaming."
)
- if use_agentic_retrieval:
+
+ if use_agentic_retrieval and self.agent_client is not None:
extra_info = await self.run_agentic_retrieval_approach(messages, overrides, auth_claims)
else:
extra_info = await self.run_search_approach(messages, overrides, auth_claims)
+ # Modify the prompt to include domain context
+ prompt_variables = self.get_system_prompt_variables(overrides.get("prompt_template")) | {
+ "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")),
+ "past_messages": messages[:-1],
+ "user_query": original_user_query,
+ "text_sources": extra_info.data_points.text,
+ }
+
+ # Add domain-specific variables
+ if domain_message:
+ prompt_variables["domain_prefix"] = domain_message + "\n\n"
+ else:
+ prompt_variables["domain_prefix"] = ""
+
+ # If domain is ambiguous, add context for structured response
+ if domain_info and len(domain_info["domains"]) > 1:
+ # Make the domain context more explicit and forceful
+ domains_list = domain_info["domains"]
+ domain_context = (
+ f"The user's question relates to multiple domains: {', '.join(domains_list)}. "
+ f"YOU MUST structure your answer by domain with CLEAR separation. "
+ f"Use the EXACT format below:\n\n"
+ )
+
+ # Add explicit format for each domain
+ for domain in domains_list:
+ domain_context += f"### Under {domain}:\n\n"
+ domain_context += f"(All information related to {domain} from {domain}-tagged sources)\n\n"
+
+ domain_context += (
+ "CRITICAL: You MUST use these exact headings. "
+ "Each domain section MUST be clearly separated with the ### heading. "
+ "Do NOT mix information from different domains in the same section."
+ )
+
+ prompt_variables["domain_context"] = domain_context
+ else:
+ prompt_variables["domain_context"] = ""
+
+ # Debug: Print what variables are being passed
+ print(f"š§ Prompt variables being passed:")
+ print(f" domain_prefix: {repr(prompt_variables.get('domain_prefix', '')[:100])}")
+ print(f" domain_context: {repr(prompt_variables.get('domain_context', ''))}")
+ print(f" Number of text_sources: {len(prompt_variables.get('text_sources', []))}")
+
+ # Also print if sources are tagged
+ sources = prompt_variables.get('text_sources', [])
+ if sources:
+ print(f" First source preview: {sources[0][:100]}...")
+ domain_tagged = any("[Domain:" in str(s) for s in sources)
+ print(f" Sources are domain-tagged: {domain_tagged}")
+
messages = self.prompt_manager.render_prompt(
self.answer_prompt,
- self.get_system_prompt_variables(overrides.get("prompt_template"))
- | {
- "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")),
- "past_messages": messages[:-1],
- "user_query": original_user_query,
- "text_sources": extra_info.data_points.text,
- },
+ prompt_variables
)
+
+ # Debug: Print the rendered prompt
+ print(f"š§ Rendered prompt (first message):")
+ if messages and len(messages) > 0:
+ print(f" {messages[0].get('content', '')[:200]}...")
chat_coroutine = cast(
Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]],
@@ -114,6 +209,7 @@ async def run_until_final_call(
should_stream,
),
)
+
extra_info.thoughts.append(
self.format_thought_step_for_chatcompletion(
title="Prompt to generate answer",
@@ -124,6 +220,15 @@ async def run_until_final_call(
usage=None,
)
)
+
+ # Add domain classification as a thought step
+ if domain_info:
+ extra_info.thoughts.insert(0, ThoughtStep(
+ title="Domain Classification",
+ data=domain_info,
+ properties={"classifier": "DomainClassifier"}
+ ))
+
return (extra_info, chat_coroutine)
async def run_search_approach(
@@ -137,19 +242,60 @@ async def run_search_approach(
top = overrides.get("top", 3)
minimum_search_score = overrides.get("minimum_search_score", 0.0)
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
- search_index_filter = self.build_filter(overrides, auth_claims)
+
+ # Build base filter (security, permissions, etc.)
+ base_search_filter = self.build_filter(overrides, auth_claims)
original_user_query = messages[-1]["content"]
if not isinstance(original_user_query, str):
raise ValueError("The most recent message content must be a string.")
+ # Determine which index to use and build domain-specific filter
+ domain_info = overrides.get("domain_classification")
+ search_index_name = self.search_index_name # Default
+ search_index_filter = base_search_filter # Start with base filter
+
+ if domain_info and domain_info.get("domains"):
+ domains = domain_info["domains"]
+ if len(domains) == 1:
+ domain = domains[0]
+ # Single domain - use specific index
+ if "Cosmic" in domains:
+ search_index_name = self.cosmic_index_name
+ print(f"šÆ Using Cosmic index for search: {search_index_name}")
+
+ # Add category filter if the index contains multiple categories
+ category_filter = "category eq 'Cosmic'"
+ if base_search_filter:
+ search_index_filter = f"({base_search_filter}) and ({category_filter})"
+ else:
+ search_index_filter = category_filter
+
+ elif "Substrate" in domains:
+ search_index_name = self.substrate_index_name
+ print(f"šÆ Using Substrate index for search: {search_index_name}")
+
+ # Add category filter if the index contains multiple categories
+ category_filter = "category eq 'Substrate'"
+ if base_search_filter:
+ search_index_filter = f"({base_search_filter}) and ({category_filter})"
+ else:
+ search_index_filter = category_filter
+ else:
+ # Multiple domains - search with category filter
+ categories_filter = " or ".join([f"category eq '{domain}'" for domain in domains])
+ if base_search_filter:
+ search_index_filter = f"({base_search_filter}) and ({categories_filter})"
+ else:
+ search_index_filter = categories_filter
+ print(f"šÆ Multiple domains detected: {domains}. Using filter: {search_index_filter}")
+
query_messages = self.prompt_manager.render_prompt(
self.query_rewrite_prompt, {"user_query": original_user_query, "past_messages": messages[:-1]}
)
tools: list[ChatCompletionToolParam] = self.query_rewrite_tools
- # STEP 1: Generate an optimized keyword search query based on the chat history and the last question
-
+ # STEP 1: Generate search query
chat_completion = cast(
ChatCompletion,
await self.create_chat_completion(
@@ -157,20 +303,16 @@ async def run_search_approach(
self.chatgpt_model,
messages=query_messages,
overrides=overrides,
- response_token_limit=self.get_response_token_limit(
- self.chatgpt_model, 100
- ), # Setting too low risks malformed JSON, setting too high may affect performance
- temperature=0.0, # Minimize creativity for search query generation
+ response_token_limit=self.get_response_token_limit(self.chatgpt_model, 100),
+ temperature=0.0,
tools=tools,
- reasoning_effort="low", # Minimize reasoning for search query generation
+ reasoning_effort="low",
),
)
query_text = self.get_search_query(chat_completion, original_user_query)
- # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query
-
- # If retrieval mode includes vectors, compute an embedding for the query
+ # STEP 2: Search with domain-aware filter
vectors: list[VectorQuery] = []
if use_vector_search:
vectors.append(await self.compute_text_embedding(query_text))
@@ -178,7 +320,7 @@ async def run_search_approach(
results = await self.search(
top,
query_text,
- search_index_filter,
+ search_index_filter, # Now includes domain filtering
vectors,
use_text_search,
use_vector_search,
@@ -188,9 +330,22 @@ async def run_search_approach(
minimum_reranker_score,
use_query_rewriting,
)
+
+ # Log what filter was used
+ print(f"š Search filter applied: {search_index_filter}")
- # STEP 3: Generate a contextual and content specific answer using the search results and chat history
+ # STEP 3: Process results
text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
+
+ # Add domain tagging to sources if domain is classified
+ if domain_info and domain_info.get("domains"):
+ domains = domain_info["domains"]
+ if len(domains) == 1:
+ domain = domains[0]
+ tagged_sources = []
+ for source in text_sources:
+ tagged_sources.append(f"[Domain: {domain}] {source}")
+ text_sources = tagged_sources
extra_info = ExtraInfo(
DataPoints(text=text_sources),
@@ -215,6 +370,8 @@ async def run_search_approach(
"filter": search_index_filter,
"use_vector_search": use_vector_search,
"use_text_search": use_text_search,
+ "search_index": search_index_name,
+ "domain_filter_applied": domain_info is not None,
},
),
ThoughtStep(
@@ -232,23 +389,165 @@ async def run_agentic_retrieval_approach(
auth_claims: dict[str, Any],
):
minimum_reranker_score = overrides.get("minimum_reranker_score", 0)
- search_index_filter = self.build_filter(overrides, auth_claims)
top = overrides.get("top", 3)
max_subqueries = overrides.get("max_subqueries", 10)
results_merge_strategy = overrides.get("results_merge_strategy", "interleaved")
- # 50 is the amount of documents that the reranker can process per query
max_docs_for_reranker = max_subqueries * 50
- response, results = await self.run_agentic_retrieval(
- messages=messages,
- agent_client=self.agent_client,
- search_index_name=self.search_index_name,
- top=top,
- filter_add_on=search_index_filter,
- minimum_reranker_score=minimum_reranker_score,
- max_docs_for_reranker=max_docs_for_reranker,
- results_merge_strategy=results_merge_strategy,
- )
+ # Build base filter (security, permissions, etc.)
+ base_search_filter = self.build_filter(overrides, auth_claims)
+
+ # Determine which indexes to search based on domain classification
+ domain_info = overrides.get("domain_classification")
+ indexes_to_search = []
+
+ if domain_info and domain_info.get("domains"):
+ domains = domain_info["domains"]
+ if "Cosmic" in domains:
+ indexes_to_search.append(self.cosmic_index_name)
+ if "Substrate" in domains:
+ indexes_to_search.append(self.substrate_index_name)
+ else:
+ # No domain classification - search default index
+ indexes_to_search.append(self.search_index_name)
+
+ # If multiple indexes, perform parallel searches
+ if len(indexes_to_search) > 1:
+ print(f"šÆ Searching multiple indexes: {indexes_to_search}")
+
+ # Run searches in parallel for each index
+ import asyncio
+ search_tasks = []
+ domain_mapping = {}
+
+ for index_name in indexes_to_search:
+ # Track which domain each index corresponds to
+ if index_name == self.cosmic_index_name:
+ domain_mapping[index_name] = "Cosmic"
+ elif index_name == self.substrate_index_name:
+ domain_mapping[index_name] = "Substrate"
+ else:
+ domain_mapping[index_name] = "Default"
+
+ # Build index-specific filter
+ search_index_filter = base_search_filter
+ if domain_mapping[index_name] != "Default":
+ # Add category filter for this specific domain
+ category_filter = f"category eq '{domain_mapping[index_name]}'"
+ if base_search_filter:
+ search_index_filter = f"({base_search_filter}) and ({category_filter})"
+ else:
+ search_index_filter = category_filter
+ print(f"š Filter for {domain_mapping[index_name]} index: {search_index_filter}")
+
+ task = self.run_agentic_retrieval(
+ messages=messages,
+ agent_client=self.agent_client,
+ search_index_name=index_name,
+ top=top,
+ filter_add_on=search_index_filter, # Use domain-specific filter
+ minimum_reranker_score=minimum_reranker_score,
+ max_docs_for_reranker=max_docs_for_reranker,
+ results_merge_strategy=results_merge_strategy,
+ )
+ search_tasks.append((task, index_name))
+
+ # Execute searches in parallel
+ search_results_with_index = await asyncio.gather(*[task for task, _ in search_tasks])
+
+ # Merge results from all indexes with domain tagging
+ merged_response = search_results_with_index[0][0] # Use first response as base
+ merged_results = []
+
+ for i, (response, results) in enumerate(search_results_with_index):
+ index_name = search_tasks[i][1]
+ domain = domain_mapping.get(index_name, "Unknown")
+
+ # Tag each result with its source domain
+ for result in results:
+ # Add domain tag to the content
+ if hasattr(result, 'content'):
+ result.content = f"[Domain: {domain}] {result.content}"
+ merged_results.append(result)
+
+ # Merge activity logs if needed
+ if response.activity and merged_response.activity:
+ merged_response.activity.extend(response.activity)
+
+ # Re-rank and deduplicate merged results
+ # Sort by relevance score and take top N
+ merged_results.sort(key=lambda r: r.score if hasattr(r, 'score') else 0, reverse=True)
+
+ # Remove duplicates based on content similarity
+ unique_results = []
+ seen_content = set()
+ for result in merged_results:
+ content_key = result.content[:200] if hasattr(result, 'content') else str(result)
+ if content_key not in seen_content:
+ unique_results.append(result)
+ seen_content.add(content_key)
+ if len(unique_results) >= top:
+ break
+
+ response, results = merged_response, unique_results[:top]
+ final_search_filter = "Multiple domain-specific filters applied"
+
+ else:
+ # Single index search
+ search_index = indexes_to_search[0]
+
+ # Build filter for single index search
+ search_index_filter = base_search_filter
+ domain_used = "Default"
+
+ if domain_info and domain_info.get("domains"):
+ domains = domain_info["domains"]
+ if len(domains) == 1:
+ domain = domains[0]
+ if search_index == self.cosmic_index_name and domain == "Cosmic":
+ category_filter = "category eq 'Cosmic'"
+ domain_used = "Cosmic"
+ elif search_index == self.substrate_index_name and domain == "Substrate":
+ category_filter = "category eq 'Substrate'"
+ domain_used = "Substrate"
+ else:
+ category_filter = None
+
+ if category_filter:
+ if base_search_filter:
+ search_index_filter = f"({base_search_filter}) and ({category_filter})"
+ else:
+ search_index_filter = category_filter
+ else:
+ # Multiple domains on single index - use OR filter
+ categories_filter = " or ".join([f"category eq '{domain}'" for domain in domains])
+ if base_search_filter:
+ search_index_filter = f"({base_search_filter}) and ({categories_filter})"
+ else:
+ search_index_filter = categories_filter
+ domain_used = "Multiple"
+
+ print(f"šÆ Using single index: {search_index}")
+ print(f"š Filter applied: {search_index_filter}")
+
+ response, results = await self.run_agentic_retrieval(
+ messages=messages,
+ agent_client=self.agent_client,
+ search_index_name=search_index,
+ top=top,
+ filter_add_on=search_index_filter, # Use domain-specific filter
+ minimum_reranker_score=minimum_reranker_score,
+ max_docs_for_reranker=max_docs_for_reranker,
+ results_merge_strategy=results_merge_strategy,
+ )
+
+ # Tag results for single domain if needed
+ if domain_used != "Default" and domain_used != "Multiple":
+ for result in results:
+ if hasattr(result, 'content'):
+ result.content = f"[Domain: {domain_used}] {result.content}"
+
+ final_search_filter = search_index_filter
text_sources = self.get_sources_content(results, use_semantic_captions=False, use_image_citation=False)
@@ -262,7 +561,9 @@ async def run_agentic_retrieval_approach(
"reranker_threshold": minimum_reranker_score,
"max_docs_for_reranker": max_docs_for_reranker,
"results_merge_strategy": results_merge_strategy,
- "filter": search_index_filter,
+ "filter": final_search_filter,
+ "indexes_searched": indexes_to_search,
+ "domain_filter_applied": domain_info is not None,
},
),
ThoughtStep(
@@ -279,3 +580,21 @@ async def run_agentic_retrieval_approach(
],
)
return extra_info
+
+ def _format_domain_message(self, domains: List[str], confidence: float, reasoning: str) -> str:
+ """Format domain classification into user-friendly message"""
+ if len(domains) == 1:
+ domain = domains[0]
+ if confidence >= 0.8:
+ return f"š Based on your question, I believe this is related to the **{domain}** domain."
+ elif confidence >= 0.5:
+ return f"š Based on your question, I think this is related to the **{domain}** domain, though I'm not entirely certain."
+ else:
+ return f"š Based on your question, this might be related to the **{domain}** domain, but I'm not confident about this classification."
+ else:
+ # Multiple domains
+ domains_str = " and ".join(f"**{d}**" for d in domains)
+ if confidence >= 0.8:
+ return f"š Based on your question, this appears to be related to both {domains_str} domains."
+ else:
+ return f"š Based on your question, this might be related to both {domains_str} domains. {reasoning}"
diff --git a/app/backend/approaches/domain_classifier.py b/app/backend/approaches/domain_classifier.py
new file mode 100644
index 0000000000..0d9bc2e88e
--- /dev/null
+++ b/app/backend/approaches/domain_classifier.py
@@ -0,0 +1,183 @@
+from typing import List, Dict, Optional, Tuple
+from azure.search.documents.aio import SearchClient # Change to async version
+from azure.search.documents.models import VectorizedQuery
+from openai import AsyncOpenAI
+import json
+
+class DomainClassifier:
+ def __init__(
+ self,
+ search_client: SearchClient,
+ openai_client: AsyncOpenAI,
+ embeddings_service,
+ chatgpt_deployment: str
+ ):
+ self.search_client = search_client
+ self.openai_client = openai_client
+ self.embeddings_service = embeddings_service
+ self.chatgpt_deployment = chatgpt_deployment
+
+ async def classify_with_context(
+ self,
+ question: str,
+ conversation_history: List[Dict] = None
+ ) -> Tuple[List[str], float, str]:
+ """
+ Classify question into domains with confidence score
+ Returns: (domains, confidence, reasoning)
+ """
+
+ # Step 1: Vector search for similar domain patterns
+ vector_results = await self._vector_search_domains(question)
+
+ # Step 2: Analyze with LLM using domain knowledge
+ classification = await self._llm_classification(
+ question,
+ vector_results,
+ conversation_history
+ )
+
+ return classification
+
+ async def _vector_search_domains(self, question: str) -> List[Dict]:
+ """Search for similar patterns in domain classifier index"""
+ # Create embedding for the question
+ question_embedding = await self.embeddings_service.create_embeddings([question])
+
+ # Search in domain classifier index
+ vector_query = VectorizedQuery(
+ vector=question_embedding[0],
+ k_nearest_neighbors=2,
+ fields="embedding"
+ )
+
+ results = await self.search_client.search(
+ search_text=question,
+ vector_queries=[vector_query],
+ select=["domain", "keywords", "topics", "sample_questions"],
+ top=2
+ )
+
+ domain_matches = []
+ # Use proper pagination pattern for async search results
+ async for page in results.by_page():
+ async for result in page:
+ domain_matches.append({
+ "domain": result["domain"],
+ "score": result["@search.score"],
+ "keywords": result.get("keywords", ""),
+ "topics": result.get("topics", ""),
+ "sample_questions": result.get("sample_questions", "")
+ })
+
+ # Add this debug logging:
+ print(f"š Domain search results for '{question}':")
+ for match in domain_matches:
+ print(f" - {match['domain']}: score={match['score']:.3f}")
+ print(f" Keywords: {match['keywords'][:100]}...")
+ print(f" Topics: {match['topics'][:100]}...")
+
+ return domain_matches
+
+ async def _llm_classification(
+ self,
+ question: str,
+ vector_results: List[Dict],
+ conversation_history: List[Dict] = None
+ ) -> Tuple[List[str], float, str]:
+ """Use LLM to classify based on domain knowledge"""
+
+ # Build context from vector results
+ domain_context = "\n".join([
+ f"Domain: {r['domain']}\n"
+ f"Relevance Score: {r['score']}\n"
+ f"Key Topics: {r['topics'][:100] if r['topics'] else 'N/A'}...\n" # Handle as string
+ f"Keywords: {r['keywords'][:100]}...\n"
+ f"Similar Questions: {r['sample_questions'][:200]}...\n"
+ for r in vector_results
+ ])
+
+ # Include conversation history if available
+ history_context = ""
+ if conversation_history:
+ history_context = "Previous conversation:\n"
+ for msg in conversation_history[-3:]: # Last 3 messages
+ history_context += f"{msg['role']}: {msg['content'][:200]}...\n"
+
+ classification_prompt = f"""You are a domain classification expert for Microsoft's technical documentation.
+
+You need to classify questions into one of these domains:
+
+**COSMIC Domain:**
+- Container platform for performance and diagnostics
+- Keywords: cosmic, container, performance, diagnostics, monitoring, metrics
+- Sample questions: "How do I monitor container performance?", "What are cosmic diagnostics?"
+
+**SUBSTRATE Domain:**
+- Infrastructure platform and cloud services
+- Keywords: substrate, infrastructure, platform, cloud, services, deployment
+- Sample questions: "How to set up substrate infrastructure?", "What is substrate platform?"
+
+**Current Question:** "{question}"
+
+Based on the search results from each domain:
+{domain_context}
+
+{history_context}
+
+Analyze the question and determine:
+1. Which domain(s) this question belongs to
+2. Your confidence level (high/medium/low)
+3. Brief reasoning
+4. If the domain is confusing/ambiguous, explain what aspects belong to which domain
+
+Return JSON:
+{{
+ "domains": ["Cosmic"] or ["Substrate"] or ["Cosmic", "Substrate"],
+ "confidence": "high|medium|low",
+ "reasoning": "Brief explanation of why this belongs to the selected domain(s)",
+ "primary_domain": "Cosmic|Substrate|both",
+ "domain_breakdown": {{
+ "Cosmic": "Aspects of the question related to Cosmic (if any)",
+ "Substrate": "Aspects of the question related to Substrate (if any)"
+ }},
+ "is_ambiguous": true/false,
+ "user_friendly_explanation": "A simple explanation for the user about which domain their question relates to"
+}}"""
+
+ response = await self.openai_client.chat.completions.create(
+ model=self.chatgpt_deployment,
+ messages=[{"role": "system", "content": classification_prompt}],
+ temperature=0,
+ response_format={"type": "json_object"}
+ )
+
+ result = json.loads(response.choices[0].message.content)
+
+ # Convert confidence to numeric value
+ confidence_map = {"high": 0.9, "medium": 0.6, "low": 0.3}
+ numeric_confidence = confidence_map.get(result["confidence"], 0.5)
+
+ # Print classification decision (for debugging)
+ print(f"\nšÆ Classification Decision:")
+ print(f" Question: '{question}'")
+ print(f" Domains: {', '.join(result['domains'])}")
+ print(f" Primary Domain: {result['primary_domain']}")
+ print(f" Confidence: {result['confidence']} ({numeric_confidence:.1%})")
+ print(f" Reasoning: {result['reasoning']}")
+
+ if result.get('is_ambiguous', False):
+ print(f"\nā ļø Domain is ambiguous/confusing:")
+ if result.get('domain_breakdown'):
+ for domain, aspects in result['domain_breakdown'].items():
+ if aspects and aspects != "N/A":
+ print(f" - {domain}: {aspects}")
+
+ # Return user-friendly explanation if available, otherwise use reasoning
+ user_explanation = result.get('user_friendly_explanation', result['reasoning'])
+
+ return (
+ result["domains"],
+ numeric_confidence,
+ user_explanation
+ )
\ No newline at end of file
diff --git a/app/backend/approaches/orchestrator.py b/app/backend/approaches/orchestrator.py
new file mode 100644
index 0000000000..b6446bf029
--- /dev/null
+++ b/app/backend/approaches/orchestrator.py
@@ -0,0 +1,61 @@
+from typing import Any, Optional
+from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach
+from azure.search.documents.aio import SearchClient
+from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient
+
+class OrchestratorApproach:
+ def __init__(
+ self,
+ cosmic_agent: ChatReadRetrieveReadApproach,
+ substrate_agent: ChatReadRetrieveReadApproach,
+ openai_client,
+ prompt_manager
+ ):
+ self.cosmic_agent = cosmic_agent
+ self.substrate_agent = substrate_agent
+ self.openai_client = openai_client
+ self.prompt_manager = prompt_manager
+
+ async def determine_domain(self, question: str) -> str:
+ """Use LLM to determine which domain the question belongs to"""
+ prompt = f"""Given this question: "{question}"
+
+ Determine if this question is about:
+ 1. Cosmic - Microsoft's container platform for performance and diagnostics
+ 2. Substrate - Microsoft's infrastructure platform
+
+ Respond with only "cosmic" or "substrate".
+ """
+
+ response = await self.openai_client.chat.completions.create(
+ model=self.openai_deployment,
+ messages=[{"role": "system", "content": prompt}],
+ temperature=0
+ )
+
+ domain = response.choices[0].message.content.strip().lower()
+ return domain if domain in ["cosmic", "substrate"] else "cosmic"
+
+ async def run(
+ self,
+ messages: list[dict],
+ stream: bool = False,
+ session_state: Any = None,
+ context: dict[str, Any] = {}
+ ):
+ # Extract the latest user question
+ user_question = messages[-1]["content"] if messages else ""
+
+ # Determine which domain to use
+ domain = await self.determine_domain(user_question)
+
+ # Route to appropriate agent
+ if domain == "substrate":
+ # Add domain context to the messages
+ context["overrides"] = context.get("overrides", {})
+ context["overrides"]["include_category"] = "Substrate"
+ return await self.substrate_agent.run(messages, stream, session_state, context)
+ else:
+ context["overrides"] = context.get("overrides", {})
+ context["overrides"]["include_category"] = "Cosmic"
+ return await self.cosmic_agent.run(messages, stream, session_state, context)
\ No newline at end of file
diff --git a/app/backend/approaches/orchestrator_approach.py b/app/backend/approaches/orchestrator_approach.py
new file mode 100644
index 0000000000..1793e66b0a
--- /dev/null
+++ b/app/backend/approaches/orchestrator_approach.py
@@ -0,0 +1,257 @@
+from typing import Any, AsyncGenerator, Union
+from approaches.approach import Approach
+from approaches.domain_classifier import DomainClassifier
+
+class OrchestratorApproach(Approach):
+ def __init__(
+ self,
+ cosmic_approach,
+ substrate_approach,
+ domain_classifier: DomainClassifier,
+ openai_client,
+ prompt_manager
+ ):
+ self.cosmic_approach = cosmic_approach
+ self.substrate_approach = substrate_approach
+ self.domain_classifier = domain_classifier
+ self.openai_client = openai_client
+ self.prompt_manager = prompt_manager
+
+ async def run(
+ self,
+ messages: list[dict],
+ stream: bool = False,
+ session_state: Any = None,
+ context: dict[str, Any] = {}
+ ) -> Union[dict[str, Any], AsyncGenerator[dict[str, Any], None]]:
+ """
+ Main entry point for the orchestrator approach.
+ Returns a dict for non-streaming, or an async generator for streaming.
+ """
+ if stream:
+ return self._run_stream_internal(messages, session_state, context)
+ else:
+ return await self._run_non_stream(messages, session_state, context)
+
+ async def run_stream(
+ self,
+ messages: list[dict],
+ session_state: Any = None,
+ context: dict[str, Any] = {}
+ ) -> AsyncGenerator[dict[str, Any], None]:
+ """
+ Stream implementation - returns an async generator.
+ This matches the pattern used by other approaches.
+ """
+ return self._run_stream_internal(messages, session_state, context)
+
+ async def _run_stream_internal(
+ self,
+ messages: list[dict],
+ session_state: Any = None,
+ context: dict[str, Any] = {}
+ ) -> AsyncGenerator[dict[str, Any], None]:
+ """
+ Internal streaming implementation - this is the actual async generator.
+ """
+ # Check if user has explicitly selected a category
+ user_category = context.get("overrides", {}).get("include_category", "")
+
+ if user_category and user_category != "":
+ # User specified a category, use it directly
+ if user_category == "Cosmic":
+ async for chunk in await self.cosmic_approach.run_stream(messages, session_state, context):
+ yield chunk
+ return
+ elif user_category == "Substrate":
+ async for chunk in await self.substrate_approach.run_stream(messages, session_state, context):
+ yield chunk
+ return
+
+ # Extract the latest user question
+ user_question = messages[-1]["content"] if messages else ""
+
+ # Get domain classification with context
+ domains, confidence, reasoning = await self.domain_classifier.classify_with_context(
+ user_question,
+ messages[:-1] # Pass conversation history
+ )
+
+ # Handle based on classification results
+ if len(domains) == 1:
+ # Single domain detected
+ domain = domains[0]
+ context["overrides"] = context.get("overrides", {})
+ context["overrides"]["include_category"] = domain
+
+ approach = self.cosmic_approach if domain == "Cosmic" else self.substrate_approach
+
+ # Yield classification info first
+ yield {
+ "thoughts": f"Domain Classification: {domain} (Confidence: {confidence})\nReasoning: {reasoning}"
+ }
+
+ # Then stream from the selected approach
+ async for chunk in await approach.run_stream(messages, session_state, context):
+ yield chunk
+
+ elif len(domains) == 2 and confidence < 0.8: # Use numeric threshold instead of string
+ # Ambiguous - search both domains
+ yield {
+ "answer": f"I found relevant information in both {' and '.join(domains)} domains:\n\n",
+ "thoughts": f"Classification: Multiple domains (Confidence: {confidence})\n{reasoning}"
+ }
+
+ # Run both approaches
+ for domain in domains:
+ yield {"answer": f"\n## {domain}\n"}
+
+ approach = self.cosmic_approach if domain == "Cosmic" else self.substrate_approach
+ domain_context = context.copy()
+ domain_context["overrides"] = domain_context.get("overrides", {}).copy()
+ domain_context["overrides"]["include_category"] = domain
+
+ async for chunk in await approach.run_stream(messages, session_state, domain_context):
+ yield chunk
+
+ if domain != domains[-1]:
+ yield {"answer": "\n\n---\n"}
+
+ else:
+ # Low confidence or no clear domain
+ yield {
+ "answer": f"I'm not sure which domain you're asking about. Your question could relate to:\n\n"
+ f"- **Cosmic**: Microsoft's container platform for performance and diagnostics\n"
+ f"- **Substrate**: Microsoft's infrastructure platform\n\n"
+ f"Could you please specify which one you're interested in, or shall I provide information about both?",
+ "thoughts": f"Classification confidence was low. Domains detected: {', '.join(domains)}",
+ "data_points": [],
+ "citation_lookup": {},
+ "thought_chain": [{
+ "title": "Domain Classification",
+ "description": reasoning,
+ "domains": domains,
+ "confidence": confidence
+ }]
+ }
+
+ async def _run_non_stream(
+ self,
+ messages: list[dict],
+ session_state: Any = None,
+ context: dict[str, Any] = {}
+ ) -> dict[str, Any]:
+ """
+ Non-streaming implementation.
+ """
+ # Check if user has explicitly selected a category
+ user_category = context.get("overrides", {}).get("include_category", "")
+
+ if user_category and user_category != "":
+ # User specified a category, use it directly
+ if user_category == "Cosmic":
+ return await self.cosmic_approach.run(messages, False, session_state, context)
+ elif user_category == "Substrate":
+ return await self.substrate_approach.run(messages, False, session_state, context)
+
+ # Extract the latest user question
+ user_question = messages[-1]["content"] if messages else ""
+
+ # Get domain classification with context
+ domains, confidence, reasoning = await self.domain_classifier.classify_with_context(
+ user_question,
+ messages[:-1] # Pass conversation history
+ )
+
+ # Handle based on classification results
+ if len(domains) == 1:
+ # Single domain detected
+ domain = domains[0]
+ context["overrides"] = context.get("overrides", {})
+ context["overrides"]["include_category"] = domain
+
+ approach = self.cosmic_approach if domain == "Cosmic" else self.substrate_approach
+
+ result = await approach.run(messages, False, session_state, context)
+ # Add classification info to thoughts
+ if isinstance(result, dict):
+ result["thought_chain"] = result.get("thought_chain", [])
+ result["thought_chain"].insert(0, {
+ "title": "Domain Classification",
+ "description": reasoning,
+ "domain": domain,
+ "confidence": confidence
+ })
+ return result
+
+ elif len(domains) == 2 and confidence < 0.8: # Use numeric threshold instead of string
+ # Ambiguous ā search both domains
+ return await self._handle_multi_domain_query(
+ messages, domains, confidence, reasoning, session_state, context
+ )
+ else:
+ # Low confidence or no clear domain
+ return await self._handle_unclear_classification(
+ messages, domains, confidence, reasoning, session_state, context
+ )
+
+ async def _handle_multi_domain_query(
+ self, messages, domains, confidence, reasoning, session_state, context
+ ):
+ """Handle queries that might belong to multiple domains (non-streaming)"""
+
+ results = {
+ "answer": f"I found relevant information in both {' and '.join(domains)} domains:\n\n",
+ "thoughts": f"Classification: Multiple domains (Confidence: {confidence})\n{reasoning}",
+ "data_points": [],
+ "citation_lookup": {},
+ "thought_chain": [{
+ "title": "Domain Classification",
+ "description": reasoning,
+ "domains": domains,
+ "confidence": confidence
+ }]
+ }
+
+ # Run both approaches
+ for domain in domains:
+ results["answer"] += f"\n## {domain}\n"
+
+ approach = self.cosmic_approach if domain == "Cosmic" else self.substrate_approach
+ domain_context = context.copy()
+ domain_context["overrides"] = domain_context.get("overrides", {}).copy()
+ domain_context["overrides"]["include_category"] = domain
+
+ domain_result = await approach.run(messages, False, session_state, domain_context)
+
+ if isinstance(domain_result, dict):
+ results["answer"] += domain_result.get("answer", "")
+ results["data_points"].extend(domain_result.get("data_points", []))
+ results["citation_lookup"].update(domain_result.get("citation_lookup", {}))
+ results["thought_chain"].extend(domain_result.get("thought_chain", []))
+
+ if domain != domains[-1]:
+ results["answer"] += "\n\n---\n"
+
+ return results
+
+ async def _handle_unclear_classification(
+ self, messages, domains, confidence, session_state, context
+ ):
+ """Handle cases where the domain classification is unclear (non-streaming)"""
+
+ return {
+ "answer": f"I'm not sure which domain you're asking about. Your question could relate to:\n\n"
+ f"- **Cosmic**: Microsoft's container platform for performance and diagnostics\n"
+ f"- **Substrate**: Microsoft's infrastructure platform\n\n"
+ f"Could you please specify which one you're interested in, or shall I provide information about both?",
+ "thoughts": f"Classification confidence was low. Domains detected: {', '.join(domains)}",
+ "data_points": [],
+ "citation_lookup": {},
+ "thought_chain": [{
+ "title": "Domain Classification",
+ "description": reasoning,
+ "domains": domains,
+ "confidence": confidence
+ }]
+ }
\ No newline at end of file
diff --git a/app/backend/approaches/prompts/chat_answer_question.prompty b/app/backend/approaches/prompts/chat_answer_question.prompty
index 3dcb05ae21..5e3cfb1966 100644
--- a/app/backend/approaches/prompts/chat_answer_question.prompty
+++ b/app/backend/approaches/prompts/chat_answer_question.prompty
@@ -1,38 +1,78 @@
---
name: Chat
-description: Answer a question (with chat history) using solely text sources.
+description: Answer a question (with chat history) using solely text sources. The answer must be accurate and precise, and include original hyperlinks if present in the source documents.
model:
api: chat
+inputs:
+ domain_prefix:
+ type: string
+ default: ""
+ domain_context:
+ type: string
+ default: ""
+ user_query:
+ type: string
+ include_follow_up_questions:
+ type: boolean
+ default: false
+ past_messages:
+ type: array
+ default: []
+ text_sources:
+ type: array
+ default: []
+ injected_prompt:
+ type: string
+ default: ""
+ override_prompt:
+ type: string
+ default: ""
sample:
- user_query: What does a product manager do that a CEO doesn't?
+ user_query: How do I investigate high CPU usage in a Cosmic container?
include_follow_up_questions: true
+ domain_prefix: "š Based on your question, I believe this is related to the **Cosmic** domain."
+ domain_context: ""
past_messages:
- role: user
- content: "What does a CEO do?"
+ content: "How do I detect performance issues in Cosmic?"
- role: assistant
- content: "A CEO, or Chief Executive Officer, is responsible for providing strategic direction and oversight to a company to ensure its long-term success and profitability. They develop and implement strategies and objectives for financial success and growth, provide guidance to the executive team, manage day-to-day operations, ensure compliance with laws and regulations, develop and maintain relationships with stakeholders, monitor industry trends, and represent the company in public events 12. [role_library.pdf#page=1][role_library.pdf#page=3]"
+ content: "You can start by reviewing the [Windows Perf Counter dashboard](https://cosmicmonitoring-b6a0cza8a4ghfnda.scus.grafana.azure.com/d/f5035b83-bb8d-41ac-808b-65635396a8a1/windows-performance-counter?orgId=1) and [Linux Utilization dashboard](https://cosmicmonitoring-b6a0cza8a4ghfnda.scus.grafana.azure.com/d/rYdddlPWk/linux-utilization?orgId=1) to identify abnormal resource usage. For more detailed steps, refer to the [TSG for perf investigation](https://aka.ms/cosmic/tsg/perf)."
text_sources:
- - "role_library.pdf#page=29: The Manager of Product Management will collaborate with internal teams, such as engineering, sales, marketing, and finance, as well as external partners, suppliers, and customers to ensure successful product execution. Responsibilities: Ā· Lead the product management team and provide guidance on product strategy, design, development, and launch. Ā· Develop and implement product life-cycle management processes. Ā· Monitor and analyze industry trends to identify opportunities for new products. Ā· Develop product marketing plans and go-to-market strategies. Ā· Research customer needs and develop customer-centric product roadmaps. Ā· Collaborate with internal teams to ensure product execution and successful launch. Ā· Develop pricing strategies and cost models. Ā· Oversee product portfolio and performance metrics. Ā· Manage product development budget. Ā· Analyze product performance and customer feedback to identify areas for improvement. Qualifications: Ā· Bachelor's degree in business, engineering, or a related field. Ā· At least 5 years of experience in product management. Ā· Proven track record of successful product launches."
- - "role_library.pdf#page=23: Company: Contoso Electronics Location: Anywhere Job Type: Full-Time Salary: Competitive, commensurate with experience Job Summary: The Senior Manager of Product Management will be responsible for leading the product management team at Contoso Electronics. This role includes developing strategies, plans and objectives for the product management team and managing the day-to-day operations. The Senior Manager of Product Management will be responsible for the successful launch of new products and the optimization of existing products. Responsibilities: Ā· Develop and implement product management strategies, plans and objectives to maximize team performance. Ā· Analyze competitive landscape and market trends to develop product strategies. Ā· Lead the product management team in the development of product plans, roadmaps and launch plans. Ā· Monitor the performance of product management team, analyze results and implement corrective action as needed. Ā· Manage the product lifecycle, including product development, launch, and end of life. Ā· Ensure product features and benefits meet customer requirements. Ā· Establish and maintain relationships with key customers, partners, and vendors."
- - "role_library.pdf#page=28: Ā· 7+ years of experience in research and development in the electronics sector. Ā· Proven track record of successfully designing, testing, and optimizing products. Ā· Experience leading a team of researchers and engineers. Ā· Excellent problem-solving and analytical skills. Ā· Ability to work in a fast-paced environment and meet tight deadlines.Ā· Knowledge of industry trends, technologies, and regulations. Ā· Excellent communication and presentation skills. Manager of Product Management Job Title: Manager of Product Management, Contoso Electronics Job Summary: The Manager of Product Management is responsible for overseeing the product management team, driving product development and marketing strategy for Contoso Electronics. This individual will be accountable for the successful launch of new products and the implementation of product life-cycle management processes. The Manager of Product Management will collaborate with internal teams, such as engineering, sales, marketing, and finance, as well as external partners, suppliers, and customers to ensure successful product execution."
+ - "[Domain: Cosmic] [Investigate High CPU Utilization in Container.docx](https://microsoft-my.sharepoint-df.com/personal/chengxili_microsoft_com/_layouts/15/Doc.aspx?sourcedoc=%7B795E6E66-25F1-4BE4-8D82-B9324F777E37%7D&file=Investigate%20High%20CPU%20Utilization%20in%20Container.docx&action=default&mobileredirect=true&EntityRepresentationId=8cb99e06-cf21-4a9e-a626-2af05ad16fd2): Examine [Windows Perf Counter dashboard](https://cosmicmonitoring-b6a0cza8a4ghfnda.scus.grafana.azure.com/d/f5035b83-bb8d-41ac-808b-65635396a8a1/windows-performance-counter?orgId=1) and [Linux Utilization dashboard](https://cosmicmonitoring-b6a0cza8a4ghfnda.scus.grafana.azure.com/d/rYdddlPWk/linux-utilization?orgId=1) to detect abnormal usage. For trigger setup, refer to [trigger configuration document](https://eng.ms/docs/products/efficiency-pack/monitoring/triggers/configure-triggers) and [EP portal](https://epportal.azurewebsites.net/trigger?teamName=Cosmic)."
+ - "[Domain: Cosmic] [FAQs for Cosmic Performance.docx](https://microsoft-my.sharepoint-df.com/personal/chengxili_microsoft_com/_layouts/15/Doc.aspx?sourcedoc=%7BE9D8A58B-995F-4FC4-A178-7ABFB8046320%7D&file=FAQs%20for%20Cosmic%20Performance.docx&action=default&mobileredirect=true&EntityRepresentationId=ba6971e7-adf6-44c0-ad26-8d7fda692eeb): For general troubleshooting, see [aka.ms/cosmic911](https://aka.ms/cosmic911) and [Troubleshooting Guide for Performance Investigation](https://aka.ms/cosmic/tsg/perf). For CPR profiling, refer to [Run CPR Analysis with Debug Container](https://eng.ms/docs/experiences-devices/m365-core/substrate-platform/cosmic-group/partners/troubleshooting/debug-tools/run-cpr-analysis-with-debug-container)."
---
system:
{% if override_prompt %}
{{ override_prompt }}
{% else %}
-Assistant helps the company employees with their healthcare plan questions, and questions about the employee handbook. Be brief in your answers.
+{% if domain_prefix %}
+{{ domain_prefix }}
+
+{% endif %}
+Assistant helps Microsoft engineers and partners with performance diagnostics and container health investigations. Be concise and technically accurate.
+
Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.
+
+{% if domain_context %}
+**CRITICAL FORMATTING REQUIREMENT:**
+{{ domain_context }}
+{% endif %}
+
If the question is not in English, answer in the language used in the question.
-Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
+
+Each source has a name followed by a colon and the actual information. Always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [FAQs for Cosmic Performance.docx]. Don't combine sources, list each source separately, for example [FAQs for Cosmic Performance.docx][Investigate High CPU Utilization in Container.docx].
+
+**Always preserve hyperlinks from the sources. Format them using Markdown: [link text](URL). Do not omit or rephrase hyperlinks. If a hyperlink in the source has target="_blank", preserve this attribute by using HTML: link text.**
+
{{ injected_prompt }}
{% endif %}
{% if include_follow_up_questions %}
Generate 3 very brief follow-up questions that the user would likely ask next.
Enclose the follow-up questions in double angle brackets. Example:
-<>
-<>
-<>
+<>
+<>
+<>
Do not repeat questions that have already been asked.
Make sure the last question ends with ">>".
{% endif %}
@@ -49,3 +89,4 @@ Sources:
{% for text_source in text_sources %}
{{ text_source }}
{% endfor %}
+
diff --git a/app/backend/approaches/prompts/chat_query_rewrite_tools.json b/app/backend/approaches/prompts/chat_query_rewrite_tools.json
index cf1743483c..45f3c40c20 100644
--- a/app/backend/approaches/prompts/chat_query_rewrite_tools.json
+++ b/app/backend/approaches/prompts/chat_query_rewrite_tools.json
@@ -8,7 +8,7 @@
"properties": {
"search_query": {
"type": "string",
- "description": "Query string to retrieve documents from azure search eg: 'Health care plan'"
+ "description": "Query string to retrieve documents from azure search eg: 'How to setup Watson on Cosmic'"
}
},
"required": ["search_query"]
diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py
index d59f903b0e..71d25abecb 100644
--- a/app/backend/approaches/retrievethenread.py
+++ b/app/backend/approaches/retrievethenread.py
@@ -25,7 +25,7 @@ def __init__(
search_index_name: str,
agent_model: Optional[str],
agent_deployment: Optional[str],
- agent_client: KnowledgeAgentRetrievalClient,
+ agent_client: Optional[KnowledgeAgentRetrievalClient], # ā NOT Optional, but can be None in practice
auth_helper: AuthenticationHelper,
openai_client: AsyncOpenAI,
chatgpt_model: str,
@@ -77,7 +77,7 @@ async def run(
if not isinstance(q, str):
raise ValueError("The most recent message content must be a string.")
- if use_agentic_retrieval:
+ if use_agentic_retrieval and self.agent_client is not None:
extra_info = await self.run_agentic_retrieval_approach(messages, overrides, auth_claims)
else:
extra_info = await self.run_search_approach(messages, overrides, auth_claims)
diff --git a/app/backend/domain-classifier-backup.py b/app/backend/domain-classifier-backup.py
new file mode 100644
index 0000000000..bff4f54c5b
--- /dev/null
+++ b/app/backend/domain-classifier-backup.py
@@ -0,0 +1,651 @@
+import os
+import re
+import json
+import asyncio
+from typing import List, Dict, Optional, Union
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
+ SearchIndex,
+ SimpleField,
+ SearchableField,
+ SearchField,
+ VectorSearch,
+ HnswAlgorithmConfiguration,
+ VectorSearchProfile,
+ SemanticConfiguration,
+ SemanticPrioritizedFields,
+ SemanticField,
+ SemanticSearch,
+ SearchFieldDataType
+)
+
+class DomainClassifierSetup:
+ def __init__(self, search_client: SearchIndexClient, embeddings_service):
+ self.search_client = search_client
+ self.embeddings_service = embeddings_service
+ self._llm_client = None
+
+ def create_domain_classifier_index(self):
+ """Create a specialized index for domain classification"""
+ index_name = "domain-classifier-index"
+
+ # Get the actual embedding dimensions from the environment
+ embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "1536"))
+ print(f"Creating index with embedding dimensions: {embedding_dimensions}")
+
+ fields = [
+ SimpleField(name="id", type=SearchFieldDataType.String, key=True),
+ SearchableField(name="domain", type=SearchFieldDataType.String, facetable=True, filterable=True),
+ SearchableField(name="sample_questions", type=SearchFieldDataType.String),
+ SearchableField(name="keywords", type=SearchFieldDataType.String),
+ SearchableField(name="description", type=SearchFieldDataType.String),
+ SearchableField(name="topics", type=SearchFieldDataType.String),
+ SearchField(
+ name="embedding",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=embedding_dimensions,
+ vector_search_profile_name="myHnswProfile"
+ )
+ ]
+
+ vector_search = VectorSearch(
+ algorithms=[HnswAlgorithmConfiguration(name="myHnsw")],
+ profiles=[VectorSearchProfile(name="myHnswProfile", algorithm_configuration_name="myHnsw")]
+ )
+
+ semantic_config = SemanticConfiguration(
+ name="default",
+ prioritized_fields=SemanticPrioritizedFields(
+ title_field=SemanticField(field_name="domain"),
+ content_fields=[SemanticField(field_name="description")]
+ )
+ )
+
+ index = SearchIndex(
+ name=index_name,
+ fields=fields,
+ vector_search=vector_search,
+ semantic_search=SemanticSearch(configurations=[semantic_config])
+ )
+
+ self.search_client.create_or_update_index(index)
+ return index_name
+
+ def _get_llm_client(self):
+ """Get or create LLM client for chat completions"""
+ if self._llm_client is not None:
+ return self._llm_client
+
+ from openai import AsyncAzureOpenAI
+
+ # Get configuration
+ model_deployment = os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "chat")
+ endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT", "")
+ api_version = os.environ.get("AZURE_OPENAI_API_VERSION") or "2024-06-01"
+
+ print(f"DEBUG: Azure OpenAI Endpoint: {endpoint}")
+ print(f"DEBUG: Model Deployment: {model_deployment}")
+ print(f"DEBUG: API Version: {api_version}")
+
+ if not endpoint:
+ print("Error: AZURE_OPENAI_ENDPOINT is not set")
+ return None
+
+ try:
+ api_key = os.environ.get("AZURE_OPENAI_API_KEY_OVERRIDE")
+
+ if api_key:
+ print("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client")
+ self._llm_client = AsyncAzureOpenAI(
+ api_key=api_key,
+ azure_endpoint=endpoint,
+ api_version=api_version,
+ max_retries=3,
+ timeout=60.0
+ )
+ print("DEBUG: LLM client created with API key")
+ else:
+ from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
+
+ print("Using Azure credential (passwordless authentication) for Azure OpenAI client")
+ azure_credential = DefaultAzureCredential()
+ token_provider = get_bearer_token_provider(
+ azure_credential,
+ "https://cognitiveservices.azure.com/.default"
+ )
+
+ self._llm_client = AsyncAzureOpenAI(
+ azure_endpoint=endpoint,
+ azure_ad_token_provider=token_provider,
+ api_version=api_version,
+ max_retries=3,
+ timeout=60.0
+ )
+ print("DEBUG: LLM client created with Managed Identity")
+
+ except Exception as e:
+ print(f"Error creating LLM client: {e}")
+ import traceback
+ traceback.print_exc()
+ return None
+
+ return self._llm_client
+
+ def _get_model_deployment(self) -> Optional[str]:
+ """Get the chat model deployment name"""
+ return os.environ.get("AZURE_OPENAI_GPT4_DEPLOYMENT",
+ os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT"))
+
+ async def _call_llm_with_retry(self, prompt: str, system_prompt: str,
+ temperature: float = 0.3, max_tokens: int = 500) -> Optional[str]:
+ """Generic method to call LLM with retry logic"""
+ model_deployment = self._get_model_deployment()
+ if not model_deployment:
+ return None
+
+ client = self._get_llm_client()
+ if not client:
+ return None
+
+ max_retries = 3
+ for attempt in range(max_retries):
+ try:
+ response = await client.chat.completions.create(
+ model=model_deployment,
+ messages=[
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": prompt}
+ ],
+ temperature=temperature,
+ max_tokens=max_tokens,
+ timeout=30
+ )
+ return response.choices[0].message.content.strip()
+ except Exception as e:
+ if attempt < max_retries - 1:
+ print(f"Retry {attempt + 1}/{max_retries} after error: {str(e)}")
+ await asyncio.sleep(2 ** attempt)
+ else:
+ raise e
+ return None
+
+ def _parse_json_response(self, response_text: str, fallback_pattern: str = r'"([^"]+)"') -> List[str]:
+ """Parse JSON response with fallback"""
+ try:
+ import json
+ result = json.loads(response_text)
+ if isinstance(result, list):
+ return [item for item in result if item and isinstance(item, str)]
+ except:
+ # Fallback: extract using regex
+ matches = re.findall(fallback_pattern, response_text)
+ if matches:
+ return matches
+ # Further fallback: split by newlines
+ return [line.strip() for line in response_text.split('\n') if line.strip()]
+ return []
+
+ async def extract_domain_characteristics(self, file_paths: List[str], category: str) -> Dict:
+ """Extract domain characteristics from documents"""
+ all_content = []
+ topics = set()
+
+ # Filter out non-document files
+ valid_extensions = ['.txt', '.md', '.docx', '.pdf', '.doc', '.html', '.csv', '.json',
+ '.pptx', '.xlsx']
+
+ document_paths = [
+ fp for fp in file_paths
+ if os.path.isfile(fp)
+ and not fp.endswith('.md5')
+ and any(fp.lower().endswith(ext) for ext in valid_extensions)
+ ]
+
+ if not document_paths:
+ print(f"No valid documents found for {category}")
+ return self._get_default_characteristics(category)
+
+ print(f"Processing {len(document_paths)} documents for {category}")
+
+ # Read and analyze documents
+ for file_path in document_paths[:10]: # Limit to first 10 documents
+ try:
+ print(f"Processing: {os.path.basename(file_path)}")
+ content = await self.read_file(file_path)
+ if content and len(content.strip()) > 50:
+ all_content.append(content)
+ extracted_topics = await self.extract_topics(content)
+ topics.update(extracted_topics)
+ except Exception as e:
+ print(f"Error processing file {file_path}: {e}")
+ continue
+
+ if not all_content:
+ print(f"No content extracted from {category} documents")
+ return self._get_default_characteristics(category)
+
+ # Generate sample questions and keywords
+ sample_questions = await self.generate_sample_questions(all_content[:5])
+ keywords = await self.extract_keywords(all_content)
+
+ return {
+ "domain": category,
+ "sample_questions": sample_questions,
+ "keywords": keywords,
+ "topics": list(topics)[:20],
+ "description": f"Documents related to {category} - {', '.join(list(topics)[:5])}"
+ }
+
+ def _get_default_characteristics(self, category: str) -> Dict:
+ """Get default characteristics for a category"""
+ return {
+ "domain": category,
+ "sample_questions": [f"What is {category}?", f"How does {category} work?", f"What are the key features of {category}?"],
+ "keywords": [category.lower()],
+ "topics": [category],
+ "description": f"Documents related to {category}"
+ }
+
+ async def populate_classifier_index(self, cosmic_docs: List[str], substrate_docs: List[str]):
+ """Populate the classifier index with domain information"""
+ index_name = "domain-classifier-index"
+
+ # Create a synchronous SearchClient for the index
+ search_client = SearchClient(
+ endpoint=self.search_client._endpoint,
+ index_name=index_name,
+ credential=self.search_client._credential
+ )
+
+ # Extract characteristics for each domain
+ cosmic_chars = await self.extract_domain_characteristics(cosmic_docs, "Cosmic")
+ substrate_chars = await self.extract_domain_characteristics(substrate_docs, "Substrate")
+
+ # Create embeddings for each domain
+ embeddings = await self._create_embeddings_for_domains(cosmic_chars, substrate_chars)
+ if not embeddings:
+ print("Failed to create embeddings")
+ return
+
+ cosmic_embedding, substrate_embedding = embeddings
+
+ # Verify embedding dimensions
+ expected_dims = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "1536"))
+ print(f"Expected embedding dimensions: {expected_dims}")
+ print(f"Actual cosmic embedding dimensions: {len(cosmic_embedding)}")
+ print(f"Actual substrate embedding dimensions: {len(substrate_embedding)}")
+
+ # Upload to index
+ documents = [
+ self._create_document("cosmic-domain", cosmic_chars, cosmic_embedding),
+ self._create_document("substrate-domain", substrate_chars, substrate_embedding)
+ ]
+
+ try:
+ result = search_client.upload_documents(documents=documents)
+ print(f"\nUploaded {len(documents)} domain classifier documents")
+ for r in result:
+ if hasattr(r, 'succeeded'):
+ print(f"Document {r.key}: {r.succeeded}")
+ else:
+ print(f"Document uploaded: {r}")
+ except Exception as e:
+ print(f"Error uploading documents: {e}")
+ import traceback
+ traceback.print_exc()
+
+ async def _create_embeddings_for_domains(self, cosmic_chars: Dict, substrate_chars: Dict) -> Optional[tuple]:
+ """Create embeddings for domain characteristics"""
+ cosmic_text = json.dumps(cosmic_chars)
+ substrate_text = json.dumps(substrate_chars)
+
+ try:
+ cosmic_response = await self.embeddings_service.create_embeddings([cosmic_text])
+ substrate_response = await self.embeddings_service.create_embeddings([substrate_text])
+
+ # Extract embeddings
+ cosmic_embedding = self._extract_embedding(cosmic_response)
+ substrate_embedding = self._extract_embedding(substrate_response)
+
+ return (cosmic_embedding, substrate_embedding)
+ except Exception as e:
+ print(f"Error creating embeddings: {e}")
+ return None
+
+ def _extract_embedding(self, response) -> List[float]:
+ """Extract embedding vector from response"""
+ if isinstance(response, list):
+ embedding = response[0]
+ else:
+ embedding = response.data[0].embedding
+
+ if not isinstance(embedding, list):
+ embedding = list(embedding)
+
+ return embedding
+
+ def _create_document(self, doc_id: str, characteristics: Dict, embedding: List[float]) -> Dict:
+ """Create a document for indexing"""
+ return {
+ "id": doc_id,
+ "domain": characteristics["domain"],
+ "sample_questions": "\n".join(characteristics.get("sample_questions", [])),
+ "keywords": ", ".join(characteristics.get("keywords", [])),
+ "topics": ", ".join(characteristics.get("topics", [])),
+ "description": characteristics.get("description", ""),
+ "embedding": embedding
+ }
+
+ async def read_file(self, file_path: str) -> str:
+ """Read content from a file using appropriate parser"""
+ file_extension = os.path.splitext(file_path)[1].lower()
+
+ parser_map = {
+ '.docx': lambda: self._read_docx(file_path),
+ '.pdf': lambda: self._read_pdf(file_path),
+ '.html': lambda: self._read_with_parser(file_path, 'htmlparser', 'LocalHTMLParser'),
+ '.csv': lambda: self._read_with_parser(file_path, 'csvparser', 'CsvParser'),
+ '.json': lambda: self._read_with_parser(file_path, 'jsonparser', 'JsonParser'),
+ '.txt': lambda: self._read_with_parser(file_path, 'textparser', 'TextParser'),
+ '.md': lambda: self._read_with_parser(file_path, 'textparser', 'TextParser')
+ }
+
+ reader = parser_map.get(file_extension)
+ if reader:
+ try:
+ return await reader()
+ except Exception as e:
+ print(f"Error reading {file_extension} file {file_path}: {e}")
+ return ""
+ else:
+ # Default text reading
+ try:
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ return f.read()
+ except Exception as e:
+ print(f"Error reading file {file_path}: {e}")
+ return ""
+
+ async def _read_with_parser(self, file_path: str, module_name: str, class_name: str) -> str:
+ """Generic method to read files with parsers"""
+ try:
+ module = __import__(f'prepdocslib.{module_name}', fromlist=[class_name])
+ parser_class = getattr(module, class_name)
+ parser = parser_class()
+
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text if hasattr(page, 'text') else str(page))
+ return '\n'.join(pages)
+ except Exception as e:
+ print(f"Error using {class_name}: {e}")
+ return ""
+
+ async def _read_docx(self, file_path: str) -> str:
+ """Read DOCX files"""
+ return await self._read_with_parser(file_path, 'docxparser', 'DocxHyperlinkParser')
+
+ async def _read_pdf(self, file_path: str) -> str:
+ """Read PDF files"""
+ use_local_pdf_parser = os.getenv("USE_LOCAL_PDF_PARSER", "false").lower() != "false"
+
+ if use_local_pdf_parser or os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") is None:
+ return await self._read_with_parser(file_path, 'pdfparser', 'LocalPdfParser')
+ else:
+ # Use Document Intelligence parser
+ try:
+ from prepdocslib.pdfparser import DocumentAnalysisParser
+ from azure.identity import DefaultAzureCredential
+
+ doc_int_service = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE")
+ credential = DefaultAzureCredential()
+
+ parser = DocumentAnalysisParser(
+ endpoint=f"https://{doc_int_service}.cognitiveservices.azure.com/",
+ credential=credential
+ )
+
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text)
+ return '\n'.join(pages)
+ except Exception as e:
+ print(f"Error using Document Intelligence: {e}")
+ return ""
+
+ async def extract_topics(self, content: str) -> List[str]:
+ """Extract topics from content using LLM"""
+ if not content or len(content.strip()) < 50:
+ return []
+
+ max_content_length = 90000
+ if len(content) > max_content_length:
+ content = content[:max_content_length] + "..."
+
+ prompt = f"""Analyze the following technical document content and extract the main topics, technologies, and themes.
+Focus on specific technical concepts, tools, platforms, and processes mentioned.
+Return 10-15 specific topics as a JSON array of strings.
+Be specific - instead of generic terms like "monitoring", use specific terms like "Azure Monitor" or "performance telemetry".
+
+Document content:
+{content}
+
+Example output format: ["Azure Kubernetes Service", "container orchestration", "microservices architecture", "CI/CD pipelines", "load balancing"]
+"""
+
+ try:
+ response = await self._call_llm_with_retry(
+ prompt,
+ "You are a technical documentation analyst. Extract specific, concrete topics from technical documents.",
+ temperature=0.3,
+ max_tokens=500
+ )
+
+ if response:
+ topics = self._parse_json_response(response)
+ return topics[:15]
+ except Exception as e:
+ print(f"Error extracting topics with LLM: {e}")
+
+ return self._fallback_topic_extraction(content)
+
+ def _fallback_topic_extraction(self, content: str) -> List[str]:
+ """Fallback topic extraction with domain-specific keywords"""
+ topics = []
+
+ domain_keywords = {
+ "cosmic": ["cosmic", "watson", "performance monitoring", "diagnostics", "telemetry",
+ "container platform", "microservices", "kubernetes", "docker", "orchestration"],
+ "substrate": ["substrate", "infrastructure", "cloud platform", "azure", "deployment",
+ "virtual machines", "networking", "storage", "security", "compliance"]
+ }
+
+ content_lower = content.lower()
+
+ # Check for domain-specific keywords
+ for keywords in domain_keywords.values():
+ for keyword in keywords:
+ if keyword in content_lower:
+ topics.append(keyword.title())
+
+ # Extract capitalized phrases
+ import re
+ capitalized_phrases = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', content)
+ topics.extend(list(set(capitalized_phrases))[:5])
+
+ return list(set(topics))[:10]
+
+ async def generate_sample_questions(self, contents: List[str]) -> List[str]:
+ """Generate sample questions using LLM based on actual document content"""
+ if not contents:
+ return ["What are the key features?", "How do I configure this?", "What are common issues?"]
+
+ combined_content = "\n\n---\n\n".join(contents[:5])
+ max_content_length = 90000
+ if len(combined_content) > max_content_length:
+ combined_content = combined_content[:max_content_length] + "..."
+
+ prompt = f"""Based on the following technical documentation content, generate 7-10 realistic questions that users would ask.
+The questions should be specific to the actual content, mentioning specific features, tools, or processes described in the documents.
+Include a mix of how-to questions, troubleshooting questions, and conceptual questions.
+Return only the questions as a JSON array of strings.
+
+Document content:
+{combined_content}
+
+Example output format: ["How do I configure Watson monitoring for my Cosmic deployment?", "What are the performance tuning options for Substrate?", "How can I troubleshoot connection issues in the platform?"]
+"""
+
+ try:
+ response = await self._call_llm_with_retry(
+ prompt,
+ "You are a technical support expert. Generate realistic questions users would ask about the documented technology.",
+ temperature=0.7,
+ max_tokens=600
+ )
+
+ if response:
+ questions = self._parse_json_response(response, r'"([^"]+\?)"')
+ return [q for q in questions if '?' in q][:10]
+ except Exception as e:
+ print(f"Error generating questions with LLM: {e}")
+
+ return self._generate_fallback_questions(contents)
+
+ def _generate_fallback_questions(self, contents: List[str]) -> List[str]:
+ """Generate context-aware fallback questions"""
+ combined = " ".join(contents[:2])[:1000].lower() if contents else ""
+
+ question_templates = {
+ "cosmic": [
+ "How do I set up Cosmic monitoring for my application?",
+ "What are the performance metrics available in Cosmic?",
+ "How can I troubleshoot Cosmic deployment issues?"
+ ],
+ "substrate": [
+ "How do I configure Substrate infrastructure?",
+ "What are the security best practices for Substrate?",
+ "How can I scale my Substrate deployment?"
+ ],
+ "error": ["How do I diagnose and fix common errors?"],
+ "performance": ["What are the performance optimization techniques?"],
+ "config": ["What are the initial configuration steps?"]
+ }
+
+ questions = []
+ for keyword, template_questions in question_templates.items():
+ if keyword in combined:
+ questions.extend(template_questions)
+
+ if not questions:
+ questions = [
+ "What are the key features?",
+ "How do I get started?",
+ "What are the best practices?"
+ ]
+
+ return questions[:7]
+
+ async def extract_keywords(self, contents: List[str]) -> List[str]:
+ """Extract keywords from contents using LLM"""
+ if not contents:
+ return ["technical", "documentation", "system"]
+
+ combined_content = "\n\n".join(contents[:8])
+ max_content_length = 90000
+ if len(combined_content) > max_content_length:
+ combined_content = combined_content[:max_content_length] + "..."
+
+ prompt = f"""Extract 20-30 important technical keywords and key phrases from the following documentation.
+Focus on:
+- Product names and features
+- Technical terms and technologies
+- Platform-specific concepts
+- Tools and services mentioned
+- Important processes or methodologies
+
+Avoid generic terms. Be specific to the actual content.
+Return only the keywords as a JSON array of strings.
+
+Document content:
+{combined_content}
+
+Example output format: ["Azure Kubernetes Service", "Watson Analytics", "container orchestration", "performance telemetry", "microservice deployment"]
+"""
+
+ try:
+ response = await self._call_llm_with_retry(
+ prompt,
+ "You are a technical keyword extraction expert. Extract specific, meaningful keywords from technical documentation.",
+ temperature=0.3,
+ max_tokens=500
+ )
+
+ if response:
+ keywords = self._parse_json_response(response)
+ return list(set(keywords))[:30]
+ except Exception as e:
+ print(f"Error extracting keywords with LLM: {e}")
+
+ return self._enhanced_keyword_extraction(contents)
+
+ def _enhanced_keyword_extraction(self, contents: List[str]) -> List[str]:
+ """Enhanced fallback keyword extraction"""
+ import re
+
+ keywords = set()
+
+ for content in contents[:3]:
+ # Extract various patterns
+ patterns = [
+ r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b', # Capitalized phrases
+ r'"([^"]+)"', # Quoted text
+ r'\b\w+[-]\w+\b|\b\w+\d+\w*\b' # Technical terms
+ ]
+
+ for pattern in patterns:
+ matches = re.findall(pattern, content)
+ if pattern == r'"([^"]+)"':
+ keywords.update([m for m in matches if len(m.split()) <= 3])
+ else:
+ keywords.update(matches)
+
+ # Filter and clean keywords
+ stopwords = self._get_stopwords()
+ cleaned_keywords = [
+ kw.strip() for kw in keywords
+ if len(kw.strip()) > 3 and len(kw.split()) <= 3
+ and kw.lower() not in stopwords
+ ]
+
+ # Sort by length and alphabetically
+ cleaned_keywords.sort(key=lambda x: (-len(x.split()), x))
+
+ return cleaned_keywords[:20]
+
+ def _get_stopwords(self) -> set:
+ """Get common stopwords to filter out"""
+ return {
+ "the", "is", "at", "which", "on", "and", "a", "an", "as", "are",
+ "was", "were", "been", "have", "has", "had", "do", "does", "did",
+ "will", "would", "should", "could", "may", "might", "must", "can",
+ "to", "of", "in", "for", "with", "from", "this", "that", "these",
+ "those", "then", "than", "when", "where", "what", "how", "why"
+ }
+
+ async def __aenter__(self):
+ """Async context manager entry"""
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Async context manager exit - cleanup resources"""
+ if self._llm_client:
+ try:
+ await self._llm_client.close()
+ except:
+ pass
+ self._llm_client = None
\ No newline at end of file
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index f03baac0dc..1b2172c1b6 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -1,11 +1,14 @@
+#!/usr/bin/env python3
import argparse
import asyncio
+import glob
import logging
import os
from typing import Optional, Union
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
+from azure.identity import DefaultAzureCredential
from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
from rich.logging import RichHandler
@@ -34,6 +37,10 @@
from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy
from prepdocslib.textparser import TextParser
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
+from prepdocslib.docxparser import DocxHyperlinkParser
+from prepdocslib.hyperlinktextsplitter import HyperlinkAwareTextSplitter
+
+from prepdocslib.domain_classifier_setup import DomainClassifierSetup
logger = logging.getLogger("scripts")
@@ -115,7 +122,7 @@ def setup_list_file_strategy(
data_lake_storage_account=datalake_storage_account,
data_lake_filesystem=datalake_filesystem,
data_lake_path=datalake_path,
- credential=adls_gen2_creds,
+ credential=adls_gen2_creds, # Should be adls_gen2_creds
)
elif local_files:
logger.info("Using local files: %s", local_files)
@@ -180,6 +187,10 @@ def setup_file_processors(
content_understanding_endpoint: Union[str, None] = None,
):
sentence_text_splitter = SentenceTextSplitter()
+ # Add hyperlink-aware text splitter
+ hyperlink_text_splitter = HyperlinkAwareTextSplitter(
+ add_target_blank=True, # This ensures target="_blank" is added
+ )
doc_int_parser: Optional[DocumentAnalysisParser] = None
# check if Azure Document Intelligence credentials are provided
@@ -194,10 +205,15 @@ def setup_file_processors(
content_understanding_endpoint=content_understanding_endpoint,
)
+ # Create DOCX parser that preserves hyperlinks
+ docx_hyperlink_parser = DocxHyperlinkParser()
+
pdf_parser: Optional[Parser] = None
if local_pdf_parser or document_intelligence_service is None:
+ print("localpdf")
pdf_parser = LocalPdfParser()
elif document_intelligence_service is not None:
+ print("docintpdf")
pdf_parser = doc_int_parser
else:
logger.warning("No PDF parser available")
@@ -216,17 +232,18 @@ def setup_file_processors(
".md": FileProcessor(TextParser(), sentence_text_splitter),
".txt": FileProcessor(TextParser(), sentence_text_splitter),
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
+ # Use hyperlink-aware parser and splitter for DOCX files
+ ".docx": FileProcessor(docx_hyperlink_parser, hyperlink_text_splitter),
}
# These require either a Python package or Document Intelligence
if pdf_parser is not None:
file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)})
if html_parser is not None:
file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)})
- # These file formats require Document Intelligence
+ # These file formats require Document Intelligence (except DOCX which now uses custom parser)
if doc_int_parser is not None:
file_processors.update(
{
- ".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
".pptx": FileProcessor(doc_int_parser, sentence_text_splitter),
".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter),
".png": FileProcessor(doc_int_parser, sentence_text_splitter),
@@ -261,6 +278,74 @@ async def main(strategy: Strategy, setup_index: bool = True):
await strategy.run()
+async def setup_domain_classifier(embeddings_service, search_info):
+ """Setup domain classifier with the provided search info"""
+ # Check if embeddings service is available
+ if not embeddings_service:
+ logger.warning("No embeddings service available. Skipping domain classifier setup.")
+ return
+
+ # Check if data directories exist
+ cosmic_path = "./data/cosmic"
+ substrate_path = "./data/substrate"
+
+ if not os.path.exists(cosmic_path) or not os.path.exists(substrate_path):
+ logger.warning(f"Domain classifier data directories not found. Expected: {cosmic_path} and {substrate_path}")
+ logger.info("Skipping domain classifier setup. Create the directories and add sample documents to enable this feature.")
+ return
+
+ # Import the synchronous DefaultAzureCredential for the SearchIndexClient
+ from azure.identity import DefaultAzureCredential
+ from azure.search.documents.indexes import SearchIndexClient
+
+ # Create a synchronous credential for the SearchIndexClient
+ sync_credential = DefaultAzureCredential()
+
+ search_index_client = None
+ try:
+ # Create a synchronous search index client for the domain classifier
+ search_index_client = SearchIndexClient(
+ endpoint=search_info.endpoint,
+ credential=sync_credential
+ )
+
+ # Initialize classifier setup with the search index client
+ classifier_setup = DomainClassifierSetup(
+ search_client=search_index_client,
+ embeddings_service=embeddings_service,
+ )
+
+ # Use async context manager to ensure proper cleanup
+ async with classifier_setup:
+ # Create the classifier index (synchronous operation)
+ classifier_setup.create_domain_classifier_index()
+
+ # Get document lists
+ cosmic_docs = glob.glob("./data/cosmic/**/*", recursive=True)
+ substrate_docs = glob.glob("./data/substrate/**/*", recursive=True)
+
+ # Filter to only actual files
+ cosmic_docs = [f for f in cosmic_docs if os.path.isfile(f)]
+ substrate_docs = [f for f in substrate_docs if os.path.isfile(f)]
+
+ if not cosmic_docs and not substrate_docs:
+ logger.warning("No documents found in data directories. Add documents to enable domain classification.")
+ return
+
+ # Populate the classifier with domain knowledge
+ await classifier_setup.populate_classifier_index(cosmic_docs, substrate_docs)
+
+ logger.info("Domain classifier index created and populated successfully!")
+
+ finally:
+ # Clean up the synchronous client and credential
+ if search_index_client:
+ search_index_client.close()
+ # Close the sync credential to prevent resource leaks
+ if hasattr(sync_credential, 'close'):
+ sync_credential.close()
+
+
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index."
@@ -312,7 +397,35 @@ async def main(strategy: Strategy, setup_index: bool = True):
help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
)
+ parser.add_argument(
+ "--skip-domain-classifier",
+ action="store_true",
+ help="Skip domain classifier setup (useful when running multiple times)",
+ )
+
+ parser.add_argument(
+ "--skip-main-processing",
+ action="store_true",
+ help="Skip main document processing and only run domain classifier",
+ )
+
+ parser.add_argument(
+ "--domain-classifier-only",
+ action="store_true",
+ help="Only run domain classifier setup without processing documents",
+ )
+
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+ parser.add_argument(
+ "--searchindex",
+ required=False,
+ help="Override the search index name (overrides AZURE_SEARCH_INDEX env var)"
+ )
+ parser.add_argument(
+ "--searchagent",
+ required=False,
+ help="Override the search agent name (overrides AZURE_SEARCH_AGENT env var)"
+ )
args = parser.parse_args()
if args.verbose:
@@ -323,6 +436,15 @@ async def main(strategy: Strategy, setup_index: bool = True):
load_azd_env()
+ # Override environment variables with command line arguments if provided
+ if args.searchindex:
+ os.environ["AZURE_SEARCH_INDEX"] = args.searchindex
+ logger.info(f"Using command line override for search index: {args.searchindex}")
+
+ if args.searchagent:
+ os.environ["AZURE_SEARCH_AGENT"] = args.searchagent
+ logger.info(f"Using command line override for search agent: {args.searchagent}")
+
if os.getenv("AZURE_PUBLIC_NETWORK_ACCESS") == "Disabled":
logger.error("AZURE_PUBLIC_NETWORK_ACCESS is set to Disabled. Exiting.")
exit(0)
@@ -440,7 +562,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
azure_credential=azd_credential,
document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"),
document_intelligence_key=clean_key_if_exists(args.documentintelligencekey),
- local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
+ local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "false",
local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
search_images=use_gptvision,
use_content_understanding=use_content_understanding,
@@ -469,5 +591,33 @@ async def main(strategy: Strategy, setup_index: bool = True):
content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
)
- loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))
+ # If only running domain classifier
+ if args.domain_classifier_only:
+ logger.info("Running domain classifier setup only")
+ try:
+ loop.run_until_complete(setup_domain_classifier(openai_embeddings_service, search_info))
+ except Exception as e:
+ logger.error(f"Failed to setup domain classifier: {e}")
+ import traceback
+ traceback.print_exc()
+ else:
+ # Normal processing
+ # Only run main processing if not skipped
+ if not args.skip_main_processing:
+ loop.run_until_complete(main(ingestion_strategy, setup_index=True))
+
+ # After the main ingestion is complete, set up domain classifier
+ if not args.skip_domain_classifier:
+ try:
+ loop.run_until_complete(setup_domain_classifier(openai_embeddings_service, search_info))
+ except Exception as e:
+ logger.error(f"Failed to setup domain classifier: {e}")
+ import traceback
+ traceback.print_exc()
+
+ # Ensure all async operations are complete before closing the loop
+ pending = asyncio.all_tasks(loop)
+ if pending:
+ loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
+
loop.close()
diff --git a/app/backend/prepdocslib/docxparser.py b/app/backend/prepdocslib/docxparser.py
new file mode 100644
index 0000000000..8c3431ded3
--- /dev/null
+++ b/app/backend/prepdocslib/docxparser.py
@@ -0,0 +1,120 @@
+import logging
+import re
+from collections.abc import AsyncGenerator
+from typing import Dict, List, Optional, Union
+from zipfile import ZipFile
+import xml.etree.ElementTree as ET
+from io import BytesIO, BufferedReader
+
+from .page import Page
+from .parser import Parser
+
+logger = logging.getLogger("scripts")
+
+
+class DocxHyperlinkParser(Parser):
+ """
+ Parser that extracts text from DOCX files while preserving hyperlinks
+ """
+
+ def __init__(self):
+ self.hyperlink_pattern = re.compile(r']*>([^<]*)')
+
+ async def parse(self, content: Union[bytes, BufferedReader]) -> AsyncGenerator[Page, None]:
+ try:
+ # Convert BufferedReader to bytes if necessary
+ if isinstance(content, BufferedReader):
+ content_bytes = content.read()
+ else:
+ content_bytes = content
+
+ # Parse DOCX file
+ docx_content = self._extract_text_with_hyperlinks(content_bytes)
+
+ if docx_content.strip():
+ yield Page(page_num=0, offset=0, text=docx_content)
+ except Exception as e:
+ logger.warning(f"Error parsing DOCX content: {e}")
+ return
+
+ def _extract_text_with_hyperlinks(self, content: bytes) -> str:
+ """Extract text from DOCX while preserving hyperlinks as HTML markup"""
+ try:
+ with ZipFile(BytesIO(content)) as docx_zip:
+ # Read the main document
+ document_xml = docx_zip.read('word/document.xml')
+
+ # Read relationships to get hyperlink targets
+ relationships = self._parse_relationships(docx_zip)
+
+ # Parse the document XML
+ root = ET.fromstring(document_xml)
+
+ # Define namespaces
+ namespaces = {
+ 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+ 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
+ }
+
+ text_parts = []
+
+ # Extract text and hyperlinks
+ for paragraph in root.findall('.//w:p', namespaces):
+ paragraph_text = self._extract_paragraph_with_links(
+ paragraph, relationships, namespaces
+ )
+ if paragraph_text.strip():
+ text_parts.append(paragraph_text)
+
+ return '\n\n'.join(text_parts)
+
+ except Exception as e:
+ logger.error(f"Error extracting DOCX content: {e}")
+ return ""
+
+ def _parse_relationships(self, docx_zip: ZipFile) -> Dict[str, str]:
+ """Parse relationships file to get hyperlink targets"""
+ relationships = {}
+ try:
+ rels_xml = docx_zip.read('word/_rels/document.xml.rels')
+ root = ET.fromstring(rels_xml)
+
+ for relationship in root.findall('.//{http://schemas.openxmlformats.org/package/2006/relationships}Relationship'):
+ rel_id = relationship.get('Id')
+ target = relationship.get('Target')
+ rel_type = relationship.get('Type')
+
+ # Only store hyperlink relationships
+ if rel_type == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink':
+ relationships[rel_id] = target
+
+ except Exception as e:
+ logger.warning(f"Could not parse relationships: {e}")
+
+ return relationships
+
+ def _extract_paragraph_with_links(self, paragraph, relationships: Dict[str, str], namespaces: Dict[str, str]) -> str:
+ """Extract text from a paragraph, preserving hyperlinks"""
+ text_parts = []
+
+ # Process all elements in the paragraph
+ for elem in paragraph.iter():
+ if elem.tag.endswith('hyperlink'):
+ # This is a hyperlink element
+ rel_id = elem.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
+ link_text = ''.join(t.text for t in elem.findall('.//w:t', namespaces) if t.text)
+
+ if rel_id and rel_id in relationships:
+ url = relationships[rel_id]
+ text_parts.append(f'{link_text}')
+ else:
+ # Fallback to plain text if relationship not found
+ text_parts.append(link_text)
+ elif elem.tag.endswith('t') and elem.text:
+ # Regular text element - only add if it's not within a hyperlink
+ parent = elem.find('..')
+ grandparent = parent.find('..') if parent is not None else None
+ if grandparent is None or not grandparent.tag.endswith('hyperlink'):
+ text_parts.append(elem.text)
+
+ return ''.join(text_parts)
\ No newline at end of file
diff --git a/app/backend/prepdocslib/domain_classifier_setup.py b/app/backend/prepdocslib/domain_classifier_setup.py
new file mode 100644
index 0000000000..4ebc1fd33a
--- /dev/null
+++ b/app/backend/prepdocslib/domain_classifier_setup.py
@@ -0,0 +1,799 @@
+import os
+import re
+import json
+import asyncio
+from typing import List, Dict, Optional
+from azure.search.documents import SearchClient
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
+ SearchIndex,
+ SimpleField,
+ SearchableField,
+ SearchField,
+ VectorSearch,
+ HnswAlgorithmConfiguration,
+ VectorSearchProfile,
+ SemanticConfiguration,
+ SemanticPrioritizedFields,
+ SemanticField,
+ SemanticSearch,
+ SearchFieldDataType
+)
+
+class DomainClassifierSetup:
+ def __init__(self, search_client: SearchIndexClient, embeddings_service):
+ self.search_client = search_client
+ self.embeddings_service = embeddings_service
+ self._llm_client = None
+ self._credential = None # Add this to track the credential
+
+ def create_domain_classifier_index(self):
+ """Create a specialized index for domain classification"""
+ index_name = "domain-classifier-index"
+
+ # Get the actual embedding dimensions from the environment
+ embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "1536"))
+ print(f"Creating index with embedding dimensions: {embedding_dimensions}")
+
+ # Check if index already exists
+ try:
+ existing_index = self.search_client.get_index(index_name)
+ # Check if embedding dimensions match
+ existing_embedding_field = next((f for f in existing_index.fields if f.name == "embedding"), None)
+ if existing_embedding_field and hasattr(existing_embedding_field, 'vector_search_dimensions'):
+ if existing_embedding_field.vector_search_dimensions == embedding_dimensions:
+ print(f"Index '{index_name}' already exists with correct dimensions. Using existing index.")
+ return index_name
+ else:
+ print(f"Index '{index_name}' exists with different dimensions ({existing_embedding_field.vector_search_dimensions}). Deleting and recreating...")
+ self.search_client.delete_index(index_name)
+ else:
+ print(f"Index '{index_name}' exists but couldn't verify dimensions. Recreating...")
+ self.search_client.delete_index(index_name)
+ except Exception as e:
+ print(f"Index '{index_name}' doesn't exist. Creating new index...")
+
+ fields = [
+ SimpleField(name="id", type=SearchFieldDataType.String, key=True),
+ SearchableField(name="domain", type=SearchFieldDataType.String, facetable=True, filterable=True),
+ SearchableField(name="sample_questions", type=SearchFieldDataType.String),
+ SearchableField(name="keywords", type=SearchFieldDataType.String),
+ SearchableField(name="description", type=SearchFieldDataType.String),
+ # Change topics to a simple searchable string field instead of collection
+ SearchableField(name="topics", type=SearchFieldDataType.String),
+ SearchField(
+ name="embedding",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=embedding_dimensions,
+ vector_search_profile_name="myHnswProfile"
+ )
+ ]
+
+ vector_search = VectorSearch(
+ algorithms=[HnswAlgorithmConfiguration(name="myHnsw")],
+ profiles=[VectorSearchProfile(name="myHnswProfile", algorithm_configuration_name="myHnsw")]
+ )
+
+ semantic_config = SemanticConfiguration(
+ name="default",
+ prioritized_fields=SemanticPrioritizedFields(
+ title_field=SemanticField(field_name="domain"),
+ content_fields=[SemanticField(field_name="description")]
+ )
+ )
+
+ index = SearchIndex(
+ name=index_name,
+ fields=fields,
+ vector_search=vector_search,
+ semantic_search=SemanticSearch(configurations=[semantic_config])
+ )
+
+ # create_or_update_index is synchronous
+ self.search_client.create_or_update_index(index)
+ return index_name
+
+ def _get_llm_client(self):
+ """Get or create LLM client for chat completions"""
+ if self._llm_client is not None:
+ return self._llm_client
+
+ from openai import AsyncAzureOpenAI
+
+ # Get configuration
+ model_deployment = os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "chat")
+ endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT", "")
+ api_version = os.environ.get("AZURE_OPENAI_API_VERSION") or "2024-06-01"
+
+ print(f"DEBUG: Azure OpenAI Endpoint: {endpoint}")
+ print(f"DEBUG: Model Deployment: {model_deployment}")
+ print(f"DEBUG: API Version: {api_version}")
+
+ if not endpoint:
+ print("Error: AZURE_OPENAI_ENDPOINT is not set")
+ return None
+
+ try:
+ api_key = os.environ.get("AZURE_OPENAI_API_KEY_OVERRIDE")
+
+ if api_key:
+ print("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client")
+ self._llm_client = AsyncAzureOpenAI(
+ api_key=api_key,
+ azure_endpoint=endpoint,
+ api_version=api_version,
+ max_retries=3,
+ timeout=60.0
+ )
+ print("DEBUG: LLM client created with API key")
+ else:
+ from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
+
+ print("Using Azure credential (passwordless authentication) for Azure OpenAI client")
+ self._credential = DefaultAzureCredential() # Store the credential
+ token_provider = get_bearer_token_provider(
+ self._credential,
+ "https://cognitiveservices.azure.com/.default"
+ )
+
+ self._llm_client = AsyncAzureOpenAI(
+ azure_endpoint=endpoint,
+ azure_ad_token_provider=token_provider,
+ api_version=api_version,
+ max_retries=3,
+ timeout=60.0
+ )
+ print("DEBUG: LLM client created with Managed Identity")
+
+ except Exception as e:
+ print(f"Error creating LLM client: {e}")
+ import traceback
+ traceback.print_exc()
+ return None
+
+ return self._llm_client
+
+ async def extract_domain_characteristics(self, file_paths: List[str], category: str) -> Dict:
+ """Extract domain characteristics from documents"""
+ all_content = []
+ topics = set()
+
+ # Filter out non-document files - include all supported formats from prepdocs.py
+ valid_extensions = ['.txt', '.md', '.docx', '.pdf', '.doc', '.html', '.csv', '.json',
+ '.pptx', '.xlsx']
+
+ # Filter to only actual files (not directories) and exclude .md5 files
+ document_paths = [
+ fp for fp in file_paths
+ if os.path.isfile(fp)
+ and not fp.endswith('.md5')
+ and any(fp.lower().endswith(ext) for ext in valid_extensions)
+ ]
+
+ if not document_paths:
+ print(f"No valid documents found for {category}")
+ return {
+ "domain": category,
+ "sample_questions": [f"What is {category}?", f"How does {category} work?"],
+ "keywords": [category.lower()],
+ "topics": [category],
+ "description": f"Documents related to {category}"
+ }
+
+ print(f"Processing {len(document_paths)} documents for {category}")
+
+ # Read and analyze documents
+ for file_path in document_paths[:10]: # Limit to first 10 documents
+ try:
+ print(f"Processing: {os.path.basename(file_path)}")
+ content = await self.read_file(file_path)
+ if content and len(content.strip()) > 50: # Only process non-empty content
+ all_content.append(content)
+
+ # Extract topics using LLM
+ extracted_topics = await self.extract_topics(content)
+ topics.update(extracted_topics)
+ except Exception as e:
+ print(f"Error processing file {file_path}: {e}")
+ continue
+
+ if not all_content:
+ print(f"No content extracted from {category} documents")
+ return {
+ "domain": category,
+ "sample_questions": [f"What is {category}?", f"How does {category} work?"],
+ "keywords": [category.lower()],
+ "topics": [category],
+ "description": f"Documents related to {category}"
+ }
+
+ # Generate sample questions and keywords
+ sample_questions = await self.generate_sample_questions(all_content[:5]) # Use first 5 docs
+ keywords = await self.extract_keywords(all_content)
+
+ return {
+ "domain": category,
+ "sample_questions": sample_questions,
+ "keywords": keywords,
+ "topics": list(topics)[:20], # Limit topics
+ "description": f"Documents related to {category} - {', '.join(list(topics)[:5])}"
+ }
+
+ async def populate_classifier_index(self, cosmic_docs: List[str], substrate_docs: List[str]):
+ """Populate the classifier index with domain information"""
+ index_name = "domain-classifier-index"
+
+ # Create a synchronous SearchClient for the index (matching prepdocs.py pattern)
+ search_endpoint = self.search_client._endpoint
+ credential = self.search_client._credential
+
+ # Use synchronous SearchClient instead of async
+ from azure.search.documents import SearchClient
+ search_client = SearchClient(
+ endpoint=search_endpoint,
+ index_name=index_name,
+ credential=credential
+ )
+
+ # Extract characteristics for each domain
+ cosmic_chars = await self.extract_domain_characteristics(cosmic_docs, "Cosmic")
+ substrate_chars = await self.extract_domain_characteristics(substrate_docs, "Substrate")
+
+ # Create embeddings for each domain
+ cosmic_text = json.dumps(cosmic_chars)
+ substrate_text = json.dumps(substrate_chars)
+
+ # Get embeddings - ensure we get the vector array directly
+ cosmic_embedding_response = await self.embeddings_service.create_embeddings([cosmic_text])
+ substrate_embedding_response = await self.embeddings_service.create_embeddings([substrate_text])
+
+ # Extract the embedding vector from the response
+ cosmic_embedding = cosmic_embedding_response[0] if isinstance(cosmic_embedding_response, list) else cosmic_embedding_response.data[0].embedding
+ substrate_embedding = substrate_embedding_response[0] if isinstance(substrate_embedding_response, list) else substrate_embedding_response.data[0].embedding
+
+ # Ensure embeddings are lists of floats
+ if not isinstance(cosmic_embedding, list):
+ cosmic_embedding = list(cosmic_embedding)
+ if not isinstance(substrate_embedding, list):
+ substrate_embedding = list(substrate_embedding)
+
+ # Verify embedding dimensions match expected dimensions
+ expected_dims = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "1536"))
+ print(f"Expected embedding dimensions: {expected_dims}")
+ print(f"Actual cosmic embedding dimensions: {len(cosmic_embedding)}")
+ print(f"Actual substrate embedding dimensions: {len(substrate_embedding)}")
+
+ # Upload to index - convert topics list to comma-separated string
+ documents = [
+ {
+ "id": "cosmic-domain",
+ "domain": "Cosmic",
+ "sample_questions": "\n".join(cosmic_chars.get("sample_questions", [])),
+ "keywords": ", ".join(cosmic_chars.get("keywords", [])),
+ "topics": ", ".join(cosmic_chars.get("topics", [])), # Convert list to string
+ "description": cosmic_chars.get("description", ""),
+ "embedding": cosmic_embedding
+ },
+ {
+ "id": "substrate-domain",
+ "domain": "Substrate",
+ "sample_questions": "\n".join(substrate_chars.get("sample_questions", [])),
+ "keywords": ", ".join(substrate_chars.get("keywords", [])),
+ "topics": ", ".join(substrate_chars.get("topics", [])), # Convert list to string
+ "description": substrate_chars.get("description", ""),
+ "embedding": substrate_embedding
+ }
+ ]
+
+ # Upload documents using synchronous client (matching prepdocs.py pattern)
+ try:
+ # Use synchronous upload_documents
+ result = search_client.upload_documents(documents=documents)
+ print(f"\nUploaded {len(documents)} domain classifier documents")
+ for r in result:
+ if hasattr(r, 'succeeded'):
+ print(f"Document {r.key}: {r.succeeded}")
+ else:
+ print(f"Document uploaded: {r}")
+ except Exception as e:
+ print(f"Error uploading documents: {e}")
+ import traceback
+ traceback.print_exc()
+ finally:
+ # Synchronous client doesn't need to be closed explicitly
+ pass
+
+ async def read_file(self, file_path: str) -> str:
+ """Read content from a file, handling different file types using the same approach as prepdocs.py"""
+ try:
+ file_extension = os.path.splitext(file_path)[1].lower()
+
+ if file_extension == '.docx':
+ # Use the same DocxHyperlinkParser as prepdocs.py
+ from prepdocslib.docxparser import DocxHyperlinkParser
+
+ try:
+ # Create parser instance
+ parser = DocxHyperlinkParser()
+
+ # Parse the document
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text if page.text else "")
+
+ # Return combined text from all pages
+ return '\n'.join(pages)
+
+ except Exception as e:
+ print(f"Error parsing DOCX file {file_path}: {e}")
+ # Fallback: try to read as text
+ try:
+ from prepdocslib.textparser import TextParser
+ parser = TextParser()
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text)
+ return '\n'.join(pages)
+ except:
+ return ""
+
+ elif file_extension == '.pdf':
+ # Handle PDF files using the same approach as prepdocs.py
+ try:
+ # Check if local PDF parser should be used
+ use_local_pdf_parser = os.getenv("USE_LOCAL_PDF_PARSER", "false").lower() != "false"
+
+ if use_local_pdf_parser or os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") is None:
+ from prepdocslib.pdfparser import LocalPdfParser
+ parser = LocalPdfParser()
+ else:
+ # Use Document Intelligence parser if available
+ from prepdocslib.pdfparser import DocumentAnalysisParser
+ from azure.identity import DefaultAzureCredential
+
+ doc_int_service = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE")
+ credential = DefaultAzureCredential()
+
+ parser = DocumentAnalysisParser(
+ endpoint=f"https://{doc_int_service}.cognitiveservices.azure.com/",
+ credential=credential
+ )
+
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text)
+ return '\n'.join(pages)
+
+ except Exception as e:
+ print(f"Error reading PDF file {file_path}: {e}")
+ return ""
+
+ elif file_extension == '.html':
+ # Handle HTML files
+ try:
+ from prepdocslib.htmlparser import LocalHTMLParser
+ parser = LocalHTMLParser()
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text)
+ return '\n'.join(pages)
+ except Exception as e:
+ print(f"Error reading HTML file {file_path}: {e}")
+ return ""
+
+ elif file_extension == '.csv':
+ # Handle CSV files
+ try:
+ from prepdocslib.csvparser import CsvParser
+ parser = CsvParser()
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text)
+ return '\n'.join(pages)
+ except Exception as e:
+ print(f"Error reading CSV file {file_path}: {e}")
+ return ""
+
+ elif file_extension == '.json':
+ # Handle JSON files
+ try:
+ from prepdocslib.jsonparser import JsonParser
+ parser = JsonParser()
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text)
+ return '\n'.join(pages)
+ except Exception as e:
+ print(f"Error reading JSON file {file_path}: {e}")
+ return ""
+
+ elif file_extension in ['.txt', '.md']:
+ # Handle text and markdown files
+ try:
+ from prepdocslib.textparser import TextParser
+ parser = TextParser()
+ with open(file_path, 'rb') as f:
+ pages = []
+ async for page in parser.parse(f):
+ pages.append(page.text)
+ return '\n'.join(pages)
+ except Exception as e:
+ print(f"Error reading text file {file_path}: {e}")
+ return ""
+
+ else:
+ # Default to text file reading with encoding handling
+ try:
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ return f.read()
+ except Exception as e:
+ print(f"Error reading file {file_path}: {e}")
+ return ""
+
+ except Exception as e:
+ print(f"Error reading file {file_path}: {e}")
+ return ""
+
+ async def extract_topics(self, content: str) -> List[str]:
+ """Extract topics from content using LLM"""
+ # Skip if content is empty
+ if not content or len(content.strip()) < 50:
+ return []
+
+ # Increased content length for models with larger context windows
+ max_content_length = 90000
+ if len(content) > max_content_length:
+ content = content[:max_content_length] + "..."
+
+ try:
+ # Get the chat model deployment
+ model_deployment = os.environ.get("AZURE_OPENAI_GPT4_DEPLOYMENT",
+ os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT"))
+
+ if not model_deployment:
+ print("No chat model deployment found, using fallback extraction")
+ return self._fallback_topic_extraction(content)
+
+ # Get LLM client
+ client = self._get_llm_client()
+ if not client:
+ print("Failed to create LLM client, using fallback extraction")
+ return self._fallback_topic_extraction(content)
+
+ prompt = f"""Analyze the following technical document content and extract the main topics, technologies, and themes.
+Focus on specific technical concepts, tools, platforms, and processes mentioned.
+Return 10-15 specific topics as a JSON array of strings.
+Be specific - instead of generic terms like "monitoring", use specific terms like "Azure Monitor" or "performance telemetry".
+
+Document content:
+{content}
+
+Example output format: ["Azure Kubernetes Service", "container orchestration", "microservices architecture", "CI/CD pipelines", "load balancing"]
+"""
+
+ # Add retry logic for connection issues
+ max_retries = 3
+ for attempt in range(max_retries):
+ try:
+ response = await client.chat.completions.create(
+ model=model_deployment,
+ messages=[
+ {"role": "system", "content": "You are a technical documentation analyst. Extract specific, concrete topics from technical documents."},
+ {"role": "user", "content": prompt}
+ ],
+ temperature=0.3,
+ max_tokens=500,
+ timeout=30 # Add timeout
+ )
+ break
+ except Exception as e:
+ error_message = str(e)
+ if attempt < max_retries - 1:
+ print(f"Retry {attempt + 1}/{max_retries} after error: {error_message}")
+ await asyncio.sleep(2 ** attempt) # Exponential backoff
+ else:
+ raise
+
+ # Parse the response
+ topics_text = response.choices[0].message.content.strip()
+ try:
+ import json
+ topics = json.loads(topics_text)
+ if isinstance(topics, list):
+ return [t for t in topics if t and isinstance(t, str)][:15]
+ except:
+ # Fallback: extract from response text
+ import re
+ topics = re.findall(r'"([^"]+)"', topics_text)
+ if not topics:
+ topics = [t.strip() for t in topics_text.split('\n') if t.strip() and not t.strip().startswith('[')]
+ return [t for t in topics if t][:15]
+
+ except Exception as e:
+ print(f"Error extracting topics with LLM: {e}")
+ return self._fallback_topic_extraction(content)
+
+ def _fallback_topic_extraction(self, content: str) -> List[str]:
+ """Fallback topic extraction with domain-specific keywords"""
+ topics = []
+
+ # Domain-specific keyword patterns
+ cosmic_keywords = [
+ "cosmic", "watson", "performance monitoring", "diagnostics", "telemetry",
+ "container platform", "microservices", "kubernetes", "docker", "orchestration"
+ ]
+ substrate_keywords = [
+ "substrate", "infrastructure", "cloud platform", "azure", "deployment",
+ "virtual machines", "networking", "storage", "security", "compliance"
+ ]
+
+ content_lower = content.lower()
+
+ # Check for domain-specific keywords
+ for keyword in cosmic_keywords + substrate_keywords:
+ if keyword in content_lower:
+ topics.append(keyword.title())
+
+ # Extract capitalized multi-word phrases (likely product/feature names)
+ import re
+ capitalized_phrases = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', content)
+ topics.extend(list(set(capitalized_phrases))[:5])
+
+ return list(set(topics))[:10]
+
+ async def generate_sample_questions(self, contents: List[str]) -> List[str]:
+ """Generate sample questions using LLM based on actual document content"""
+ if not contents:
+ return ["What are the key features?", "How do I configure this?", "What are common issues?"]
+
+ # Combine and truncate contents
+ combined_content = "\n\n---\n\n".join(contents[:5])
+ max_content_length = 90000
+ if len(combined_content) > max_content_length:
+ combined_content = combined_content[:max_content_length] + "..."
+
+ try:
+ # Get the chat model deployment
+ model_deployment = os.environ.get("AZURE_OPENAI_GPT4_DEPLOYMENT",
+ os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT"))
+
+ if not model_deployment:
+ return self._generate_fallback_questions(contents)
+
+ # Get LLM client
+ client = self._get_llm_client()
+
+ prompt = f"""Based on the following technical documentation content, generate 7-10 realistic questions that users would ask.
+The questions should be specific to the actual content, mentioning specific features, tools, or processes described in the documents.
+Include a mix of how-to questions, troubleshooting questions, and conceptual questions.
+Return only the questions as a JSON array of strings.
+
+Document content:
+{combined_content}
+
+Example output format: ["How do I configure Watson monitoring for my Cosmic deployment?", "What are the performance tuning options for Substrate?", "How can I troubleshoot connection issues in the platform?"]
+"""
+
+ response = await client.chat.completions.create(
+ model=model_deployment,
+ messages=[
+ {"role": "system", "content": "You are a technical support expert. Generate realistic questions users would ask about the documented technology."},
+ {"role": "user", "content": prompt}
+ ],
+ temperature=0.7,
+ max_tokens=600,
+ timeout=30
+ )
+
+ # Parse the response
+ questions_text = response.choices[0].message.content.strip()
+ try:
+ import json
+ questions = json.loads(questions_text)
+ if isinstance(questions, list):
+ return [q for q in questions if q and isinstance(q, str) and '?' in q][:10]
+ except:
+ # Fallback: extract questions from response
+ import re
+ questions = re.findall(r'"([^"]+\?)"', questions_text)
+ if not questions:
+ questions = [line.strip() for line in questions_text.split('\n') if '?' in line]
+ return [q.strip('"\'') for q in questions if q][:10]
+
+ except Exception as e:
+ print(f"Error generating questions with LLM: {e}")
+ return self._generate_fallback_questions(contents)
+
+ def _generate_fallback_questions(self, contents: List[str]) -> List[str]:
+ """Generate context-aware fallback questions"""
+ # Extract key terms from content
+ combined = " ".join(contents[:2])[:1000].lower() if contents else ""
+
+ questions = []
+
+ # Check for specific terms and generate relevant questions
+ if "cosmic" in combined:
+ questions.extend([
+ "How do I set up Cosmic monitoring for my application?",
+ "What are the performance metrics available in Cosmic?",
+ "How can I troubleshoot Cosmic deployment issues?"
+ ])
+
+ if "substrate" in combined:
+ questions.extend([
+ "How do I configure Substrate infrastructure?",
+ "What are the security best practices for Substrate?",
+ "How can I scale my Substrate deployment?"
+ ])
+
+ # Add generic but relevant questions
+ if "error" in combined or "troubleshoot" in combined:
+ questions.append("How do I diagnose and fix common errors?")
+
+ if "performance" in combined:
+ questions.append("What are the performance optimization techniques?")
+
+ if "config" in combined or "setup" in combined:
+ questions.append("What are the initial configuration steps?")
+
+ # Ensure we always return some questions
+ if not questions:
+ questions = [
+ "What are the key features?",
+ "How do I get started?",
+ "What are the best practices?"
+ ]
+
+ return questions[:7]
+
+ async def extract_keywords(self, contents: List[str]) -> List[str]:
+ """Extract keywords from contents using LLM"""
+ if not contents:
+ return ["technical", "documentation", "system"]
+
+ # Combine and truncate contents
+ combined_content = "\n\n".join(contents[:8])
+ max_content_length = 90000
+ if len(combined_content) > max_content_length:
+ combined_content = combined_content[:max_content_length] + "..."
+
+ try:
+ # Get the chat model deployment
+ model_deployment = os.environ.get("AZURE_OPENAI_GPT4_DEPLOYMENT",
+ os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT"))
+
+ if not model_deployment:
+ return self._enhanced_keyword_extraction(contents)
+
+ # Get LLM client
+ client = self._get_llm_client()
+
+ prompt = f"""Extract 20-30 important technical keywords and key phrases from the following documentation.
+Focus on:
+- Product names and features
+- Technical terms and technologies
+- Platform-specific concepts
+- Tools and services mentioned
+- Important processes or methodologies
+
+Avoid generic terms. Be specific to the actual content.
+Return only the keywords as a JSON array of strings.
+
+Document content:
+{combined_content}
+
+Example output format: ["Azure Kubernetes Service", "Watson Analytics", "container orchestration", "performance telemetry", "microservice deployment"]
+"""
+
+ response = await client.chat.completions.create(
+ model=model_deployment,
+ messages=[
+ {"role": "system", "content": "You are a technical keyword extraction expert. Extract specific, meaningful keywords from technical documentation."},
+ {"role": "user", "content": prompt}
+ ],
+ temperature=0.3,
+ max_tokens=500,
+ timeout=30
+ )
+
+ # Parse the response
+ keywords_text = response.choices[0].message.content.strip()
+ try:
+ import json
+ keywords = json.loads(keywords_text)
+ if isinstance(keywords, list):
+ keywords = list(set([k for k in keywords if k and isinstance(k, str)]))
+ return keywords[:30]
+ except:
+ # Fallback: extract from response text
+ import re
+ keywords = re.findall(r'"([^"]+)"', keywords_text)
+ if not keywords:
+ keywords = [k.strip() for k in keywords_text.split(',') if k.strip()]
+ return list(set([k.strip('"\'') for k in keywords if k]))[:30]
+
+ except Exception as e:
+ print(f"Error extracting keywords with LLM: {e}")
+ return self._enhanced_keyword_extraction(contents)
+
+ def _enhanced_keyword_extraction(self, contents: List[str]) -> List[str]:
+ """Enhanced fallback keyword extraction"""
+ import re
+ from collections import Counter
+
+ keywords = set()
+
+ for content in contents[:3]:
+ # Extract capitalized words and phrases
+ capitalized = re.findall(r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b', content)
+ keywords.update(capitalized)
+
+ # Extract words in quotes
+ quoted = re.findall(r'"([^"]+)"', content)
+ keywords.update([q for q in quoted if len(q.split()) <= 3])
+
+ # Extract technical terms (words with numbers, hyphens, or specific patterns)
+ technical = re.findall(r'\b\w+[-]\w+\b|\b\w+\d+\w*\b', content)
+ keywords.update(technical)
+
+ # Filter and clean keywords
+ cleaned_keywords = []
+ for kw in keywords:
+ kw = kw.strip()
+ # Keep only meaningful keywords
+ if len(kw) > 3 and len(kw.split()) <= 3 and not kw.lower() in self._get_stopwords():
+ cleaned_keywords.append(kw)
+
+ # Sort by length and alphabetically to get most specific terms first
+ cleaned_keywords.sort(key=lambda x: (-len(x.split()), x))
+
+ return cleaned_keywords[:20]
+
+ def _get_stopwords(self) -> set:
+ """Get common stopwords to filter out"""
+ return {
+ "the", "is", "at", "which", "on", "and", "a", "an", "as", "are",
+ "was", "were", "been", "have", "has", "had", "do", "does", "did",
+ "will", "would", "should", "could", "may", "might", "must", "can",
+ "to", "of", "in", "for", "with", "from", "this", "that", "these",
+ "those", "then", "than", "when", "where", "what", "how", "why"
+ }
+
+ async def __aenter__(self):
+ """Async context manager entry"""
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Async context manager exit - cleanup resources"""
+ print("DomainClassifierSetup cleanup started")
+
+ # Close the LLM client first
+ if hasattr(self, '_llm_client') and self._llm_client:
+ try:
+ print("Closing LLM client...")
+ await self._llm_client.close()
+ print("LLM client closed successfully")
+ except Exception as e:
+ print(f"Error closing LLM client: {e}")
+ finally:
+ self._llm_client = None
+
+ # Close the credential if it exists
+ if hasattr(self, '_credential') and self._credential:
+ try:
+ print("Closing Azure credential...")
+ await self._credential.close()
+ print("Azure credential closed successfully")
+ except Exception as e:
+ print(f"Error closing credential: {e}")
+ finally:
+ self._credential = None
+
+ print("DomainClassifierSetup cleanup completed")
\ No newline at end of file
diff --git a/app/backend/prepdocslib/hyperlinktextsplitter.py b/app/backend/prepdocslib/hyperlinktextsplitter.py
new file mode 100644
index 0000000000..bed61e3e48
--- /dev/null
+++ b/app/backend/prepdocslib/hyperlinktextsplitter.py
@@ -0,0 +1,165 @@
+import logging
+import re
+from collections.abc import Generator
+from typing import List, Tuple
+
+import tiktoken
+
+from .page import Page, SplitPage
+from .textsplitter import SentenceTextSplitter
+
+logger = logging.getLogger("scripts")
+
+
+class HyperlinkAwareTextSplitter(SentenceTextSplitter):
+ """
+ Text splitter that preserves hyperlinks during chunking
+ """
+
+ def __init__(self, max_tokens_per_section: int = 500, add_target_blank: bool = True):
+ super().__init__(max_tokens_per_section)
+ # More comprehensive pattern that handles various link formats
+ self.hyperlink_pattern = re.compile(
+ r']*?\s+)?href\s*=\s*["\']([^"\']*)["\'][^>]*>(.*?)',
+ re.IGNORECASE | re.DOTALL
+ )
+ self.add_target_blank = add_target_blank
+
+ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]:
+ """
+ Split text while preserving hyperlinks
+ """
+ # Optionally transform hyperlinks to add target="_blank"
+ if self.add_target_blank:
+ text = self._add_target_blank_to_links(text)
+
+ # Extract hyperlinks and their positions
+ hyperlinks = list(self.hyperlink_pattern.finditer(text))
+
+ # If no hyperlinks, use the parent method
+ if not hyperlinks:
+ yield from super().split_page_by_max_tokens(page_num, text)
+ return
+
+ tokens = tiktoken.encoding_for_model("text-embedding-ada-002").encode(text)
+ if len(tokens) <= self.max_tokens_per_section:
+ yield SplitPage(page_num=page_num, text=text)
+ return
+
+ # Split text while keeping hyperlinks intact
+ yield from self._split_preserving_hyperlinks(page_num, text, hyperlinks)
+
+ def _add_target_blank_to_links(self, text: str) -> str:
+ """
+ Add target="_blank" and rel="noopener noreferrer" to all hyperlinks
+ """
+ def replace_link(match):
+ full_match = match.group(0)
+
+ # Check if target="_blank" already exists
+ if 'target="_blank"' in full_match or "target='_blank'" in full_match:
+ return full_match
+
+ # Extract the opening tag and closing tag
+ href_match = re.search(r'href="([^"]*)"', full_match)
+ if not href_match:
+ return full_match
+
+ # Insert target="_blank" and rel="noopener noreferrer" after href
+ new_link = full_match.replace(
+ href_match.group(0),
+ f'{href_match.group(0)} target="_blank" rel="noopener noreferrer"'
+ )
+
+ return new_link
+
+ # Use the same pattern as the class-level pattern
+ return self.hyperlink_pattern.sub(replace_link, text)
+
+ def _split_preserving_hyperlinks(self, page_num: int, text: str, hyperlinks: List[re.Match]) -> Generator[SplitPage, None, None]:
+ """
+ Split text ensuring hyperlinks are not broken across chunks
+ """
+ text_length = len(text)
+ start = 0
+
+ while start < text_length:
+ # Calculate the ideal end position
+ ideal_end = min(start + self.max_section_length, text_length)
+
+ # Find the best split position that doesn't break hyperlinks
+ split_end = self._find_safe_split_position(text, start, ideal_end, hyperlinks)
+
+ if split_end <= start:
+ # Fallback: take at least one hyperlink or sentence
+ split_end = self._find_minimum_split(text, start, hyperlinks)
+
+ section_text = text[start:split_end].strip()
+
+ if section_text:
+ yield SplitPage(page_num=page_num, text=section_text)
+
+ # Move to next section with overlap
+ start = max(start + 1, split_end - self.section_overlap)
+
+ def _find_safe_split_position(self, text: str, start: int, ideal_end: int, hyperlinks: List[re.Match]) -> int:
+ """
+ Find a position to split that doesn't break hyperlinks
+ """
+ # Check if the ideal end position is within a hyperlink
+ for hyperlink in hyperlinks:
+ link_start, link_end = hyperlink.span()
+
+ # If the split would occur within a hyperlink, adjust
+ if link_start < ideal_end < link_end:
+ # Try to end before the hyperlink
+ if link_start > start + self.max_section_length // 2:
+ return self._find_sentence_boundary_before(text, link_start, start)
+ # Otherwise, include the entire hyperlink
+ else:
+ return self._find_sentence_boundary_after(text, link_end, len(text))
+
+ # No hyperlink conflict, find sentence boundary
+ return self._find_sentence_boundary_before(text, ideal_end, start)
+
+ def _find_sentence_boundary_before(self, text: str, position: int, min_position: int) -> int:
+ """
+ Find the nearest sentence ending before the given position
+ """
+ for i in range(position - 1, min_position - 1, -1):
+ if text[i] in self.sentence_endings:
+ return i + 1
+
+ # Fallback to word boundary
+ for i in range(position - 1, min_position - 1, -1):
+ if text[i] in self.word_breaks:
+ return i + 1
+
+ return position
+
+ def _find_sentence_boundary_after(self, text: str, position: int, max_position: int) -> int:
+ """
+ Find the nearest sentence ending after the given position
+ """
+ for i in range(position, min(max_position, position + self.sentence_search_limit)):
+ if text[i] in self.sentence_endings:
+ return i + 1
+
+ return min(position + self.max_section_length, max_position)
+
+ def _find_minimum_split(self, text: str, start: int, hyperlinks: List[re.Match]) -> int:
+ """
+ Find minimum viable split position (at least one complete element)
+ """
+ # Find the first complete hyperlink after start
+ for hyperlink in hyperlinks:
+ link_start, link_end = hyperlink.span()
+ if link_start >= start:
+ return link_end
+
+ # Fallback to sentence or word boundary
+ for i in range(start + 1, len(text)):
+ if text[i] in self.sentence_endings:
+ return i + 1
+
+ return min(start + self.max_section_length, len(text))
\ No newline at end of file
diff --git a/app/backend/prepdocslib/listfilestrategy.py b/app/backend/prepdocslib/listfilestrategy.py
index bdceef0754..818f387d1e 100644
--- a/app/backend/prepdocslib/listfilestrategy.py
+++ b/app/backend/prepdocslib/listfilestrategy.py
@@ -100,7 +100,7 @@ def check_md5(self, path: str) -> bool:
if os.path.exists(hash_path):
with open(hash_path, encoding="utf-8") as md5_f:
stored_hash = md5_f.read()
-
+ stored_hash = None
if stored_hash and stored_hash.strip() == existing_hash.strip():
logger.info("Skipping %s, no changes detected.", path)
return True
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
index dc9a9b32bb..90d1ffc884 100644
--- a/app/backend/requirements.txt
+++ b/app/backend/requirements.txt
@@ -434,3 +434,4 @@ yarl==1.17.2
# via aiohttp
zipp==3.21.0
# via importlib-metadata
+lxml>=4.9.3
\ No newline at end of file
diff --git a/app/check.sh b/app/check.sh
new file mode 100644
index 0000000000..7597b60ae4
--- /dev/null
+++ b/app/check.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+echo "=== Checking for Missing Imports ==="
+
+# Check if prepdocs module is correctly imported
+echo -e "\n--- Checking prepdocs imports ---"
+grep -n "from prepdocs import" /workspaces/azure-search-openai-demo/app/backend/app.py
+
+# Check if the domain classifier and orchestrator files exist
+echo -e "\n--- Checking for approach files ---"
+ls -la /workspaces/azure-search-openai-demo/app/backend/approaches/domain_classifier.py 2>/dev/null || echo "ā domain_classifier.py not found"
+ls -la /workspaces/azure-search-openai-demo/app/backend/approaches/orchestrator_approach.py 2>/dev/null || echo "ā orchestrator_approach.py not found"
+
+# Check Python path
+echo -e "\n--- Python Path ---"
+cd /workspaces/azure-search-openai-demo/app/backend
+python3 -c "import sys; print('\n'.join(sys.path))"
+
+echo -e "\n=== Check Complete ==="
\ No newline at end of file
diff --git a/app/diagno.sh b/app/diagno.sh
new file mode 100644
index 0000000000..b688037282
--- /dev/null
+++ b/app/diagno.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Save as check-services.sh
+
+echo "=== Checking Azure Services Status ==="
+
+# Get environment values
+RG=$(azd env get-value AZURE_RESOURCE_GROUP)
+echo "Resource Group: $RG"
+
+# Check web apps
+echo -e "\n=== Web Apps Status ==="
+az webapp list -g $RG --query "[].{name:name, state:state, url:defaultHostName}" -o table
+
+# Check search service
+echo -e "\n=== Search Service ==="
+SEARCH=$(azd env get-value AZURE_SEARCH_SERVICE)
+az search service show -g $RG -n $SEARCH --query "{name:name, status:status, provisioningState:provisioningState}" -o table
+
+# Check OpenAI
+echo -e "\n=== OpenAI Service ==="
+OPENAI=$(azd env get-value AZURE_OPENAI_SERVICE)
+az cognitiveservices account show -g $RG -n $OPENAI --query "{name:name, provisioningState:properties.provisioningState, endpoint:properties.endpoint}" -o table
+
+# Check storage
+echo -e "\n=== Storage Account ==="
+STORAGE=$(azd env get-value AZURE_STORAGE_ACCOUNT)
+az storage account show -g $RG -n $STORAGE --query "{name:name, provisioningState:provisioningState, status:statusOfPrimary}" -o table
+
+echo -e "\n=== Checking Complete ==="
\ No newline at end of file
diff --git a/app/frontend/index.html b/app/frontend/index.html
index 30205db90f..614e8543d6 100644
--- a/app/frontend/index.html
+++ b/app/frontend/index.html
@@ -4,7 +4,7 @@
- Azure OpenAI + AI Search
+ Cosmic TSG Search
diff --git a/app/frontend/package-lock.json b/app/frontend/package-lock.json
index b4ec8fb7a0..6da48b3591 100644
--- a/app/frontend/package-lock.json
+++ b/app/frontend/package-lock.json
@@ -44,7 +44,7 @@
"vite": "^5.4.18"
},
"engines": {
- "node": ">=14.0.0"
+ "node": ">=20.0.0"
}
},
"node_modules/@ampproject/remapping": {
diff --git a/app/frontend/src/components/Answer/Answer.tsx b/app/frontend/src/components/Answer/Answer.tsx
index c9d73a8a76..27617f4ce0 100644
--- a/app/frontend/src/components/Answer/Answer.tsx
+++ b/app/frontend/src/components/Answer/Answer.tsx
@@ -100,7 +100,23 @@ export const Answer = ({
-
+
{
+ // Preserve existing target="_blank" or add it if missing
+ const target = props.target || "_blank";
+ const rel = props.rel || "noopener noreferrer";
+ return (
+
+ {children}
+
+ );
+ }
+ }}
+ />
diff --git a/app/frontend/src/components/Answer/AnswerParser.tsx b/app/frontend/src/components/Answer/AnswerParser.tsx
index 3807592f6d..21e1394710 100644
--- a/app/frontend/src/components/Answer/AnswerParser.tsx
+++ b/app/frontend/src/components/Answer/AnswerParser.tsx
@@ -52,6 +52,24 @@ export function parseAnswerToHtml(answer: ChatAppResponse, isStreaming: boolean,
parsedAnswer = truncatedAnswer;
}
+ // Process all anchor tags to ensure target="_blank"
+ parsedAnswer = parsedAnswer.replace(/]*?)href=["']([^"']+)["']([^>]*?)>/gi, (match, before, href, after) => {
+ // Check if target="_blank" already exists
+ if (match.includes('target="_blank"') || match.includes("target='_blank'")) {
+ return match;
+ }
+ // Add target="_blank" and rel="noopener noreferrer"
+ // Make sure to include any existing attributes
+ const existingRel = match.match(/rel=["']([^"']+)["']/);
+ const relValue = existingRel ? existingRel[1] + " noopener noreferrer" : "noopener noreferrer";
+
+ // Remove existing rel attribute if present
+ let newMatch = match.replace(/rel=["'][^"']*["']/g, "");
+
+ // Add target and rel attributes
+ return newMatch.replace(/(]*?)(>)/gi, `$1 target="_blank" rel="${relValue}"$2`);
+ });
+
const parts = parsedAnswer.split(/\[([^\]]+)\]/g);
const fragments: string[] = parts.map((part, index) => {
diff --git a/app/frontend/src/components/Settings/Settings.tsx b/app/frontend/src/components/Settings/Settings.tsx
index a1c4f46632..84a16df791 100644
--- a/app/frontend/src/components/Settings/Settings.tsx
+++ b/app/frontend/src/components/Settings/Settings.tsx
@@ -269,8 +269,9 @@ export const Settings = ({
onChange={(_ev?: React.FormEvent, option?: IDropdownOption) => onChange("includeCategory", option?.key || "")}
aria-labelledby={includeCategoryId}
options={[
- { key: "", text: t("labels.includeCategoryOptions.all") }
- // { key: "example", text: "Example Category" } // Add more categories as needed
+ { key: "", text: t("labels.includeCategoryOptions.all") },
+ { key: "Cosmic", text: "Cosmic" },
+ { key: "Substrate", text: "Substrate" }
]}
onRenderLabel={props => renderLabel(props, includeCategoryId, includeCategoryFieldId, t("helpTexts.includeCategory"))}
/>
diff --git a/app/frontend/src/locales/en/translation.json b/app/frontend/src/locales/en/translation.json
index 8445407290..898e65e29b 100644
--- a/app/frontend/src/locales/en/translation.json
+++ b/app/frontend/src/locales/en/translation.json
@@ -1,6 +1,6 @@
{
- "pageTitle": "Azure OpenAI + AI Search",
- "headerTitle": "Azure OpenAI + AI Search",
+ "pageTitle": "Cosmic Perf TSG",
+ "headerTitle": "Cosmic Perf TSG",
"chat": "Chat",
"qa": "Ask a question",
"login": "Login",
@@ -37,17 +37,17 @@
"chatEmptyStateTitle": "Chat with your data",
"chatEmptyStateSubtitle": "Ask anything or try an example",
"defaultExamples": {
- "1": "What is included in my Northwind Health Plus plan that is not in standard?",
- "2": "What happens in a performance review?",
- "3": "What does a Product Manager do?",
- "placeholder": "Type a new question (e.g. does my plan cover annual eye exams?)"
+ "1": "How to setup Watson on Cosmic?",
+ "2": "How to create an incident for performance issues on Cosmic?",
+ "3": "How to perform CPR profiling tools on Cosmic?",
+ "placeholder": "Type a new question (e.g. How to troubleshoot high CPU, memory, latency and timeout issues on Cosmic?)"
},
"askTitle": "Ask your data",
"gpt4vExamples": {
"1": "Compare the impact of interest rates and GDP in financial markets.",
"2": "What is the expected trend for the S&P 500 index over the next five years? Compare it to the past S&P 500 performance",
"3": "Can you identify any correlation between oil prices and stock market trends?",
- "placeholder": "Example: Does my plan cover annual eye exams?"
+ "placeholder": "Example: How to troubleshoot high CPU, memory, latency and timeout issues on Cosmic?"
},
"generatingAnswer": "Generating answer",
"citationWithColon": "Citation:",
diff --git a/app/frontend/src/locales/nl/translation.json b/app/frontend/src/locales/nl/translation.json
index 3067f856fa..30f7140e92 100644
--- a/app/frontend/src/locales/nl/translation.json
+++ b/app/frontend/src/locales/nl/translation.json
@@ -1,6 +1,6 @@
{
- "pageTitle": "Azure OpenAI + AI Search",
- "headerTitle": "Azure OpenAI + AI Search",
+ "pageTitle": "Cosmic Perf TSG",
+ "headerTitle": "Cosmic Perf TSG",
"chat": "Chat",
"qa": "Stel een vraag",
"login": "Inloggen",
diff --git a/azure-search-openai-demo/app/backend/tests/__init__.py b/azure-search-openai-demo/app/backend/tests/__init__.py
new file mode 100644
index 0000000000..74be5e3fd7
--- /dev/null
+++ b/azure-search-openai-demo/app/backend/tests/__init__.py
@@ -0,0 +1,18 @@
+from unittest import TestCase
+from unittest.mock import patch, MagicMock
+from app.backend.prepdocs import setup_domain_classifier
+
+class TestSetupDomainClassifier(TestCase):
+ @patch('app.backend.prepdocs.AzureSearchClient')
+ @patch('app.backend.prepdocs.AzureDeveloperCliCredential')
+ def test_setup_domain_classifier(self, mock_credential, mock_search_client):
+ mock_token = MagicMock()
+ mock_token.token = 'fake_token'
+ mock_credential.return_value.get_token.return_value = mock_token
+
+ mock_search_client.return_value.create_or_update_index.return_value = None
+
+ result = setup_domain_classifier(mock_credential, mock_search_client)
+
+ self.assertIsNone(result)
+ mock_search_client.return_value.create_or_update_index.assert_called_once()
\ No newline at end of file
diff --git a/azure-search-openai-demo/app/backend/tests/test_prepdocs.py b/azure-search-openai-demo/app/backend/tests/test_prepdocs.py
new file mode 100644
index 0000000000..694e8eee68
--- /dev/null
+++ b/azure-search-openai-demo/app/backend/tests/test_prepdocs.py
@@ -0,0 +1,12 @@
+import pytest
+from unittest.mock import AsyncMock, patch
+from app.backend.prepdocs import setup_domain_classifier
+
+@pytest.mark.asyncio
+async def test_setup_domain_classifier():
+ mock_openai_embeddings_service = AsyncMock()
+ mock_search_index_client = AsyncMock()
+
+ with patch('app.backend.prepdocslib.domain_classifier_setup.create_domain_classifier_index', return_value=None) as mock_create_index:
+ await setup_domain_classifier(mock_openai_embeddings_service, mock_search_index_client)
+ mock_create_index.assert_called_once()
\ No newline at end of file
diff --git a/azure-search-openai-demo/pytest.ini b/azure-search-openai-demo/pytest.ini
new file mode 100644
index 0000000000..5e35e7b6e1
--- /dev/null
+++ b/azure-search-openai-demo/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+addopts = --maxfail=1 --disable-warnings -q
+testpaths = app/backend/tests
+python_files = test_*.py
+python_functions = test_*
\ No newline at end of file
diff --git a/data/Benefit_Options.pdf b/data/Benefit_Options.pdf
deleted file mode 100644
index 6a4c07dc94..0000000000
Binary files a/data/Benefit_Options.pdf and /dev/null differ
diff --git a/data/Contoso_Electronics_Company_Overview.md b/data/Contoso_Electronics_Company_Overview.md
deleted file mode 100644
index 033d7dd84a..0000000000
--- a/data/Contoso_Electronics_Company_Overview.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Contoso Electronics
-
-*Disclaimer: This content is generated by AI and may not accurately represent factual information about any real entity. Use this information with caution and verify details from reliable sources.*
-
-## History
-
-Contoso Electronics, a pioneering force in the tech industry, was founded in 1985 by visionary entrepreneurs with a passion for innovation. Over the years, the company has played a pivotal role in shaping the landscape of consumer electronics.
-
-| Year | Milestone |
-|------|-----------|
-| 1985 | Company founded with a focus on cutting-edge technology |
-| 1990 | Launched the first-ever handheld personal computer |
-| 2000 | Introduced groundbreaking advancements in AI and robotics |
-| 2015 | Expansion into sustainable and eco-friendly product lines |
-
-## Company Overview
-
-At Contoso Electronics, we take pride in fostering a dynamic and inclusive workplace. Our dedicated team of experts collaborates to create innovative solutions that empower and connect people globally.
-
-### Core Values
-
-- **Innovation:** Constantly pushing the boundaries of technology.
-- **Diversity:** Embracing different perspectives for creative excellence.
-- **Sustainability:** Committed to eco-friendly practices in our products.
-
-## Vacation Perks
-
-We believe in work-life balance and understand the importance of well-deserved breaks. Our vacation perks are designed to help our employees recharge and return with renewed enthusiasm.
-
-| Vacation Tier | Duration | Additional Benefits |
-|---------------|----------|---------------------|
-| Standard | 2 weeks | Health and wellness stipend |
-| Senior | 4 weeks | Travel vouchers for a dream destination |
-| Executive | 6 weeks | Luxury resort getaway with family |
-
-## Employee Recognition
-
-Recognizing the hard work and dedication of our employees is at the core of our culture. Here are some ways we celebrate achievements:
-
-- Monthly "Innovator of the Month" awards
-- Annual gala with awards for outstanding contributions
-- Team-building retreats for high-performing departments
-
-## Join Us!
-
-Contoso Electronics is always on the lookout for talented individuals who share our passion for innovation. If you're ready to be part of a dynamic team shaping the future of technology, check out our [careers page](http://www.contoso.com) for exciting opportunities.
-
-[Learn more about Contoso Electronics!](http://www.contoso.com)
diff --git a/data/Northwind_Health_Plus_Benefits_Details.pdf b/data/Northwind_Health_Plus_Benefits_Details.pdf
deleted file mode 100644
index 97579a4fb5..0000000000
Binary files a/data/Northwind_Health_Plus_Benefits_Details.pdf and /dev/null differ
diff --git a/data/Northwind_Standard_Benefits_Details.pdf b/data/Northwind_Standard_Benefits_Details.pdf
deleted file mode 100644
index 7d50ff8c02..0000000000
Binary files a/data/Northwind_Standard_Benefits_Details.pdf and /dev/null differ
diff --git a/data/PerksPlus.pdf b/data/PerksPlus.pdf
deleted file mode 100644
index 2e167a2a6a..0000000000
Binary files a/data/PerksPlus.pdf and /dev/null differ
diff --git a/data/cosmic/FAQs for Cosmic Performance.docx b/data/cosmic/FAQs for Cosmic Performance.docx
new file mode 100644
index 0000000000..237213f7fe
Binary files /dev/null and b/data/cosmic/FAQs for Cosmic Performance.docx differ
diff --git a/data/cosmic/Investigate High CPU Utilization in Container_0.docx b/data/cosmic/Investigate High CPU Utilization in Container_0.docx
new file mode 100644
index 0000000000..bb9b255322
Binary files /dev/null and b/data/cosmic/Investigate High CPU Utilization in Container_0.docx differ
diff --git a/data/cosmic/Investigate High Memory Utilization in Container_1.docx b/data/cosmic/Investigate High Memory Utilization in Container_1.docx
new file mode 100644
index 0000000000..e29ac16830
Binary files /dev/null and b/data/cosmic/Investigate High Memory Utilization in Container_1.docx differ
diff --git a/data/employee_handbook.pdf b/data/employee_handbook.pdf
deleted file mode 100644
index 878f36f7dd..0000000000
Binary files a/data/employee_handbook.pdf and /dev/null differ
diff --git a/data/role_library.pdf b/data/role_library.pdf
deleted file mode 100644
index ff70c65651..0000000000
Binary files a/data/role_library.pdf and /dev/null differ
diff --git a/data/substrate/FAQs for Exchange Performance.docx b/data/substrate/FAQs for Exchange Performance.docx
new file mode 100644
index 0000000000..7f8160e299
Binary files /dev/null and b/data/substrate/FAQs for Exchange Performance.docx differ
diff --git a/delete_index.py b/delete_index.py
new file mode 100644
index 0000000000..9aa593c732
--- /dev/null
+++ b/delete_index.py
@@ -0,0 +1,25 @@
+import asyncio
+import os
+from azure.identity import AzureDeveloperCliCredential
+from azure.search.documents.indexes.aio import SearchIndexClient
+
+async def delete_search_index():
+ # Load environment variables
+ search_service = os.environ["AZURE_SEARCH_SERVICE"]
+ index_name = "gptkbindex" # or os.environ["AZURE_SEARCH_INDEX"]
+
+ # Setup credentials
+ azd_credential = AzureDeveloperCliCredential()
+
+ # Create search index client
+ endpoint = f"https://{search_service}.search.windows.net/"
+
+ async with SearchIndexClient(endpoint=endpoint, credential=azd_credential) as client:
+ try:
+ await client.delete_index(index_name)
+ print(f"Successfully deleted search index: {index_name}")
+ except Exception as e:
+ print(f"Error deleting index: {e}")
+
+if __name__ == "__main__":
+ asyncio.run(delete_search_index())
\ No newline at end of file
diff --git a/locustfile.py b/locustfile.py
index b41b9bd372..2b527e9279 100644
--- a/locustfile.py
+++ b/locustfile.py
@@ -16,10 +16,10 @@ def ask_question(self):
time.sleep(self.wait_time())
first_question = random.choice(
[
- "What is included in my Northwind Health Plus plan that is not in standard?",
- "What does a Product Manager do?",
- "What happens in a performance review?",
- "Whats your whistleblower policy?",
+ "How to create an incident for performance issues on Cosmic?",
+ "How to connect to a Cosmic Windows node without using jumpbox?",
+ "How to troubleshoot high CPU, memory, latency and timeout issues on Cosmic?",
+ "How to review resource utilization on Cosmic Windows?",
]
)
diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh
index c0254755e0..47088e48b2 100755
--- a/scripts/prepdocs.sh
+++ b/scripts/prepdocs.sh
@@ -1,12 +1,135 @@
- #!/bin/sh
+#!/bin/bash
+
+set -e
. ./scripts/load_python_env.sh
-echo 'Running "prepdocs.py"'
+echo '=== Running prepdocs.py with explicit index arguments ==='
additionalArgs=""
if [ $# -gt 0 ]; then
additionalArgs="$@"
fi
-./.venv/bin/python ./app/backend/prepdocs.py './data/*' --verbose $additionalArgs
+# Create sample data directories if they don't exist
+mkdir -p data/cosmic data/substrate
+
+# Create sample cosmic data if it doesn't exist
+if [ ! "$(ls -A data/cosmic/ 2>/dev/null)" ]; then
+ echo "š Creating sample cosmic data..."
+ cat > data/cosmic/cosmic_overview.md << 'EOF'
+# Microsoft Cosmic Platform
+
+## Overview
+Microsoft Cosmic is a container orchestration platform optimized for performance monitoring and diagnostics.
+
+## Key Features
+- Container lifecycle management
+- Real-time performance metrics
+- Advanced diagnostic tools
+- Resource optimization
+- Automated scaling
+
+## Performance Monitoring
+Monitor your containers with:
+- CPU and memory utilization tracking
+- Network throughput analysis
+- Storage I/O metrics
+- Application-level diagnostics
+
+## Common Issues
+- High memory utilization in containers
+- CPU throttling
+- Network latency
+- Storage bottlenecks
+EOF
+fi
+
+# Create sample substrate data if it doesn't exist
+if [ ! "$(ls -A data/substrate/ 2>/dev/null)" ]; then
+ echo "š Creating sample substrate data..."
+ cat > data/substrate/substrate_overview.md << 'EOF'
+# Microsoft Substrate Infrastructure
+
+## Overview
+Microsoft Substrate is an infrastructure platform for cloud services deployment and management.
+
+## Key Features
+- Infrastructure as Code (IaC)
+- Automated deployment pipelines
+- Scalable resource management
+- Azure integration
+
+## Infrastructure Setup
+Deploy infrastructure with:
+- Resource templates
+- Scaling policies
+- Network configuration
+- Security policies
+
+## Best Practices
+- Use declarative infrastructure definitions
+- Implement automated testing
+- Monitor resource utilization
+- Regular security audits
+EOF
+fi
+
+# Process Cosmic documents
+echo ""
+echo "=== Processing Cosmic Documents ==="
+echo "Index: cosmic-index"
+echo "Agent: cosmic-agent"
+echo "Data: ./data/cosmic/*"
+./.venv/bin/python ./app/backend/prepdocs.py './data/cosmic/*' \
+ --searchindex "cosmic-index" \
+ --searchagent "cosmic-agent" \
+ --category "Cosmic" \
+ --verbose \
+ --skip-domain-classifier \
+ $additionalArgs
+
+echo ""
+echo "ā
Cosmic documents processed"
+
+# Process Substrate documents
+echo ""
+echo "=== Processing Substrate Documents ==="
+echo "Index: substrate-index"
+echo "Agent: substrate-agent"
+echo "Data: ./data/substrate/*"
+./.venv/bin/python ./app/backend/prepdocs.py "./data/substrate/*" \
+ --searchindex "substrate-index" \
+ --searchagent "substrate-agent" \
+ --category "Substrate" \
+ --verbose \
+ --skip-domain-classifier \
+ $additionalArgs
+
+echo ""
+echo "ā
Substrate documents processed"
+
+# Set up domain classifier
+echo ""
+echo "=== Setting up Domain Classifier ==="
+echo "Index: domain-classifier-index"
+./.venv/bin/python ./app/backend/prepdocs.py './data/*' \
+ --searchindex "domain-classifier-index" \
+ --domain-classifier-only \
+ --verbose \
+ $additionalArgs
+
+echo ""
+echo "ā
Domain classifier setup complete"
+
+echo ""
+echo "=== ā
Multi-index setup complete! ==="
+echo "Created/populated indexes:"
+echo " - cosmic-index (with cosmic domain data)"
+echo " - substrate-index (with substrate domain data)"
+echo " - domain-classifier-index (with classification training data)"
+echo ""
+echo "Your app should now be able to:"
+echo " 1. Classify questions into Cosmic or Substrate domains"
+echo " 2. Route queries to the appropriate domain index"
+echo " 3. Return domain-specific results"
\ No newline at end of file