Azure-Samples · pamelafox · Feb 18, 2025
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 from typing import Any, AsyncGenerator, Dict, Union, cast
 
+import aiohttp
 from azure.cognitiveservices.speech import (
     ResultReason,
     SpeechConfig,
@@ -133,6 +134,16 @@ async def content_file(path: str, auth_claims: Dict[str, Any]):
     if AZURE_ENFORCE_ACCESS_CONTROL is set to true, logged in users can only access files they have access to
     This is also slow and memory hungry.
     """
+    # if the path looks like issue-NNN.html, fetch it from github.com/Azure-samples/azure-search-openai-demo/issues instead
+    if path.startswith("issue-") and path.endswith(".html"):
+        issue_id = path.split("-")[1].split(".")[0]
+        url = f"https://github.com/Azure-Samples/azure-search-openai-demo/issues/{issue_id}"
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                if response.status != 200:
+                    abort(404)
+                return await response.text()
+
     # Remove page number from path, filename-1.txt -> filename.txt
     # This shouldn't typically be necessary as browsers don't send hash fragments to servers
     if path.find("#page=") > 0:

diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -28,6 +28,14 @@
 from core.authentication import AuthenticationHelper
 
 
+@dataclass
+class AzureAISearch:
+    aisearch_query: str
+
+@dataclass
+class GitHubIssueSearch:
+    github_query: str
+
 @dataclass
 class Document:
     id: Optional[str]
@@ -204,6 +212,34 @@ async def search(
 
         return qualified_documents
 
+
+    async def search_github_issues(self, github_issue_search: GitHubIssueSearch) -> list[Document]:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f'https://api.github.com/search/issues?q={github_issue_search.github_query}+repo:Azure-samples/azure-search-openai-demo+type:issue&per_page=10') as response:
+                if response.status == 200:
+                    issues = (await response.json()).get("items", [])
+                    documents = []
+                    # strip out image markdown from the body
+                    for issue in issues:
+                        body = issue["body"].replace("![", "").replace("](https://", "").replace(")", "")
+                        # turn html_url like https://github.com/Azure-Samples/azure-search-openai-demo/issues/2358 into issue-2358.html
+                        sourcefile = f"issue-{issue.get('number')}.html"
+                        documents.append(Document(
+                            id=issue.get("id"),
+                            content=f"# {issue.get('title')}\n\n{body}",
+                            sourcepage=sourcefile,
+                            sourcefile=sourcefile,
+                            embedding=[],
+                            image_embedding=[],
+                            category=None,
+                            oids=[],
+                            groups=[],
+                            captions=[],
+                            ))
+                    return documents
+                else:
+                    return []
+
     def get_sources_content(
         self, results: List[Document], use_semantic_captions: bool, use_image_citation: bool
     ) -> list[str]:

diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py
@@ -1,11 +1,11 @@
 import json
 import re
 from abc import ABC, abstractmethod
-from typing import Any, AsyncGenerator, Optional
+from typing import Any, AsyncGenerator, List, Optional, Union
 
 from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
 
-from approaches.approach import Approach
+from approaches.approach import Approach, AzureAISearch, GitHubIssueSearch
 
 
 class ChatApproach(Approach, ABC):
@@ -16,23 +16,27 @@ class ChatApproach(Approach, ABC):
     async def run_until_final_call(self, messages, overrides, auth_claims, should_stream) -> tuple:
         pass
 
-    def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
+    def get_search_query(self, chat_completion: ChatCompletion, user_query: str) -> List[Union[AzureAISearch, GitHubIssueSearch]]:
         response_message = chat_completion.choices[0].message
+        search_queries = []
 
         if response_message.tool_calls:
             for tool in response_message.tool_calls:
                 if tool.type != "function":
                     continue
                 function = tool.function
-                if function.name == "search_sources":
+                if function.name == "azure_ai_search_docs":
                     arg = json.loads(function.arguments)
                     search_query = arg.get("search_query", self.NO_RESPONSE)
                     if search_query != self.NO_RESPONSE:
-                        return search_query
-        elif query_text := response_message.content:
-            if query_text.strip() != self.NO_RESPONSE:
-                return query_text
-        return user_query
+                        search_queries.append(AzureAISearch(aisearch_query=search_query))
+                elif function.name == "github_search_issues":
+                    arg = json.loads(function.arguments)
+                    search_query = arg.get("search_query", self.NO_RESPONSE)
+                    if search_query != self.NO_RESPONSE:
+                        search_queries.append(GitHubIssueSearch(github_query=search_query))
+
+        return search_queries
 
     def extract_followup_questions(self, content: Optional[str]):
         if content is None:

diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py
@@ -12,7 +12,7 @@
 from openai_messages_token_helper import build_messages, get_token_limit
 
 from approaches.approach import ThoughtStep
-from approaches.chatapproach import ChatApproach
+from approaches.chatapproach import ChatApproach, GitHubIssueSearch
 from approaches.promptmanager import PromptManager
 from core.authentication import AuthenticationHelper
 
@@ -124,33 +124,40 @@ async def run_until_final_call(
             max_tokens=query_response_token_limit,  # Setting too low risks malformed JSON, setting too high may affect performance
             n=1,
             tools=tools,
+            tool_choice="auto",
             seed=seed,
         )
 
-        query_text = self.get_search_query(chat_completion, original_user_query)
-
-        # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query
-
-        # If retrieval mode includes vectors, compute an embedding for the query
-        vectors: list[VectorQuery] = []
-        if use_vector_search:
-            vectors.append(await self.compute_text_embedding(query_text))
-
-        results = await self.search(
-            top,
-            query_text,
-            filter,
-            vectors,
-            use_text_search,
-            use_vector_search,
-            use_semantic_ranker,
-            use_semantic_captions,
-            minimum_search_score,
-            minimum_reranker_score,
-        )
+        search_queries = self.get_search_query(chat_completion, original_user_query)
+        results = []
+
+        for query in search_queries:
+            if isinstance(query, GitHubIssueSearch):
+                # Handle GitHub issue search
+                results.extend(await self.search_github_issues(query))
+            else:
+                # Handle regular AI search query
+
+                vectors: list[VectorQuery] = []
+                if use_vector_search:
+                    vectors.append(await self.compute_text_embedding(query.aisearch_query))
+
+                results.extend(await self.search(
+                    top,
+                    query.aisearch_query,
+                    filter,
+                    vectors,
+                    use_text_search,
+                    use_vector_search,
+                    use_semantic_ranker,
+                    use_semantic_captions,
+                    minimum_search_score,
+                    minimum_reranker_score,
+                ))
 
         # STEP 3: Generate a contextual and content specific answer using the search results and chat history
         text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)
+
         rendered_answer_prompt = self.prompt_manager.render_prompt(
             self.answer_prompt,
             self.get_system_prompt_variables(overrides.get("prompt_template"))
@@ -186,7 +193,7 @@ async def run_until_final_call(
                 ),
                 ThoughtStep(
                     "Search using generated search query",
-                    query_text,
+                    search_queries,
                     {
                         "use_semantic_captions": use_semantic_captions,
                         "use_semantic_ranker": use_semantic_ranker,
@@ -222,4 +229,5 @@ async def run_until_final_call(
             stream=should_stream,
             seed=seed,
         )
+
         return (extra_info, chat_coroutine)
diff --git a/app/backend/approaches/prompts/chat_query_rewrite.prompty b/app/backend/approaches/prompts/chat_query_rewrite.prompty
@@ -14,31 +14,19 @@ sample:
           content: "The Northwind Health Plus plan includes coverage for emergency services, mental health and substance abuse coverage, and out-of-network services, which are not included in the Northwind Standard plan. [Benefit_Options.pdf#page=3]"
 ---
 system:
-Below is a history of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base.
-You have access to Azure AI Search index with 100's of documents.
-Generate a search query based on the conversation and the new question.
-Do not include cited source filenames and document names e.g. info.txt or doc.pdf in the search query terms.
-Do not include any text inside [] or <<>> in the search query terms.
-Do not include any special characters like '+'.
-If the question is not in English, translate the question to English before generating the search query.
-If you cannot generate a search query, return just the number 0.
-
-user:
-(EXAMPLE) How did crypto do last year?
+Below is a history of the conversation so far, and a new question asked by the user about the azure-search-openai-demo open source project.
+You have access to an Azure AI Search index with the documentation or to the GitHub issue tracker for the project documentation.
 
-assistant:
-Summarize Cryptocurrency Market Dynamics from last year
-
-user:
-(EXAMPLE) What are my health plans?
+Based on the conversation and the new question, suggest the optimal search query for the AI Search index or GitHub issue tracker.
+If the question is not in English, translate the question to English before generating the search query.
 
-assistant:
-Show available health plans
+If you cannot generate a search query for either AI Search or GitHub, return just the number 0.
+If you think that it would help to search both, then recommend both functions be called.
 
 {% for message in past_messages %}
 {{ message["role"] }}:
 {{ message["content"] }}
 {% endfor %}
 
 user:
-Generate search query for: {{ user_query }}
+{{ user_query }}
diff --git a/app/backend/approaches/prompts/chat_query_rewrite_tools.json b/app/backend/approaches/prompts/chat_query_rewrite_tools.json
@@ -1,17 +1,40 @@
-[{
+[
+  {
     "type": "function",
     "function": {
-        "name": "search_sources",
-        "description": "Retrieve sources from the Azure AI Search index",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "search_query": {
-                    "type": "string",
-                    "description": "Query string to retrieve documents from azure search eg: 'Health care plan'"
-                }
-            },
-            "required": ["search_query"]
-        }
+      "name": "azure_ai_search_docs",
+      "description": "Retrieve sources from the Azure AI Search index. Use this function for questions like 'does the repo support user-based access control?'",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "search_query": {
+            "type": "string",
+            "description": "Query string to retrieve documents from azure search eg: 'data access control'. Do not include cited source filenames and document names e.g. info.txt or doc.pdf in the search query terms. Do not include any text inside [] or <<>> in the search query terms. Do not include any special characters like '+'."
+          }
+        },
+        "required": ["search_query"],
+        "additionalProperties": false
+      },
+      "strict": true
     }
-}]
+  },
+  {
+    "type": "function",
+    "function": {
+      "name": "github_search_issues",
+      "description": "Retrieve issues from the azure-search-openai-demo issue tracker. Use this function for questions like 'what are the top errors with deployment?'",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "search_query": {
+            "type": "string",
+            "description": "Query string to retrieve issues from github eg: 'Deployment failure' - should only contain the search terms, does not need 'issue' or 'issues' in the search query."
+          }
+        },
+        "required": ["search_query"],
+        "additionalProperties": false
+      },
+      "strict": true
+    }
+  }
+]
diff --git a/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx b/app/frontend/src/components/AnalysisPanel/ThoughtProcess.tsx
@@ -28,7 +28,7 @@ export const ThoughtProcess = ({ thoughts }: Props) => {
                                     </span>
                                 ))}
                         </Stack>
-                        {Array.isArray(t.description) ? (
+                        {Array.isArray(t.description) || typeof t.description === "object" ? (
                             <SyntaxHighlighter language="json" wrapLongLines className={styles.tCodeBlock} style={a11yLight}>
                                 {JSON.stringify(t.description, null, 2)}
                             </SyntaxHighlighter>

diff --git a/app/frontend/src/locales/en/translation.json b/app/frontend/src/locales/en/translation.json
@@ -37,9 +37,9 @@
     "chatEmptyStateTitle": "Chat with your data",
     "chatEmptyStateSubtitle": "Ask anything or try an example",
     "defaultExamples": {
-        "1": "What is included in my Northwind Health Plus plan that is not in standard?",
-        "2": "What happens in a performance review?",
-        "3": "What does a Product Manager do?",
+        "1": "summarize issues with manageacls.py?",
+        "2": "how to enable user-based access control?",
+        "3": "Summarize the available documentation and reported user issues around manageacls.py",
         "placeholder": "Type a new question (e.g. does my plan cover annual eye exams?)"
     },
     "askTitle": "Ask your data",

diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
@@ -45,7 +45,7 @@ const Chat = () => {
     const [seed, setSeed] = useState<number | null>(null);
     const [minimumRerankerScore, setMinimumRerankerScore] = useState<number>(0);
     const [minimumSearchScore, setMinimumSearchScore] = useState<number>(0);
-    const [retrieveCount, setRetrieveCount] = useState<number>(3);
+    const [retrieveCount, setRetrieveCount] = useState<number>(5);
     const [retrievalMode, setRetrievalMode] = useState<RetrievalMode>(RetrievalMode.Hybrid);
     const [useSemanticRanker, setUseSemanticRanker] = useState<boolean>(true);
     const [shouldStream, setShouldStream] = useState<boolean>(true);

diff --git a/convertdocs.py b/convertdocs.py
@@ -0,0 +1,25 @@
+import os
+
+import pypandoc
+
+
+def convert_md_to_html(directory):
+    # Ensure the output directory exists
+    html_output_dir = os.path.join(directory, 'html')
+    os.makedirs(html_output_dir, exist_ok=True)
+
+    # Iterate over all files in the directory
+    for filename in os.listdir(directory):
+        if filename.endswith('.md'):
+            filepath = os.path.join(directory, filename)
+            base_filename = os.path.splitext(filename)[0]
+
+            # Convert to HTML
+            html_output_path = os.path.join(html_output_dir, f'{base_filename}.html')
+            pypandoc.convert_file(filepath, 'html', outputfile=html_output_path)
+            print(f'Converted {filename} to {html_output_path}')
+
+if __name__ == '__main__':
+    # Specify the directory containing the Markdown files
+    directory = '.'
+    convert_md_to_html(directory)
diff --git a/data/README.html b/data/README.html
@@ -0,0 +1,35 @@
+<h1 id="additional-documentation">Additional documentation</h1>
+<p>Consult the main <a href="../README.md">README</a> for general
+information about the project. These are advanced topics that are not
+necessary for a basic deployment.</p>
+<ul>
+<li>Deploying:
+<ul>
+<li><a href="docs/deploy_troubleshooting.md">Troubleshooting
+deployment</a>
+<ul>
+<li><a href="appservice.md">Debugging the app on App Service</a></li>
+</ul></li>
+<li><a href="azd.md">Deploying with azd: deep dive and CI/CD</a></li>
+<li><a href="deploy_existing.md">Deploying with existing Azure
+resources</a></li>
+<li><a href="deploy_lowcost.md">Deploying from a free account</a></li>
+<li><a href="deploy_features.md">Enabling optional features</a>
+<ul>
+<li><a href="docs/deploy_features.md">All features</a></li>
+<li><a href="login_and_acl.md">Login and access control</a></li>
+<li><a href="gpt4v.md">GPT-4 Turbo with Vision</a></li>
+<li><a href="deploy_private.md">Private endpoints</a></li>
+</ul></li>
+<li><a href="sharing_environments.md">Sharing deployment
+environments</a></li>
+</ul></li>
+<li><a href="localdev.md">Local development</a></li>
+<li><a href="customization.md">Customizing the app</a></li>
+<li><a href="docs/evaluation.md">Evaluation</a></li>
+<li><a href="data_ingestion.md">Data ingestion</a></li>
+<li><a href="monitoring.md">Monitoring with Application
+Insights</a></li>
+<li><a href="productionizing.md">Productionizing</a></li>
+<li><a href="other_samples.md">Alternative RAG chat samples</a></li>
+</ul>