Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pathlib import Path
from typing import Any, AsyncGenerator, Dict, Union, cast

import aiohttp
from azure.cognitiveservices.speech import (
ResultReason,
SpeechConfig,
Expand Down Expand Up @@ -133,6 +134,16 @@ async def content_file(path: str, auth_claims: Dict[str, Any]):
if AZURE_ENFORCE_ACCESS_CONTROL is set to true, logged in users can only access files they have access to
This is also slow and memory hungry.
"""
# if the path looks like issue-NNN.html, fetch it from github.com/Azure-samples/azure-search-openai-demo/issues instead
if path.startswith("issue-") and path.endswith(".html"):
issue_id = path.split("-")[1].split(".")[0]
url = f"https://github.com/Azure-Samples/azure-search-openai-demo/issues/{issue_id}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
abort(404)
return await response.text()

# Remove page number from path, filename-1.txt -> filename.txt
# This shouldn't typically be necessary as browsers don't send hash fragments to servers
if path.find("#page=") > 0:
Expand Down
36 changes: 36 additions & 0 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
from core.authentication import AuthenticationHelper


@dataclass
class AzureAISearch:
aisearch_query: str

@dataclass
class GitHubIssueSearch:
github_query: str

@dataclass
class Document:
id: Optional[str]
Expand Down Expand Up @@ -204,6 +212,34 @@ async def search(

return qualified_documents


async def search_github_issues(self, github_issue_search: GitHubIssueSearch) -> list[Document]:
async with aiohttp.ClientSession() as session:
async with session.get(f'https://api.github.com/search/issues?q={github_issue_search.github_query}+repo:Azure-samples/azure-search-openai-demo+type:issue&per_page=10') as response:
if response.status == 200:
issues = (await response.json()).get("items", [])
documents = []
# strip out image markdown from the body
for issue in issues:
body = issue["body"].replace("![", "").replace("](https://", "").replace(")", "")
# turn html_url like https://github.com/Azure-Samples/azure-search-openai-demo/issues/2358 into issue-2358.html
sourcefile = f"issue-{issue.get('number')}.html"
documents.append(Document(
id=issue.get("id"),
content=f"# {issue.get('title')}\n\n{body}",
sourcepage=sourcefile,
sourcefile=sourcefile,
embedding=[],
image_embedding=[],
category=None,
oids=[],
groups=[],
captions=[],
))
return documents
else:
return []

def get_sources_content(
self, results: List[Document], use_semantic_captions: bool, use_image_citation: bool
) -> list[str]:
Expand Down
22 changes: 13 additions & 9 deletions app/backend/approaches/chatapproach.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import json
import re
from abc import ABC, abstractmethod
from typing import Any, AsyncGenerator, Optional
from typing import Any, AsyncGenerator, List, Optional, Union

from openai.types.chat import ChatCompletion, ChatCompletionMessageParam

from approaches.approach import Approach
from approaches.approach import Approach, AzureAISearch, GitHubIssueSearch


class ChatApproach(Approach, ABC):
Expand All @@ -16,23 +16,27 @@ class ChatApproach(Approach, ABC):
async def run_until_final_call(self, messages, overrides, auth_claims, should_stream) -> tuple:
pass

def get_search_query(self, chat_completion: ChatCompletion, user_query: str):
def get_search_query(self, chat_completion: ChatCompletion, user_query: str) -> List[Union[AzureAISearch, GitHubIssueSearch]]:
response_message = chat_completion.choices[0].message
search_queries = []

if response_message.tool_calls:
for tool in response_message.tool_calls:
if tool.type != "function":
continue
function = tool.function
if function.name == "search_sources":
if function.name == "azure_ai_search_docs":
arg = json.loads(function.arguments)
search_query = arg.get("search_query", self.NO_RESPONSE)
if search_query != self.NO_RESPONSE:
return search_query
elif query_text := response_message.content:
if query_text.strip() != self.NO_RESPONSE:
return query_text
return user_query
search_queries.append(AzureAISearch(aisearch_query=search_query))
elif function.name == "github_search_issues":
arg = json.loads(function.arguments)
search_query = arg.get("search_query", self.NO_RESPONSE)
if search_query != self.NO_RESPONSE:
search_queries.append(GitHubIssueSearch(github_query=search_query))

return search_queries

def extract_followup_questions(self, content: Optional[str]):
if content is None:
Expand Down
54 changes: 31 additions & 23 deletions app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from openai_messages_token_helper import build_messages, get_token_limit

from approaches.approach import ThoughtStep
from approaches.chatapproach import ChatApproach
from approaches.chatapproach import ChatApproach, GitHubIssueSearch
from approaches.promptmanager import PromptManager
from core.authentication import AuthenticationHelper

Expand Down Expand Up @@ -124,33 +124,40 @@ async def run_until_final_call(
max_tokens=query_response_token_limit, # Setting too low risks malformed JSON, setting too high may affect performance
n=1,
tools=tools,
tool_choice="auto",
seed=seed,
)

query_text = self.get_search_query(chat_completion, original_user_query)

# STEP 2: Retrieve relevant documents from the search index with the GPT optimized query

# If retrieval mode includes vectors, compute an embedding for the query
vectors: list[VectorQuery] = []
if use_vector_search:
vectors.append(await self.compute_text_embedding(query_text))

results = await self.search(
top,
query_text,
filter,
vectors,
use_text_search,
use_vector_search,
use_semantic_ranker,
use_semantic_captions,
minimum_search_score,
minimum_reranker_score,
)
search_queries = self.get_search_query(chat_completion, original_user_query)
results = []

for query in search_queries:
if isinstance(query, GitHubIssueSearch):
# Handle GitHub issue search
results.extend(await self.search_github_issues(query))
else:
# Handle regular AI search query

vectors: list[VectorQuery] = []
if use_vector_search:
vectors.append(await self.compute_text_embedding(query.aisearch_query))

results.extend(await self.search(
top,
query.aisearch_query,
filter,
vectors,
use_text_search,
use_vector_search,
use_semantic_ranker,
use_semantic_captions,
minimum_search_score,
minimum_reranker_score,
))

# STEP 3: Generate a contextual and content specific answer using the search results and chat history
text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False)

rendered_answer_prompt = self.prompt_manager.render_prompt(
self.answer_prompt,
self.get_system_prompt_variables(overrides.get("prompt_template"))
Expand Down Expand Up @@ -186,7 +193,7 @@ async def run_until_final_call(
),
ThoughtStep(
"Search using generated search query",
query_text,
search_queries,
{
"use_semantic_captions": use_semantic_captions,
"use_semantic_ranker": use_semantic_ranker,
Expand Down Expand Up @@ -222,4 +229,5 @@ async def run_until_final_call(
stream=should_stream,
seed=seed,
)

return (extra_info, chat_coroutine)
26 changes: 7 additions & 19 deletions app/backend/approaches/prompts/chat_query_rewrite.prompty
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,19 @@ sample:
content: "The Northwind Health Plus plan includes coverage for emergency services, mental health and substance abuse coverage, and out-of-network services, which are not included in the Northwind Standard plan. [Benefit_Options.pdf#page=3]"
---
system:
Below is a history of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base.
You have access to Azure AI Search index with 100's of documents.
Generate a search query based on the conversation and the new question.
Do not include cited source filenames and document names e.g. info.txt or doc.pdf in the search query terms.
Do not include any text inside [] or <<>> in the search query terms.
Do not include any special characters like '+'.
If the question is not in English, translate the question to English before generating the search query.
If you cannot generate a search query, return just the number 0.

user:
(EXAMPLE) How did crypto do last year?
Below is a history of the conversation so far, and a new question asked by the user about the azure-search-openai-demo open source project.
You have access to an Azure AI Search index with the documentation or to the GitHub issue tracker for the project documentation.

assistant:
Summarize Cryptocurrency Market Dynamics from last year

user:
(EXAMPLE) What are my health plans?
Based on the conversation and the new question, suggest the optimal search query for the AI Search index or GitHub issue tracker.
If the question is not in English, translate the question to English before generating the search query.

assistant:
Show available health plans
If you cannot generate a search query for either AI Search or GitHub, return just the number 0.
If you think that it would help to search both, then recommend both functions be called.

{% for message in past_messages %}
{{ message["role"] }}:
{{ message["content"] }}
{% endfor %}

user:
Generate search query for: {{ user_query }}
{{ user_query }}
51 changes: 37 additions & 14 deletions app/backend/approaches/prompts/chat_query_rewrite_tools.json
Original file line number Diff line number Diff line change
@@ -1,17 +1,40 @@
[{
[
{
"type": "function",
"function": {
"name": "search_sources",
"description": "Retrieve sources from the Azure AI Search index",
"parameters": {
"type": "object",
"properties": {
"search_query": {
"type": "string",
"description": "Query string to retrieve documents from azure search eg: 'Health care plan'"
}
},
"required": ["search_query"]
}
"name": "azure_ai_search_docs",
"description": "Retrieve sources from the Azure AI Search index. Use this function for questions like 'does the repo support user-based access control?'",
"parameters": {
"type": "object",
"properties": {
"search_query": {
"type": "string",
"description": "Query string to retrieve documents from azure search eg: 'data access control'. Do not include cited source filenames and document names e.g. info.txt or doc.pdf in the search query terms. Do not include any text inside [] or <<>> in the search query terms. Do not include any special characters like '+'."
}
},
"required": ["search_query"],
"additionalProperties": false
},
"strict": true
}
}]
},
{
"type": "function",
"function": {
"name": "github_search_issues",
"description": "Retrieve issues from the azure-search-openai-demo issue tracker. Use this function for questions like 'what are the top errors with deployment?'",
"parameters": {
"type": "object",
"properties": {
"search_query": {
"type": "string",
"description": "Query string to retrieve issues from github eg: 'Deployment failure' - should only contain the search terms, does not need 'issue' or 'issues' in the search query."
}
},
"required": ["search_query"],
"additionalProperties": false
},
"strict": true
}
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ export const ThoughtProcess = ({ thoughts }: Props) => {
</span>
))}
</Stack>
{Array.isArray(t.description) ? (
{Array.isArray(t.description) || typeof t.description === "object" ? (
<SyntaxHighlighter language="json" wrapLongLines className={styles.tCodeBlock} style={a11yLight}>
{JSON.stringify(t.description, null, 2)}
</SyntaxHighlighter>
Expand Down
6 changes: 3 additions & 3 deletions app/frontend/src/locales/en/translation.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@
"chatEmptyStateTitle": "Chat with your data",
"chatEmptyStateSubtitle": "Ask anything or try an example",
"defaultExamples": {
"1": "What is included in my Northwind Health Plus plan that is not in standard?",
"2": "What happens in a performance review?",
"3": "What does a Product Manager do?",
"1": "summarize issues with manageacls.py?",
"2": "how to enable user-based access control?",
"3": "Summarize the available documentation and reported user issues around manageacls.py",
"placeholder": "Type a new question (e.g. does my plan cover annual eye exams?)"
},
"askTitle": "Ask your data",
Expand Down
2 changes: 1 addition & 1 deletion app/frontend/src/pages/chat/Chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ const Chat = () => {
const [seed, setSeed] = useState<number | null>(null);
const [minimumRerankerScore, setMinimumRerankerScore] = useState<number>(0);
const [minimumSearchScore, setMinimumSearchScore] = useState<number>(0);
const [retrieveCount, setRetrieveCount] = useState<number>(3);
const [retrieveCount, setRetrieveCount] = useState<number>(5);
const [retrievalMode, setRetrievalMode] = useState<RetrievalMode>(RetrievalMode.Hybrid);
const [useSemanticRanker, setUseSemanticRanker] = useState<boolean>(true);
const [shouldStream, setShouldStream] = useState<boolean>(true);
Expand Down
25 changes: 25 additions & 0 deletions convertdocs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os

import pypandoc


def convert_md_to_html(directory):
# Ensure the output directory exists
html_output_dir = os.path.join(directory, 'html')
os.makedirs(html_output_dir, exist_ok=True)

# Iterate over all files in the directory
for filename in os.listdir(directory):
if filename.endswith('.md'):
filepath = os.path.join(directory, filename)
base_filename = os.path.splitext(filename)[0]

# Convert to HTML
html_output_path = os.path.join(html_output_dir, f'{base_filename}.html')
pypandoc.convert_file(filepath, 'html', outputfile=html_output_path)
print(f'Converted {filename} to {html_output_path}')

if __name__ == '__main__':
# Specify the directory containing the Markdown files
directory = '.'
convert_md_to_html(directory)
35 changes: 35 additions & 0 deletions data/README.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<h1 id="additional-documentation">Additional documentation</h1>
<p>Consult the main <a href="../README.md">README</a> for general
information about the project. These are advanced topics that are not
necessary for a basic deployment.</p>
<ul>
<li>Deploying:
<ul>
<li><a href="docs/deploy_troubleshooting.md">Troubleshooting
deployment</a>
<ul>
<li><a href="appservice.md">Debugging the app on App Service</a></li>
</ul></li>
<li><a href="azd.md">Deploying with azd: deep dive and CI/CD</a></li>
<li><a href="deploy_existing.md">Deploying with existing Azure
resources</a></li>
<li><a href="deploy_lowcost.md">Deploying from a free account</a></li>
<li><a href="deploy_features.md">Enabling optional features</a>
<ul>
<li><a href="docs/deploy_features.md">All features</a></li>
<li><a href="login_and_acl.md">Login and access control</a></li>
<li><a href="gpt4v.md">GPT-4 Turbo with Vision</a></li>
<li><a href="deploy_private.md">Private endpoints</a></li>
</ul></li>
<li><a href="sharing_environments.md">Sharing deployment
environments</a></li>
</ul></li>
<li><a href="localdev.md">Local development</a></li>
<li><a href="customization.md">Customizing the app</a></li>
<li><a href="docs/evaluation.md">Evaluation</a></li>
<li><a href="data_ingestion.md">Data ingestion</a></li>
<li><a href="monitoring.md">Monitoring with Application
Insights</a></li>
<li><a href="productionizing.md">Productionizing</a></li>
<li><a href="other_samples.md">Alternative RAG chat samples</a></li>
</ul>
Loading
Loading