Skip to content

Commit 08a0ec6

Browse files
committed
testing functionality and RAG
1 parent 9b49efe commit 08a0ec6

23 files changed

+1857
-73
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,7 @@ desktop.ini
192192
.vscode/
193193
.idea/
194194
*.sublime-*
195+
196+
tests/test_application.py
197+
tests/test_ollama_ocr.py
198+
tests/test_response_accuracy.py

app/backend/app.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
CONFIG_USER_BLOB_MANAGER,
8585
CONFIG_USER_UPLOAD_ENABLED,
8686
CONFIG_VECTOR_SEARCH_ENABLED,
87+
OCR_ON_INGEST,
8788
)
8889
from core.authentication import AuthenticationHelper
8990
from core.sessionhelper import create_session_id
@@ -98,11 +99,14 @@
9899
setup_openai_client,
99100
setup_search_info,
100101
)
102+
from services.ocr_service import OCRService
101103
from prepdocslib.blobmanager import AdlsBlobManager, BlobManager
102104
from prepdocslib.embeddings import ImageEmbeddings
103105
from prepdocslib.filestrategy import UploadUserFileStrategy
104106
from prepdocslib.listfilestrategy import File
105107

108+
logger = logging.getLogger(__name__)
109+
106110
bp = Blueprint("routes", __name__, static_folder="static")
107111
# Fix Windows registry issue with mimetypes
108112
mimetypes.add_type("application/javascript", ".js")
@@ -699,13 +703,25 @@ async def setup_clients():
699703
vision_endpoint=AZURE_VISION_ENDPOINT,
700704
use_multimodal=USE_MULTIMODAL,
701705
)
706+
707+
# Initialize OCR service for runtime uploads if enabled
708+
ocr_service = None
709+
if OCR_ON_INGEST:
710+
ocr_candidate = OCRService()
711+
if ocr_candidate.is_enabled():
712+
ocr_service = ocr_candidate
713+
logger.info("OCR service enabled for runtime user uploads")
714+
else:
715+
logger.warning("OCR_ON_INGEST is enabled but no OCR provider is configured; skipping OCR for runtime uploads.")
716+
702717
ingester = UploadUserFileStrategy(
703718
search_info=search_info,
704719
file_processors=file_processors,
705720
embeddings=text_embeddings_service,
706721
image_embeddings=image_embeddings_service,
707722
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
708723
blob_manager=user_blob_manager,
724+
ocr_service=ocr_service,
709725
)
710726
current_app.config[CONFIG_INGESTER] = ingester
711727

app/backend/approaches/chatreadretrieveread.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,16 @@ async def run_without_streaming(
134134
if overrides.get("suggest_followup_questions"):
135135
content, followup_questions = self.extract_followup_questions(content)
136136
extra_info.followup_questions = followup_questions
137+
138+
# Filter citations to only include those actually used in the answer
139+
from services.citation_filter import filter_citations_by_answer
140+
if extra_info.data_points.citations and content:
141+
filtered_citations = filter_citations_by_answer(
142+
extra_info.data_points.citations,
143+
content
144+
)
145+
extra_info.data_points.citations = filtered_citations
146+
137147
# Assume last thought is for generating answer
138148
if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage:
139149
extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage)
@@ -278,7 +288,9 @@ async def run_search_approach(
278288
self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any]
279289
):
280290
# Phase 1B scaffolding: allow a simple 'mode' switch with safe defaults
281-
mode = overrides.get("mode", "rag") # rag | web | hybrid
291+
# Default to hybrid mode if web search is enabled, otherwise use rag
292+
default_mode = "hybrid" if ENABLE_WEB_SEARCH and SERPER_API_KEY else "rag"
293+
mode = overrides.get("mode", default_mode) # rag | web | hybrid
282294

283295
# Hybrid mode: merge RAG + Web results
284296
if mode == "hybrid":

app/backend/approaches/prompts/ask_answer_question.prompty

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,23 @@ system:
1515
{{ override_prompt }}
1616
{% else %}
1717
Assistant helps structural engineering company employees with their questions about technical documents, design standards, project specifications, and engineering practices. Be brief and technically accurate in your answers.
18-
Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.
18+
19+
You have access to two types of sources:
20+
1. **Corpus sources** (document files like .pdf, .txt) - These are from the company's internal knowledge base. Prioritize these when they contain relevant information.
21+
2. **Web sources** (URLs starting with http:// or https://) - These are from web search. Use these when corpus sources don't contain the answer.
22+
23+
**Answering rules:**
24+
- **CRITICAL**: Only use and cite documents that are directly relevant to answering the specific question asked. If the question asks about a specific document or topic, ONLY use information from documents related to that topic.
25+
- If corpus sources contain the answer, use them and cite them with [filename.pdf#page=N] format.
26+
- If corpus sources don't contain the answer but web sources do, use web sources and cite them with [URL] format.
27+
- Only say "I don't know" if neither corpus nor web sources contain enough information to answer the question.
28+
- **CRITICAL**: Only cite documents that you actually use in your answer. If multiple documents are retrieved but only one is relevant to the question, ONLY use and cite that one document. Do not include information or citations from irrelevant documents, even if they were retrieved.
29+
- If a question asks about a specific document (e.g., "code review documents"), only use information from that specific document type, not other unrelated documents.
30+
- Always cite your sources. Use square brackets to reference sources, for example [document.pdf#page=1] for corpus or [https://example.com] for web sources.
31+
- Don't combine sources, list each source separately, for example [doc1.pdf#page=1][doc2.pdf#page=2] or [https://site1.com][https://site2.com].
32+
1933
You CANNOT ask clarifying questions to the user, since the user will have no way to reply.
2034
If the question is not in English, answer in the language used in the question.
21-
Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
2235
{% if image_sources %}
2336
Each image source has the document file name in the top left corner of the image with coordinates (10,10) pixels with format <filename.ext#page=N>,
2437
and the image figure name is right-aligned in the top right corner of the image.
@@ -27,7 +40,10 @@ Each text source starts in a new line and has the file name followed by colon an
2740
Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N].
2841
If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)].
2942
{% endif %}
30-
Possible citations for current question: {% for citation in citations %} [{{ citation }}] {% endfor %}
43+
Available sources for this question (you may not need all of them): {% for citation in citations %} [{{ citation }}] {% endfor %}
44+
45+
**IMPORTANT**: Only cite documents that you actually use in your answer. Do not cite documents that don't contain relevant information for the question. If a document is retrieved but doesn't help answer the question, do not include it in your citations.
46+
3147
{{ injected_prompt }}
3248
{% endif %}
3349

app/backend/approaches/prompts/chat_answer_question.prompty

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,23 @@ system:
2121
{{ override_prompt }}
2222
{% else %}
2323
Assistant helps structural engineering company employees with their questions about technical documents, design standards, project specifications, and engineering practices. Be brief and technically accurate in your answers.
24-
Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.
24+
25+
You have access to two types of sources:
26+
1. **Corpus sources** (document files like .pdf, .txt) - These are from the company's internal knowledge base. Prioritize these when they contain relevant information.
27+
2. **Web sources** (URLs starting with http:// or https://) - These are from web search. Use these when corpus sources don't contain the answer.
28+
29+
**Answering rules:**
30+
- **CRITICAL**: Only use and cite documents that are directly relevant to answering the specific question asked. If the question asks about a specific document or topic, ONLY use information from documents related to that topic.
31+
- If corpus sources contain the answer, use them and cite them with [filename.pdf#page=N] format.
32+
- If corpus sources don't contain the answer but web sources do, use web sources and cite them with [URL] format.
33+
- Only say "I don't know" if neither corpus nor web sources contain enough information to answer the question.
34+
- **CRITICAL**: Only cite documents that you actually use in your answer. If multiple documents are retrieved but only one is relevant to the question, ONLY use and cite that one document. Do not include information or citations from irrelevant documents, even if they were retrieved.
35+
- If a question asks about a specific document (e.g., "code review documents"), only use information from that specific document type, not other unrelated documents.
36+
- Always cite your sources. Use square brackets to reference sources, for example [document.pdf#page=1] for corpus or [https://example.com] for web sources.
37+
- Don't combine sources, list each source separately, for example [doc1.pdf#page=1][doc2.pdf#page=2] or [https://site1.com][https://site2.com].
38+
2539
If asking a clarifying question to the user would help, ask the question.
2640
If the question is not in English, answer in the language used in the question.
27-
Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
2841
{% if image_sources %}
2942
Each image source has the document file name in the top left corner of the image with coordinates (10,10) pixels with format <filename.ext#page=N>,
3043
and the image figure name is right-aligned in the top right corner of the image.
@@ -33,7 +46,10 @@ Each text source starts in a new line and has the file name followed by colon an
3346
Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N].
3447
If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)].
3548
{% endif %}
36-
Possible citations for current question: {% for citation in citations %} [{{ citation }}] {% endfor %}
49+
Available sources for this question (you may not need all of them): {% for citation in citations %} [{{ citation }}] {% endfor %}
50+
51+
**IMPORTANT**: Only cite documents that you actually use in your answer. Do not cite documents that don't contain relevant information for the question. If a document is retrieved but doesn't help answer the question, do not include it in your citations.
52+
3753
{{ injected_prompt }}
3854
{% endif %}
3955

app/backend/approaches/retrievethenread.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,17 @@ async def run(
118118
response_token_limit=self.get_response_token_limit(self.chatgpt_model, 1024),
119119
),
120120
)
121+
answer_content = chat_completion.choices[0].message.content
122+
123+
# Filter citations to only include those actually used in the answer
124+
from services.citation_filter import filter_citations_by_answer
125+
if extra_info.data_points.citations and answer_content:
126+
filtered_citations = filter_citations_by_answer(
127+
extra_info.data_points.citations,
128+
answer_content
129+
)
130+
extra_info.data_points.citations = filtered_citations
131+
121132
extra_info.thoughts.append(
122133
self.format_thought_step_for_chatcompletion(
123134
title="Prompt to generate answer",
@@ -130,7 +141,7 @@ async def run(
130141
)
131142
return {
132143
"message": {
133-
"content": chat_completion.choices[0].message.content,
144+
"content": answer_content,
134145
"role": chat_completion.choices[0].message.role,
135146
},
136147
"context": {

app/backend/config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@
4646
REDIS_URL = os.getenv("REDIS_URL") # Optional Redis cache URL
4747

4848
# OCR Configuration
49-
OCR_PROVIDER = os.getenv("OCR_PROVIDER", "none").lower() # deepseek, azure_document_intelligence, none
49+
OCR_PROVIDER = os.getenv("OCR_PROVIDER", "none").lower() # ollama, azure_document_intelligence, none
5050
OCR_ON_INGEST = os.getenv("OCR_ON_INGEST", "false").lower() == "true" # Run OCR during document ingestion
51-
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
52-
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com/v1")
53-
DEEPSEEK_OCR_MODEL = os.getenv("DEEPSEEK_OCR_MODEL", "deepseek-ocr")
51+
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434/v1")
52+
OLLAMA_OCR_MODEL = os.getenv("OLLAMA_OCR_MODEL", "llava:7b") # Must be a vision-capable model (llava, bakllava, etc.)
53+
OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "120"))
5454
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
5555
AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
5656
AZURE_DOCUMENT_INTELLIGENCE_MODEL = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_MODEL", "prebuilt-read")

app/backend/core/keyvault_config.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ async def load_web_search_secrets(self) -> Dict[str, Optional[str]]:
6868
Dictionary with web search API keys
6969
"""
7070
return await self.reader.get_secrets({
71-
"SERPER_API_KEY": "SERPER_API_KEY",
72-
"DEEPSEEK_API_KEY": "DEEPSEEK_API_KEY"
71+
"SERPER_API_KEY": "SERPER_API_KEY"
7372
})
7473

7574
async def load_all_secrets(self) -> Dict[str, Optional[str]]:

app/backend/load_azd_env.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,9 @@ def load_azd_env():
4545
else:
4646
logger.info("Loading azd env from %s, which may override existing environment variables", env_file_path)
4747
load_dotenv(env_file_path, override=True)
48+
49+
# Also load from local .env file as fallback (for variables not in azd env)
50+
local_env_path = os.path.join(os.path.dirname(__file__), ".env")
51+
if os.path.exists(local_env_path) and local_env_path != env_file_path:
52+
logger.info("Also loading local .env from %s (as fallback for missing variables)", local_env_path)
53+
load_dotenv(local_env_path, override=False) # Don't override azd vars, but fill in missing ones

app/backend/prepdocs.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy
4545
from prepdocslib.textparser import TextParser
4646
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
47+
from services.ocr_service import OCRService
48+
from config import OCR_ON_INGEST
4749

4850
logger = logging.getLogger("scripts")
4951

@@ -650,6 +652,14 @@ async def main(strategy: Strategy, setup_index: bool = True):
650652
use_multimodal=use_multimodal,
651653
)
652654

655+
ocr_service = None
656+
if OCR_ON_INGEST:
657+
ocr_candidate = OCRService()
658+
if ocr_candidate.is_enabled():
659+
ocr_service = ocr_candidate
660+
else:
661+
logger.warning("OCR_ON_INGEST is enabled but no OCR provider is configured; skipping OCR.")
662+
653663
ingestion_strategy = FileStrategy(
654664
search_info=search_info,
655665
list_file_strategy=list_file_strategy,
@@ -665,6 +675,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
665675
category=args.category,
666676
use_content_understanding=use_content_understanding,
667677
content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
678+
ocr_service=ocr_service,
679+
ocr_on_ingest=ocr_service is not None and OCR_ON_INGEST,
668680
)
669681

670682
try:

0 commit comments

Comments
 (0)