Azure-Samples
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎app/backend/app.py‎
Lines changed: 16 additions & 0 deletions b/‎app/backend/app.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎app/backend/approaches/chatreadretrieveread.py‎
Lines changed: 13 additions & 1 deletion b/‎app/backend/approaches/chatreadretrieveread.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎app/backend/approaches/prompts/ask_answer_question.prompty‎
Lines changed: 19 additions & 3 deletions b/‎app/backend/approaches/prompts/ask_answer_question.prompty‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎app/backend/approaches/prompts/chat_answer_question.prompty‎
Lines changed: 19 additions & 3 deletions b/‎app/backend/approaches/prompts/chat_answer_question.prompty‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎app/backend/approaches/retrievethenread.py‎
Lines changed: 12 additions & 1 deletion b/‎app/backend/approaches/retrievethenread.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎app/backend/config.py‎
Lines changed: 4 additions & 4 deletions b/‎app/backend/config.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎app/backend/core/keyvault_config.py‎
Lines changed: 1 addition & 2 deletions b/‎app/backend/core/keyvault_config.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎app/backend/load_azd_env.py‎
Lines changed: 6 additions & 0 deletions b/‎app/backend/load_azd_env.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎app/backend/prepdocs.py‎
Lines changed: 12 additions & 0 deletions b/‎app/backend/prepdocs.py‎
Lines changed: 12 additions & 0 deletions
@@ -192,3 +192,7 @@ desktop.ini
 .vscode/
 .idea/
 *.sublime-*
+
+tests/test_application.py
+tests/test_ollama_ocr.py
+tests/test_response_accuracy.py
@@ -84,6 +84,7 @@
     CONFIG_USER_BLOB_MANAGER,
     CONFIG_USER_UPLOAD_ENABLED,
     CONFIG_VECTOR_SEARCH_ENABLED,
+    OCR_ON_INGEST,
 )
 from core.authentication import AuthenticationHelper
 from core.sessionhelper import create_session_id
@@ -98,11 +99,14 @@
     setup_openai_client,
     setup_search_info,
 )
+from services.ocr_service import OCRService
 from prepdocslib.blobmanager import AdlsBlobManager, BlobManager
 from prepdocslib.embeddings import ImageEmbeddings
 from prepdocslib.filestrategy import UploadUserFileStrategy
 from prepdocslib.listfilestrategy import File
 
+logger = logging.getLogger(__name__)
+
 bp = Blueprint("routes", __name__, static_folder="static")
 # Fix Windows registry issue with mimetypes
 mimetypes.add_type("application/javascript", ".js")
@@ -699,13 +703,25 @@ async def setup_clients():
             vision_endpoint=AZURE_VISION_ENDPOINT,
             use_multimodal=USE_MULTIMODAL,
         )
+        
+        # Initialize OCR service for runtime uploads if enabled
+        ocr_service = None
+        if OCR_ON_INGEST:
+            ocr_candidate = OCRService()
+            if ocr_candidate.is_enabled():
+                ocr_service = ocr_candidate
+                logger.info("OCR service enabled for runtime user uploads")
+            else:
+                logger.warning("OCR_ON_INGEST is enabled but no OCR provider is configured; skipping OCR for runtime uploads.")
+        
         ingester = UploadUserFileStrategy(
             search_info=search_info,
             file_processors=file_processors,
             embeddings=text_embeddings_service,
             image_embeddings=image_embeddings_service,
             search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
             blob_manager=user_blob_manager,
+            ocr_service=ocr_service,
         )
         current_app.config[CONFIG_INGESTER] = ingester
 
 
@@ -134,6 +134,16 @@ async def run_without_streaming(
         if overrides.get("suggest_followup_questions"):
             content, followup_questions = self.extract_followup_questions(content)
             extra_info.followup_questions = followup_questions
+        
+        # Filter citations to only include those actually used in the answer
+        from services.citation_filter import filter_citations_by_answer
+        if extra_info.data_points.citations and content:
+            filtered_citations = filter_citations_by_answer(
+                extra_info.data_points.citations,
+                content
+            )
+            extra_info.data_points.citations = filtered_citations
+        
         # Assume last thought is for generating answer
         if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage:
             extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage)
@@ -278,7 +288,9 @@ async def run_search_approach(
         self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any]
     ):
         # Phase 1B scaffolding: allow a simple 'mode' switch with safe defaults
-        mode = overrides.get("mode", "rag")  # rag | web | hybrid
+        # Default to hybrid mode if web search is enabled, otherwise use rag
+        default_mode = "hybrid" if ENABLE_WEB_SEARCH and SERPER_API_KEY else "rag"
+        mode = overrides.get("mode", default_mode)  # rag | web | hybrid
 
         # Hybrid mode: merge RAG + Web results
         if mode == "hybrid":
 
@@ -15,10 +15,23 @@ system:
 {{ override_prompt }}
 {% else %}
 Assistant helps structural engineering company employees with their questions about technical documents, design standards, project specifications, and engineering practices. Be brief and technically accurate in your answers.
-Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.
+
+You have access to two types of sources:
+1. **Corpus sources** (document files like .pdf, .txt) - These are from the company's internal knowledge base. Prioritize these when they contain relevant information.
+2. **Web sources** (URLs starting with http:// or https://) - These are from web search. Use these when corpus sources don't contain the answer.
+
+**Answering rules:**
+- **CRITICAL**: Only use and cite documents that are directly relevant to answering the specific question asked. If the question asks about a specific document or topic, ONLY use information from documents related to that topic.
+- If corpus sources contain the answer, use them and cite them with [filename.pdf#page=N] format.
+- If corpus sources don't contain the answer but web sources do, use web sources and cite them with [URL] format.
+- Only say "I don't know" if neither corpus nor web sources contain enough information to answer the question.
+- **CRITICAL**: Only cite documents that you actually use in your answer. If multiple documents are retrieved but only one is relevant to the question, ONLY use and cite that one document. Do not include information or citations from irrelevant documents, even if they were retrieved.
+- If a question asks about a specific document (e.g., "code review documents"), only use information from that specific document type, not other unrelated documents.
+- Always cite your sources. Use square brackets to reference sources, for example [document.pdf#page=1] for corpus or [https://example.com] for web sources.
+- Don't combine sources, list each source separately, for example [doc1.pdf#page=1][doc2.pdf#page=2] or [https://site1.com][https://site2.com].
+
 You CANNOT ask clarifying questions to the user, since the user will have no way to reply.
 If the question is not in English, answer in the language used in the question.
-Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
 {% if image_sources %}
 Each image source has the document file name in the top left corner of the image with coordinates (10,10) pixels with format <filename.ext#page=N>,
 and the image figure name is right-aligned in the top right corner of the image.
@@ -27,7 +40,10 @@ Each text source starts in a new line and has the file name followed by colon an
 Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N].
 If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)].
 {% endif %}
-Possible citations for current question: {% for citation in citations %} [{{ citation }}] {% endfor %}
+Available sources for this question (you may not need all of them): {% for citation in citations %} [{{ citation }}] {% endfor %}
+
+**IMPORTANT**: Only cite documents that you actually use in your answer. Do not cite documents that don't contain relevant information for the question. If a document is retrieved but doesn't help answer the question, do not include it in your citations.
+
 {{ injected_prompt }}
 {% endif %}
 
 
@@ -21,10 +21,23 @@ system:
 {{ override_prompt }}
 {% else %}
 Assistant helps structural engineering company employees with their questions about technical documents, design standards, project specifications, and engineering practices. Be brief and technically accurate in your answers.
-Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.
+
+You have access to two types of sources:
+1. **Corpus sources** (document files like .pdf, .txt) - These are from the company's internal knowledge base. Prioritize these when they contain relevant information.
+2. **Web sources** (URLs starting with http:// or https://) - These are from web search. Use these when corpus sources don't contain the answer.
+
+**Answering rules:**
+- **CRITICAL**: Only use and cite documents that are directly relevant to answering the specific question asked. If the question asks about a specific document or topic, ONLY use information from documents related to that topic.
+- If corpus sources contain the answer, use them and cite them with [filename.pdf#page=N] format.
+- If corpus sources don't contain the answer but web sources do, use web sources and cite them with [URL] format.
+- Only say "I don't know" if neither corpus nor web sources contain enough information to answer the question.
+- **CRITICAL**: Only cite documents that you actually use in your answer. If multiple documents are retrieved but only one is relevant to the question, ONLY use and cite that one document. Do not include information or citations from irrelevant documents, even if they were retrieved.
+- If a question asks about a specific document (e.g., "code review documents"), only use information from that specific document type, not other unrelated documents.
+- Always cite your sources. Use square brackets to reference sources, for example [document.pdf#page=1] for corpus or [https://example.com] for web sources.
+- Don't combine sources, list each source separately, for example [doc1.pdf#page=1][doc2.pdf#page=2] or [https://site1.com][https://site2.com].
+
 If asking a clarifying question to the user would help, ask the question.
 If the question is not in English, answer in the language used in the question.
-Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
 {% if image_sources %}
 Each image source has the document file name in the top left corner of the image with coordinates (10,10) pixels with format <filename.ext#page=N>,
 and the image figure name is right-aligned in the top right corner of the image.
@@ -33,7 +46,10 @@ Each text source starts in a new line and has the file name followed by colon an
 Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N].
 If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)].
 {% endif %}
-Possible citations for current question: {% for citation in citations %} [{{ citation }}] {% endfor %}
+Available sources for this question (you may not need all of them): {% for citation in citations %} [{{ citation }}] {% endfor %}
+
+**IMPORTANT**: Only cite documents that you actually use in your answer. Do not cite documents that don't contain relevant information for the question. If a document is retrieved but doesn't help answer the question, do not include it in your citations.
+
 {{ injected_prompt }}
 {% endif %}
 
 
@@ -118,6 +118,17 @@ async def run(
                 response_token_limit=self.get_response_token_limit(self.chatgpt_model, 1024),
             ),
         )
+        answer_content = chat_completion.choices[0].message.content
+        
+        # Filter citations to only include those actually used in the answer
+        from services.citation_filter import filter_citations_by_answer
+        if extra_info.data_points.citations and answer_content:
+            filtered_citations = filter_citations_by_answer(
+                extra_info.data_points.citations,
+                answer_content
+            )
+            extra_info.data_points.citations = filtered_citations
+        
         extra_info.thoughts.append(
             self.format_thought_step_for_chatcompletion(
                 title="Prompt to generate answer",
@@ -130,7 +141,7 @@ async def run(
         )
         return {
             "message": {
-                "content": chat_completion.choices[0].message.content,
+                "content": answer_content,
                 "role": chat_completion.choices[0].message.role,
             },
             "context": {
 
@@ -46,11 +46,11 @@
 REDIS_URL = os.getenv("REDIS_URL")  # Optional Redis cache URL
 
 # OCR Configuration
-OCR_PROVIDER = os.getenv("OCR_PROVIDER", "none").lower()  # deepseek, azure_document_intelligence, none
+OCR_PROVIDER = os.getenv("OCR_PROVIDER", "none").lower()  # ollama, azure_document_intelligence, none
 OCR_ON_INGEST = os.getenv("OCR_ON_INGEST", "false").lower() == "true"  # Run OCR during document ingestion
-DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
-DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com/v1")
-DEEPSEEK_OCR_MODEL = os.getenv("DEEPSEEK_OCR_MODEL", "deepseek-ocr")
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434/v1")
+OLLAMA_OCR_MODEL = os.getenv("OLLAMA_OCR_MODEL", "llava:7b")  # Must be a vision-capable model (llava, bakllava, etc.)
+OLLAMA_TIMEOUT = int(os.getenv("OLLAMA_TIMEOUT", "120"))
 AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
 AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
 AZURE_DOCUMENT_INTELLIGENCE_MODEL = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_MODEL", "prebuilt-read")
 
@@ -68,8 +68,7 @@ async def load_web_search_secrets(self) -> Dict[str, Optional[str]]:
             Dictionary with web search API keys
         """
         return await self.reader.get_secrets({
-            "SERPER_API_KEY": "SERPER_API_KEY",
-            "DEEPSEEK_API_KEY": "DEEPSEEK_API_KEY"
+            "SERPER_API_KEY": "SERPER_API_KEY"
         })
 
     async def load_all_secrets(self) -> Dict[str, Optional[str]]:
 
@@ -45,3 +45,9 @@ def load_azd_env():
     else:
         logger.info("Loading azd env from %s, which may override existing environment variables", env_file_path)
         load_dotenv(env_file_path, override=True)
+    
+    # Also load from local .env file as fallback (for variables not in azd env)
+    local_env_path = os.path.join(os.path.dirname(__file__), ".env")
+    if os.path.exists(local_env_path) and local_env_path != env_file_path:
+        logger.info("Also loading local .env from %s (as fallback for missing variables)", local_env_path)
+        load_dotenv(local_env_path, override=False)  # Don't override azd vars, but fill in missing ones
@@ -44,6 +44,8 @@
 from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy
 from prepdocslib.textparser import TextParser
 from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
+from services.ocr_service import OCRService
+from config import OCR_ON_INGEST
 
 logger = logging.getLogger("scripts")
 
@@ -650,6 +652,14 @@ async def main(strategy: Strategy, setup_index: bool = True):
             use_multimodal=use_multimodal,
         )
 
+        ocr_service = None
+        if OCR_ON_INGEST:
+            ocr_candidate = OCRService()
+            if ocr_candidate.is_enabled():
+                ocr_service = ocr_candidate
+            else:
+                logger.warning("OCR_ON_INGEST is enabled but no OCR provider is configured; skipping OCR.")
+
         ingestion_strategy = FileStrategy(
             search_info=search_info,
             list_file_strategy=list_file_strategy,
@@ -665,6 +675,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
             category=args.category,
             use_content_understanding=use_content_understanding,
             content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
+            ocr_service=ocr_service,
+            ocr_on_ingest=ocr_service is not None and OCR_ON_INGEST,
         )
 
     try: