clstaudt
diff --git a/‎DEVELOPMENT_INSTRUCTIONS.md
Lines changed: 4 additions & 0 deletions b/‎DEVELOPMENT_INSTRUCTIONS.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎app.py
Lines changed: 107 additions & 21 deletions b/‎app.py
Lines changed: 107 additions & 21 deletions
@@ -1,3 +1,7 @@
+
+- Make sure you never destroy existing functionality when adding new functionality, unless it is a replacement or the old functionality is no longer needed.
 - Never forget to update the conda environment config file when you update the requirements.txt
 - Make sure there are concise and up to date docstrings that document usage.
 - Debug information belongs into the command line logs, not in the app UI/UX.
+- Always develop a generic solution, do not use content from specific examples in the code
+- Never include content from example documents in the source code. Never leak content from provided examples into test code!
@@ -218,7 +218,7 @@ def render_document_upload(chat_manager):
             with st.spinner("Processing PDF..."):
                 extracted_text = PDFProcessor.extract_text(uploaded_file)
 
-            if extracted_text:
+            if extracted_text and extracted_text.strip():
                 # Update current chat with document
                 chat = chat_manager.get_current_chat()
                 chat.update({
@@ -232,10 +232,34 @@ def render_document_upload(chat_manager):
                 st.info(f"Extracted {len(extracted_text.split()):,} words")
                 st.rerun()
             else:
-                st.error("Could not extract text from PDF")
+                st.error("❌ **Document Processing Failed**")
+                st.error("Could not extract readable text from this PDF. This could be due to:")
+                st.markdown("""
+                - The PDF contains only images or scanned content
+                - The PDF is corrupted or password-protected
+                - The PDF format is not supported
+                
+                **Please try:**
+                - A different PDF document with selectable text
+                - Converting scanned PDFs to text-searchable format first
+                - Ensuring the PDF is not password-protected
+                """)
+                st.info("💡 The chat interface will remain disabled until a valid document is uploaded.")
         except Exception as e:
+            st.error("❌ **Document Processing Failed**")
             st.error(f"Error processing file: {e}")
-            st.info("💡 If this file was uploaded before, try clicking '🗑️ Clear Upload' and upload again")
+            st.markdown("""
+            **This error occurred while trying to process your PDF. Common causes:**
+            - File corruption or invalid PDF format
+            - Insufficient memory for large files
+            - Network issues during upload
+            
+            **Please try:**
+            - Uploading a different PDF file
+            - Clicking '🗑️ Clear Upload' and trying again
+            - Ensuring the file is a valid PDF document
+            """)
+            st.info("💡 The chat interface will remain disabled until a valid document is uploaded.")
 
 def render_chat_interface(chat_manager):
     """Render the main chat interface"""
@@ -261,8 +285,9 @@ def render_chat_interface(chat_manager):
         with st.chat_message(message["role"]):
             st.markdown(message["content"])
 
-    # Chat input
-    if chat.get("document_text"):
+    # Chat input - only show if document text is valid
+    document_text = chat.get("document_text", "")
+    if document_text and document_text.strip():
         if prompt := st.chat_input("Ask about your document..."):
             if not st.session_state.selected_model:
                 st.warning("Please select a model first")
@@ -285,13 +310,52 @@ def render_chat_interface(chat_manager):
 
                 except Exception as e:
                     st.error(f"Error generating response: {e}")
+    else:
+        # Show message when document processing failed
+        st.warning("⚠️ **Chat Disabled**: No valid document content available. Please upload a PDF document with readable text to start chatting.")
+        st.info("The document may have failed to process, or the extracted text may be empty. Try uploading a different PDF file.")
+
+
 
 def generate_ai_response(prompt, document_text):
     """Generate AI response using Ollama with reasoning support"""
-    system_prompt = f"""Answer questions based on this document:
+    
+    # Check if document text is empty or None
+    if not document_text or not document_text.strip():
+        return "I apologize, but I cannot answer your question because the document could not be processed or contains no readable text. Please try uploading a different PDF document."
+    
+    system_prompt = f"""You are a document analysis assistant. You MUST ONLY answer questions that can be directly supported with citations from the provided document.
 
+DOCUMENT CONTENT:
 {document_text}
 
+CRITICAL INSTRUCTIONS:
+1. ALWAYS respond in the same language as the user's question, regardless of the document language
+2. You may reason about and analyze the information in the document, but ALL reasoning must be grounded in content that can be cited from the document
+3. Every factual claim in your answer MUST be supported by at least one verbatim citation from the document
+4. You may draw logical conclusions and make inferences, but only based on information explicitly present in the document
+5. NEVER use your training data, general knowledge, or external information - base all reasoning solely on the document content
+
+RESPONSE LOGIC - CHOOSE ONE PATH:
+PATH A - ANSWER WITH CITATIONS:
+- If you can find information in the document to answer the question, provide a complete answer
+- Support every factual claim with exact citations from the document
+- Use the required citation format shown below
+
+PATH B - DECLINE TO ANSWER:
+- If you cannot find sufficient information in the document to answer the question, decline to answer
+- Explain that the information is not available in the provided document
+- Do NOT include any citations when declining to answer
+- Do NOT reference any specific text from the document when declining
+
+CRITICAL: Never mix these paths. Either answer with full citations OR decline without any citations. Never decline while providing citations - this is contradictory.
+
+MANDATORY CITATION REQUIREMENT:
+- Every factual claim in your answer MUST be backed by a verbatim citation from the document
+- You may reason and analyze, but the underlying facts must be cited exactly as they appear in the document
+- If you cannot provide verbatim citations to support your reasoning, do NOT provide the answer
+- Citations must be exact quotes from the document, not paraphrases or interpretations
+
 CRITICAL CITATION FORMAT:
 You MUST use citations in this EXACT format for text highlighting to work:
 
@@ -302,23 +366,45 @@ def generate_ai_response(prompt, document_text):
 [1] "exact quote from document"
 [2] "another exact quote from document"
 
-EXAMPLE:
+EXAMPLE OF PATH A - ANSWER WITH CITATIONS:
 Question: Does he have experience in the medical field?
-Answer: Yes, Christian Staudt has experience in the medical field. [1] [2]
+Answer: Yes, the document shows he has experience in medical applications. [1]
+
+[1] "project development for AI applications: medical data mining & AI"
+
+EXAMPLE OF PATH B - DECLINE TO ANSWER:
+Question: What is his favorite programming language?
+Answer: I cannot answer this question based on the information provided in the document. The document does not contain information about programming language preferences.
 
-[1] "project development for AI applications: medical data mining & AI, AI for renewable energy control"
-[2] "developing a prototype for data-driven measurement of global marketing campaign performance across channels"
+INVALID EXAMPLE (DO NOT DO THIS):
+Question: What are his hobbies?
+Answer: I cannot answer this question based on the document. [1]
+[1] "some text from document"
+^ This is WRONG - never decline while providing citations!
 
-RULES:
+CITATION RULES:
 - Citations MUST start at the beginning of a line
 - Citations MUST use the format [number] "quote" 
-- Use exact quotes from the document, not paraphrases
+- Use exact quotes from the document in their ORIGINAL language - NEVER translate citations
 - Each citation on its own line
 - Do NOT use colons, "Exact quote:", or other text before the quote
-- IMPORTANT: Quote only the SPECIFIC text that directly answers the question, not entire sentences or paragraphs
-- For time/date questions, quote only the relevant time/date, not the entire schedule line
-- For specific facts, quote only the relevant fact, not surrounding context
-- Keep quotes focused and precise to ensure accurate highlighting"""
+- IMPORTANT: Quote meaningful phrases with context, not isolated words or numbers
+- Always include descriptive context around numbers, percentages, or measurements
+- Avoid quoting standalone numbers - always include the surrounding descriptive words
+- Keep quotes focused but meaningful - aim for 3-8 words that capture the complete idea
+- Prioritize phrases that directly answer the user's question with sufficient context for highlighting
+
+LANGUAGE RULES:
+- Respond to the user in the same language as their question
+- Your explanatory text, reasoning, and analysis should be in the user's language
+- Citations must remain in the original document language - do NOT translate them
+- Example: If user asks in English about a German document, respond in English but keep German citations
+
+STRICT RULES: 
+- If you cannot provide citations from the document for your answer, you MUST decline to answer
+- Do NOT provide any information from your training data
+- NEVER mix declining to answer with providing citations - this is contradictory
+- Either answer with citations OR decline without citations - never both"""
 
     messages = [
         {"role": "system", "content": system_prompt},
@@ -398,10 +484,8 @@ def generate_ai_response(prompt, document_text):
                     answer_placeholder.markdown(answer_content)
 
         # Return the final answer (without reasoning tags) for storage
-        if reasoning_started:
-            return answer_content
-        else:
-            return full_response
+        final_answer = answer_content if reasoning_started else full_response
+        return final_answer
 
     except Exception as e:
         st.error(f"Error during streaming: {e}")
@@ -448,7 +532,9 @@ def main():
 
     # Main content
     chat = chat_manager.get_current_chat()
-    if not chat.get("document_text"):
+    document_text = chat.get("document_text", "")
+    
+    if not document_text or not document_text.strip():
         render_document_upload(chat_manager)
     else:
         render_chat_interface(chat_manager)