Skip to content

Commit f0ecc64

Browse files
committed
prompt & highlighting optimized
1 parent 752a7f0 commit f0ecc64

File tree

4 files changed

+348
-73
lines changed

4 files changed

+348
-73
lines changed

DEVELOPMENT_INSTRUCTIONS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
2+
- Make sure you never destroy existing functionality when adding new functionality, unless it is a replacement or the old functionality is no longer needed.
13
- Never forget to update the conda environment config file when you update the requirements.txt
24
- Make sure there are concise and up to date docstrings that document usage.
35
- Debug information belongs into the command line logs, not in the app UI/UX.
6+
- Always develop a generic solution, do not use content from specific examples in the code
7+
- Never include content from example documents in the source code. Never leak content from provided examples into test code!

app.py

Lines changed: 107 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def render_document_upload(chat_manager):
218218
with st.spinner("Processing PDF..."):
219219
extracted_text = PDFProcessor.extract_text(uploaded_file)
220220

221-
if extracted_text:
221+
if extracted_text and extracted_text.strip():
222222
# Update current chat with document
223223
chat = chat_manager.get_current_chat()
224224
chat.update({
@@ -232,10 +232,34 @@ def render_document_upload(chat_manager):
232232
st.info(f"Extracted {len(extracted_text.split()):,} words")
233233
st.rerun()
234234
else:
235-
st.error("Could not extract text from PDF")
235+
st.error("❌ **Document Processing Failed**")
236+
st.error("Could not extract readable text from this PDF. This could be due to:")
237+
st.markdown("""
238+
- The PDF contains only images or scanned content
239+
- The PDF is corrupted or password-protected
240+
- The PDF format is not supported
241+
242+
**Please try:**
243+
- A different PDF document with selectable text
244+
- Converting scanned PDFs to text-searchable format first
245+
- Ensuring the PDF is not password-protected
246+
""")
247+
st.info("💡 The chat interface will remain disabled until a valid document is uploaded.")
236248
except Exception as e:
249+
st.error("❌ **Document Processing Failed**")
237250
st.error(f"Error processing file: {e}")
238-
st.info("💡 If this file was uploaded before, try clicking '🗑️ Clear Upload' and upload again")
251+
st.markdown("""
252+
**This error occurred while trying to process your PDF. Common causes:**
253+
- File corruption or invalid PDF format
254+
- Insufficient memory for large files
255+
- Network issues during upload
256+
257+
**Please try:**
258+
- Uploading a different PDF file
259+
- Clicking '🗑️ Clear Upload' and trying again
260+
- Ensuring the file is a valid PDF document
261+
""")
262+
st.info("💡 The chat interface will remain disabled until a valid document is uploaded.")
239263

240264
def render_chat_interface(chat_manager):
241265
"""Render the main chat interface"""
@@ -261,8 +285,9 @@ def render_chat_interface(chat_manager):
261285
with st.chat_message(message["role"]):
262286
st.markdown(message["content"])
263287

264-
# Chat input
265-
if chat.get("document_text"):
288+
# Chat input - only show if document text is valid
289+
document_text = chat.get("document_text", "")
290+
if document_text and document_text.strip():
266291
if prompt := st.chat_input("Ask about your document..."):
267292
if not st.session_state.selected_model:
268293
st.warning("Please select a model first")
@@ -285,13 +310,52 @@ def render_chat_interface(chat_manager):
285310

286311
except Exception as e:
287312
st.error(f"Error generating response: {e}")
313+
else:
314+
# Show message when document processing failed
315+
st.warning("⚠️ **Chat Disabled**: No valid document content available. Please upload a PDF document with readable text to start chatting.")
316+
st.info("The document may have failed to process, or the extracted text may be empty. Try uploading a different PDF file.")
317+
318+
288319

289320
def generate_ai_response(prompt, document_text):
290321
"""Generate AI response using Ollama with reasoning support"""
291-
system_prompt = f"""Answer questions based on this document:
322+
323+
# Check if document text is empty or None
324+
if not document_text or not document_text.strip():
325+
return "I apologize, but I cannot answer your question because the document could not be processed or contains no readable text. Please try uploading a different PDF document."
326+
327+
system_prompt = f"""You are a document analysis assistant. You MUST ONLY answer questions that can be directly supported with citations from the provided document.
292328
329+
DOCUMENT CONTENT:
293330
{document_text}
294331
332+
CRITICAL INSTRUCTIONS:
333+
1. ALWAYS respond in the same language as the user's question, regardless of the document language
334+
2. You may reason about and analyze the information in the document, but ALL reasoning must be grounded in content that can be cited from the document
335+
3. Every factual claim in your answer MUST be supported by at least one verbatim citation from the document
336+
4. You may draw logical conclusions and make inferences, but only based on information explicitly present in the document
337+
5. NEVER use your training data, general knowledge, or external information - base all reasoning solely on the document content
338+
339+
RESPONSE LOGIC - CHOOSE ONE PATH:
340+
PATH A - ANSWER WITH CITATIONS:
341+
- If you can find information in the document to answer the question, provide a complete answer
342+
- Support every factual claim with exact citations from the document
343+
- Use the required citation format shown below
344+
345+
PATH B - DECLINE TO ANSWER:
346+
- If you cannot find sufficient information in the document to answer the question, decline to answer
347+
- Explain that the information is not available in the provided document
348+
- Do NOT include any citations when declining to answer
349+
- Do NOT reference any specific text from the document when declining
350+
351+
CRITICAL: Never mix these paths. Either answer with full citations OR decline without any citations. Never decline while providing citations - this is contradictory.
352+
353+
MANDATORY CITATION REQUIREMENT:
354+
- Every factual claim in your answer MUST be backed by a verbatim citation from the document
355+
- You may reason and analyze, but the underlying facts must be cited exactly as they appear in the document
356+
- If you cannot provide verbatim citations to support your reasoning, do NOT provide the answer
357+
- Citations must be exact quotes from the document, not paraphrases or interpretations
358+
295359
CRITICAL CITATION FORMAT:
296360
You MUST use citations in this EXACT format for text highlighting to work:
297361
@@ -302,23 +366,45 @@ def generate_ai_response(prompt, document_text):
302366
[1] "exact quote from document"
303367
[2] "another exact quote from document"
304368
305-
EXAMPLE:
369+
EXAMPLE OF PATH A - ANSWER WITH CITATIONS:
306370
Question: Does he have experience in the medical field?
307-
Answer: Yes, Christian Staudt has experience in the medical field. [1] [2]
371+
Answer: Yes, the document shows he has experience in medical applications. [1]
372+
373+
[1] "project development for AI applications: medical data mining & AI"
374+
375+
EXAMPLE OF PATH B - DECLINE TO ANSWER:
376+
Question: What is his favorite programming language?
377+
Answer: I cannot answer this question based on the information provided in the document. The document does not contain information about programming language preferences.
308378
309-
[1] "project development for AI applications: medical data mining & AI, AI for renewable energy control"
310-
[2] "developing a prototype for data-driven measurement of global marketing campaign performance across channels"
379+
INVALID EXAMPLE (DO NOT DO THIS):
380+
Question: What are his hobbies?
381+
Answer: I cannot answer this question based on the document. [1]
382+
[1] "some text from document"
383+
^ This is WRONG - never decline while providing citations!
311384
312-
RULES:
385+
CITATION RULES:
313386
- Citations MUST start at the beginning of a line
314387
- Citations MUST use the format [number] "quote"
315-
- Use exact quotes from the document, not paraphrases
388+
- Use exact quotes from the document in their ORIGINAL language - NEVER translate citations
316389
- Each citation on its own line
317390
- Do NOT use colons, "Exact quote:", or other text before the quote
318-
- IMPORTANT: Quote only the SPECIFIC text that directly answers the question, not entire sentences or paragraphs
319-
- For time/date questions, quote only the relevant time/date, not the entire schedule line
320-
- For specific facts, quote only the relevant fact, not surrounding context
321-
- Keep quotes focused and precise to ensure accurate highlighting"""
391+
- IMPORTANT: Quote meaningful phrases with context, not isolated words or numbers
392+
- Always include descriptive context around numbers, percentages, or measurements
393+
- Avoid quoting standalone numbers - always include the surrounding descriptive words
394+
- Keep quotes focused but meaningful - aim for 3-8 words that capture the complete idea
395+
- Prioritize phrases that directly answer the user's question with sufficient context for highlighting
396+
397+
LANGUAGE RULES:
398+
- Respond to the user in the same language as their question
399+
- Your explanatory text, reasoning, and analysis should be in the user's language
400+
- Citations must remain in the original document language - do NOT translate them
401+
- Example: If user asks in English about a German document, respond in English but keep German citations
402+
403+
STRICT RULES:
404+
- If you cannot provide citations from the document for your answer, you MUST decline to answer
405+
- Do NOT provide any information from your training data
406+
- NEVER mix declining to answer with providing citations - this is contradictory
407+
- Either answer with citations OR decline without citations - never both"""
322408

323409
messages = [
324410
{"role": "system", "content": system_prompt},
@@ -398,10 +484,8 @@ def generate_ai_response(prompt, document_text):
398484
answer_placeholder.markdown(answer_content)
399485

400486
# Return the final answer (without reasoning tags) for storage
401-
if reasoning_started:
402-
return answer_content
403-
else:
404-
return full_response
487+
final_answer = answer_content if reasoning_started else full_response
488+
return final_answer
405489

406490
except Exception as e:
407491
st.error(f"Error during streaming: {e}")
@@ -448,7 +532,9 @@ def main():
448532

449533
# Main content
450534
chat = chat_manager.get_current_chat()
451-
if not chat.get("document_text"):
535+
document_text = chat.get("document_text", "")
536+
537+
if not document_text or not document_text.strip():
452538
render_document_upload(chat_manager)
453539
else:
454540
render_chat_interface(chat_manager)

0 commit comments

Comments
 (0)