citation improvements

clstaudt · clstaudt · commit 752a7f00ac50 · 2025-05-26T15:53:32.000+02:00
diff --git a/DEVELOPMENT_INSTRUCTIONS.md b/DEVELOPMENT_INSTRUCTIONS.md
@@ -1,4 +1,3 @@
 - Never forget to update the conda environment config file when you update the requirements.txt
 - Make sure there are concise and up to date docstrings that document usage.
 - Debug information belongs into the command line logs, not in the app UI/UX.
-- 
diff --git a/app.py b/app.py
@@ -281,7 +281,7 @@ def render_chat_interface(chat_manager):
                     chat_manager.add_message("assistant", response)
                     
                     # Show citations
-                    show_citations(response, chat)
+                    show_citations(response, chat, prompt)
                         
                 except Exception as e:
                     st.error(f"Error generating response: {e}")
@@ -314,7 +314,11 @@ def generate_ai_response(prompt, document_text):
 - Citations MUST use the format [number] "quote" 
 - Use exact quotes from the document, not paraphrases
 - Each citation on its own line
-- Do NOT use colons, "Exact quote:", or other text before the quote"""
+- Do NOT use colons, "Exact quote:", or other text before the quote
+- IMPORTANT: Quote only the SPECIFIC text that directly answers the question, not entire sentences or paragraphs
+- For time/date questions, quote only the relevant time/date, not the entire schedule line
+- For specific facts, quote only the relevant fact, not surrounding context
+- Keep quotes focused and precise to ensure accurate highlighting"""
     
     messages = [
         {"role": "system", "content": system_prompt},
@@ -403,13 +407,13 @@ def generate_ai_response(prompt, document_text):
         st.error(f"Error during streaming: {e}")
         return ""
 
-def show_citations(response, chat):
+def show_citations(response, chat, user_question=""):
     """Show citation-based references"""
     if chat.get("document_content"):
         try:
             pdf_processor = EnhancedPDFProcessor(chat["document_content"])
             pdf_processor.display_citation_based_references(
-                response, chat["document_text"]
+                response, chat["document_text"], user_question
             )
         except Exception as e:
             st.warning(f"Could not show citations: {e}")
diff --git a/ragnarok/enhanced_pdf_processor.py b/ragnarok/enhanced_pdf_processor.py
@@ -92,10 +92,11 @@ def display_citation_based_references(
         self,
         ai_response: str,
         original_text: str,
+        user_question: str = "",
     ) -> int:
         """Display highlighted document for citations found in AI response"""
         # Extract quotes from AI response
-        citation_quotes = self._extract_quotes_from_ai_response(ai_response)
+        citation_quotes = self._extract_quotes_from_ai_response(ai_response, user_question)
 
         # Log debug information instead of showing in UI
         if not citation_quotes:
@@ -165,7 +166,7 @@ def display_citation_based_references(
             st.caption("💬 No citations found in response")
             return 0
 
-    def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
+    def _extract_quotes_from_ai_response(self, ai_response: str, user_question: str = "") -> Dict[int, str]:
         """Extract numbered quotes from AI response using multiple patterns"""
         citation_quotes = {}
 
@@ -176,7 +177,9 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
         for match in matches1:
             citation_num = int(match[0])
             quote_text = match[1].strip()
-            citation_quotes[citation_num] = quote_text
+            # Try to extract more focused quotes for long citations
+            focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
+            citation_quotes[citation_num] = focused_quote
 
         # Pattern 2: [1]: "exact quote" - legacy format with colon
         if not citation_quotes:
@@ -186,15 +189,17 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
             for match in matches2:
                 citation_num = int(match[0])
                 quote_text = match[1].strip()
-                citation_quotes[citation_num] = quote_text
+                focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
+                citation_quotes[citation_num] = focused_quote
 
         # Pattern 3: [Exact quote: "text"] - current problematic format
         if not citation_quotes:
             pattern3 = r'\[Exact quote:\s*"([^"]+)"\]'
             matches3 = re.findall(pattern3, ai_response, re.IGNORECASE)
             
             for i, quote_text in enumerate(matches3, 1):
-                citation_quotes[i] = quote_text.strip()
+                focused_quote = self._extract_focused_quote(quote_text.strip(), ai_response, user_question)
+                citation_quotes[i] = focused_quote
 
         # Pattern 3b: "text" in brackets without "Exact quote:" prefix
         if not citation_quotes:
@@ -203,7 +208,8 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
             
             for i, quote_text in enumerate(matches3b, 1):
                 if len(quote_text.strip()) > 15:  # Only substantial quotes
-                    citation_quotes[i] = quote_text.strip()
+                    focused_quote = self._extract_focused_quote(quote_text.strip(), ai_response, user_question)
+                    citation_quotes[i] = focused_quote
 
         # Pattern 4: Any text in double quotes as fallback
         if not citation_quotes:
@@ -214,14 +220,95 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
                 # Only use if it looks like a substantial quote
                 cleaned = quote_text.strip()
                 if len(cleaned) > 15 and not cleaned.startswith('http'):
-                    citation_quotes[i] = cleaned
+                    focused_quote = self._extract_focused_quote(cleaned, ai_response, user_question)
+                    citation_quotes[i] = focused_quote
 
         return citation_quotes
 
+    def _extract_focused_quote(self, quote_text: str, ai_response: str, user_question: str = "") -> str:
+        """Extract the most relevant part of a long quote based on the question context"""
+        # If quote is short enough, return as-is
+        if len(quote_text.split()) <= 10:
+            return quote_text
+            
+        # Try to identify what the user is asking about from both the question and AI response
+        question_keywords = []
+        
+        # Analyze the user question first (more reliable)
+        combined_text = f"{user_question} {ai_response}"
+        
+        # Look for common question patterns
+        if re.search(r'\barrive\b|\barrival\b', combined_text, re.IGNORECASE):
+            question_keywords.extend(['arrive', 'arrival', 'ankunft'])
+        if re.search(r'\bdepart\b|\bdeparture\b', combined_text, re.IGNORECASE):
+            question_keywords.extend(['depart', 'departure', 'abfahrt'])
+        if re.search(r'\btime\b|\bwhen\b', combined_text, re.IGNORECASE):
+            question_keywords.extend(['time', 'uhrzeit'])
+        if re.search(r'\bdate\b', combined_text, re.IGNORECASE):
+            question_keywords.extend(['date'])
+        if re.search(r'\bprice\b|\bcost\b', combined_text, re.IGNORECASE):
+            question_keywords.extend(['price', 'cost', 'euro', '€'])
+            
+        # If we have question keywords, try to find the most relevant part
+        if question_keywords:
+            words = quote_text.split()
+            best_segment = quote_text  # fallback
+            best_score = 0
+            
+            # Try different segment sizes
+            for segment_size in [3, 5, 7, 10]:
+                if segment_size >= len(words):
+                    continue
+                    
+                for i in range(len(words) - segment_size + 1):
+                    segment = " ".join(words[i:i + segment_size])
+                    
+                    # Score this segment based on keyword matches
+                    score = 0
+                    for keyword in question_keywords:
+                        if keyword.lower() in segment.lower():
+                            score += 1
+                    
+                    # Also look for time/date patterns
+                    if re.search(r'\d{1,2}:\d{2}', segment):  # Time pattern
+                        score += 2
+                    if re.search(r'\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}', segment):  # Date pattern
+                        score += 2
+                    
+                    if score > best_score:
+                        best_score = score
+                        best_segment = segment
+            
+            # If we found a good focused segment, use it
+            if best_score > 0 and len(best_segment.split()) < len(words) * 0.7:
+                return best_segment
+        
+        # If no good focused segment found, try to extract key information
+        # Look for time patterns
+        time_matches = re.findall(r'\d{1,2}:\d{2}(?:\s*-\s*\d{1,2}/\d{1,2}/\d{4})?', quote_text)
+        if time_matches:
+            # Return the time with some context
+            for time_match in time_matches:
+                time_pos = quote_text.find(time_match)
+                if time_pos != -1:
+                    # Get some words around the time
+                    start = max(0, time_pos - 20)
+                    end = min(len(quote_text), time_pos + len(time_match) + 20)
+                    context = quote_text[start:end].strip()
+                    if len(context.split()) <= 10:
+                        return context
+        
+        # If still too long, just take the first part
+        words = quote_text.split()
+        if len(words) > 15:
+            return " ".join(words[:15]) + "..."
+            
+        return quote_text
+
     def _create_highlighted_pdf(
         self, search_terms: List[str]
     ) -> Tuple[bytes, Optional[int]]:
-        """Create highlighted PDF with simple highlighting"""
+        """Create highlighted PDF with smart highlighting"""
         highlighted_doc = fitz.open(stream=self.pdf_bytes, filetype="pdf")
         first_highlight_page = None
 
@@ -241,18 +328,60 @@ def _create_highlighted_pdf(
                             if first_highlight_page is None:
                                 first_highlight_page = page_num + 1
                     else:
-                        # Try to find partial matches for longer quotes
+                        # For long quotes, try smart highlighting
                         if len(term.split()) >= 5:
-                            self._highlight_partial_matches(page, term)
-                            if first_highlight_page is None:
+                            found = self._smart_highlight_long_quote(page, term)
+                            if found and first_highlight_page is None:
                                 first_highlight_page = page_num + 1
 
             return highlighted_doc.tobytes(), first_highlight_page
 
         finally:
             highlighted_doc.close()
 
-    def _highlight_partial_matches(self, page, term: str):
+    def _smart_highlight_long_quote(self, page, term: str) -> bool:
+        """Smart highlighting for long quotes - tries to find key parts"""
+        words = term.split()
+        found_any = False
+        
+        # Extract key information patterns (times, dates, numbers, important words)
+        key_patterns = []
+        
+        # Look for time patterns (HH:MM)
+        time_pattern = r'\b\d{1,2}:\d{2}\b'
+        times = re.findall(time_pattern, term)
+        key_patterns.extend(times)
+        
+        # Look for date patterns (DD/MM/YYYY or similar)
+        date_pattern = r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b'
+        dates = re.findall(date_pattern, term)
+        key_patterns.extend(dates)
+        
+        # Look for numbers that might be important
+        number_pattern = r'\b\d+\b'
+        numbers = re.findall(number_pattern, term)
+        # Only include numbers that are likely important (not too common)
+        important_numbers = [n for n in numbers if len(n) >= 2]
+        key_patterns.extend(important_numbers)
+        
+        # Try to highlight key patterns first
+        for pattern in key_patterns:
+            instances = page.search_for(pattern, quads=True)
+            if instances:
+                for inst in instances:
+                    highlight = page.add_highlight_annot(inst)
+                    highlight.set_colors(stroke=(0, 1, 0))  # Green for key info
+                    highlight.update()
+                    found_any = True
+        
+        # If we found key patterns, we're done
+        if found_any:
+            return True
+            
+        # Otherwise, fall back to partial matching
+        return self._highlight_partial_matches(page, term)
+
+    def _highlight_partial_matches(self, page, term: str) -> bool:
         """Find and highlight partial matches for longer quotes"""
         words = term.split()
 
@@ -269,7 +398,9 @@ def _highlight_partial_matches(self, page, term: str):
                             stroke=(1, 0.8, 0)
                         )  # Orange for partial matches
                         highlight.update()
-                    return  # Found something, stop here
+                    return True  # Found something, stop here
+        
+        return False  # Nothing found
 
     def __del__(self):
         """Clean up document resources"""
@@ -284,7 +415,7 @@ def process_pdf_with_highlighting(pdf_bytes: bytes) -> EnhancedPDFProcessor:
 
 
 def highlight_ai_referenced_text(
-    pdf_processor: EnhancedPDFProcessor, ai_response: str, original_text: str
+    pdf_processor: EnhancedPDFProcessor, ai_response: str, original_text: str, user_question: str = ""
 ):
     """Legacy function for backward compatibility"""
-    return pdf_processor.display_citation_based_references(ai_response, original_text)
+    return pdf_processor.display_citation_based_references(ai_response, original_text, user_question)