Skip to content

Commit 752a7f0

Browse files
committed
citation improvements
1 parent c784366 commit 752a7f0

File tree

3 files changed

+154
-20
lines changed

3 files changed

+154
-20
lines changed

DEVELOPMENT_INSTRUCTIONS.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
- Never forget to update the conda environment config file when you update the requirements.txt
22
- Make sure there are concise and up to date docstrings that document usage.
33
- Debug information belongs into the command line logs, not in the app UI/UX.
4-
-

app.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def render_chat_interface(chat_manager):
281281
chat_manager.add_message("assistant", response)
282282

283283
# Show citations
284-
show_citations(response, chat)
284+
show_citations(response, chat, prompt)
285285

286286
except Exception as e:
287287
st.error(f"Error generating response: {e}")
@@ -314,7 +314,11 @@ def generate_ai_response(prompt, document_text):
314314
- Citations MUST use the format [number] "quote"
315315
- Use exact quotes from the document, not paraphrases
316316
- Each citation on its own line
317-
- Do NOT use colons, "Exact quote:", or other text before the quote"""
317+
- Do NOT use colons, "Exact quote:", or other text before the quote
318+
- IMPORTANT: Quote only the SPECIFIC text that directly answers the question, not entire sentences or paragraphs
319+
- For time/date questions, quote only the relevant time/date, not the entire schedule line
320+
- For specific facts, quote only the relevant fact, not surrounding context
321+
- Keep quotes focused and precise to ensure accurate highlighting"""
318322

319323
messages = [
320324
{"role": "system", "content": system_prompt},
@@ -403,13 +407,13 @@ def generate_ai_response(prompt, document_text):
403407
st.error(f"Error during streaming: {e}")
404408
return ""
405409

406-
def show_citations(response, chat):
410+
def show_citations(response, chat, user_question=""):
407411
"""Show citation-based references"""
408412
if chat.get("document_content"):
409413
try:
410414
pdf_processor = EnhancedPDFProcessor(chat["document_content"])
411415
pdf_processor.display_citation_based_references(
412-
response, chat["document_text"]
416+
response, chat["document_text"], user_question
413417
)
414418
except Exception as e:
415419
st.warning(f"Could not show citations: {e}")

ragnarok/enhanced_pdf_processor.py

Lines changed: 146 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,11 @@ def display_citation_based_references(
9292
self,
9393
ai_response: str,
9494
original_text: str,
95+
user_question: str = "",
9596
) -> int:
9697
"""Display highlighted document for citations found in AI response"""
9798
# Extract quotes from AI response
98-
citation_quotes = self._extract_quotes_from_ai_response(ai_response)
99+
citation_quotes = self._extract_quotes_from_ai_response(ai_response, user_question)
99100

100101
# Log debug information instead of showing in UI
101102
if not citation_quotes:
@@ -165,7 +166,7 @@ def display_citation_based_references(
165166
st.caption("💬 No citations found in response")
166167
return 0
167168

168-
def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
169+
def _extract_quotes_from_ai_response(self, ai_response: str, user_question: str = "") -> Dict[int, str]:
169170
"""Extract numbered quotes from AI response using multiple patterns"""
170171
citation_quotes = {}
171172

@@ -176,7 +177,9 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
176177
for match in matches1:
177178
citation_num = int(match[0])
178179
quote_text = match[1].strip()
179-
citation_quotes[citation_num] = quote_text
180+
# Try to extract more focused quotes for long citations
181+
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
182+
citation_quotes[citation_num] = focused_quote
180183

181184
# Pattern 2: [1]: "exact quote" - legacy format with colon
182185
if not citation_quotes:
@@ -186,15 +189,17 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
186189
for match in matches2:
187190
citation_num = int(match[0])
188191
quote_text = match[1].strip()
189-
citation_quotes[citation_num] = quote_text
192+
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
193+
citation_quotes[citation_num] = focused_quote
190194

191195
# Pattern 3: [Exact quote: "text"] - current problematic format
192196
if not citation_quotes:
193197
pattern3 = r'\[Exact quote:\s*"([^"]+)"\]'
194198
matches3 = re.findall(pattern3, ai_response, re.IGNORECASE)
195199

196200
for i, quote_text in enumerate(matches3, 1):
197-
citation_quotes[i] = quote_text.strip()
201+
focused_quote = self._extract_focused_quote(quote_text.strip(), ai_response, user_question)
202+
citation_quotes[i] = focused_quote
198203

199204
# Pattern 3b: "text" in brackets without "Exact quote:" prefix
200205
if not citation_quotes:
@@ -203,7 +208,8 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
203208

204209
for i, quote_text in enumerate(matches3b, 1):
205210
if len(quote_text.strip()) > 15: # Only substantial quotes
206-
citation_quotes[i] = quote_text.strip()
211+
focused_quote = self._extract_focused_quote(quote_text.strip(), ai_response, user_question)
212+
citation_quotes[i] = focused_quote
207213

208214
# Pattern 4: Any text in double quotes as fallback
209215
if not citation_quotes:
@@ -214,14 +220,95 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
214220
# Only use if it looks like a substantial quote
215221
cleaned = quote_text.strip()
216222
if len(cleaned) > 15 and not cleaned.startswith('http'):
217-
citation_quotes[i] = cleaned
223+
focused_quote = self._extract_focused_quote(cleaned, ai_response, user_question)
224+
citation_quotes[i] = focused_quote
218225

219226
return citation_quotes
220227

228+
def _extract_focused_quote(self, quote_text: str, ai_response: str, user_question: str = "") -> str:
229+
"""Extract the most relevant part of a long quote based on the question context"""
230+
# If quote is short enough, return as-is
231+
if len(quote_text.split()) <= 10:
232+
return quote_text
233+
234+
# Try to identify what the user is asking about from both the question and AI response
235+
question_keywords = []
236+
237+
# Analyze the user question first (more reliable)
238+
combined_text = f"{user_question} {ai_response}"
239+
240+
# Look for common question patterns
241+
if re.search(r'\barrive\b|\barrival\b', combined_text, re.IGNORECASE):
242+
question_keywords.extend(['arrive', 'arrival', 'ankunft'])
243+
if re.search(r'\bdepart\b|\bdeparture\b', combined_text, re.IGNORECASE):
244+
question_keywords.extend(['depart', 'departure', 'abfahrt'])
245+
if re.search(r'\btime\b|\bwhen\b', combined_text, re.IGNORECASE):
246+
question_keywords.extend(['time', 'uhrzeit'])
247+
if re.search(r'\bdate\b', combined_text, re.IGNORECASE):
248+
question_keywords.extend(['date'])
249+
if re.search(r'\bprice\b|\bcost\b', combined_text, re.IGNORECASE):
250+
question_keywords.extend(['price', 'cost', 'euro', '€'])
251+
252+
# If we have question keywords, try to find the most relevant part
253+
if question_keywords:
254+
words = quote_text.split()
255+
best_segment = quote_text # fallback
256+
best_score = 0
257+
258+
# Try different segment sizes
259+
for segment_size in [3, 5, 7, 10]:
260+
if segment_size >= len(words):
261+
continue
262+
263+
for i in range(len(words) - segment_size + 1):
264+
segment = " ".join(words[i:i + segment_size])
265+
266+
# Score this segment based on keyword matches
267+
score = 0
268+
for keyword in question_keywords:
269+
if keyword.lower() in segment.lower():
270+
score += 1
271+
272+
# Also look for time/date patterns
273+
if re.search(r'\d{1,2}:\d{2}', segment): # Time pattern
274+
score += 2
275+
if re.search(r'\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}', segment): # Date pattern
276+
score += 2
277+
278+
if score > best_score:
279+
best_score = score
280+
best_segment = segment
281+
282+
# If we found a good focused segment, use it
283+
if best_score > 0 and len(best_segment.split()) < len(words) * 0.7:
284+
return best_segment
285+
286+
# If no good focused segment found, try to extract key information
287+
# Look for time patterns
288+
time_matches = re.findall(r'\d{1,2}:\d{2}(?:\s*-\s*\d{1,2}/\d{1,2}/\d{4})?', quote_text)
289+
if time_matches:
290+
# Return the time with some context
291+
for time_match in time_matches:
292+
time_pos = quote_text.find(time_match)
293+
if time_pos != -1:
294+
# Get some words around the time
295+
start = max(0, time_pos - 20)
296+
end = min(len(quote_text), time_pos + len(time_match) + 20)
297+
context = quote_text[start:end].strip()
298+
if len(context.split()) <= 10:
299+
return context
300+
301+
# If still too long, just take the first part
302+
words = quote_text.split()
303+
if len(words) > 15:
304+
return " ".join(words[:15]) + "..."
305+
306+
return quote_text
307+
221308
def _create_highlighted_pdf(
222309
self, search_terms: List[str]
223310
) -> Tuple[bytes, Optional[int]]:
224-
"""Create highlighted PDF with simple highlighting"""
311+
"""Create highlighted PDF with smart highlighting"""
225312
highlighted_doc = fitz.open(stream=self.pdf_bytes, filetype="pdf")
226313
first_highlight_page = None
227314

@@ -241,18 +328,60 @@ def _create_highlighted_pdf(
241328
if first_highlight_page is None:
242329
first_highlight_page = page_num + 1
243330
else:
244-
# Try to find partial matches for longer quotes
331+
# For long quotes, try smart highlighting
245332
if len(term.split()) >= 5:
246-
self._highlight_partial_matches(page, term)
247-
if first_highlight_page is None:
333+
found = self._smart_highlight_long_quote(page, term)
334+
if found and first_highlight_page is None:
248335
first_highlight_page = page_num + 1
249336

250337
return highlighted_doc.tobytes(), first_highlight_page
251338

252339
finally:
253340
highlighted_doc.close()
254341

255-
def _highlight_partial_matches(self, page, term: str):
342+
def _smart_highlight_long_quote(self, page, term: str) -> bool:
343+
"""Smart highlighting for long quotes - tries to find key parts"""
344+
words = term.split()
345+
found_any = False
346+
347+
# Extract key information patterns (times, dates, numbers, important words)
348+
key_patterns = []
349+
350+
# Look for time patterns (HH:MM)
351+
time_pattern = r'\b\d{1,2}:\d{2}\b'
352+
times = re.findall(time_pattern, term)
353+
key_patterns.extend(times)
354+
355+
# Look for date patterns (DD/MM/YYYY or similar)
356+
date_pattern = r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b'
357+
dates = re.findall(date_pattern, term)
358+
key_patterns.extend(dates)
359+
360+
# Look for numbers that might be important
361+
number_pattern = r'\b\d+\b'
362+
numbers = re.findall(number_pattern, term)
363+
# Only include numbers that are likely important (not too common)
364+
important_numbers = [n for n in numbers if len(n) >= 2]
365+
key_patterns.extend(important_numbers)
366+
367+
# Try to highlight key patterns first
368+
for pattern in key_patterns:
369+
instances = page.search_for(pattern, quads=True)
370+
if instances:
371+
for inst in instances:
372+
highlight = page.add_highlight_annot(inst)
373+
highlight.set_colors(stroke=(0, 1, 0)) # Green for key info
374+
highlight.update()
375+
found_any = True
376+
377+
# If we found key patterns, we're done
378+
if found_any:
379+
return True
380+
381+
# Otherwise, fall back to partial matching
382+
return self._highlight_partial_matches(page, term)
383+
384+
def _highlight_partial_matches(self, page, term: str) -> bool:
256385
"""Find and highlight partial matches for longer quotes"""
257386
words = term.split()
258387

@@ -269,7 +398,9 @@ def _highlight_partial_matches(self, page, term: str):
269398
stroke=(1, 0.8, 0)
270399
) # Orange for partial matches
271400
highlight.update()
272-
return # Found something, stop here
401+
return True # Found something, stop here
402+
403+
return False # Nothing found
273404

274405
def __del__(self):
275406
"""Clean up document resources"""
@@ -284,7 +415,7 @@ def process_pdf_with_highlighting(pdf_bytes: bytes) -> EnhancedPDFProcessor:
284415

285416

286417
def highlight_ai_referenced_text(
287-
pdf_processor: EnhancedPDFProcessor, ai_response: str, original_text: str
418+
pdf_processor: EnhancedPDFProcessor, ai_response: str, original_text: str, user_question: str = ""
288419
):
289420
"""Legacy function for backward compatibility"""
290-
return pdf_processor.display_citation_based_references(ai_response, original_text)
421+
return pdf_processor.display_citation_based_references(ai_response, original_text, user_question)

0 commit comments

Comments
 (0)