@@ -92,10 +92,11 @@ def display_citation_based_references(
92
92
self ,
93
93
ai_response : str ,
94
94
original_text : str ,
95
+ user_question : str = "" ,
95
96
) -> int :
96
97
"""Display highlighted document for citations found in AI response"""
97
98
# Extract quotes from AI response
98
- citation_quotes = self ._extract_quotes_from_ai_response (ai_response )
99
+ citation_quotes = self ._extract_quotes_from_ai_response (ai_response , user_question )
99
100
100
101
# Log debug information instead of showing in UI
101
102
if not citation_quotes :
@@ -165,7 +166,7 @@ def display_citation_based_references(
165
166
st .caption ("💬 No citations found in response" )
166
167
return 0
167
168
168
- def _extract_quotes_from_ai_response (self , ai_response : str ) -> Dict [int , str ]:
169
+ def _extract_quotes_from_ai_response (self , ai_response : str , user_question : str = "" ) -> Dict [int , str ]:
169
170
"""Extract numbered quotes from AI response using multiple patterns"""
170
171
citation_quotes = {}
171
172
@@ -176,7 +177,9 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
176
177
for match in matches1 :
177
178
citation_num = int (match [0 ])
178
179
quote_text = match [1 ].strip ()
179
- citation_quotes [citation_num ] = quote_text
180
+ # Try to extract more focused quotes for long citations
181
+ focused_quote = self ._extract_focused_quote (quote_text , ai_response , user_question )
182
+ citation_quotes [citation_num ] = focused_quote
180
183
181
184
# Pattern 2: [1]: "exact quote" - legacy format with colon
182
185
if not citation_quotes :
@@ -186,15 +189,17 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
186
189
for match in matches2 :
187
190
citation_num = int (match [0 ])
188
191
quote_text = match [1 ].strip ()
189
- citation_quotes [citation_num ] = quote_text
192
+ focused_quote = self ._extract_focused_quote (quote_text , ai_response , user_question )
193
+ citation_quotes [citation_num ] = focused_quote
190
194
191
195
# Pattern 3: [Exact quote: "text"] - current problematic format
192
196
if not citation_quotes :
193
197
pattern3 = r'\[Exact quote:\s*"([^"]+)"\]'
194
198
matches3 = re .findall (pattern3 , ai_response , re .IGNORECASE )
195
199
196
200
for i , quote_text in enumerate (matches3 , 1 ):
197
- citation_quotes [i ] = quote_text .strip ()
201
+ focused_quote = self ._extract_focused_quote (quote_text .strip (), ai_response , user_question )
202
+ citation_quotes [i ] = focused_quote
198
203
199
204
# Pattern 3b: "text" in brackets without "Exact quote:" prefix
200
205
if not citation_quotes :
@@ -203,7 +208,8 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
203
208
204
209
for i , quote_text in enumerate (matches3b , 1 ):
205
210
if len (quote_text .strip ()) > 15 : # Only substantial quotes
206
- citation_quotes [i ] = quote_text .strip ()
211
+ focused_quote = self ._extract_focused_quote (quote_text .strip (), ai_response , user_question )
212
+ citation_quotes [i ] = focused_quote
207
213
208
214
# Pattern 4: Any text in double quotes as fallback
209
215
if not citation_quotes :
@@ -214,14 +220,95 @@ def _extract_quotes_from_ai_response(self, ai_response: str) -> Dict[int, str]:
214
220
# Only use if it looks like a substantial quote
215
221
cleaned = quote_text .strip ()
216
222
if len (cleaned ) > 15 and not cleaned .startswith ('http' ):
217
- citation_quotes [i ] = cleaned
223
+ focused_quote = self ._extract_focused_quote (cleaned , ai_response , user_question )
224
+ citation_quotes [i ] = focused_quote
218
225
219
226
return citation_quotes
220
227
228
+ def _extract_focused_quote (self , quote_text : str , ai_response : str , user_question : str = "" ) -> str :
229
+ """Extract the most relevant part of a long quote based on the question context"""
230
+ # If quote is short enough, return as-is
231
+ if len (quote_text .split ()) <= 10 :
232
+ return quote_text
233
+
234
+ # Try to identify what the user is asking about from both the question and AI response
235
+ question_keywords = []
236
+
237
+ # Analyze the user question first (more reliable)
238
+ combined_text = f"{ user_question } { ai_response } "
239
+
240
+ # Look for common question patterns
241
+ if re .search (r'\barrive\b|\barrival\b' , combined_text , re .IGNORECASE ):
242
+ question_keywords .extend (['arrive' , 'arrival' , 'ankunft' ])
243
+ if re .search (r'\bdepart\b|\bdeparture\b' , combined_text , re .IGNORECASE ):
244
+ question_keywords .extend (['depart' , 'departure' , 'abfahrt' ])
245
+ if re .search (r'\btime\b|\bwhen\b' , combined_text , re .IGNORECASE ):
246
+ question_keywords .extend (['time' , 'uhrzeit' ])
247
+ if re .search (r'\bdate\b' , combined_text , re .IGNORECASE ):
248
+ question_keywords .extend (['date' ])
249
+ if re .search (r'\bprice\b|\bcost\b' , combined_text , re .IGNORECASE ):
250
+ question_keywords .extend (['price' , 'cost' , 'euro' , '€' ])
251
+
252
+ # If we have question keywords, try to find the most relevant part
253
+ if question_keywords :
254
+ words = quote_text .split ()
255
+ best_segment = quote_text # fallback
256
+ best_score = 0
257
+
258
+ # Try different segment sizes
259
+ for segment_size in [3 , 5 , 7 , 10 ]:
260
+ if segment_size >= len (words ):
261
+ continue
262
+
263
+ for i in range (len (words ) - segment_size + 1 ):
264
+ segment = " " .join (words [i :i + segment_size ])
265
+
266
+ # Score this segment based on keyword matches
267
+ score = 0
268
+ for keyword in question_keywords :
269
+ if keyword .lower () in segment .lower ():
270
+ score += 1
271
+
272
+ # Also look for time/date patterns
273
+ if re .search (r'\d{1,2}:\d{2}' , segment ): # Time pattern
274
+ score += 2
275
+ if re .search (r'\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}' , segment ): # Date pattern
276
+ score += 2
277
+
278
+ if score > best_score :
279
+ best_score = score
280
+ best_segment = segment
281
+
282
+ # If we found a good focused segment, use it
283
+ if best_score > 0 and len (best_segment .split ()) < len (words ) * 0.7 :
284
+ return best_segment
285
+
286
+ # If no good focused segment found, try to extract key information
287
+ # Look for time patterns
288
+ time_matches = re .findall (r'\d{1,2}:\d{2}(?:\s*-\s*\d{1,2}/\d{1,2}/\d{4})?' , quote_text )
289
+ if time_matches :
290
+ # Return the time with some context
291
+ for time_match in time_matches :
292
+ time_pos = quote_text .find (time_match )
293
+ if time_pos != - 1 :
294
+ # Get some words around the time
295
+ start = max (0 , time_pos - 20 )
296
+ end = min (len (quote_text ), time_pos + len (time_match ) + 20 )
297
+ context = quote_text [start :end ].strip ()
298
+ if len (context .split ()) <= 10 :
299
+ return context
300
+
301
+ # If still too long, just take the first part
302
+ words = quote_text .split ()
303
+ if len (words ) > 15 :
304
+ return " " .join (words [:15 ]) + "..."
305
+
306
+ return quote_text
307
+
221
308
def _create_highlighted_pdf (
222
309
self , search_terms : List [str ]
223
310
) -> Tuple [bytes , Optional [int ]]:
224
- """Create highlighted PDF with simple highlighting"""
311
+ """Create highlighted PDF with smart highlighting"""
225
312
highlighted_doc = fitz .open (stream = self .pdf_bytes , filetype = "pdf" )
226
313
first_highlight_page = None
227
314
@@ -241,18 +328,60 @@ def _create_highlighted_pdf(
241
328
if first_highlight_page is None :
242
329
first_highlight_page = page_num + 1
243
330
else :
244
- # Try to find partial matches for longer quotes
331
+ # For long quotes, try smart highlighting
245
332
if len (term .split ()) >= 5 :
246
- self ._highlight_partial_matches (page , term )
247
- if first_highlight_page is None :
333
+ found = self ._smart_highlight_long_quote (page , term )
334
+ if found and first_highlight_page is None :
248
335
first_highlight_page = page_num + 1
249
336
250
337
return highlighted_doc .tobytes (), first_highlight_page
251
338
252
339
finally :
253
340
highlighted_doc .close ()
254
341
255
- def _highlight_partial_matches (self , page , term : str ):
342
+ def _smart_highlight_long_quote (self , page , term : str ) -> bool :
343
+ """Smart highlighting for long quotes - tries to find key parts"""
344
+ words = term .split ()
345
+ found_any = False
346
+
347
+ # Extract key information patterns (times, dates, numbers, important words)
348
+ key_patterns = []
349
+
350
+ # Look for time patterns (HH:MM)
351
+ time_pattern = r'\b\d{1,2}:\d{2}\b'
352
+ times = re .findall (time_pattern , term )
353
+ key_patterns .extend (times )
354
+
355
+ # Look for date patterns (DD/MM/YYYY or similar)
356
+ date_pattern = r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b'
357
+ dates = re .findall (date_pattern , term )
358
+ key_patterns .extend (dates )
359
+
360
+ # Look for numbers that might be important
361
+ number_pattern = r'\b\d+\b'
362
+ numbers = re .findall (number_pattern , term )
363
+ # Only include numbers that are likely important (not too common)
364
+ important_numbers = [n for n in numbers if len (n ) >= 2 ]
365
+ key_patterns .extend (important_numbers )
366
+
367
+ # Try to highlight key patterns first
368
+ for pattern in key_patterns :
369
+ instances = page .search_for (pattern , quads = True )
370
+ if instances :
371
+ for inst in instances :
372
+ highlight = page .add_highlight_annot (inst )
373
+ highlight .set_colors (stroke = (0 , 1 , 0 )) # Green for key info
374
+ highlight .update ()
375
+ found_any = True
376
+
377
+ # If we found key patterns, we're done
378
+ if found_any :
379
+ return True
380
+
381
+ # Otherwise, fall back to partial matching
382
+ return self ._highlight_partial_matches (page , term )
383
+
384
+ def _highlight_partial_matches (self , page , term : str ) -> bool :
256
385
"""Find and highlight partial matches for longer quotes"""
257
386
words = term .split ()
258
387
@@ -269,7 +398,9 @@ def _highlight_partial_matches(self, page, term: str):
269
398
stroke = (1 , 0.8 , 0 )
270
399
) # Orange for partial matches
271
400
highlight .update ()
272
- return # Found something, stop here
401
+ return True # Found something, stop here
402
+
403
+ return False # Nothing found
273
404
274
405
def __del__ (self ):
275
406
"""Clean up document resources"""
@@ -284,7 +415,7 @@ def process_pdf_with_highlighting(pdf_bytes: bytes) -> EnhancedPDFProcessor:
284
415
285
416
286
417
def highlight_ai_referenced_text (
287
- pdf_processor : EnhancedPDFProcessor , ai_response : str , original_text : str
418
+ pdf_processor : EnhancedPDFProcessor , ai_response : str , original_text : str , user_question : str = ""
288
419
):
289
420
"""Legacy function for backward compatibility"""
290
- return pdf_processor .display_citation_based_references (ai_response , original_text )
421
+ return pdf_processor .display_citation_based_references (ai_response , original_text , user_question )
0 commit comments