@@ -325,7 +325,7 @@ def display_highlighted_snippets_below_message(self, ai_response: str, original_
325
325
pdf_viewer (
326
326
input = highlighted_pdf_bytes ,
327
327
width = "100%" ,
328
- height = 600 , # Slightly smaller height for inline display
328
+ height = 1200 ,
329
329
render_text = True ,
330
330
key = f"evidence_pdf_{ hash (ai_response )} " # Unique key per response
331
331
)
@@ -370,12 +370,11 @@ def display_citation_based_references(self, ai_response: str, original_text: str
370
370
st .session_state .chats [chat_id ]['highlight_terms' ] = all_quotes
371
371
372
372
# Display the highlighted PDF directly
373
- st .markdown ("### 🎯 **Highlighted Document:**" )
374
373
from streamlit_pdf_viewer import pdf_viewer
375
374
pdf_viewer (
376
375
input = highlighted_pdf_bytes ,
377
376
width = "100%" ,
378
- height = 600 ,
377
+ height = 1200 ,
379
378
render_text = True ,
380
379
key = f"inline_highlighted_pdf_{ hash (ai_response )} "
381
380
)
@@ -486,7 +485,7 @@ def _create_robust_highlighted_pdf(self, search_terms: List[str]) -> bytes:
486
485
page_text = page .get_text ()
487
486
488
487
for i , term in enumerate (search_terms ):
489
- # Try exact search first
488
+ # Strategy 1: Try exact search first
490
489
instances = page .search_for (term , quads = True )
491
490
492
491
if instances :
@@ -496,27 +495,90 @@ def _create_robust_highlighted_pdf(self, search_terms: List[str]) -> bytes:
496
495
highlight .update ()
497
496
total_highlights += 1
498
497
else :
499
- # Try fuzzy matching
500
- if self ._fuzzy_text_match (term , page_text ):
501
- # Search for significant words
502
- words = term .split ()
503
- significant_words = [w for w in words if len (w ) > 3 ]
504
-
505
- highlighted_any = False
506
- for word in significant_words [:5 ]: # Limit to first 5 significant words
507
- word_instances = page .search_for (word , quads = True )
508
- for inst in word_instances :
509
- highlight = page .add_highlight_annot (inst )
510
- highlight .set_colors (stroke = (1 , 0.8 , 0 )) # Orange for word highlights
511
- highlight .update ()
512
- total_highlights += 1
513
- highlighted_any = True
498
+ # Strategy 2: Try to find the most distinctive parts of the quote
499
+ highlighted_parts = self ._find_and_highlight_distinctive_parts (page , term )
500
+ total_highlights += highlighted_parts
514
501
515
502
return highlighted_doc .tobytes ()
516
503
517
504
finally :
518
505
highlighted_doc .close ()
519
506
507
+ def _find_and_highlight_distinctive_parts (self , page , term : str ) -> int :
508
+ """Find and highlight the most distinctive/important parts of a quote"""
509
+ highlighted_count = 0
510
+ words = term .split ()
511
+
512
+ if len (words ) < 5 : # Only work with substantial quotes
513
+ return 0
514
+
515
+ # Strategy 1: Look for longer phrases (minimum 5 consecutive words)
516
+ for phrase_length in range (min (len (words ), 10 ), 4 , - 1 ): # 10 words down to 5 words
517
+ for start_idx in range (len (words ) - phrase_length + 1 ):
518
+ phrase = ' ' .join (words [start_idx :start_idx + phrase_length ])
519
+
520
+ instances = page .search_for (phrase , quads = True )
521
+ if instances :
522
+ for inst in instances :
523
+ highlight = page .add_highlight_annot (inst )
524
+ highlight .set_colors (stroke = (1 , 0.9 , 0 )) # Light yellow for phrase matches
525
+ highlight .update ()
526
+ highlighted_count += len (instances )
527
+ return highlighted_count # Found a substantial phrase, stop here
528
+
529
+ # Strategy 2: Only if no 5+ word phrases found, look for very specific distinctive phrases
530
+ # But be much more conservative
531
+ distinctive_phrases = self ._extract_very_specific_phrases (term )
532
+
533
+ for phrase in distinctive_phrases :
534
+ if len (phrase .split ()) >= 4 : # Only highlight phrases with 4+ words
535
+ instances = page .search_for (phrase , quads = True )
536
+ if instances :
537
+ for inst in instances :
538
+ highlight = page .add_highlight_annot (inst )
539
+ highlight .set_colors (stroke = (1 , 0.8 , 0 )) # Orange for specific phrases
540
+ highlight .update ()
541
+ highlighted_count += len (instances )
542
+ if highlighted_count > 0 :
543
+ break # Found something substantial, stop
544
+
545
+ return highlighted_count
546
+
547
+ def _extract_very_specific_phrases (self , text : str ) -> list :
548
+ """Extract only very specific and substantial phrases, avoiding single word matches"""
549
+ distinctive_phrases = []
550
+ words = text .split ()
551
+
552
+ # Only look for longer sequences that are likely to be unique/specific
553
+ for length in range (min (len (words ), 8 ), 3 , - 1 ): # 8 words down to 4 words
554
+ for i in range (len (words ) - length + 1 ):
555
+ phrase = ' ' .join (words [i :i + length ])
556
+
557
+ # Only include if it's substantial and likely unique
558
+ if self ._is_substantial_phrase (phrase ):
559
+ distinctive_phrases .append (phrase )
560
+
561
+ return distinctive_phrases [:3 ] # Limit to top 3 most promising phrases
562
+
563
+ def _is_substantial_phrase (self , phrase : str ) -> bool :
564
+ """Check if a phrase is substantial enough to be worth highlighting"""
565
+ words = phrase .split ()
566
+
567
+ # Must be at least 4 words
568
+ if len (words ) < 4 :
569
+ return False
570
+
571
+ # Look for indicators of substantial/specific content
572
+ indicators = [
573
+ any (len (word ) > 8 for word in words ), # Contains long technical words
574
+ any (word [0 ].isupper () and len (word ) > 3 for word in words ), # Contains proper nouns
575
+ '"' in phrase or '(' in phrase or ')' in phrase , # Contains specific formatting
576
+ any (char in phrase for char in [':' , '—' , '/' , '-' ]), # Contains specific punctuation
577
+ ]
578
+
579
+ # Require at least 2 indicators of specificity
580
+ return sum (indicators ) >= 2
581
+
520
582
def get_highlighted_pdf_bytes (self ) -> bytes :
521
583
"""Get the highlighted PDF bytes for display"""
522
584
chat_id = st .session_state .get ('current_chat_id' )
0 commit comments