@@ -376,35 +376,14 @@ def display_citation_based_references(
376
376
all_quotes = list (citation_quotes .values ())
377
377
378
378
# Show found citations with improved display
379
- st .caption (f"✅ Found { len (citation_quotes )} citation(s)" )
379
+ st .caption (f"Found { len (citation_quotes )} citation(s)" )
380
380
with st .expander ("Found Citations" , expanded = False ):
381
381
for num , quote in citation_quotes .items ():
382
382
# Clean display without technical details
383
383
st .markdown (f"**[{ num } ]** \" { quote } \" " )
384
384
if len (quote ) > 200 : # Add some spacing for very long quotes
385
385
st .markdown ("---" )
386
386
387
- # IMPROVED: Pre-check if quotes are likely to be found before creating highlighted PDF
388
- likely_matches = []
389
- for quote in all_quotes :
390
- # Simple check if quote text appears in document (case-insensitive)
391
- if quote .lower () in original_text .lower ():
392
- likely_matches .append (quote )
393
- else :
394
- # Check if key phrases from the quote appear
395
- words = quote .split ()
396
- if len (words ) >= 3 :
397
- for i in range (len (words ) - 2 ):
398
- phrase = " " .join (words [i :i + 3 ])
399
- if len (phrase ) > 10 and phrase .lower () in original_text .lower ():
400
- likely_matches .append (quote )
401
- break
402
-
403
- if likely_matches :
404
- st .info (f"🎯 { len (likely_matches )} of { len (all_quotes )} citations likely to be highlighted in document" )
405
- else :
406
- st .warning ("⚠️ Citations may not be found exactly as written in the document. The highlighter will try to find related content." )
407
-
408
387
highlighted_pdf_bytes , first_highlight_page = self ._create_highlighted_pdf (
409
388
all_quotes
410
389
)
@@ -435,9 +414,6 @@ def display_citation_based_references(
435
414
436
415
if first_highlight_page :
437
416
viewer_params ["scroll_to_page" ] = first_highlight_page
438
- st .success (f"📄 Scrolled to page { first_highlight_page } with first highlight" )
439
- else :
440
- st .info ("📄 No highlights found - showing original document" )
441
417
442
418
pdf_viewer (** viewer_params )
443
419
@@ -641,59 +617,54 @@ def _create_highlighted_pdf(
641
617
highlighted_doc = fitz .open (stream = self .pdf_bytes , filetype = "pdf" )
642
618
first_highlight_page = None
643
619
644
- # PERFORMANCE: Show progress for user feedback during highlighting
620
+ # PERFORMANCE: Show simple spinner during highlighting
645
621
progress_placeholder = st .empty ()
646
- total_operations = len (search_terms ) * highlighted_doc .page_count
647
- current_operation = 0
648
622
649
623
try :
650
- for term_idx , term in enumerate (search_terms ):
651
- with progress_placeholder .container ():
652
- st .info (f"🔍 Highlighting citation { term_idx + 1 } /{ len (search_terms )} : \" { term [:50 ]} { '...' if len (term ) > 50 else '' } \" " )
653
-
654
- # PERFORMANCE: Early termination if we already found highlights
655
- found_highlight_for_term = False
656
-
657
- for page_num in range (highlighted_doc .page_count ):
658
- current_operation += 1
659
-
660
- # PERFORMANCE: Skip remaining pages if we found good highlights for this term
661
- if found_highlight_for_term and first_highlight_page is not None :
662
- continue
624
+ with progress_placeholder .container ():
625
+ with st .spinner ("Highlighting citations in document..." ):
626
+ for term_idx , term in enumerate (search_terms ):
627
+ # PERFORMANCE: Early termination if we already found highlights
628
+ found_highlight_for_term = False
663
629
664
- page = highlighted_doc [page_num ]
665
-
666
- # Try exact search first for the complete term
667
- instances = page .search_for (term , quads = True )
668
-
669
- if instances :
670
- # Found exact match - highlight it
671
- for inst in instances :
672
- highlight = page .add_highlight_annot (inst )
673
- highlight .set_colors (stroke = (1 , 1 , 0 )) # Yellow highlight
674
- highlight .update ()
675
- if first_highlight_page is None :
676
- first_highlight_page = page_num + 1
677
- found_highlight_for_term = True
678
- else :
679
- # PERFORMANCE: Only try smart highlighting if no exact match and term is substantial
680
- if len (term .split ()) >= 5 : # Increased threshold to reduce unnecessary processing
681
- found = self ._smart_highlight_long_quote_fast (page , term )
682
- if found :
683
- found_highlight_for_term = True
684
- if first_highlight_page is None :
685
- first_highlight_page = page_num + 1
686
- elif len (term .split ()) >= 3 : # For shorter terms, just try case-insensitive
687
- # For short terms, try case-insensitive search only
688
- instances_case_insensitive = page .search_for (term , quads = True , flags = fitz .TEXT_DEHYPHENATE | fitz .TEXT_PRESERVE_WHITESPACE )
689
- if instances_case_insensitive :
690
- for inst in instances_case_insensitive :
630
+ for page_num in range (highlighted_doc .page_count ):
631
+ # PERFORMANCE: Skip remaining pages if we found good highlights for this term
632
+ if found_highlight_for_term and first_highlight_page is not None :
633
+ continue
634
+
635
+ page = highlighted_doc [page_num ]
636
+
637
+ # Try exact search first for the complete term
638
+ instances = page .search_for (term , quads = True )
639
+
640
+ if instances :
641
+ # Found exact match - highlight it
642
+ for inst in instances :
691
643
highlight = page .add_highlight_annot (inst )
692
- highlight .set_colors (stroke = (1 , 0.8 , 0 )) # Orange for case-insensitive matches
644
+ highlight .set_colors (stroke = (1 , 1 , 0 )) # Yellow highlight
693
645
highlight .update ()
646
+ if first_highlight_page is None :
647
+ first_highlight_page = page_num + 1
694
648
found_highlight_for_term = True
695
- if first_highlight_page is None :
696
- first_highlight_page = page_num + 1
649
+ else :
650
+ # PERFORMANCE: Only try smart highlighting if no exact match and term is substantial
651
+ if len (term .split ()) >= 5 : # Increased threshold to reduce unnecessary processing
652
+ found = self ._smart_highlight_long_quote_fast (page , term )
653
+ if found :
654
+ found_highlight_for_term = True
655
+ if first_highlight_page is None :
656
+ first_highlight_page = page_num + 1
657
+ elif len (term .split ()) >= 3 : # For shorter terms, just try case-insensitive
658
+ # For short terms, try case-insensitive search only
659
+ instances_case_insensitive = page .search_for (term , quads = True , flags = fitz .TEXT_DEHYPHENATE | fitz .TEXT_PRESERVE_WHITESPACE )
660
+ if instances_case_insensitive :
661
+ for inst in instances_case_insensitive :
662
+ highlight = page .add_highlight_annot (inst )
663
+ highlight .set_colors (stroke = (1 , 0.8 , 0 )) # Orange for case-insensitive matches
664
+ highlight .update ()
665
+ found_highlight_for_term = True
666
+ if first_highlight_page is None :
667
+ first_highlight_page = page_num + 1
697
668
698
669
# Clear progress indicator
699
670
progress_placeholder .empty ()
0 commit comments