Skip to content

Commit 606ec66

Browse files
committed
Refactor citation display and highlighting process in EnhancedPDFProcessor
- Removed redundant pre-check for likely matches before highlighting. - Simplified citation display message. - Improved user feedback during highlighting with a spinner. - Streamlined the highlighting logic for better performance and clarity.
1 parent d24c721 commit 606ec66

File tree

1 file changed

+42
-71
lines changed

1 file changed

+42
-71
lines changed

ragnarok/enhanced_pdf_processor.py

Lines changed: 42 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -376,35 +376,14 @@ def display_citation_based_references(
376376
all_quotes = list(citation_quotes.values())
377377

378378
# Show found citations with improved display
379-
st.caption(f"Found {len(citation_quotes)} citation(s)")
379+
st.caption(f"Found {len(citation_quotes)} citation(s)")
380380
with st.expander("Found Citations", expanded=False):
381381
for num, quote in citation_quotes.items():
382382
# Clean display without technical details
383383
st.markdown(f"**[{num}]** \"{quote}\"")
384384
if len(quote) > 200: # Add some spacing for very long quotes
385385
st.markdown("---")
386386

387-
# IMPROVED: Pre-check if quotes are likely to be found before creating highlighted PDF
388-
likely_matches = []
389-
for quote in all_quotes:
390-
# Simple check if quote text appears in document (case-insensitive)
391-
if quote.lower() in original_text.lower():
392-
likely_matches.append(quote)
393-
else:
394-
# Check if key phrases from the quote appear
395-
words = quote.split()
396-
if len(words) >= 3:
397-
for i in range(len(words) - 2):
398-
phrase = " ".join(words[i:i+3])
399-
if len(phrase) > 10 and phrase.lower() in original_text.lower():
400-
likely_matches.append(quote)
401-
break
402-
403-
if likely_matches:
404-
st.info(f"🎯 {len(likely_matches)} of {len(all_quotes)} citations likely to be highlighted in document")
405-
else:
406-
st.warning("⚠️ Citations may not be found exactly as written in the document. The highlighter will try to find related content.")
407-
408387
highlighted_pdf_bytes, first_highlight_page = self._create_highlighted_pdf(
409388
all_quotes
410389
)
@@ -435,9 +414,6 @@ def display_citation_based_references(
435414

436415
if first_highlight_page:
437416
viewer_params["scroll_to_page"] = first_highlight_page
438-
st.success(f"📄 Scrolled to page {first_highlight_page} with first highlight")
439-
else:
440-
st.info("📄 No highlights found - showing original document")
441417

442418
pdf_viewer(**viewer_params)
443419

@@ -641,59 +617,54 @@ def _create_highlighted_pdf(
641617
highlighted_doc = fitz.open(stream=self.pdf_bytes, filetype="pdf")
642618
first_highlight_page = None
643619

644-
# PERFORMANCE: Show progress for user feedback during highlighting
620+
# PERFORMANCE: Show simple spinner during highlighting
645621
progress_placeholder = st.empty()
646-
total_operations = len(search_terms) * highlighted_doc.page_count
647-
current_operation = 0
648622

649623
try:
650-
for term_idx, term in enumerate(search_terms):
651-
with progress_placeholder.container():
652-
st.info(f"🔍 Highlighting citation {term_idx + 1}/{len(search_terms)}: \"{term[:50]}{'...' if len(term) > 50 else ''}\"")
653-
654-
# PERFORMANCE: Early termination if we already found highlights
655-
found_highlight_for_term = False
656-
657-
for page_num in range(highlighted_doc.page_count):
658-
current_operation += 1
659-
660-
# PERFORMANCE: Skip remaining pages if we found good highlights for this term
661-
if found_highlight_for_term and first_highlight_page is not None:
662-
continue
624+
with progress_placeholder.container():
625+
with st.spinner("Highlighting citations in document..."):
626+
for term_idx, term in enumerate(search_terms):
627+
# PERFORMANCE: Early termination if we already found highlights
628+
found_highlight_for_term = False
663629

664-
page = highlighted_doc[page_num]
665-
666-
# Try exact search first for the complete term
667-
instances = page.search_for(term, quads=True)
668-
669-
if instances:
670-
# Found exact match - highlight it
671-
for inst in instances:
672-
highlight = page.add_highlight_annot(inst)
673-
highlight.set_colors(stroke=(1, 1, 0)) # Yellow highlight
674-
highlight.update()
675-
if first_highlight_page is None:
676-
first_highlight_page = page_num + 1
677-
found_highlight_for_term = True
678-
else:
679-
# PERFORMANCE: Only try smart highlighting if no exact match and term is substantial
680-
if len(term.split()) >= 5: # Increased threshold to reduce unnecessary processing
681-
found = self._smart_highlight_long_quote_fast(page, term)
682-
if found:
683-
found_highlight_for_term = True
684-
if first_highlight_page is None:
685-
first_highlight_page = page_num + 1
686-
elif len(term.split()) >= 3: # For shorter terms, just try case-insensitive
687-
# For short terms, try case-insensitive search only
688-
instances_case_insensitive = page.search_for(term, quads=True, flags=fitz.TEXT_DEHYPHENATE | fitz.TEXT_PRESERVE_WHITESPACE)
689-
if instances_case_insensitive:
690-
for inst in instances_case_insensitive:
630+
for page_num in range(highlighted_doc.page_count):
631+
# PERFORMANCE: Skip remaining pages if we found good highlights for this term
632+
if found_highlight_for_term and first_highlight_page is not None:
633+
continue
634+
635+
page = highlighted_doc[page_num]
636+
637+
# Try exact search first for the complete term
638+
instances = page.search_for(term, quads=True)
639+
640+
if instances:
641+
# Found exact match - highlight it
642+
for inst in instances:
691643
highlight = page.add_highlight_annot(inst)
692-
highlight.set_colors(stroke=(1, 0.8, 0)) # Orange for case-insensitive matches
644+
highlight.set_colors(stroke=(1, 1, 0)) # Yellow highlight
693645
highlight.update()
646+
if first_highlight_page is None:
647+
first_highlight_page = page_num + 1
694648
found_highlight_for_term = True
695-
if first_highlight_page is None:
696-
first_highlight_page = page_num + 1
649+
else:
650+
# PERFORMANCE: Only try smart highlighting if no exact match and term is substantial
651+
if len(term.split()) >= 5: # Increased threshold to reduce unnecessary processing
652+
found = self._smart_highlight_long_quote_fast(page, term)
653+
if found:
654+
found_highlight_for_term = True
655+
if first_highlight_page is None:
656+
first_highlight_page = page_num + 1
657+
elif len(term.split()) >= 3: # For shorter terms, just try case-insensitive
658+
# For short terms, try case-insensitive search only
659+
instances_case_insensitive = page.search_for(term, quads=True, flags=fitz.TEXT_DEHYPHENATE | fitz.TEXT_PRESERVE_WHITESPACE)
660+
if instances_case_insensitive:
661+
for inst in instances_case_insensitive:
662+
highlight = page.add_highlight_annot(inst)
663+
highlight.set_colors(stroke=(1, 0.8, 0)) # Orange for case-insensitive matches
664+
highlight.update()
665+
found_highlight_for_term = True
666+
if first_highlight_page is None:
667+
first_highlight_page = page_num + 1
697668

698669
# Clear progress indicator
699670
progress_placeholder.empty()

0 commit comments

Comments
 (0)