Skip to content

Commit a1cd411

Browse files
committed
imrpoved highlighting
1 parent 453325e commit a1cd411

File tree

2 files changed

+81
-19
lines changed

2 files changed

+81
-19
lines changed
Binary file not shown.

enhanced_pdf_processor.py

Lines changed: 81 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ def display_highlighted_snippets_below_message(self, ai_response: str, original_
325325
pdf_viewer(
326326
input=highlighted_pdf_bytes,
327327
width="100%",
328-
height=600, # Slightly smaller height for inline display
328+
height=1200,
329329
render_text=True,
330330
key=f"evidence_pdf_{hash(ai_response)}" # Unique key per response
331331
)
@@ -370,12 +370,11 @@ def display_citation_based_references(self, ai_response: str, original_text: str
370370
st.session_state.chats[chat_id]['highlight_terms'] = all_quotes
371371

372372
# Display the highlighted PDF directly
373-
st.markdown("### 🎯 **Highlighted Document:**")
374373
from streamlit_pdf_viewer import pdf_viewer
375374
pdf_viewer(
376375
input=highlighted_pdf_bytes,
377376
width="100%",
378-
height=600,
377+
height=1200,
379378
render_text=True,
380379
key=f"inline_highlighted_pdf_{hash(ai_response)}"
381380
)
@@ -486,7 +485,7 @@ def _create_robust_highlighted_pdf(self, search_terms: List[str]) -> bytes:
486485
page_text = page.get_text()
487486

488487
for i, term in enumerate(search_terms):
489-
# Try exact search first
488+
# Strategy 1: Try exact search first
490489
instances = page.search_for(term, quads=True)
491490

492491
if instances:
@@ -496,27 +495,90 @@ def _create_robust_highlighted_pdf(self, search_terms: List[str]) -> bytes:
496495
highlight.update()
497496
total_highlights += 1
498497
else:
499-
# Try fuzzy matching
500-
if self._fuzzy_text_match(term, page_text):
501-
# Search for significant words
502-
words = term.split()
503-
significant_words = [w for w in words if len(w) > 3]
504-
505-
highlighted_any = False
506-
for word in significant_words[:5]: # Limit to first 5 significant words
507-
word_instances = page.search_for(word, quads=True)
508-
for inst in word_instances:
509-
highlight = page.add_highlight_annot(inst)
510-
highlight.set_colors(stroke=(1, 0.8, 0)) # Orange for word highlights
511-
highlight.update()
512-
total_highlights += 1
513-
highlighted_any = True
498+
# Strategy 2: Try to find the most distinctive parts of the quote
499+
highlighted_parts = self._find_and_highlight_distinctive_parts(page, term)
500+
total_highlights += highlighted_parts
514501

515502
return highlighted_doc.tobytes()
516503

517504
finally:
518505
highlighted_doc.close()
519506

507+
def _find_and_highlight_distinctive_parts(self, page, term: str) -> int:
508+
"""Find and highlight the most distinctive/important parts of a quote"""
509+
highlighted_count = 0
510+
words = term.split()
511+
512+
if len(words) < 5: # Only work with substantial quotes
513+
return 0
514+
515+
# Strategy 1: Look for longer phrases (minimum 5 consecutive words)
516+
for phrase_length in range(min(len(words), 10), 4, -1): # 10 words down to 5 words
517+
for start_idx in range(len(words) - phrase_length + 1):
518+
phrase = ' '.join(words[start_idx:start_idx + phrase_length])
519+
520+
instances = page.search_for(phrase, quads=True)
521+
if instances:
522+
for inst in instances:
523+
highlight = page.add_highlight_annot(inst)
524+
highlight.set_colors(stroke=(1, 0.9, 0)) # Light yellow for phrase matches
525+
highlight.update()
526+
highlighted_count += len(instances)
527+
return highlighted_count # Found a substantial phrase, stop here
528+
529+
# Strategy 2: Only if no 5+ word phrases found, look for very specific distinctive phrases
530+
# But be much more conservative
531+
distinctive_phrases = self._extract_very_specific_phrases(term)
532+
533+
for phrase in distinctive_phrases:
534+
if len(phrase.split()) >= 4: # Only highlight phrases with 4+ words
535+
instances = page.search_for(phrase, quads=True)
536+
if instances:
537+
for inst in instances:
538+
highlight = page.add_highlight_annot(inst)
539+
highlight.set_colors(stroke=(1, 0.8, 0)) # Orange for specific phrases
540+
highlight.update()
541+
highlighted_count += len(instances)
542+
if highlighted_count > 0:
543+
break # Found something substantial, stop
544+
545+
return highlighted_count
546+
547+
def _extract_very_specific_phrases(self, text: str) -> list:
548+
"""Extract only very specific and substantial phrases, avoiding single word matches"""
549+
distinctive_phrases = []
550+
words = text.split()
551+
552+
# Only look for longer sequences that are likely to be unique/specific
553+
for length in range(min(len(words), 8), 3, -1): # 8 words down to 4 words
554+
for i in range(len(words) - length + 1):
555+
phrase = ' '.join(words[i:i + length])
556+
557+
# Only include if it's substantial and likely unique
558+
if self._is_substantial_phrase(phrase):
559+
distinctive_phrases.append(phrase)
560+
561+
return distinctive_phrases[:3] # Limit to top 3 most promising phrases
562+
563+
def _is_substantial_phrase(self, phrase: str) -> bool:
564+
"""Check if a phrase is substantial enough to be worth highlighting"""
565+
words = phrase.split()
566+
567+
# Must be at least 4 words
568+
if len(words) < 4:
569+
return False
570+
571+
# Look for indicators of substantial/specific content
572+
indicators = [
573+
any(len(word) > 8 for word in words), # Contains long technical words
574+
any(word[0].isupper() and len(word) > 3 for word in words), # Contains proper nouns
575+
'"' in phrase or '(' in phrase or ')' in phrase, # Contains specific formatting
576+
any(char in phrase for char in [':', '—', '/', '-']), # Contains specific punctuation
577+
]
578+
579+
# Require at least 2 indicators of specificity
580+
return sum(indicators) >= 2
581+
520582
def get_highlighted_pdf_bytes(self) -> bytes:
521583
"""Get the highlighted PDF bytes for display"""
522584
chat_id = st.session_state.get('current_chat_id')

0 commit comments

Comments
 (0)