Skip to content

Commit 665c7a9

Browse files
committed
optimize highlights
1 parent f0ecc64 commit 665c7a9

File tree

1 file changed

+37
-11
lines changed

1 file changed

+37
-11
lines changed

ragnarok/enhanced_pdf_processor.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,13 @@ def _extract_quotes_from_ai_response(self, ai_response: str, user_question: str
177177
for match in matches1:
178178
citation_num = int(match[0])
179179
quote_text = match[1].strip()
180-
# Try to extract more focused quotes for long citations
181-
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
182-
citation_quotes[citation_num] = focused_quote
180+
# For citation highlighting, preserve the full quote text
181+
# Only use focused extraction for very long quotes (>20 words)
182+
if len(quote_text.split()) > 20:
183+
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
184+
citation_quotes[citation_num] = focused_quote
185+
else:
186+
citation_quotes[citation_num] = quote_text
183187

184188
# Pattern 2: [1]: "exact quote" - legacy format with colon (anywhere in line)
185189
if not citation_quotes:
@@ -189,17 +193,28 @@ def _extract_quotes_from_ai_response(self, ai_response: str, user_question: str
189193
for match in matches2:
190194
citation_num = int(match[0])
191195
quote_text = match[1].strip()
192-
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
193-
citation_quotes[citation_num] = focused_quote
196+
# For citation highlighting, preserve the full quote text
197+
# Only use focused extraction for very long quotes (>20 words)
198+
if len(quote_text.split()) > 20:
199+
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
200+
citation_quotes[citation_num] = focused_quote
201+
else:
202+
citation_quotes[citation_num] = quote_text
194203

195204
# Pattern 3: [Exact quote: "text"] - current problematic format
196205
if not citation_quotes:
197206
pattern3 = r'\[Exact quote:\s*"([^"]+)"\]'
198207
matches3 = re.findall(pattern3, ai_response, re.IGNORECASE)
199208

200209
for i, quote_text in enumerate(matches3, 1):
201-
focused_quote = self._extract_focused_quote(quote_text.strip(), ai_response, user_question)
202-
citation_quotes[i] = focused_quote
210+
quote_text = quote_text.strip()
211+
# For citation highlighting, preserve the full quote text
212+
# Only use focused extraction for very long quotes (>20 words)
213+
if len(quote_text.split()) > 20:
214+
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
215+
citation_quotes[i] = focused_quote
216+
else:
217+
citation_quotes[i] = quote_text
203218

204219
# Pattern 3b: "text" in brackets without "Exact quote:" prefix
205220
if not citation_quotes:
@@ -208,8 +223,14 @@ def _extract_quotes_from_ai_response(self, ai_response: str, user_question: str
208223

209224
for i, quote_text in enumerate(matches3b, 1):
210225
if len(quote_text.strip()) > 15: # Only substantial quotes
211-
focused_quote = self._extract_focused_quote(quote_text.strip(), ai_response, user_question)
212-
citation_quotes[i] = focused_quote
226+
quote_text = quote_text.strip()
227+
# For citation highlighting, preserve the full quote text
228+
# Only use focused extraction for very long quotes (>20 words)
229+
if len(quote_text.split()) > 20:
230+
focused_quote = self._extract_focused_quote(quote_text, ai_response, user_question)
231+
citation_quotes[i] = focused_quote
232+
else:
233+
citation_quotes[i] = quote_text
213234

214235
# Pattern 4: Any text in double quotes as fallback
215236
if not citation_quotes:
@@ -220,8 +241,13 @@ def _extract_quotes_from_ai_response(self, ai_response: str, user_question: str
220241
# Only use if it looks like a substantial quote
221242
cleaned = quote_text.strip()
222243
if len(cleaned) > 15 and not cleaned.startswith('http'):
223-
focused_quote = self._extract_focused_quote(cleaned, ai_response, user_question)
224-
citation_quotes[i] = focused_quote
244+
# For citation highlighting, preserve the full quote text
245+
# Only use focused extraction for very long quotes (>20 words)
246+
if len(cleaned.split()) > 20:
247+
focused_quote = self._extract_focused_quote(cleaned, ai_response, user_question)
248+
citation_quotes[i] = focused_quote
249+
else:
250+
citation_quotes[i] = cleaned
225251

226252
return citation_quotes
227253

0 commit comments

Comments
 (0)