Skip to content

Commit 187d2ec

Browse files
author
Daniele Briggi
committed
refact(search): embed query once per search
1 parent 012b3e7 commit 187d2ec

File tree

9 files changed

+532
-196
lines changed

9 files changed

+532
-196
lines changed

src/sqlite_rag/database.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings):
8888
"""
8989
)
9090

91-
# TODO: remove sequence
9291
cursor.execute(
9392
"""
9493
CREATE TABLE IF NOT EXISTS sentences (

src/sqlite_rag/engine.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
import re
23
import sqlite3
34
from pathlib import Path
45
from typing import List
@@ -123,11 +124,38 @@ def free_context(self) -> None:
123124

124125
cursor.execute("SELECT llm_context_free();")
125126

126-
def search(
127-
self, semantic_query: str, fts_query, top_k: int = 10
127+
def search(self, query, top_k: int = 10) -> list[DocumentResult]:
128+
"""Semantic search and full-text search sorted with Reciprocal Rank Fusion
129+
with top matching sentences to highlight."""
130+
semantic_query = query
131+
if self._settings.use_prompt_templates:
132+
semantic_query = self._settings.prompt_template_retrieval_query.format(
133+
content=query
134+
)
135+
136+
# Clean up and split into words
137+
# '*' is used to match while typing
138+
fts_query = " ".join(re.findall(r"\b\w+\b", query.lower())) + "*"
139+
140+
query_embedding = self.generate_embedding(semantic_query)
141+
142+
results = self.search_documents(query_embedding, fts_query, top_k=top_k)
143+
144+
# Refine chunks with top sentences
145+
for result in results:
146+
result.sentences = self.search_sentences(
147+
query_embedding, result.chunk_id, top_k=self._settings.top_k_sentences
148+
)
149+
150+
return results
151+
152+
def search_documents(
153+
self, query_embedding: bytes, fts_query: str, top_k: int
128154
) -> list[DocumentResult]:
129155
"""Semantic search and full-text search sorted with Reciprocal Rank Fusion."""
130-
query_embedding = self.generate_embedding(semantic_query)
156+
# invalid query
157+
if query_embedding == b"" or fts_query.strip() == "":
158+
return []
131159

132160
vector_scan_type = (
133161
"vector_quantize_scan"
@@ -180,7 +208,7 @@ def search(
180208
documents.content as document_content,
181209
documents.metadata,
182210
chunks.id AS chunk_id,
183-
chunks.content AS snippet,
211+
chunks.content AS chunk_content,
184212
vec_rank,
185213
fts_rank,
186214
combined_rank,
@@ -212,7 +240,7 @@ def search(
212240
metadata=json.loads(row["metadata"]) if row["metadata"] else {},
213241
),
214242
chunk_id=row["chunk_id"],
215-
snippet=row["snippet"],
243+
chunk_content=row["chunk_content"],
216244
vec_rank=row["vec_rank"],
217245
fts_rank=row["fts_rank"],
218246
combined_rank=row["combined_rank"],
@@ -225,10 +253,9 @@ def search(
225253
return results
226254

227255
def search_sentences(
228-
self, query: str, chunk_id: int, top_k: int
256+
self, query_embedding: bytes, chunk_id: int, top_k: int
229257
) -> List[SentenceResult]:
230-
query_embedding = self.generate_embedding(query)
231-
258+
"""Semantic search for sentences within a chunk."""
232259
vector_scan_type = (
233260
"vector_quantize_scan_stream"
234261
if self._settings.quantize_scan

src/sqlite_rag/formatters.py

Lines changed: 7 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
import typer
88

9-
from sqlite_rag.models.sentence_result import SentenceResult
10-
119
from .models.document_result import DocumentResult
1210

1311

@@ -82,81 +80,6 @@ def _format_uri_display(self, uri: str, icon: str, max_width: int = 75) -> str:
8280
uri_display = f"{icon} ...{uri[-available_width:]}"
8381
return uri_display
8482

85-
def _build_sentence_preview(
86-
self,
87-
chunk_content: str,
88-
sentences: List[SentenceResult],
89-
max_chars: int = 400,
90-
) -> str:
91-
"""Build preview from top 3 ranked sentences with [...] for gaps.
92-
93-
Args:
94-
chunk_content: The full chunk text
95-
sentences: List of SentenceResult objects (should already be sorted by rank)
96-
max_chars: Maximum total characters for preview
97-
98-
Returns:
99-
Preview string with top sentences and [...] separators.
100-
Falls back to truncated chunk_content if sentences have no offsets.
101-
"""
102-
103-
# Take top 3 sentences (they should already be sorted by rank/distance)
104-
top_sentences = sentences[:3] if sentences else []
105-
106-
if not top_sentences:
107-
# Fallback: no sentences, return truncated chunk content
108-
return chunk_content[:max_chars]
109-
110-
# Filter sentences that have offset information
111-
sentences_with_offsets = [
112-
s
113-
for s in top_sentences
114-
if s.start_offset is not None and s.end_offset is not None
115-
]
116-
117-
if not sentences_with_offsets:
118-
# Fallback: sentences exist but no offset information, return truncated chunk content
119-
return chunk_content[:max_chars]
120-
121-
# Sort by start_offset to maintain document order
122-
sentences_with_offsets.sort(
123-
key=lambda s: s.start_offset if s.start_offset is not None else -1
124-
)
125-
126-
preview_parts = []
127-
total_chars = 0
128-
prev_end_offset = None
129-
130-
for sentence in sentences_with_offsets:
131-
# Extract sentence text using offsets
132-
sentence_text = chunk_content[
133-
sentence.start_offset : sentence.end_offset
134-
].strip()
135-
136-
# Calculate remaining budget including potential separator
137-
separator_len = len(" [...] ") if preview_parts else 0
138-
remaining = max_chars - total_chars - separator_len
139-
140-
if remaining <= 0:
141-
break
142-
143-
# Truncate sentence if needed
144-
if len(sentence_text) > remaining:
145-
sentence_text = sentence_text[: remaining - 3] + "..."
146-
147-
# Check if there's a gap > 10 chars from previous sentence
148-
if prev_end_offset is not None and sentence.start_offset is not None:
149-
gap_size = sentence.start_offset - prev_end_offset
150-
if gap_size > 10:
151-
preview_parts.append("[...]")
152-
total_chars += len(" [...] ")
153-
154-
preview_parts.append(sentence_text)
155-
total_chars += len(sentence_text)
156-
prev_end_offset = sentence.end_offset
157-
158-
return " ".join(preview_parts)
159-
16083

16184
class BoxedFormatter(SearchResultFormatter):
16285
"""Base class for boxed result formatters."""
@@ -176,11 +99,8 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None:
17699
"""Format a single result with box layout."""
177100
icon = self._get_file_icon(doc.document.uri or "")
178101

179-
# Use sentence-based preview if sentences are available
180-
if doc.sentences:
181-
snippet_text = self._build_sentence_preview(doc.snippet, doc.sentences)
182-
else:
183-
snippet_text = doc.snippet
102+
# Get snippet from DocumentResult (handles sentence-based preview automatically)
103+
snippet_text = doc.get_preview(max_chars=400)
184104

185105
snippet_lines = self._clean_and_wrap_snippet(
186106
snippet_text, width=75, max_length=400
@@ -250,11 +170,8 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None:
250170
"""Format a single result with box layout including sentence summary."""
251171
icon = self._get_file_icon(doc.document.uri or "")
252172

253-
# Use sentence-based preview if sentences are available
254-
if doc.sentences:
255-
snippet_text = self._build_sentence_preview(doc.snippet, doc.sentences)
256-
else:
257-
snippet_text = doc.snippet
173+
# Get snippet from DocumentResult (handles sentence-based preview automatically)
174+
snippet_text = doc.get_preview(max_chars=400)
258175

259176
snippet_lines = self._clean_and_wrap_snippet(
260177
snippet_text, width=75, max_length=400
@@ -305,7 +222,7 @@ def _format_single_result(self, doc: DocumentResult, idx: int) -> None:
305222
sentence.start_offset is not None
306223
and sentence.end_offset is not None
307224
):
308-
sentence_text = doc.snippet[
225+
sentence_text = doc.chunk_content[
309226
sentence.start_offset : sentence.end_offset
310227
].strip()
311228
# Truncate and clean for display
@@ -364,13 +281,8 @@ def _print_table_header(self) -> None:
364281

365282
def _print_table_row(self, idx: int, doc: DocumentResult) -> None:
366283
"""Print a single table row."""
367-
# Use sentence-based preview if sentences are available
368-
if doc.sentences:
369-
snippet = self._build_sentence_preview(
370-
doc.snippet, doc.sentences, max_chars=52
371-
)
372-
else:
373-
snippet = doc.snippet
284+
# Get snippet from DocumentResult (handles sentence-based preview automatically)
285+
snippet = doc.get_preview(max_chars=52)
374286

375287
# Clean snippet display
376288
snippet = snippet.replace("\n", " ").replace("\r", "")

src/sqlite_rag/models/document_result.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class DocumentResult:
99
document: Document
1010

1111
chunk_id: int
12-
snippet: str
12+
chunk_content: str
1313

1414
combined_rank: float
1515
vec_rank: float | None = None
@@ -20,3 +20,67 @@ class DocumentResult:
2020

2121
# highlight sentences
2222
sentences: list[SentenceResult] = field(default_factory=list)
23+
24+
def get_preview(
25+
self, top_k_sentences: int = 3, max_chars: int = 400, gap: str = "[...]"
26+
) -> str:
27+
"""Build preview from top ranked sentences with [...] for gaps.
28+
29+
Args:
30+
top_k_sentences: Number of top sentences to include in preview
31+
max_chars: Maximum total characters for preview
32+
33+
Returns:
34+
Preview string with top sentences and [...] separators.
35+
Falls back to truncated chunk_content if sentences have no offsets.
36+
"""
37+
top_sentences = self.sentences[:top_k_sentences] if self.sentences else []
38+
39+
if not top_sentences:
40+
# Fallback: no sentences, return truncated chunk content
41+
return self.chunk_content[:max_chars]
42+
43+
# Filter sentences that have offset information
44+
sentences_with_offsets = [
45+
s
46+
for s in top_sentences
47+
if s.start_offset is not None and s.end_offset is not None
48+
]
49+
50+
if not sentences_with_offsets:
51+
return self.chunk_content[:max_chars]
52+
53+
# Sort by start_offset to maintain document order
54+
sentences_with_offsets.sort(
55+
key=lambda s: s.start_offset if s.start_offset is not None else -1
56+
)
57+
58+
preview_parts = []
59+
total_chars = 0
60+
prev_end_offset = None
61+
62+
for sentence in sentences_with_offsets:
63+
sentence_text = self.chunk_content[
64+
sentence.start_offset : sentence.end_offset
65+
].strip()
66+
67+
# Calculate remaining budget including potential separator
68+
separator_len = len("[...] ") if preview_parts else 0
69+
remaining = max_chars - total_chars - separator_len
70+
71+
if remaining <= 0:
72+
break
73+
74+
if prev_end_offset is not None and sentence.start_offset is not None:
75+
gap_size = sentence.start_offset - prev_end_offset
76+
if gap_size > 10:
77+
preview_parts.append(gap)
78+
total_chars += len(gap)
79+
80+
preview_parts.append(sentence_text)
81+
total_chars += len(sentence_text)
82+
prev_end_offset = sentence.end_offset
83+
84+
preview = " ".join(preview_parts)
85+
86+
return preview[: max_chars - 3] + "..." if len(preview) > max_chars else preview

src/sqlite_rag/sqliterag.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import re
21
import sqlite3
32
from dataclasses import asdict
43
from pathlib import Path
@@ -317,25 +316,7 @@ def search(
317316
if new_context:
318317
self._engine.create_new_context()
319318

320-
semantic_query = query
321-
if self._settings.use_prompt_templates:
322-
semantic_query = self._settings.prompt_template_retrieval_query.format(
323-
content=query
324-
)
325-
326-
# Clean up and split into words
327-
# '*' is used to match while typing
328-
fts_query = " ".join(re.findall(r"\b\w+\b", query.lower())) + "*"
329-
330-
results = self._engine.search(semantic_query, fts_query, top_k=top_k)
331-
332-
# Refine chunks with top sentences
333-
for result in results:
334-
result.sentences = self._engine.search_sentences(
335-
semantic_query, result.chunk_id, top_k=self._settings.top_k_sentences
336-
)
337-
338-
return results
319+
return self._engine.search(query, top_k=top_k)
339320

340321
def get_settings(self) -> dict:
341322
"""Get settings and more useful information"""

0 commit comments

Comments
 (0)