Skip to content

Commit 8dbae68

Browse files
author
Daniele Briggi
committed
feat(sentences): extract sentence content from sql.
Avoid to fetch the entire chunk to extract the content
1 parent db93d7c commit 8dbae68

File tree

7 files changed

+456
-208
lines changed

7 files changed

+456
-208
lines changed

src/sqlite_rag/database.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,19 +76,19 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings):
7676
)
7777

7878
# TODO: this table is not ready for sqlite-sync, it uses the id AUTOINCREMENT
79-
cursor.execute(
79+
cursor.executescript(
8080
"""
8181
CREATE TABLE IF NOT EXISTS chunks (
8282
id INTEGER PRIMARY KEY AUTOINCREMENT,
8383
document_id TEXT,
8484
content TEXT,
85-
embedding BLOB,
86-
FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE
85+
embedding BLOB
8786
);
87+
CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks (document_id);
8888
"""
8989
)
9090

91-
cursor.execute(
91+
cursor.executescript(
9292
"""
9393
CREATE TABLE IF NOT EXISTS sentences (
9494
id TEXT PRIMARY KEY,
@@ -97,7 +97,8 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings):
9797
embedding BLOB,
9898
start_offset INTEGER,
9999
end_offset INTEGER
100-
)
100+
);
101+
CREATE INDEX IF NOT EXISTS idx_sentences_chunk_id ON sentences (chunk_id);
101102
"""
102103
)
103104

src/sqlite_rag/engine.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,22 +270,27 @@ def search_sentences(
270270
SELECT
271271
v.rowid AS sentence_id,
272272
row_number() OVER (ORDER BY v.distance) AS rank_number,
273-
v.distance,
274-
sentences.start_offset as sentence_start_offset,
275-
sentences.end_offset as sentence_end_offset
273+
v.distance
276274
FROM {vector_scan_type}('sentences', 'embedding', :query_embedding) AS v
277275
JOIN sentences ON sentences.rowid = v.rowid
278276
WHERE sentences.chunk_id = :chunk_id
279-
ORDER BY rank_number ASC
280277
LIMIT :top_k
281278
)
282279
SELECT
283280
sentence_id,
284-
sentence_start_offset,
285-
sentence_end_offset,
281+
-- Extract sentence directly from document content
282+
COALESCE(
283+
substr(chunks.content, sentences.start_offset + 1, sentences.end_offset - sentences.start_offset),
284+
""
285+
) AS content,
286+
sentences.start_offset AS sentence_start_offset,
287+
sentences.end_offset AS sentence_end_offset,
286288
rank_number,
287289
distance
288290
FROM vec_matches
291+
JOIN sentences ON sentences.rowid = vec_matches.sentence_id
292+
JOIN chunks ON chunks.id = sentences.chunk_id
293+
ORDER BY rank_number ASC
289294
""", # nosec B608
290295
{
291296
"query_embedding": query_embedding,
@@ -301,6 +306,7 @@ def search_sentences(
301306
SentenceResult(
302307
id=row["sentence_id"],
303308
chunk_id=chunk_id,
309+
content=row["content"].strip(),
304310
rank=row["rank_number"],
305311
distance=row["distance"],
306312
start_offset=row["sentence_start_offset"],

src/sqlite_rag/formatters.py

Lines changed: 81 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,19 @@
22
"""Output formatters for CLI search results."""
33

44
from abc import ABC, abstractmethod
5-
from typing import List, Optional
5+
from typing import List
66

77
import typer
88

99
from .models.document_result import DocumentResult
1010

11+
# Display constants
12+
BOX_CONTENT_WIDTH = 75
13+
BOX_TOTAL_WIDTH = 77
14+
SNIPPET_MAX_LENGTH = 400
15+
SENTENCE_PREVIEW_LENGTH = 50
16+
MAX_SENTENCES_DISPLAY = 5
17+
1118

1219
class SearchResultFormatter(ABC):
1320
"""Base class for search result formatters."""
@@ -40,7 +47,10 @@ def _get_file_icon(self, uri: str) -> str:
4047
return "📄"
4148

4249
def _clean_and_wrap_snippet(
43-
self, snippet: str, width: int = 75, max_length: int = 400
50+
self,
51+
snippet: str,
52+
width: int = BOX_CONTENT_WIDTH,
53+
max_length: int = SNIPPET_MAX_LENGTH,
4454
) -> List[str]:
4555
"""Clean snippet and wrap to specified width with max length limit."""
4656
# Clean the snippet
@@ -69,7 +79,9 @@ def _clean_and_wrap_snippet(
6979

7080
return lines
7181

72-
def _format_uri_display(self, uri: str, icon: str, max_width: int = 75) -> str:
82+
def _format_uri_display(
83+
self, uri: str, icon: str, max_width: int = BOX_CONTENT_WIDTH
84+
) -> str:
7385
"""Format URI for display with icon and truncation."""
7486
if not uri:
7587
return ""
@@ -82,7 +94,15 @@ def _format_uri_display(self, uri: str, icon: str, max_width: int = 75) -> str:
8294

8395

8496
class BoxedFormatter(SearchResultFormatter):
85-
"""Base class for boxed result formatters."""
97+
"""Boxed formatter for search results with optional debug information."""
98+
99+
def __init__(self, show_debug: bool = False):
100+
"""Initialize formatter.
101+
102+
Args:
103+
show_debug: Whether to show debug information and sentence details
104+
"""
105+
self.show_debug = show_debug
86106

87107
def format_results(self, results: List[DocumentResult], query: str) -> None:
88108
if not results:
@@ -98,56 +118,39 @@ def format_results(self, results: List[DocumentResult], query: str) -> None:
98118
def _format_single_result(self, doc: DocumentResult, idx: int) -> None:
99119
"""Format a single result with box layout."""
100120
icon = self._get_file_icon(doc.document.uri or "")
121+
snippet_text = doc.get_preview(max_chars=SNIPPET_MAX_LENGTH)
122+
snippet_lines = self._clean_and_wrap_snippet(snippet_text)
101123

102-
# Get snippet from DocumentResult (handles sentence-based preview automatically)
103-
snippet_text = doc.get_preview(max_chars=400)
104-
105-
snippet_lines = self._clean_and_wrap_snippet(
106-
snippet_text, width=75, max_length=400
107-
)
108-
109-
# Draw the result box header
110-
header = f"┌─ Result #{idx} " + "─" * (67 - len(str(idx)))
124+
# Draw box header
125+
header = f"┌─ Result #{idx} " + "─" * (BOX_TOTAL_WIDTH - 10 - len(str(idx)))
111126
typer.echo(header)
112127

113-
# Display URI if available
128+
# Display URI and debug info
114129
if doc.document.uri:
115-
uri_display = self._format_uri_display(doc.document.uri, icon, 75)
116-
typer.echo(f"│ {uri_display:<75}│")
130+
uri_display = self._format_uri_display(doc.document.uri, icon)
131+
typer.echo(f"│ {uri_display:<{BOX_CONTENT_WIDTH}}│")
117132

118-
# Add debug info if needed
119-
debug_line = self._get_debug_line(doc)
120-
if debug_line:
121-
typer.echo(debug_line)
133+
if self.show_debug:
134+
self._print_debug_line(doc)
122135

123-
typer.echo("├" + "─" * 77 + "┤")
124-
elif self._should_show_debug():
125-
debug_line = self._get_debug_line(doc)
126-
if debug_line:
127-
typer.echo(debug_line)
128-
typer.echo("├" + "─" * 77 + "┤")
136+
typer.echo("├" + "─" * BOX_TOTAL_WIDTH + "┤")
137+
elif self.show_debug:
138+
self._print_debug_line(doc)
139+
typer.echo("├" + "─" * BOX_TOTAL_WIDTH + "┤")
129140

130141
# Display snippet
131142
for line in snippet_lines:
132-
typer.echo(f"│ {line:<75} │")
143+
typer.echo(f"│ {line:<{BOX_CONTENT_WIDTH}} │")
133144

134-
typer.echo("└" + "─" * 77 + "┘")
135-
typer.echo()
145+
# Display sentence details in debug mode
146+
if self.show_debug and doc.sentences:
147+
self._print_sentence_details(doc)
136148

137-
def _get_debug_line(self, doc: DocumentResult) -> Optional[str]:
138-
"""Get debug information line. Override in subclasses."""
139-
return None
140-
141-
def _should_show_debug(self) -> bool:
142-
"""Whether to show debug information. Override in subclasses."""
143-
return False
144-
145-
146-
class BoxedDebugFormatter(BoxedFormatter):
147-
"""Modern detailed formatter with debug information in boxes."""
149+
typer.echo("└" + "─" * BOX_TOTAL_WIDTH + "┘")
150+
typer.echo()
148151

149-
def _get_debug_line(self, doc: DocumentResult) -> str:
150-
"""Format debug metrics line."""
152+
def _print_debug_line(self, doc: DocumentResult) -> None:
153+
"""Print debug metrics line."""
151154
combined = (
152155
f"{doc.combined_rank:.5f}" if doc.combined_rank is not None else "N/A"
153156
)
@@ -161,88 +164,36 @@ def _get_debug_line(self, doc: DocumentResult) -> str:
161164
if doc.fts_rank is not None
162165
else "N/A"
163166
)
164-
return f"│ Combined: {combined} │ Vector: {vec_info} │ FTS: {fts_info}"
165-
166-
def _should_show_debug(self) -> bool:
167-
return True
168-
169-
def _format_single_result(self, doc: DocumentResult, idx: int) -> None:
170-
"""Format a single result with box layout including sentence summary."""
171-
icon = self._get_file_icon(doc.document.uri or "")
172-
173-
# Get snippet from DocumentResult (handles sentence-based preview automatically)
174-
snippet_text = doc.get_preview(max_chars=400)
175-
176-
snippet_lines = self._clean_and_wrap_snippet(
177-
snippet_text, width=75, max_length=400
178-
)
167+
debug_line = f"│ Combined: {combined} │ Vector: {vec_info} │ FTS: {fts_info}"
168+
typer.echo(debug_line)
179169

180-
# Draw the result box header
181-
header = f"┌─ Result #{idx} " + "─" * (67 - len(str(idx)))
182-
typer.echo(header)
170+
def _print_sentence_details(self, doc: DocumentResult) -> None:
171+
"""Print sentence-level details."""
172+
typer.echo("├" + "─" * BOX_TOTAL_WIDTH + "┤")
173+
typer.echo(f"│ Sentences:{' ' * (BOX_CONTENT_WIDTH - 10)}│")
183174

184-
# Display URI if available
185-
if doc.document.uri:
186-
uri_display = self._format_uri_display(doc.document.uri, icon, 75)
187-
typer.echo(f"│ {uri_display:<75}│")
188-
189-
# Add debug info
190-
debug_line = self._get_debug_line(doc)
191-
if debug_line:
192-
typer.echo(debug_line)
193-
194-
typer.echo("├" + "─" * 77 + "┤")
195-
elif self._should_show_debug():
196-
debug_line = self._get_debug_line(doc)
197-
if debug_line:
198-
typer.echo(debug_line)
199-
typer.echo("├" + "─" * 77 + "┤")
200-
201-
# Display snippet preview
202-
for line in snippet_lines:
203-
typer.echo(f"│ {line:<75} │")
204-
205-
# Display sentence details if available
206-
if doc.sentences:
207-
typer.echo("├" + "─" * 77 + "┤")
208-
typer.echo(
209-
"│ Sentences: │"
175+
for sentence in doc.sentences[:MAX_SENTENCES_DISPLAY]:
176+
distance_str = (
177+
f"{sentence.distance:.6f}" if sentence.distance is not None else "N/A"
210178
)
211-
212-
for sentence in doc.sentences[:5]: # Show max 5 sentences
213-
distance_str = (
214-
f"{sentence.distance:.6f}"
215-
if sentence.distance is not None
216-
else "N/A"
217-
)
218-
rank_str = f"#{sentence.rank}" if sentence.rank is not None else "N/A"
219-
220-
# Extract sentence preview (first 50 chars)
221-
if (
222-
sentence.start_offset is not None
223-
and sentence.end_offset is not None
224-
):
225-
sentence_text = doc.chunk_content[
226-
sentence.start_offset : sentence.end_offset
227-
].strip()
228-
# Truncate and clean for display
229-
sentence_preview = sentence_text.replace("\n", " ").replace(
230-
"\r", ""
179+
rank_str = f"#{sentence.rank}" if sentence.rank is not None else "N/A"
180+
181+
# Extract sentence preview
182+
if sentence.start_offset is not None and sentence.end_offset is not None:
183+
sentence_text = doc.chunk_content[
184+
sentence.start_offset : sentence.end_offset
185+
].strip()
186+
sentence_preview = sentence_text.replace("\n", " ").replace("\r", "")
187+
if len(sentence_preview) > SENTENCE_PREVIEW_LENGTH:
188+
sentence_preview = (
189+
sentence_preview[: SENTENCE_PREVIEW_LENGTH - 3] + "..."
231190
)
232-
if len(sentence_preview) > 50:
233-
sentence_preview = sentence_preview[:47] + "..."
234-
else:
235-
sentence_preview = "[No offset info]"
236-
237-
# Format sentence line
238-
sentence_line = (
239-
f"│ {rank_str:>3} ({distance_str}) | {sentence_preview}"
240-
)
241-
# Pad to 78 chars and add closing border
242-
typer.echo(sentence_line.ljust(78) + " │")
243-
244-
typer.echo("└" + "─" * 77 + "┘")
245-
typer.echo()
191+
else:
192+
sentence_preview = "[No offset info]"
193+
194+
# Format and print sentence line
195+
sentence_line = f"│ {rank_str:>3} ({distance_str}) | {sentence_preview}"
196+
typer.echo(sentence_line.ljust(BOX_TOTAL_WIDTH + 1) + " │")
246197

247198

248199
class TableDebugFormatter(SearchResultFormatter):
@@ -312,10 +263,15 @@ def _print_table_row(self, idx: int, doc: DocumentResult) -> None:
312263
def get_formatter(
313264
debug: bool = False, table_view: bool = False
314265
) -> SearchResultFormatter:
315-
"""Factory function to get the appropriate formatter."""
266+
"""Factory function to get the appropriate formatter.
267+
268+
Args:
269+
debug: Show debug information and sentence details
270+
table_view: Use table format instead of boxed format
271+
272+
Returns:
273+
SearchResultFormatter instance
274+
"""
316275
if table_view:
317276
return TableDebugFormatter()
318-
elif debug:
319-
return BoxedDebugFormatter()
320-
else:
321-
return BoxedFormatter()
277+
return BoxedFormatter(show_debug=debug)

0 commit comments

Comments
 (0)