Skip to content

Commit e7b82f8

Browse files
author
Daniele Briggi
committed
feat(sentences): introduce sentences to improve results preview
refact(settings): extensions options are generated by a setting method chore(settings): - default chunk_size equals to the model context window - increase FTS weight
1 parent 01a860c commit e7b82f8

16 files changed

+560
-43
lines changed

src/sqlite_rag/cli.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,11 @@ def search(
446446
"--debug",
447447
help="Print extra debug information with modern formatting",
448448
),
449+
debug2: bool = typer.Option(
450+
False,
451+
"--debug2",
452+
help="Print debug format with sentence-level details and snippet context",
453+
),
449454
peek: bool = typer.Option(
450455
False, "--peek", help="Print debug information using compact table format"
451456
),
@@ -462,7 +467,7 @@ def search(
462467
results = results[:limit]
463468

464469
# Get the appropriate formatter and display results
465-
formatter = get_formatter(debug=debug, table_view=peek)
470+
formatter = get_formatter(debug=debug, debug2=debug2, table_view=peek)
466471
formatter.format_results(results, query)
467472

468473
typer.echo(f"{search_time:.3f} seconds")

src/sqlite_rag/database.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,16 +88,39 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings):
8888
"""
8989
)
9090

91+
# TODO: remove sequence
92+
cursor.execute(
93+
"""
94+
CREATE TABLE IF NOT EXISTS sentences (
95+
id TEXT PRIMARY KEY,
96+
chunk_id INTEGER,
97+
content TEXT,
98+
embedding BLOB,
99+
sequence INTEGER,
100+
start_offset INTEGER,
101+
end_offset INTEGER
102+
)
103+
"""
104+
)
105+
91106
cursor.execute(
92107
"""
93108
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(content, content='chunks', content_rowid='id');
94109
"""
95110
)
96111

97112
cursor.execute(
98-
f"""
99-
SELECT vector_init('chunks', 'embedding', 'type={settings.vector_type},dimension={settings.embedding_dim},{settings.other_vector_options}');
100-
"""
113+
"""
114+
SELECT vector_init('chunks', 'embedding', ?);
115+
""",
116+
(settings.get_vector_init_options(),),
117+
)
118+
# TODO: same configuration as chunks (or different options?)
119+
cursor.execute(
120+
"""
121+
SELECT vector_init('sentences', 'embedding', ?);
122+
""",
123+
(settings.get_vector_init_options(),),
101124
)
102125

103126
conn.commit()

src/sqlite_rag/engine.py

Lines changed: 99 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import json
2-
import re
32
import sqlite3
43
from pathlib import Path
4+
from typing import List
55

66
from sqlite_rag.logger import Logger
77
from sqlite_rag.models.document_result import DocumentResult
8+
from sqlite_rag.models.sentence_result import SentenceResult
9+
from sqlite_rag.sentence_splitter import SentenceSplitter
810

911
from .chunker import Chunker
1012
from .models.document import Document
@@ -15,10 +17,17 @@ class Engine:
1517
# Considered a good default to normilize the score for RRF
1618
DEFAULT_RRF_K = 60
1719

18-
def __init__(self, conn: sqlite3.Connection, settings: Settings, chunker: Chunker):
20+
def __init__(
21+
self,
22+
conn: sqlite3.Connection,
23+
settings: Settings,
24+
chunker: Chunker,
25+
sentence_chunker: SentenceSplitter,
26+
):
1927
self._conn = conn
2028
self._settings = settings
2129
self._chunker = chunker
30+
self._sentence_chunker = sentence_chunker
2231
self._logger = Logger()
2332

2433
def load_model(self):
@@ -30,7 +39,7 @@ def load_model(self):
3039

3140
self._conn.execute(
3241
"SELECT llm_model_load(?, ?);",
33-
(self._settings.model_path, self._settings.model_options),
42+
(self._settings.model_path, self._settings.other_model_options),
3443
)
3544

3645
def process(self, document: Document) -> Document:
@@ -46,6 +55,11 @@ def process(self, document: Document) -> Document:
4655
chunk.title = document.get_title()
4756
chunk.embedding = self.generate_embedding(chunk.get_embedding_text())
4857

58+
sentences = self._sentence_chunker.split(chunk)
59+
for sentence in sentences:
60+
sentence.embedding = self.generate_embedding(sentence.content)
61+
chunk.sentences = sentences
62+
4963
document.chunks = chunks
5064

5165
return document
@@ -72,6 +86,7 @@ def quantize(self) -> None:
7286
cursor = self._conn.cursor()
7387

7488
cursor.execute("SELECT vector_quantize('chunks', 'embedding');")
89+
cursor.execute("SELECT vector_quantize('sentences', 'embedding');")
7590

7691
self._conn.commit()
7792
self._logger.debug("Quantization completed.")
@@ -81,21 +96,25 @@ def quantize_preload(self) -> None:
8196
cursor = self._conn.cursor()
8297

8398
cursor.execute("SELECT vector_quantize_preload('chunks', 'embedding');")
99+
cursor.execute("SELECT vector_quantize_preload('sentences', 'embedding');")
84100

85101
def quantize_cleanup(self) -> None:
86102
"""Clean up internal structures related to a previously quantized table/column."""
87103
cursor = self._conn.cursor()
88104

89105
cursor.execute("SELECT vector_quantize_cleanup('chunks', 'embedding');")
106+
cursor.execute("SELECT vector_quantize_cleanup('sentences', 'embedding');")
90107

91108
self._conn.commit()
92109

93110
def create_new_context(self) -> None:
94-
""""""
111+
"""Create a new LLM context with optional runtime overrides."""
95112
cursor = self._conn.cursor()
113+
context_options = self._settings.get_embeddings_context_options()
96114

97115
cursor.execute(
98-
"SELECT llm_context_create(?);", (self._settings.model_context_options,)
116+
"SELECT llm_context_create(?);",
117+
(context_options,),
99118
)
100119

101120
def free_context(self) -> None:
@@ -104,13 +123,11 @@ def free_context(self) -> None:
104123

105124
cursor.execute("SELECT llm_context_free();")
106125

107-
def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
126+
def search(
127+
self, semantic_query: str, fts_query, top_k: int = 10
128+
) -> list[DocumentResult]:
108129
"""Semantic search and full-text search sorted with Reciprocal Rank Fusion."""
109-
query_embedding = self.generate_embedding(query)
110-
111-
# Clean up and split into words
112-
# '*' is used to match while typing
113-
query = " ".join(re.findall(r"\b\w+\b", query.lower())) + "*"
130+
query_embedding = self.generate_embedding(semantic_query)
114131

115132
vector_scan_type = (
116133
"vector_quantize_scan"
@@ -119,8 +136,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
119136
)
120137

121138
cursor = self._conn.cursor()
122-
# TODO: understand how to sort results depending on the distance metric
123-
# Eg, for cosine distance, higher is better (closer to 1)
139+
124140
cursor.execute(
125141
f"""
126142
-- sqlite-vector KNN vector search results
@@ -163,6 +179,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
163179
documents.uri,
164180
documents.content as document_content,
165181
documents.metadata,
182+
chunks.id AS chunk_id,
166183
chunks.content AS snippet,
167184
vec_rank,
168185
fts_rank,
@@ -176,7 +193,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
176193
;
177194
""", # nosec B608
178195
{
179-
"query": query,
196+
"query": fts_query,
180197
"query_embedding": query_embedding,
181198
"k": top_k,
182199
"rrf_k": Engine.DEFAULT_RRF_K,
@@ -186,14 +203,15 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
186203
)
187204

188205
rows = cursor.fetchall()
189-
return [
206+
results = [
190207
DocumentResult(
191208
document=Document(
192209
id=row["id"],
193210
uri=row["uri"],
194211
content=row["document_content"],
195212
metadata=json.loads(row["metadata"]) if row["metadata"] else {},
196213
),
214+
chunk_id=row["chunk_id"],
197215
snippet=row["snippet"],
198216
vec_rank=row["vec_rank"],
199217
fts_rank=row["fts_rank"],
@@ -204,6 +222,72 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
204222
for row in rows
205223
]
206224

225+
return results
226+
227+
def search_sentences(
228+
self, query: str, chunk_id: int, k: int
229+
) -> List[SentenceResult]:
230+
query_embedding = self.generate_embedding(query)
231+
232+
vector_scan_type = (
233+
"vector_quantize_scan_stream"
234+
if self._settings.quantize_scan
235+
else "vector_full_scan_stream"
236+
)
237+
238+
cursor = self._conn.cursor()
239+
240+
cursor.execute(
241+
f"""
242+
WITH vec_matches AS (
243+
SELECT
244+
v.rowid AS sentence_id,
245+
row_number() OVER (ORDER BY v.distance) AS rank_number,
246+
v.distance,
247+
sentences.content as sentence_content,
248+
sentences.sequence as sentence_sequence,
249+
sentences.start_offset as sentence_start_offset,
250+
sentences.end_offset as sentence_end_offset
251+
FROM {vector_scan_type}('sentences', 'embedding', :query_embedding) AS v
252+
JOIN sentences ON sentences.rowid = v.rowid
253+
WHERE sentences.chunk_id = :chunk_id
254+
LIMIT :k
255+
)
256+
SELECT
257+
sentence_id,
258+
sentence_content,
259+
sentence_sequence,
260+
sentence_start_offset,
261+
sentence_end_offset,
262+
rank_number,
263+
distance
264+
FROM vec_matches
265+
ORDER BY rank_number ASC
266+
""", # nosec B608
267+
{
268+
"query_embedding": query_embedding,
269+
"k": k,
270+
"chunk_id": chunk_id,
271+
},
272+
)
273+
274+
rows = cursor.fetchall()
275+
sentences = []
276+
for row in rows:
277+
sentences.append(
278+
SentenceResult(
279+
id=row["sentence_id"],
280+
chunk_id=chunk_id,
281+
sequence=row["sentence_sequence"],
282+
rank=row["rank_number"],
283+
distance=row["distance"],
284+
start_offset=row["sentence_start_offset"],
285+
end_offset=row["sentence_end_offset"],
286+
)
287+
)
288+
289+
return sentences[:k]
290+
207291
def versions(self) -> dict:
208292
"""Get versions of the loaded extensions."""
209293
cursor = self._conn.cursor()

0 commit comments

Comments
 (0)