11import json
2- import re
32import sqlite3
43from pathlib import Path
4+ from typing import List
55
66from sqlite_rag .logger import Logger
77from sqlite_rag .models .document_result import DocumentResult
8+ from sqlite_rag .models .sentence_result import SentenceResult
9+ from sqlite_rag .sentence_splitter import SentenceSplitter
810
911from .chunker import Chunker
1012from .models .document import Document
@@ -15,10 +17,17 @@ class Engine:
1517 # Considered a good default to normilize the score for RRF
1618 DEFAULT_RRF_K = 60
1719
18- def __init__ (self , conn : sqlite3 .Connection , settings : Settings , chunker : Chunker ):
20+ def __init__ (
21+ self ,
22+ conn : sqlite3 .Connection ,
23+ settings : Settings ,
24+ chunker : Chunker ,
25+ sentence_chunker : SentenceSplitter ,
26+ ):
1927 self ._conn = conn
2028 self ._settings = settings
2129 self ._chunker = chunker
30+ self ._sentence_chunker = sentence_chunker
2231 self ._logger = Logger ()
2332
2433 def load_model (self ):
@@ -30,7 +39,7 @@ def load_model(self):
3039
3140 self ._conn .execute (
3241 "SELECT llm_model_load(?, ?);" ,
33- (self ._settings .model_path , self ._settings .model_options ),
42+ (self ._settings .model_path , self ._settings .other_model_options ),
3443 )
3544
3645 def process (self , document : Document ) -> Document :
@@ -46,6 +55,11 @@ def process(self, document: Document) -> Document:
4655 chunk .title = document .get_title ()
4756 chunk .embedding = self .generate_embedding (chunk .get_embedding_text ())
4857
58+ sentences = self ._sentence_chunker .split (chunk )
59+ for sentence in sentences :
60+ sentence .embedding = self .generate_embedding (sentence .content )
61+ chunk .sentences = sentences
62+
4963 document .chunks = chunks
5064
5165 return document
@@ -72,6 +86,7 @@ def quantize(self) -> None:
7286 cursor = self ._conn .cursor ()
7387
7488 cursor .execute ("SELECT vector_quantize('chunks', 'embedding');" )
89+ cursor .execute ("SELECT vector_quantize('sentences', 'embedding');" )
7590
7691 self ._conn .commit ()
7792 self ._logger .debug ("Quantization completed." )
@@ -81,21 +96,25 @@ def quantize_preload(self) -> None:
8196 cursor = self ._conn .cursor ()
8297
8398 cursor .execute ("SELECT vector_quantize_preload('chunks', 'embedding');" )
99+ cursor .execute ("SELECT vector_quantize_preload('sentences', 'embedding');" )
84100
85101 def quantize_cleanup (self ) -> None :
86102 """Clean up internal structures related to a previously quantized table/column."""
87103 cursor = self ._conn .cursor ()
88104
89105 cursor .execute ("SELECT vector_quantize_cleanup('chunks', 'embedding');" )
106+ cursor .execute ("SELECT vector_quantize_cleanup('sentences', 'embedding');" )
90107
91108 self ._conn .commit ()
92109
93110 def create_new_context (self ) -> None :
94- """"""
111+ """Create a new LLM context with optional runtime overrides. """
95112 cursor = self ._conn .cursor ()
113+ context_options = self ._settings .get_embeddings_context_options ()
96114
97115 cursor .execute (
98- "SELECT llm_context_create(?);" , (self ._settings .model_context_options ,)
116+ "SELECT llm_context_create(?);" ,
117+ (context_options ,),
99118 )
100119
101120 def free_context (self ) -> None :
@@ -104,13 +123,11 @@ def free_context(self) -> None:
104123
105124 cursor .execute ("SELECT llm_context_free();" )
106125
107- def search (self , query : str , top_k : int = 10 ) -> list [DocumentResult ]:
126+ def search (
127+ self , semantic_query : str , fts_query , top_k : int = 10
128+ ) -> list [DocumentResult ]:
108129 """Semantic search and full-text search sorted with Reciprocal Rank Fusion."""
109- query_embedding = self .generate_embedding (query )
110-
111- # Clean up and split into words
112- # '*' is used to match while typing
113- query = " " .join (re .findall (r"\b\w+\b" , query .lower ())) + "*"
130+ query_embedding = self .generate_embedding (semantic_query )
114131
115132 vector_scan_type = (
116133 "vector_quantize_scan"
@@ -119,8 +136,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
119136 )
120137
121138 cursor = self ._conn .cursor ()
122- # TODO: understand how to sort results depending on the distance metric
123- # Eg, for cosine distance, higher is better (closer to 1)
139+
124140 cursor .execute (
125141 f"""
126142 -- sqlite-vector KNN vector search results
@@ -163,6 +179,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
163179 documents.uri,
164180 documents.content as document_content,
165181 documents.metadata,
182+ chunks.id AS chunk_id,
166183 chunks.content AS snippet,
167184 vec_rank,
168185 fts_rank,
@@ -176,7 +193,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
176193 ;
177194 """ , # nosec B608
178195 {
179- "query" : query ,
196+ "query" : fts_query ,
180197 "query_embedding" : query_embedding ,
181198 "k" : top_k ,
182199 "rrf_k" : Engine .DEFAULT_RRF_K ,
@@ -186,14 +203,15 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
186203 )
187204
188205 rows = cursor .fetchall ()
189- return [
206+ results = [
190207 DocumentResult (
191208 document = Document (
192209 id = row ["id" ],
193210 uri = row ["uri" ],
194211 content = row ["document_content" ],
195212 metadata = json .loads (row ["metadata" ]) if row ["metadata" ] else {},
196213 ),
214+ chunk_id = row ["chunk_id" ],
197215 snippet = row ["snippet" ],
198216 vec_rank = row ["vec_rank" ],
199217 fts_rank = row ["fts_rank" ],
@@ -204,6 +222,72 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
204222 for row in rows
205223 ]
206224
225+ return results
226+
227+ def search_sentences (
228+ self , query : str , chunk_id : int , k : int
229+ ) -> List [SentenceResult ]:
230+ query_embedding = self .generate_embedding (query )
231+
232+ vector_scan_type = (
233+ "vector_quantize_scan_stream"
234+ if self ._settings .quantize_scan
235+ else "vector_full_scan_stream"
236+ )
237+
238+ cursor = self ._conn .cursor ()
239+
240+ cursor .execute (
241+ f"""
242+ WITH vec_matches AS (
243+ SELECT
244+ v.rowid AS sentence_id,
245+ row_number() OVER (ORDER BY v.distance) AS rank_number,
246+ v.distance,
247+ sentences.content as sentence_content,
248+ sentences.sequence as sentence_sequence,
249+ sentences.start_offset as sentence_start_offset,
250+ sentences.end_offset as sentence_end_offset
251+ FROM { vector_scan_type } ('sentences', 'embedding', :query_embedding) AS v
252+ JOIN sentences ON sentences.rowid = v.rowid
253+ WHERE sentences.chunk_id = :chunk_id
254+ LIMIT :k
255+ )
256+ SELECT
257+ sentence_id,
258+ sentence_content,
259+ sentence_sequence,
260+ sentence_start_offset,
261+ sentence_end_offset,
262+ rank_number,
263+ distance
264+ FROM vec_matches
265+ ORDER BY rank_number ASC
266+ """ , # nosec B608
267+ {
268+ "query_embedding" : query_embedding ,
269+ "k" : k ,
270+ "chunk_id" : chunk_id ,
271+ },
272+ )
273+
274+ rows = cursor .fetchall ()
275+ sentences = []
276+ for row in rows :
277+ sentences .append (
278+ SentenceResult (
279+ id = row ["sentence_id" ],
280+ chunk_id = chunk_id ,
281+ sequence = row ["sentence_sequence" ],
282+ rank = row ["rank_number" ],
283+ distance = row ["distance" ],
284+ start_offset = row ["sentence_start_offset" ],
285+ end_offset = row ["sentence_end_offset" ],
286+ )
287+ )
288+
289+ return sentences [:k ]
290+
207291 def versions (self ) -> dict :
208292 """Get versions of the loaded extensions."""
209293 cursor = self ._conn .cursor ()
0 commit comments