1313
1414//---- CONFIGURATION ----
1515const sqliteAIBaseUrl = "https://aiserver.vital-rhino.eks.euc1.ryujaz.sqlite.cloud" ;
16- const sqliteAIAPI = "/v1/ai/embeddings"
16+ const sqliteAIAPI = "/v1/ai/embeddings" ;
17+ const topKSentences = 3 ; // Number of top sentences to include in preview
18+ const maxChars = 400 ; // Maximum total characters for preview
19+ const gap = "[...]" ; // Gap indicator string
1720//-----------------------
1821
1922const query = request . params . query ;
20- const limit = parseInt ( request . params . limit ) || 10 ; // Number of top results to return
23+ const limit = parseInt ( request . params . limit ) || 5 ; // Number of top results to return
2124
2225// Get embedding from sqlite-ai-server
2326const data = { "text" : query } ;
@@ -41,6 +44,7 @@ const query_fts = (query.toLowerCase().match(/\b\w+\b/g) || []).join(" ") + "*";
4144
4245// Vector configuration must match the embedding parameters used during database generation
4346await connection . sql ( "SELECT vector_init('chunks', 'embedding', 'type=INT8,dimension=768,distance=cosine')" ) ;
47+ await connection . sql ( "SELECT vector_init('sentences', 'embedding', 'type=INT8,dimension=768,distance=cosine')" ) ;
4448
4549const res = await connection . sql (
4650 `
@@ -82,9 +86,9 @@ const res = await connection.sql(
8286 SELECT
8387 documents.id,
8488 documents.uri,
85- documents.content as document_content,
8689 documents.metadata,
87- chunks.content AS snippet,
90+ chunks.id AS chunk_id,
91+ chunks.content AS chunk_content,
8892 vec_rank,
8993 fts_rank,
9094 combined_rank,
@@ -95,24 +99,100 @@ const res = await connection.sql(
9599 JOIN documents ON documents.id = chunks.document_id
96100 ORDER BY combined_rank DESC
97101 ;
98- ` , query_embedding , limit , query_fts , limit )
102+ ` , query_embedding , limit , query_fts , limit
103+ ) ;
99104
100105// The results from the query may contain multiple resulting chunks per document.
101106// We want to return one result per document, so we will group by document id and take
102107// the top-ranked chunk as a snippet.
103- const documentsChunk = new Map ( ) ;
104- res . forEach ( item => {
105- if ( ! documentsChunk . has ( item . id ) || item . combined_rank > documentsChunk . get ( item . id ) . combined_rank ) {
106- documentsChunk . set ( item . id , item ) ;
108+ const seenDocuments = new Set ( ) ;
109+ const topResults = res
110+ . filter ( item => ! seenDocuments . has ( item . id ) && seenDocuments . add ( item . id ) )
111+ . slice ( 0 , limit ) ;
112+
113+ // ----- Fetch top sentences for each top result -----
114+ for ( const result of topResults ) {
115+ result . sentences = await connection . sql (
116+ `WITH vec_matches AS (
117+ SELECT
118+ v.rowid AS sentence_id,
119+ row_number() OVER (ORDER BY v.distance) AS rank_number,
120+ v.distance
121+ FROM vector_quantize_scan_stream('sentences', 'embedding', ?) AS v
122+ JOIN sentences ON sentences.rowid = v.rowid
123+ WHERE sentences.chunk_id = ?
124+ LIMIT ?
125+ )
126+ SELECT
127+ sentence_id,
128+ -- Extract sentence directly from document content
129+ COALESCE(
130+ substr(chunks.content, sentences.start_offset + 1, sentences.end_offset - sentences.start_offset),
131+ ""
132+ ) AS content,
133+ sentences.start_offset AS sentence_start_offset,
134+ sentences.end_offset AS sentence_end_offset,
135+ rank_number,
136+ distance
137+ FROM vec_matches
138+ JOIN sentences ON sentences.rowid = vec_matches.sentence_id
139+ JOIN chunks ON chunks.id = sentences.chunk_id
140+ ORDER BY rank_number ASC
141+ ;
142+ ` , query_embedding , result . chunk_id , topKSentences
143+ ) ;
144+ }
145+
146+ // ----- Build snippets from sentences -----
147+ for ( const item of topResults ) {
148+ const topSentences = item . sentences ? item . sentences . slice ( 0 , topKSentences ) : [ ] ;
149+ let snippet = "" ;
150+
151+ if ( topSentences . length === 0 ) {
152+ // Fallback: no sentences, return truncated chunk content
153+ const chunkContent = item . chunk_content || "" ;
154+ snippet = chunkContent . substring ( 0 , maxChars ) ;
155+ } else {
156+ // Sort by start_offset to maintain document order
157+ topSentences . sort ( ( a , b ) => {
158+ const offsetA = a . sentence_start_offset !== null ? a . sentence_start_offset : - 1 ;
159+ const offsetB = b . sentence_start_offset !== null ? b . sentence_start_offset : - 1 ;
160+ return offsetA - offsetB ;
161+ } ) ;
162+
163+ const previewParts = [ ] ;
164+ let totalChars = 0 ;
165+ let prevEndOffset = null ;
166+
167+ for ( const sentence of topSentences ) {
168+ const sentenceText = sentence . content ;
169+
170+ // Check for gap between sentences
171+ if ( prevEndOffset !== null && sentence . sentence_start_offset !== null ) {
172+ const gapSize = sentence . sentence_start_offset - prevEndOffset ;
173+ if ( gapSize > 10 ) {
174+ previewParts . push ( gap ) ;
175+ totalChars += gap . length ;
176+ }
177+ }
178+
179+ previewParts . push ( sentenceText ) ;
180+ totalChars += sentenceText . length ;
181+ prevEndOffset = sentence . sentence_end_offset ;
182+ }
183+
184+ const preview = previewParts . join ( " " ) ;
185+ snippet = preview . length > maxChars ? preview . substring ( 0 , maxChars - 3 ) + "..." : preview ;
107186 }
108- } ) ;
109- const topResults = Array . from ( documentsChunk . values ( ) ) . slice ( 0 , limit ) ;
187+
188+ item . snippet = snippet ;
189+ }
110190
111191// ----- URLs for results -----
112192// Customize this section based on how URLs should be constructed for your documents.
113193// This example uses 'base_url' from metadata and 'slug' if available, otherwise derives from URI.
114194// ----------------------------
115- const resultsWithUrls = topResults
195+ const finalResults = topResults
116196 . map ( item => {
117197 const metadata = JSON . parse ( item . metadata ) ;
118198 const baseUrl = metadata . base_url ;
@@ -133,7 +213,7 @@ const resultsWithUrls = topResults
133213 id : item . id ,
134214 url : fullUrl ,
135215 title : metadata . extracted ?. title || metadata . generated ?. title ,
136- snippet : item . snippet ,
216+ snippet : item . snippet
137217 } ;
138218 } ) ;
139219
@@ -143,6 +223,6 @@ return {
143223 * @type {Array<{id: number, url: string, title: string, snippet: string}> }
144224 * The search results with constructed URLs, titles, and snippets.
145225 */
146- search : resultsWithUrls
226+ search : finalResults
147227 }
148228}
0 commit comments