Skip to content

Commit e056f5b

Browse files
author
Daniele Briggi
committed
feat(edg-fn): support to preview with sentences
1 parent 615cfd5 commit e056f5b

File tree

1 file changed

+94
-14
lines changed

1 file changed

+94
-14
lines changed

search_edge_function_template/aisearch-docs.js

Lines changed: 94 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,14 @@
1313

1414
//---- CONFIGURATION ----
1515
const sqliteAIBaseUrl = "https://aiserver.vital-rhino.eks.euc1.ryujaz.sqlite.cloud";
16-
const sqliteAIAPI = "/v1/ai/embeddings"
16+
const sqliteAIAPI = "/v1/ai/embeddings";
17+
const topKSentences = 3; // Number of top sentences to include in preview
18+
const maxChars = 400; // Maximum total characters for preview
19+
const gap = "[...]"; // Gap indicator string
1720
//-----------------------
1821

1922
const query = request.params.query;
20-
const limit = parseInt(request.params.limit) || 10; // Number of top results to return
23+
const limit = parseInt(request.params.limit) || 5; // Number of top results to return
2124

2225
// Get embedding from sqlite-ai-server
2326
const data = {"text": query };
@@ -41,6 +44,7 @@ const query_fts = (query.toLowerCase().match(/\b\w+\b/g) || []).join(" ") + "*";
4144

4245
// Vector configuration must match the embedding parameters used during database generation
4346
await connection.sql("SELECT vector_init('chunks', 'embedding', 'type=INT8,dimension=768,distance=cosine')");
47+
await connection.sql("SELECT vector_init('sentences', 'embedding', 'type=INT8,dimension=768,distance=cosine')");
4448

4549
const res = await connection.sql(
4650
`
@@ -82,9 +86,9 @@ const res = await connection.sql(
8286
SELECT
8387
documents.id,
8488
documents.uri,
85-
documents.content as document_content,
8689
documents.metadata,
87-
chunks.content AS snippet,
90+
chunks.id AS chunk_id,
91+
chunks.content AS chunk_content,
8892
vec_rank,
8993
fts_rank,
9094
combined_rank,
@@ -95,24 +99,100 @@ const res = await connection.sql(
9599
JOIN documents ON documents.id = chunks.document_id
96100
ORDER BY combined_rank DESC
97101
;
98-
`, query_embedding, limit, query_fts, limit)
102+
`, query_embedding, limit, query_fts, limit
103+
);
99104

100105
// The results from the query may contain multiple resulting chunks per document.
101106
// We want to return one result per document, so we will group by document id and take
102107
// the top-ranked chunk as a snippet.
103-
const documentsChunk = new Map();
104-
res.forEach(item => {
105-
if (!documentsChunk.has(item.id) || item.combined_rank > documentsChunk.get(item.id).combined_rank) {
106-
documentsChunk.set(item.id, item);
108+
const seenDocuments = new Set();
109+
const topResults = res
110+
.filter(item => !seenDocuments.has(item.id) && seenDocuments.add(item.id))
111+
.slice(0, limit);
112+
113+
// ----- Fetch top sentences for each top result -----
114+
for (const result of topResults) {
115+
result.sentences = await connection.sql(
116+
`WITH vec_matches AS (
117+
SELECT
118+
v.rowid AS sentence_id,
119+
row_number() OVER (ORDER BY v.distance) AS rank_number,
120+
v.distance
121+
FROM vector_quantize_scan_stream('sentences', 'embedding', ?) AS v
122+
JOIN sentences ON sentences.rowid = v.rowid
123+
WHERE sentences.chunk_id = ?
124+
LIMIT ?
125+
)
126+
SELECT
127+
sentence_id,
128+
-- Extract sentence directly from document content
129+
COALESCE(
130+
substr(chunks.content, sentences.start_offset + 1, sentences.end_offset - sentences.start_offset),
131+
""
132+
) AS content,
133+
sentences.start_offset AS sentence_start_offset,
134+
sentences.end_offset AS sentence_end_offset,
135+
rank_number,
136+
distance
137+
FROM vec_matches
138+
JOIN sentences ON sentences.rowid = vec_matches.sentence_id
139+
JOIN chunks ON chunks.id = sentences.chunk_id
140+
ORDER BY rank_number ASC
141+
;
142+
`, query_embedding, result.chunk_id, topKSentences
143+
);
144+
}
145+
146+
// ----- Build snippets from sentences -----
147+
for (const item of topResults) {
148+
const topSentences = item.sentences ? item.sentences.slice(0, topKSentences) : [];
149+
let snippet = "";
150+
151+
if (topSentences.length === 0) {
152+
// Fallback: no sentences, return truncated chunk content
153+
const chunkContent = item.chunk_content || "";
154+
snippet = chunkContent.substring(0, maxChars);
155+
} else {
156+
// Sort by start_offset to maintain document order
157+
topSentences.sort((a, b) => {
158+
const offsetA = a.sentence_start_offset !== null ? a.sentence_start_offset : -1;
159+
const offsetB = b.sentence_start_offset !== null ? b.sentence_start_offset : -1;
160+
return offsetA - offsetB;
161+
});
162+
163+
const previewParts = [];
164+
let totalChars = 0;
165+
let prevEndOffset = null;
166+
167+
for (const sentence of topSentences) {
168+
const sentenceText = sentence.content;
169+
170+
// Check for gap between sentences
171+
if (prevEndOffset !== null && sentence.sentence_start_offset !== null) {
172+
const gapSize = sentence.sentence_start_offset - prevEndOffset;
173+
if (gapSize > 10) {
174+
previewParts.push(gap);
175+
totalChars += gap.length;
176+
}
177+
}
178+
179+
previewParts.push(sentenceText);
180+
totalChars += sentenceText.length;
181+
prevEndOffset = sentence.sentence_end_offset;
182+
}
183+
184+
const preview = previewParts.join(" ");
185+
snippet = preview.length > maxChars ? preview.substring(0, maxChars - 3) + "..." : preview;
107186
}
108-
});
109-
const topResults = Array.from(documentsChunk.values()).slice(0, limit);
187+
188+
item.snippet = snippet;
189+
}
110190

111191
// ----- URLs for results -----
112192
// Customize this section based on how URLs should be constructed for your documents.
113193
// This example uses 'base_url' from metadata and 'slug' if available, otherwise derives from URI.
114194
// ----------------------------
115-
const resultsWithUrls = topResults
195+
const finalResults = topResults
116196
.map(item => {
117197
const metadata = JSON.parse(item.metadata);
118198
const baseUrl = metadata.base_url;
@@ -133,7 +213,7 @@ const resultsWithUrls = topResults
133213
id: item.id,
134214
url: fullUrl,
135215
title: metadata.extracted?.title || metadata.generated?.title,
136-
snippet: item.snippet,
216+
snippet: item.snippet
137217
};
138218
});
139219

@@ -143,6 +223,6 @@ return {
143223
* @type {Array<{id: number, url: string, title: string, snippet: string}>}
144224
* The search results with constructed URLs, titles, and snippets.
145225
*/
146-
search: resultsWithUrls
226+
search: finalResults
147227
}
148228
}

0 commit comments

Comments
 (0)