Skip to content

Commit a1003a5

Browse files
author
Daniele Briggi
committed
feat: support to selection/exclusion files by extension
chore(edge-fn): update
1 parent 180a1e7 commit a1003a5

File tree

3 files changed

+79
-31
lines changed

3 files changed

+79
-31
lines changed

.github/workflows/test.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@ jobs:
1818
base_url: https://docs.sqlitecloud.io/docs/
1919
database_name: aisearch-action-test.sqlite
2020
# only few files for testing
21-
source_files: docs/sqlite-cloud/sdks/php
21+
source_files: docs/sqlite-cloud/sdks/js
22+
only_extensions: "md"

action.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,15 @@ inputs:
1515
source_files:
1616
description: The path of the files, by default it will parse every file recursively starting from the working directory.
1717
required: false
18-
default: $(pwd)
18+
default: "./"
19+
only_extensions:
20+
description: Comma-separated list of file extensions to include (e.g., "md,txt,html"). If not set, all supported file types are included.
21+
required: false
22+
default: ""
23+
exclude_extensions:
24+
description: Comma-separated list of file extensions to exclude (e.g., "js,jsx"). If not set, no file types are excluded.
25+
required: false
26+
default: ""
1927
hf_model_id:
2028
description: The Hugging Face model ID to use for generating embeddings.
2129
required: false
@@ -79,7 +87,9 @@ runs:
7987
sqlite-rag add \
8088
--recursive "${{ inputs.source_files }}" \
8189
--metadata '{"base_url": "${{ inputs.base_url }}"}' \
82-
--relative-paths
90+
--relative-paths \
91+
--only "${{ inputs.only_extensions }}" \
92+
--exclude "${{ inputs.exclude_extensions }}"
8393
shell: bash
8494

8595
- name: Upload the database to SQLite Cloud

search_edge_function_template/aisearch-docs.js

Lines changed: 65 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ const sqliteAIBaseUrl = "https://aiserver.vital-rhino.eks.euc1.ryujaz.sqlite.clo
1616
const sqliteAIAPI = "/v1/ai/embeddings"
1717
//-----------------------
1818

19-
const requestid = request.params.requestid;
2019
const query = request.params.query;
20+
const limit = parseInt(request.params.limit) || 10; // Number of top results to return
2121

22-
// get embedding from sqlite-ai-server
22+
// Get embedding from sqlite-ai-server
2323
const data = {"text": query };
2424
const response = await fetch(sqliteAIBaseUrl + sqliteAIAPI, {
2525
method: "POST",
@@ -36,26 +36,21 @@ if (!response.ok) {
3636
const result = await response.json();
3737
const query_embedding = result.data.embedding;
3838

39-
// clean query for full-text search
39+
// Clean query for full-text search
4040
const query_fts = (query.toLowerCase().match(/\b\w+\b/g) || []).join(" ") + "*";
4141

42-
// --- TEST ---
43-
//const test_embedding = await connection.sql('SELECT embedding FROM chunks LIMIT 1;');
44-
//const query_embedding = test_embedding[0].embedding;
45-
// ------------
46-
4742
// Vector configuration must match the embedding parameters used during database generation
4843
await connection.sql("SELECT vector_init('chunks', 'embedding', 'type=INT8,dimension=768,distance=cosine')");
4944

5045
const res = await connection.sql(
5146
`
52-
-- sqlite-vector KNN vector search results
47+
-- sqlite-vector KNN vector search results
5348
WITH vec_matches AS (
5449
SELECT
5550
v.rowid AS chunk_id,
5651
row_number() OVER (ORDER BY v.distance) AS rank_number,
5752
v.distance
58-
FROM vector_quantize_scan('chunks', 'embedding', ?, 10) AS v
53+
FROM vector_quantize_scan('chunks', 'embedding', ?, ?) AS v
5954
),
6055
-- Full-text search results
6156
fts_matches AS (
@@ -65,7 +60,7 @@ const res = await connection.sql(
6560
rank AS score
6661
FROM chunks_fts
6762
WHERE chunks_fts MATCH ?
68-
LIMIT 10
63+
LIMIT ?
6964
),
7065
-- combine FTS5 + vector search results with RRF
7166
matches AS (
@@ -84,28 +79,70 @@ const res = await connection.sql(
8479
FULL OUTER JOIN fts_matches
8580
ON vec_matches.chunk_id = fts_matches.chunk_id
8681
)
87-
SELECT
88-
documents.id,
89-
documents.uri,
90-
documents.content as document_content,
91-
documents.metadata,
92-
chunks.content AS snippet,
93-
vec_rank,
94-
fts_rank,
95-
combined_rank,
96-
vec_distance,
97-
fts_score
98-
FROM matches
99-
JOIN chunks ON chunks.id = matches.chunk_id
100-
JOIN documents ON documents.id = chunks.document_id
82+
SELECT
83+
documents.id,
84+
documents.uri,
85+
documents.content as document_content,
86+
documents.metadata,
87+
chunks.content AS snippet,
88+
vec_rank,
89+
fts_rank,
90+
combined_rank,
91+
vec_distance,
92+
fts_score
93+
FROM matches
94+
JOIN chunks ON chunks.id = matches.chunk_id
95+
JOIN documents ON documents.id = chunks.document_id
10196
ORDER BY combined_rank DESC
10297
;
103-
`, query_embedding, query_fts)
98+
`, query_embedding, limit, query_fts, limit)
99+
100+
// The results from the query may contain multiple resulting chunks per document.
101+
// We want to return one result per document, so we will group by document id and take
102+
// the top-ranked chunk as a snippet.
103+
const documentsChunk = new Map();
104+
res.forEach(item => {
105+
if (!documentsChunk.has(item.id) || item.combined_rank > documentsChunk.get(item.id).combined_rank) {
106+
documentsChunk.set(item.id, item);
107+
}
108+
});
109+
const topResults = Array.from(documentsChunk.values()).slice(0, limit);
104110

111+
// ----- URLs for results -----
112+
// Customize this section based on how URLs should be constructed for your documents.
113+
// This example uses 'base_url' from metadata and 'slug' if available, otherwise derives from URI.
114+
// ----------------------------
115+
const resultsWithUrls = topResults
116+
.map(item => {
117+
const metadata = JSON.parse(item.metadata);
118+
const baseUrl = metadata.base_url;
119+
const slug = metadata.extracted?.slug;
120+
const uri = item.uri;
121+
122+
let fullUrl;
123+
if (slug) {
124+
fullUrl = `${baseUrl}${slug}`;
125+
} else {
126+
const uriWithoutExtension = uri
127+
.toLowerCase()
128+
.replace(/\.(mdx?|md)$/i, '');
129+
fullUrl = `${baseUrl}${uriWithoutExtension}`;
130+
}
131+
132+
return {
133+
id: item.id,
134+
url: fullUrl,
135+
title: metadata.extracted?.title || metadata.generated?.title,
136+
snippet: item.snippet,
137+
};
138+
});
105139

106140
return {
107141
data: {
108-
search: res,
109-
requestid: requestid
142+
/**
143+
* @type {Array<{id: number, url: string, title: string, snippet: string}>}
144+
* The search results with constructed URLs, titles, and snippets.
145+
*/
146+
search: resultsWithUrls
110147
}
111148
}

0 commit comments

Comments
 (0)