Skip to content

Commit d0d4541

Browse files
authored
Merge pull request #303 from guillesd/queriable_search_index
add queriable search index for ducklake docs
2 parents f559013 + 546ce10 commit d0d4541

File tree

7 files changed

+358
-8
lines changed

7 files changed

+358
-8
lines changed

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ jobs:
3030
with:
3131
options: "--check --skip-string-normalization --verbose --diff --color"
3232
src: "./scripts"
33-
version: "24.8.0"
33+
version: "26.3.1"

_plugins/search_index.rb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include Jekyll
22

3-
# this file is used to run the external python script to generate the search index
4-
# we only want to run this script once, so we use the `:after_init` hook to run the script
3+
# this file is used to run the external python scripts to generate search indexes
4+
# we only want to run these scripts once, so we use the `:after_init` hook
55

66
Jekyll::Hooks.register :site, :after_init do |page|
77
tag = 'Search index:'
@@ -14,4 +14,12 @@
1414
Jekyll.logger.error(tag, "Failed to generate index")
1515
end
1616

17+
Jekyll.logger.info(tag, "Generating DuckDB search index")
18+
19+
if system "python3 scripts/generate_search_index.py --validate"
20+
Jekyll.logger.info(tag, "DuckDB search index generated")
21+
else
22+
Jekyll.logger.error(tag, "Failed to generate DuckDB search index")
23+
end
24+
1725
end

data/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
search_data.json
22
duckdb-releases.csv
3+
docs-search.duckdb

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
duckdb
12
marko # for generating search index
23
PyYAML
34
python-frontmatter

scripts/generate_search.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import json
55
from textwrap import shorten
66

7-
87
SKIP_TYPES = [marko.block.HTMLBlock, marko.inline.Image, marko.inline.InlineHTML]
98

109

scripts/generate_search_index.py

Lines changed: 343 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,343 @@
1+
"""
2+
Generate a DuckDB search index for the DuckLake documentation.
3+
4+
Parses all markdown files under docs/stable/ and docs/preview/,
5+
chunks them (per-H2 section), and builds a DuckDB file with a
6+
pre-built FTS index.
7+
8+
Usage:
9+
python scripts/generate_search_index.py [--output PATH] [--validate]
10+
"""
11+
12+
import argparse
13+
import os
14+
import re
15+
import sys
16+
17+
import duckdb
18+
import frontmatter
19+
20+
# ---------------------------------------------------------------------------
21+
# Anchor / slug generation (matches Kramdown defaults)
22+
# ---------------------------------------------------------------------------
23+
24+
25+
def slugify(text):
26+
"""Convert heading text to an anchor slug matching Kramdown's rules."""
27+
slug = text.lower()
28+
slug = re.sub(r'[^\w\s-]', '', slug) # strip non-alphanumeric except - and _
29+
slug = re.sub(r'[\s]+', '-', slug) # spaces to hyphens
30+
slug = slug.strip('-')
31+
return slug
32+
33+
34+
# ---------------------------------------------------------------------------
35+
# Breadcrumb from file path
36+
# ---------------------------------------------------------------------------
37+
38+
BREADCRUMB_RENAMES = {
39+
'sql': 'SQL',
40+
'duckdb': 'DuckDB',
41+
'ducklake': 'DuckLake',
42+
'fts': 'FTS',
43+
'api': 'API',
44+
}
45+
46+
47+
def breadcrumb_segment(segment):
48+
"""Prettify a single path segment for breadcrumb display."""
49+
if segment in BREADCRUMB_RENAMES:
50+
return BREADCRUMB_RENAMES[segment]
51+
return segment.replace('_', ' ').title()
52+
53+
54+
def build_breadcrumb(filepath, version):
55+
"""Build a breadcrumb string from a docs file path.
56+
57+
e.g. docs/stable/duckdb/usage/connecting.md -> DuckDB > Usage > Connecting
58+
"""
59+
relative = filepath.removeprefix(f'docs/{version}/').removesuffix('.md')
60+
parts = relative.split('/')
61+
if parts and parts[-1] in ('index', 'overview'):
62+
parts = parts[:-1]
63+
return ' > '.join(breadcrumb_segment(p) for p in parts)
64+
65+
66+
# ---------------------------------------------------------------------------
67+
# Chunking helpers
68+
# ---------------------------------------------------------------------------
69+
70+
71+
def page_slug(filepath, version):
72+
"""Strip docs/<version>/ prefix and .md suffix to get a clean page slug."""
73+
return filepath.removeprefix(f'docs/{version}/').removesuffix('.md')
74+
75+
76+
def make_unique_anchor(anchor, seen_anchors):
77+
"""Deduplicate anchors on the same page (matching Kramdown behavior)."""
78+
if anchor not in seen_anchors:
79+
seen_anchors[anchor] = 0
80+
return anchor
81+
seen_anchors[anchor] += 1
82+
return f'{anchor}-{seen_anchors[anchor]}'
83+
84+
85+
# ---------------------------------------------------------------------------
86+
# Chunking
87+
# ---------------------------------------------------------------------------
88+
89+
90+
def chunk_page(filepath, version, title, body):
91+
"""Split a page into one chunk per ## heading."""
92+
chunks = []
93+
base_url = '/' + filepath.removesuffix('.md')
94+
slug = page_slug(filepath, version)
95+
breadcrumb = build_breadcrumb(filepath, version)
96+
seen_anchors = {}
97+
98+
# Split on ## headings
99+
parts = re.split(r'^(##)\s+(.+)$', body, flags=re.MULTILINE)
100+
101+
intro = parts[0].strip()
102+
103+
# If there are no H2 headings, treat the whole page as one chunk
104+
if len(parts) == 1:
105+
chunks.append(
106+
{
107+
'chunk_id': f'{version}/{slug}',
108+
'page_title': title,
109+
'section': None,
110+
'breadcrumb': breadcrumb,
111+
'url': base_url,
112+
'version': version,
113+
'text': body.strip(),
114+
}
115+
)
116+
return chunks
117+
118+
# Intro chunk (content before first H2)
119+
if intro:
120+
chunks.append(
121+
{
122+
'chunk_id': f'{version}/{slug}',
123+
'page_title': title,
124+
'section': None,
125+
'breadcrumb': breadcrumb,
126+
'url': base_url,
127+
'version': version,
128+
'text': intro,
129+
}
130+
)
131+
132+
# H2 sections
133+
i = 1
134+
while i < len(parts):
135+
_marker = parts[i]
136+
heading = parts[i + 1].strip()
137+
content = parts[i + 2].strip() if i + 2 < len(parts) else ''
138+
i += 3
139+
140+
anchor = make_unique_anchor(slugify(heading), seen_anchors)
141+
text = f'## {heading}\n\n{content}' if content else f'## {heading}'
142+
143+
chunks.append(
144+
{
145+
'chunk_id': f'{version}/{slug}#{anchor}',
146+
'page_title': title,
147+
'section': heading,
148+
'breadcrumb': breadcrumb,
149+
'url': f'{base_url}#{anchor}',
150+
'version': version,
151+
'text': text,
152+
}
153+
)
154+
155+
return chunks
156+
157+
158+
# ---------------------------------------------------------------------------
159+
# File processing
160+
# ---------------------------------------------------------------------------
161+
162+
163+
def process_file(filepath, version):
164+
"""Parse a single markdown file and return its chunks."""
165+
with open(filepath, 'r') as f:
166+
post = frontmatter.load(f)
167+
168+
title = post.get('title', '')
169+
if not title:
170+
return []
171+
172+
body = post.content
173+
return chunk_page(filepath, version, title, body)
174+
175+
176+
def collect_chunks(docs_dir, version):
177+
"""Walk a docs version directory and collect all chunks."""
178+
chunks = []
179+
for root, _dirs, files in os.walk(docs_dir):
180+
for fname in sorted(files):
181+
if not fname.endswith('.md'):
182+
continue
183+
filepath = os.path.join(root, fname)
184+
rel_path = os.path.relpath(filepath, '.').replace(os.sep, '/')
185+
try:
186+
chunks.extend(process_file(rel_path, version))
187+
except Exception as e:
188+
print(f' Warning: failed to process {rel_path}: {e}')
189+
return chunks
190+
191+
192+
# ---------------------------------------------------------------------------
193+
# DuckDB index building
194+
# ---------------------------------------------------------------------------
195+
196+
197+
def build_duckdb(chunks, output_path):
198+
"""Create the DuckDB file with docs_chunks table and FTS index."""
199+
if os.path.exists(output_path):
200+
os.remove(output_path)
201+
202+
con = duckdb.connect(output_path)
203+
204+
con.execute("""
205+
CREATE TABLE docs_chunks (
206+
chunk_id VARCHAR PRIMARY KEY,
207+
page_title VARCHAR NOT NULL,
208+
section VARCHAR,
209+
breadcrumb VARCHAR,
210+
url VARCHAR NOT NULL,
211+
version VARCHAR NOT NULL,
212+
text TEXT NOT NULL
213+
)
214+
""")
215+
216+
con.executemany(
217+
"""
218+
INSERT INTO docs_chunks (chunk_id, page_title, section, breadcrumb, url, version, text)
219+
VALUES (?, ?, ?, ?, ?, ?, ?)
220+
""",
221+
[
222+
(
223+
c['chunk_id'],
224+
c['page_title'],
225+
c['section'],
226+
c['breadcrumb'],
227+
c['url'],
228+
c['version'],
229+
c['text'],
230+
)
231+
for c in chunks
232+
],
233+
)
234+
235+
row_count = con.execute('SELECT count(*) FROM docs_chunks').fetchone()[0]
236+
print(f'Inserted {row_count} chunks')
237+
238+
# Build FTS index
239+
con.execute('INSTALL fts')
240+
con.execute('LOAD fts')
241+
con.execute("""
242+
PRAGMA create_fts_index(
243+
'docs_chunks',
244+
'chunk_id',
245+
'page_title', 'section', 'text',
246+
stemmer = 'porter',
247+
stopwords = 'english',
248+
ignore = '(\\.|[^a-zA-Z0-9_])+',
249+
lower = 1,
250+
overwrite = 1
251+
)
252+
""")
253+
print('FTS index built')
254+
255+
con.close()
256+
257+
258+
# ---------------------------------------------------------------------------
259+
# Validation
260+
# ---------------------------------------------------------------------------
261+
262+
263+
def validate(output_path):
264+
"""Run a smoke-test query against the generated index."""
265+
con = duckdb.connect(output_path, read_only=True)
266+
con.execute('LOAD fts')
267+
268+
results = con.execute("""
269+
SELECT chunk_id, page_title, score
270+
FROM (
271+
SELECT *,
272+
fts_main_docs_chunks.match_bm25(chunk_id, 'attach catalog') AS score
273+
FROM docs_chunks
274+
)
275+
WHERE score IS NOT NULL
276+
ORDER BY score DESC
277+
LIMIT 5
278+
""").fetchall()
279+
280+
con.close()
281+
282+
if not results:
283+
print("VALIDATION FAILED: no results for 'attach catalog'")
284+
return False
285+
286+
print("Validation passed — top 5 results for 'attach catalog':")
287+
for chunk_id, page_title, score in results:
288+
print(f' {score:8.4f} {chunk_id} ({page_title})')
289+
return True
290+
291+
292+
# ---------------------------------------------------------------------------
293+
# Main
294+
# ---------------------------------------------------------------------------
295+
296+
297+
def main():
298+
parser = argparse.ArgumentParser(description='Build DuckLake docs search index')
299+
parser.add_argument(
300+
'--output',
301+
default='data/docs-search.duckdb',
302+
help='Output .duckdb file path (default: data/docs-search.duckdb)',
303+
)
304+
parser.add_argument(
305+
'--validate',
306+
action='store_true',
307+
help='Run a smoke-test FTS query after building',
308+
)
309+
args = parser.parse_args()
310+
311+
versions = [
312+
('docs/stable', 'stable'),
313+
('docs/preview', 'preview'),
314+
]
315+
316+
all_chunks = []
317+
for docs_dir, version in versions:
318+
if not os.path.isdir(docs_dir):
319+
print(f'Skipping {version}: {docs_dir} not found')
320+
continue
321+
print(f'Processing {version}...')
322+
chunks = collect_chunks(docs_dir, version)
323+
print(f' {len(chunks)} chunks from {version}')
324+
all_chunks.extend(chunks)
325+
326+
if not all_chunks:
327+
print('No chunks found — nothing to build')
328+
sys.exit(1)
329+
330+
print(f'\nTotal: {len(all_chunks)} chunks')
331+
print(f'Building {args.output}...')
332+
build_duckdb(all_chunks, args.output)
333+
334+
if args.validate:
335+
print()
336+
if not validate(args.output):
337+
sys.exit(1)
338+
339+
print(f'\nDone. Output: {args.output}')
340+
341+
342+
if __name__ == '__main__':
343+
main()

0 commit comments

Comments
 (0)