Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ jobs:
with:
options: "--check --skip-string-normalization --verbose --diff --color"
src: "./scripts"
version: "24.8.0"
version: "26.3.1"
12 changes: 10 additions & 2 deletions _plugins/search_index.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include Jekyll

# this file is used to run the external python script to generate the search index
# we only want to run this script once, so we use the `:after_init` hook to run the script
# this file is used to run the external python scripts to generate search indexes
# we only want to run these scripts once, so we use the `:after_init` hook

Jekyll::Hooks.register :site, :after_init do |page|
tag = 'Search index:'
Expand All @@ -14,4 +14,12 @@
Jekyll.logger.error(tag, "Failed to generate index")
end

Jekyll.logger.info(tag, "Generating DuckDB search index")

if system "python3 scripts/generate_search_index.py --validate"
Jekyll.logger.info(tag, "DuckDB search index generated")
else
Jekyll.logger.error(tag, "Failed to generate DuckDB search index")
end

end
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
search_data.json
duckdb-releases.csv
docs-search.duckdb
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
duckdb
marko # for generating search index
PyYAML
python-frontmatter
1 change: 0 additions & 1 deletion scripts/generate_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import json
from textwrap import shorten


SKIP_TYPES = [marko.block.HTMLBlock, marko.inline.Image, marko.inline.InlineHTML]


Expand Down
343 changes: 343 additions & 0 deletions scripts/generate_search_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,343 @@
"""
Generate a DuckDB search index for the DuckLake documentation.

Parses all markdown files under docs/stable/ and docs/preview/,
chunks them (per-H2 section), and builds a DuckDB file with a
pre-built FTS index.

Usage:
python scripts/generate_search_index.py [--output PATH] [--validate]
"""

import argparse
import os
import re
import sys

import duckdb
import frontmatter

# ---------------------------------------------------------------------------
# Anchor / slug generation (matches Kramdown defaults)
# ---------------------------------------------------------------------------


def slugify(text):
"""Convert heading text to an anchor slug matching Kramdown's rules."""
slug = text.lower()
slug = re.sub(r'[^\w\s-]', '', slug) # strip non-alphanumeric except - and _
slug = re.sub(r'[\s]+', '-', slug) # spaces to hyphens
slug = slug.strip('-')
return slug


# ---------------------------------------------------------------------------
# Breadcrumb from file path
# ---------------------------------------------------------------------------

BREADCRUMB_RENAMES = {
'sql': 'SQL',
'duckdb': 'DuckDB',
'ducklake': 'DuckLake',
'fts': 'FTS',
'api': 'API',
}


def breadcrumb_segment(segment):
"""Prettify a single path segment for breadcrumb display."""
if segment in BREADCRUMB_RENAMES:
return BREADCRUMB_RENAMES[segment]
return segment.replace('_', ' ').title()


def build_breadcrumb(filepath, version):
"""Build a breadcrumb string from a docs file path.

e.g. docs/stable/duckdb/usage/connecting.md -> DuckDB > Usage > Connecting
"""
relative = filepath.removeprefix(f'docs/{version}/').removesuffix('.md')
parts = relative.split('/')
if parts and parts[-1] in ('index', 'overview'):
parts = parts[:-1]
return ' > '.join(breadcrumb_segment(p) for p in parts)


# ---------------------------------------------------------------------------
# Chunking helpers
# ---------------------------------------------------------------------------


def page_slug(filepath, version):
"""Strip docs/<version>/ prefix and .md suffix to get a clean page slug."""
return filepath.removeprefix(f'docs/{version}/').removesuffix('.md')


def make_unique_anchor(anchor, seen_anchors):
"""Deduplicate anchors on the same page (matching Kramdown behavior)."""
if anchor not in seen_anchors:
seen_anchors[anchor] = 0
return anchor
seen_anchors[anchor] += 1
return f'{anchor}-{seen_anchors[anchor]}'


# ---------------------------------------------------------------------------
# Chunking
# ---------------------------------------------------------------------------


def chunk_page(filepath, version, title, body):
"""Split a page into one chunk per ## heading."""
chunks = []
base_url = '/' + filepath.removesuffix('.md')
slug = page_slug(filepath, version)
breadcrumb = build_breadcrumb(filepath, version)
seen_anchors = {}

# Split on ## headings
parts = re.split(r'^(##)\s+(.+)$', body, flags=re.MULTILINE)

intro = parts[0].strip()

# If there are no H2 headings, treat the whole page as one chunk
if len(parts) == 1:
chunks.append(
{
'chunk_id': f'{version}/{slug}',
'page_title': title,
'section': None,
'breadcrumb': breadcrumb,
'url': base_url,
'version': version,
'text': body.strip(),
}
)
return chunks

# Intro chunk (content before first H2)
if intro:
chunks.append(
{
'chunk_id': f'{version}/{slug}',
'page_title': title,
'section': None,
'breadcrumb': breadcrumb,
'url': base_url,
'version': version,
'text': intro,
}
)

# H2 sections
i = 1
while i < len(parts):
_marker = parts[i]
heading = parts[i + 1].strip()
content = parts[i + 2].strip() if i + 2 < len(parts) else ''
i += 3

anchor = make_unique_anchor(slugify(heading), seen_anchors)
text = f'## {heading}\n\n{content}' if content else f'## {heading}'

chunks.append(
{
'chunk_id': f'{version}/{slug}#{anchor}',
'page_title': title,
'section': heading,
'breadcrumb': breadcrumb,
'url': f'{base_url}#{anchor}',
'version': version,
'text': text,
}
)

return chunks


# ---------------------------------------------------------------------------
# File processing
# ---------------------------------------------------------------------------


def process_file(filepath, version):
"""Parse a single markdown file and return its chunks."""
with open(filepath, 'r') as f:
post = frontmatter.load(f)

title = post.get('title', '')
if not title:
return []

body = post.content
return chunk_page(filepath, version, title, body)


def collect_chunks(docs_dir, version):
"""Walk a docs version directory and collect all chunks."""
chunks = []
for root, _dirs, files in os.walk(docs_dir):
for fname in sorted(files):
if not fname.endswith('.md'):
continue
filepath = os.path.join(root, fname)
rel_path = os.path.relpath(filepath, '.').replace(os.sep, '/')
try:
chunks.extend(process_file(rel_path, version))
except Exception as e:
print(f' Warning: failed to process {rel_path}: {e}')
return chunks


# ---------------------------------------------------------------------------
# DuckDB index building
# ---------------------------------------------------------------------------


def build_duckdb(chunks, output_path):
"""Create the DuckDB file with docs_chunks table and FTS index."""
if os.path.exists(output_path):
os.remove(output_path)

con = duckdb.connect(output_path)

con.execute("""
CREATE TABLE docs_chunks (
chunk_id VARCHAR PRIMARY KEY,
page_title VARCHAR NOT NULL,
section VARCHAR,
breadcrumb VARCHAR,
url VARCHAR NOT NULL,
version VARCHAR NOT NULL,
text TEXT NOT NULL
)
""")

con.executemany(
"""
INSERT INTO docs_chunks (chunk_id, page_title, section, breadcrumb, url, version, text)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
[
(
c['chunk_id'],
c['page_title'],
c['section'],
c['breadcrumb'],
c['url'],
c['version'],
c['text'],
)
for c in chunks
],
)

row_count = con.execute('SELECT count(*) FROM docs_chunks').fetchone()[0]
print(f'Inserted {row_count} chunks')

# Build FTS index
con.execute('INSTALL fts')
con.execute('LOAD fts')
con.execute("""
PRAGMA create_fts_index(
'docs_chunks',
'chunk_id',
'page_title', 'section', 'text',
stemmer = 'porter',
stopwords = 'english',
ignore = '(\\.|[^a-zA-Z0-9_])+',
lower = 1,
overwrite = 1
)
""")
print('FTS index built')

con.close()


# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------


def validate(output_path):
"""Run a smoke-test query against the generated index."""
con = duckdb.connect(output_path, read_only=True)
con.execute('LOAD fts')

results = con.execute("""
SELECT chunk_id, page_title, score
FROM (
SELECT *,
fts_main_docs_chunks.match_bm25(chunk_id, 'attach catalog') AS score
FROM docs_chunks
)
WHERE score IS NOT NULL
ORDER BY score DESC
LIMIT 5
""").fetchall()

con.close()

if not results:
print("VALIDATION FAILED: no results for 'attach catalog'")
return False

print("Validation passed — top 5 results for 'attach catalog':")
for chunk_id, page_title, score in results:
print(f' {score:8.4f} {chunk_id} ({page_title})')
return True


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main():
parser = argparse.ArgumentParser(description='Build DuckLake docs search index')
parser.add_argument(
'--output',
default='data/docs-search.duckdb',
help='Output .duckdb file path (default: data/docs-search.duckdb)',
)
parser.add_argument(
'--validate',
action='store_true',
help='Run a smoke-test FTS query after building',
)
args = parser.parse_args()

versions = [
('docs/stable', 'stable'),
('docs/preview', 'preview'),
]

all_chunks = []
for docs_dir, version in versions:
if not os.path.isdir(docs_dir):
print(f'Skipping {version}: {docs_dir} not found')
continue
print(f'Processing {version}...')
chunks = collect_chunks(docs_dir, version)
print(f' {len(chunks)} chunks from {version}')
all_chunks.extend(chunks)

if not all_chunks:
print('No chunks found — nothing to build')
sys.exit(1)

print(f'\nTotal: {len(all_chunks)} chunks')
print(f'Building {args.output}...')
build_duckdb(all_chunks, args.output)

if args.validate:
print()
if not validate(args.output):
sys.exit(1)

print(f'\nDone. Output: {args.output}')


if __name__ == '__main__':
main()
Loading
Loading