Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 3 additions & 56 deletions .github/workflows/populate_search_engine.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,59 +34,6 @@ jobs:
HF_IE_URL: ${{ secrets.HF_IE_URL }}
HF_IE_TOKEN: ${{ secrets.HF_IE_TOKEN }}
MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
run: uv run doc-builder populate-search-engine
# gradio-job:
# runs-on: ubuntu-latest
# steps:
# - name: Checkout doc-builder
# uses: actions/checkout@v4

# - name: Install uv
# uses: astral-sh/setup-uv@v4
# with:
# version: "latest"

# - name: Set up Python 3.10
# run: uv python install 3.10

# - name: Install doc-builder
# run: uv sync --extra dev

# - name: Add gradio docs to meilisearch
# env:
# HF_IE_URL: ${{ secrets.HF_IE_URL }}
# HF_IE_TOKEN: ${{ secrets.HF_IE_TOKEN }}
# MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
# run: uv run doc-builder add-gradio-docs

# cleanup-job:
# needs: [process-docs, gradio-job]
# runs-on: ubuntu-latest
# if: always() # This ensures that the cleanup job runs regardless of the result
# steps:
# - name: Checkout doc-builder
# uses: actions/checkout@v4

# - name: Install uv
# uses: astral-sh/setup-uv@v4
# with:
# version: "latest"

# - name: Set up Python 3.10
# run: uv python install 3.10

# - name: Install doc-builder
# run: uv sync --extra dev

# - name: Success Cleanup
# if: needs.process-docs.result == 'success' # Runs if job succeeded
# env:
# MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
# run: uv run doc-builder meilisearch-clean --swap

# - name: Failure Cleanup
# if: needs.process-docs.result == 'failure' # Runs if job failed
# env:
# MEILISEARCH_KEY: ${{ secrets.MEILISEARCH_KEY }}
# run: uv run doc-builder meilisearch-clean

MEILISEARCH_URL: ${{ secrets.MEILISEARCH_URL }}
HF_TOKEN: ${{ secrets.HF_EMBED_DATASETS_TOKEN }}
run: uv run doc-builder populate-search-engine --incremental
3 changes: 2 additions & 1 deletion migrations/clear_meili_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
def main():
parser = argparse.ArgumentParser(description="Delete all documents from a Meilisearch index")
parser.add_argument("--meilisearch_key", type=str, required=True, help="Meilisearch API key")
parser.add_argument("--meilisearch_url", type=str, required=True, help="Meilisearch URL")
parser.add_argument("--temp", action="store_true", help="Clear the temp index instead of the main index")
args = parser.parse_args()

index_name = MEILI_INDEX_TEMP if args.temp else MEILI_INDEX

client = meilisearch.Client("https://edge.meilisearch.com", args.meilisearch_key)
client = meilisearch.Client(args.meilisearch_url, args.meilisearch_key)
clear_embedding_db(client, index_name)
print(f"[meilisearch] successfully cleared all documents from {index_name}")

Expand Down
3 changes: 2 additions & 1 deletion migrations/create_meili_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
def main():
parser = argparse.ArgumentParser(description="Create a Meilisearch index for docs semantic search")
parser.add_argument("--meilisearch_key", type=str, required=True, help="Meilisearch API key")
parser.add_argument("--meilisearch_url", type=str, required=True, help="Meilisearch URL")
parser.add_argument("--temp", action="store_true", help="Create the temp index instead of the main index")
args = parser.parse_args()

index_name = MEILI_INDEX_TEMP if args.temp else MEILI_INDEX

client = meilisearch.Client("https://edge.meilisearch.com", args.meilisearch_key)
client = meilisearch.Client(args.meilisearch_url, args.meilisearch_key)
create_embedding_db(client, index_name)
update_db_settings(client, index_name)
print(f"[meilisearch] successfully created {index_name}")
Expand Down
50 changes: 50 additions & 0 deletions migrations/export_meili_ids_to_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python3
"""
One-time migration script: export all existing Meilisearch document IDs to the
HF Hub tracker dataset (hf-doc-build/doc-builder-embeddings-tracker).

This bootstraps the tracker so that subsequent `populate-search-engine
--incremental` runs can diff against it instead of re-indexing everything.

Usage:
uv run python migrations/export_meili_ids_to_hf.py \
--meilisearch_key <key> \
--meilisearch_url <url> \
[--hf_token <token>] # falls back to HF_TOKEN env var
"""

import argparse

import meilisearch

from doc_builder.build_embeddings import MEILI_INDEX
from doc_builder.embeddings_tracker import save_tracker
from doc_builder.meilisearch_helper import get_all_document_ids


def main():
parser = argparse.ArgumentParser(description="Export all Meilisearch document IDs to the HF Hub tracker dataset.")
parser.add_argument("--meilisearch_key", type=str, required=True, help="Meilisearch API key")
parser.add_argument("--meilisearch_url", type=str, required=True, help="Meilisearch URL")
parser.add_argument(
"--hf_token",
type=str,
required=False,
default=None,
help="HuggingFace token with write access (falls back to HF_TOKEN env var)",
)
args = parser.parse_args()

client = meilisearch.Client(args.meilisearch_url, args.meilisearch_key)

print(f"Fetching all document IDs from Meilisearch index '{MEILI_INDEX}'...")
ids = get_all_document_ids(client, MEILI_INDEX)
print(f"Found {len(ids)} documents in '{MEILI_INDEX}'")

print("Pushing ID list to HF Hub tracker...")
save_tracker(ids, hf_token=args.hf_token)
print("Done. The tracker is now ready for incremental updates.")


if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion migrations/swap_meili_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
def main():
parser = argparse.ArgumentParser(description="Swap main and temp Meilisearch indexes")
parser.add_argument("--meilisearch_key", type=str, required=True, help="Meilisearch API key")
parser.add_argument("--meilisearch_url", type=str, required=True, help="Meilisearch URL")
args = parser.parse_args()

client = meilisearch.Client("https://edge.meilisearch.com", args.meilisearch_key)
client = meilisearch.Client(args.meilisearch_url, args.meilisearch_key)
swap_indexes(client, MEILI_INDEX, MEILI_INDEX_TEMP)
print(f"[meilisearch] successfully swapped {MEILI_INDEX} and {MEILI_INDEX_TEMP}")

Expand Down
15 changes: 8 additions & 7 deletions src/doc_builder/build_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@
"text source_page_url source_page_title library embedding heading1 heading2 heading3 heading4 heading5 page",
)

MEILI_INDEX = "docs-semantic-search-v2"
MEILI_INDEX_TEMP = "docs-semantic-search-v2-temp"
MEILI_INDEX = "docs-semantic-search"
MEILI_INDEX_TEMP = "docs-semantic-search-temp"

_re_md_anchor = re.compile(r"\[\[(.*)]]")
_re_non_alphaneumeric = re.compile(r"[^a-z0-9\s]+", re.IGNORECASE)
Expand Down Expand Up @@ -778,6 +778,7 @@ def build_embeddings(
hf_ie_url,
hf_ie_token,
meilisearch_key,
meilisearch_url,
version="main",
version_tag="main",
language="en",
Expand Down Expand Up @@ -832,17 +833,17 @@ def build_embeddings(
embeddings = call_embedding_inference(chunks, hf_ie_url, hf_ie_token, is_python_module)

# Step 3: push embeddings to vector database (meilisearch)
client = meilisearch.Client("https://edge.meilisearch.com", meilisearch_key)
client = meilisearch.Client(meilisearch_url, meilisearch_key)
ITEMS_PER_CHUNK = 5000 # a value that was found experimentally
for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading data to meilisearch"):
add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings)


def clean_meilisearch(meilisearch_key: str, swap: bool):
def clean_meilisearch(meilisearch_key: str, swap: bool, meilisearch_url: str):
"""
Swap & delete temp index.
"""
client = meilisearch.Client("https://edge.meilisearch.com", meilisearch_key)
client = meilisearch.Client(meilisearch_url, meilisearch_key)
if swap:
swap_indexes(client, MEILI_INDEX, MEILI_INDEX_TEMP)
delete_embedding_db(client, MEILI_INDEX_TEMP)
Expand All @@ -851,7 +852,7 @@ def clean_meilisearch(meilisearch_key: str, swap: bool):
print("[meilisearch] successfully swapped & deleted temp index.")


def add_gradio_docs(hf_ie_url: str, hf_ie_token: str, meilisearch_key: str):
def add_gradio_docs(hf_ie_url: str, hf_ie_token: str, meilisearch_key: str, meilisearch_url: str):
"""Add Gradio documentation to embeddings."""
# Step 1: download the documentation
url = "https://huggingface.co/datasets/gradio/docs/resolve/main/docs.json"
Expand Down Expand Up @@ -894,7 +895,7 @@ def add_gradio_docs(hf_ie_url: str, hf_ie_token: str, meilisearch_key: str):
embeddings.extend(batch_embeddings)

# Step 3: push embeddings to vector database (meilisearch)
client = meilisearch.Client("https://edge.meilisearch.com", meilisearch_key)
client = meilisearch.Client(meilisearch_url, meilisearch_key)
ITEMS_PER_CHUNK = 5000 # a value that was found experimentally
for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading gradio docs to meilisearch"):
add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings)
Loading