diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f4f57dd4..ac791939 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -23,6 +23,9 @@ jobs: - name: Install dependencies run: npm ci + - name: Install Python dependencies + run: | + python3 -m pip install --user gitingest - name: Build website run: npm run build diff --git a/.github/workflows/test-deploy.yml b/.github/workflows/test-deploy.yml index 9dbe4618..c29f563f 100644 --- a/.github/workflows/test-deploy.yml +++ b/.github/workflows/test-deploy.yml @@ -23,5 +23,7 @@ jobs: - name: Install dependencies run: npm ci + - name: Install Python dependencies + run: python3 -m pip install --user gitingest - name: Test build website run: npm run build diff --git a/docusaurus.config.js b/docusaurus.config.js index 8c6b0bad..7af0e245 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -287,7 +287,7 @@ const config = { 'docusaurus-plugin-llms', { generateLLMsTxt: true, - generateLLMsFullTxt: true, + generateLLMsFullTxt: false, // Disabled. We're currently using gitingest to generate a more detailed llms-full.txt file. For details, see /scripts/README.md. docsDir: 'docs', version: 'latest', title: 'ScalarDB Documentation', diff --git a/package.json b/package.json index 417bd753..1334f1f2 100644 --- a/package.json +++ b/package.json @@ -5,13 +5,14 @@ "scripts": { "docusaurus": "docusaurus", "start": "docusaurus start", - "build": "docusaurus build 2>&1 | tee brokenLinks.log && node scripts/filter-broken-link-warnings.js && node scripts/generate-glossary-json.js", + "build": "docusaurus build 2>&1 | tee brokenLinks.log && node scripts/filter-broken-link-warnings.js && node scripts/generate-glossary-json.js && npm run generate-llms-full", "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", "clear": "docusaurus clear", "serve": "docusaurus serve", "write-translations": "docusaurus write-translations", - "write-heading-ids": "docusaurus write-heading-ids" + "write-heading-ids": "docusaurus write-heading-ids", + "generate-llms-full": "python3 scripts/generate-llms-full.py" }, "dependencies": { "@docusaurus/core": "^3.7.0", diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..f6ffb323 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,64 @@ +# Create `llms-full.txt` by Using the `generate-llms-full.py` Script + +The `generate-llms-full.py` script generates an `llms-full.txt` file when the Docusaurus site is built. + +> [!CAUTION] +> +> If this script stops working, it's because [gitingest](https://github.com/coderamp-labs/gitingest) is either down or has limited its API usage. If that happens, we'll need to find another way or host gitingest ourselves and provide it with an API key from an AI language model provider (OpenAI, Claude, etc.) to generate the `llms-full.txt` file. + +## Why do we need this script? + +The `docusaurus-plugin-llms` plugin can generate a `llms-full.txt` file, the file doesn't include front-matter metadata. Currently, this seems to be the expected behavior for the `llms.txt` standard. + +However, we need to be able to tell AI language models when our documentation applies to only specific editions, which is already specified in `tags` in the front-matter properties of each Markdown file. + +By using [gitingest](https://github.com/coderamp-labs/gitingest), we can generate a `llms-full.txt` that includes front-matter data as well as a directory tree within `llms-full.txt` to provide AI language models with better context into our documentation, particularly front-matter metadata (like edition tags) and documentation navigation. + +## Usage + +The `generate-llms-full` script runs when the Docusaurus site is built: + +```shell +npm run generate-llms-full +``` + +You should rarely have to run the following Python script directly, unless you want to do testing: + +```shell +python scripts/generate-llms-full.py +``` + +### Requirements + +- Python 3.8+ +- gitingest package + +> [!NOTE] +> +> For local development, install gitingest manually by using `pip install --user gitingest` or `pipx install gitingest`. For GitHub Actions, gitingest is automatically installed in the workflow for building and deploying the docs site at `.github/workflows/deploy.yml`. + +### What the `generate-llms-full.py` script does + +1. Uses gitingest to analyze the `docs` directory. +2. Includes only .mdx documentation files (`docs/*.mdx`, `docs/**/*.mdx`, and `src/components/en-us`). +3. Focuses on the latest version of English documentation. +4. Excludes build artifacts, node_modules, and other irrelevant files. +5. Generates a comprehensive AI-friendly text digest. +6. Adds a custom header for ScalarDB documentation context. +7. Outputs to `build/llms-full.txt`. + +### Configuration + +The script includes these file patterns: + +- **Include:** `docs/*.mdx`, `docs/**/*.mdx`, `src/components/en-us/*.mdx`, `src/components/en-us/**/*.mdx` (only latest English docs) +- **Exclude:** `node_modules/*`, `.git/*`, `build/*`, `*.log` +- **Max file size:** 100KB per file + +### Benefits over `docusaurus-plugin-llms` + +- Better repository understanding and context +- More comprehensive file inclusion +- Optimized format for AI language model consumption +- Active maintenance and updates +- Superior pattern matching and filtering diff --git a/scripts/generate-llms-full.py b/scripts/generate-llms-full.py new file mode 100644 index 00000000..48327a97 --- /dev/null +++ b/scripts/generate-llms-full.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +""" +Generate llms-full.txt by using gitingest instead of docusaurus-plugin-llms +""" + +import asyncio +import sys +import textwrap +from pathlib import Path + +try: + from gitingest import ingest_async +except ImportError: + print("❌ gitingest not found. Please install it first:") + print(" pip install --user gitingest") + print(" # or") + print(" pipx install gitingest") + print("") + print("For GitHub Actions, this should be installed automatically in the workflow.") + sys.exit(1) + + +async def generate_llms_full(): + """Generate llms-full.txt by using gitingest.""" + try: + print("Generating llms-full.txt by using gitingest...") + + # Current repository path + repo_path = Path(__file__).parent.parent + build_dir = repo_path / "build" + build_dir.mkdir(exist_ok=True) + + # Configure the gitingest parameters. + include_patterns = { + "docs/*.mdx", "docs/**/*.mdx", "src/components/en-us/*.mdx", "src/components/en-us/**/*.mdx" + } + + exclude_patterns = { + "node_modules/*", ".git/*", "build/*", + "*.log", ".next/*", "dist/*", ".docusaurus/*" + } + + # Generate content by using gitingest. + summary, tree, content = await ingest_async( + str(repo_path), + max_file_size=100000, # 100 KB max file size + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + include_gitignored=False + ) + + # Create a header that matches your current format. + header = textwrap.dedent("""\ + # ScalarDB Documentation - Full Repository Context + # Generated by using GitIngest for AI/LLM consumption + # Cloud-native universal transaction manager + # Website: https://scalardb.scalar-labs.com + + """) + + # Combine all sections. + full_content = header + summary + "\n\n" + tree + "\n\n" + content + + # Write to the build directory. + output_path = build_dir / "llms-full.txt" + with open(output_path, 'w', encoding='utf-8') as f: + f.write(full_content) + + print(f"✅ llms-full.txt generated successfully at {output_path}") + print(f"📊 Summary: {len(full_content)} characters, estimated tokens: {len(full_content.split())}") + + except Exception as error: + print(f"❌ Error generating llms-full.txt: {error}") + sys.exit(1) + +if __name__ == "__main__": + asyncio.run(generate_llms_full())