diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8d684a6ec..ee8c29b9c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,6 +26,13 @@ repos: - repo: local hooks: + - id: generate-cli-docs + name: generate CLI documentation + entry: python dev/generate_cli_docs.py + language: system + files: ^(python/cocoindex/cli\.py|dev/generate_cli_docs\.py)$ + pass_filenames: false + - id: maturin-develop name: maturin develop entry: maturin develop -E all,dev diff --git a/dev/README.md b/dev/README.md new file mode 100644 index 000000000..0efd539d0 --- /dev/null +++ b/dev/README.md @@ -0,0 +1,37 @@ +# Development Scripts + +This directory contains development and maintenance scripts for the CocoIndex project. + +## Scripts + +### `generate_cli_docs.py` + +Automatically generates CLI documentation from the CocoIndex Click commands. + +**Usage:** + +```bash +python dev/generate_cli_docs.py +``` + +**What it does:** + +- Extracts help messages from all Click commands in `python/cocoindex/cli.py` +- Generates comprehensive Markdown documentation with properly formatted tables +- Saves the output to `docs/docs/core/cli-commands.md` for direct import into CLI documentation +- Only updates the file if content has changed (avoids unnecessary git diffs) +- Automatically escapes HTML-like tags to prevent MDX parsing issues +- Wraps URLs with placeholders in code blocks for proper rendering + +**Integration:** + +- Runs automatically as a pre-commit hook when `python/cocoindex/cli.py` is modified +- The generated documentation is directly imported into `docs/docs/core/cli.mdx` via MDX import +- Provides seamless single-page CLI documentation experience without separate reference pages + +**Dependencies:** + +- `md-click` package for extracting Click help information +- `cocoindex` package must be importable (the CLI module) + +This ensures that CLI documentation is always kept in sync with the actual command-line interface. diff --git a/dev/generate_cli_docs.py b/dev/generate_cli_docs.py new file mode 100644 index 000000000..c989c7753 --- /dev/null +++ b/dev/generate_cli_docs.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python3 +""" +Script to generate CLI documentation from CocoIndex Click commands. + +This script uses md-click as the foundation but generates enhanced markdown +documentation that's suitable for inclusion in the CocoIndex documentation site. +""" + +import sys +import os +from pathlib import Path +import re +from typing import Dict, List, Any + +# Add the cocoindex python directory to the path +project_root = Path(__file__).parent.parent +python_path = project_root / "python" +sys.path.insert(0, str(python_path)) + +try: + import md_click + from cocoindex.cli import cli +except ImportError as e: + print(f"Error importing required modules: {e}") + print("Make sure to run this script from the project root and install dependencies") + sys.exit(1) + + +def clean_usage_line(usage: str) -> str: + """Clean up the usage line to remove 'cli' and make it generic, and remove the 'Usage:' prefix.""" + # Replace 'cli' with 'cocoindex' in usage lines and remove 'Usage:' prefix + cleaned = usage.replace("Usage: cli ", "cocoindex ") + # Handle case where it might be "Usage: cocoindex" already + if cleaned.startswith("Usage: cocoindex "): + cleaned = cleaned.replace("Usage: cocoindex ", "cocoindex ") + return cleaned + + +def escape_html_tags(text: str) -> str: + """Escape HTML-like tags in text to prevent MDX parsing issues, but preserve them in code blocks.""" + import re + + # Handle special cases where URLs with placeholders should be wrapped in code blocks + text = re.sub(r"http://localhost:<([^>]+)>", r"`http://localhost:<\1>`", text) + text = re.sub(r"https://([^<\s]+)<([^>]+)>", r"`https://\1<\2>`", text) + + # Handle comma-separated URL examples specifically (e.g., "https://site1.com,http://localhost:3000") + text = re.sub(r"(?", ">")) + else: + # Odd indices are code blocks, preserve as-is + result.append(part) + + return "".join(result) + + +def format_options_section(help_text: str) -> str: + """Extract and format the options section.""" + lines = help_text.split("\n") + options_start = None + commands_start = None + + for i, line in enumerate(lines): + if line.strip() == "Options:": + options_start = i + elif line.strip() == "Commands:": + commands_start = i + break + + if options_start is None: + return "" + + # Extract options section + end_idx = commands_start if commands_start else len(lines) + options_lines = lines[options_start + 1 : end_idx] # Skip "Options:" header + + # Parse options - each option starts with exactly 2 spaces and a dash + formatted_options = [] + current_option = None + current_description = [] + + for line in options_lines: + if not line.strip(): # Empty line + continue + + # Check if this is a new option line (starts with exactly 2 spaces then -) + if line.startswith(" -") and not line.startswith(" "): + # Save previous option if exists + if current_option is not None: + desc = " ".join(current_description).strip() + desc = escape_html_tags(desc) # Escape HTML tags for MDX compatibility + formatted_options.append(f"| `{current_option}` | {desc} |") + + # Remove the leading 2 spaces + content = line[2:] + + # Find the position where we have multiple consecutive spaces (start of description) + match = re.search(r"\s{2,}", content) + if match: + # Split at the first occurrence of multiple spaces + option_part = content[: match.start()] + desc_part = content[match.end() :] + current_option = option_part.strip() + current_description = [desc_part.strip()] if desc_part.strip() else [] + else: + # No description on this line, just the option + current_option = content.strip() + current_description = [] + else: + # Continuation line (starts with more than 2 spaces) + if current_option is not None and line.strip(): + current_description.append(line.strip()) + + # Add last option + if current_option is not None: + desc = " ".join(current_description).strip() + desc = escape_html_tags(desc) # Escape HTML tags for MDX compatibility + formatted_options.append(f"| `{current_option}` | {desc} |") + + if formatted_options: + header = "| Option | Description |\n|--------|-------------|" + return f"{header}\n" + "\n".join(formatted_options) + "\n" + + return "" + + +def format_commands_section(help_text: str) -> str: + """Extract and format the commands section.""" + lines = help_text.split("\n") + commands_start = None + + for i, line in enumerate(lines): + if line.strip() == "Commands:": + commands_start = i + break + + if commands_start is None: + return "" + + # Extract commands section + commands_lines = lines[commands_start + 1 :] + + # Parse commands - each command starts with 2 spaces then the command name + formatted_commands = [] + + for line in commands_lines: + if not line.strip(): # Empty line + continue + + # Check if this is a command line (starts with 2 spaces + command name) + match = re.match(r"^ (\w+)\s{2,}(.+)$", line) + if match: + command = match.group(1) + description = match.group(2).strip() + # Truncate long descriptions + if len(description) > 80: + description = description[:77] + "..." + formatted_commands.append(f"| `{command}` | {description} |") + + if formatted_commands: + header = "| Command | Description |\n|---------|-------------|" + return f"{header}\n" + "\n".join(formatted_commands) + "\n" + + return "" + + +def extract_description(help_text: str) -> str: + """Extract the main description from help text.""" + lines = help_text.split("\n") + + # Find the description between usage and options/commands + description_lines = [] + in_description = False + + for line in lines: + if line.startswith("Usage:"): + in_description = True + continue + elif line.strip() in ["Options:", "Commands:"]: + break + elif in_description and line.strip(): + description_lines.append(line.strip()) + + description = "\n\n".join(description_lines) if description_lines else "" + return escape_html_tags(description) # Escape HTML tags for MDX compatibility + + +def generate_command_docs(docs: List[Dict[str, Any]]) -> str: + """Generate markdown documentation for all commands.""" + + # Separate main CLI from subcommands + main_cli = None + subcommands = [] + + for doc in docs: + parent = doc.get("parent", "") + if not parent: + main_cli = doc + else: + subcommands.append(doc) + + markdown_content = [] + + # Add top-level heading to satisfy MD041 linting rule + markdown_content.append("# CLI Commands") + markdown_content.append("") + + # Generate only the command details section (remove redundant headers) + for doc in sorted(subcommands, key=lambda x: x["command"].name): + command_name = doc["command"].name + help_text = doc["help"] + usage = clean_usage_line(doc["usage"]) + description = extract_description(help_text) + + markdown_content.append(f"## `{command_name}`") + markdown_content.append("") + + if description: + markdown_content.append(description) + markdown_content.append("") + + # Add usage + markdown_content.append("**Usage:**") + markdown_content.append("") + markdown_content.append(f"```bash") + markdown_content.append(usage) + markdown_content.append("```") + markdown_content.append("") + + # Add options if any + options_section = format_options_section(help_text) + if options_section: + markdown_content.append("**Options:**") + markdown_content.append("") + markdown_content.append(options_section) + + markdown_content.append("---") + markdown_content.append("") + + return "\n".join(markdown_content) + + +def main(): + """Generate CLI documentation and save to file.""" + print("Generating CocoIndex CLI documentation...") + + try: + # Generate documentation using md-click + docs_generator = md_click.main.recursive_help(cli) + docs = list(docs_generator) + + print(f"Found {len(docs)} CLI commands to document") + + # Generate markdown content + markdown_content = generate_command_docs(docs) + + # Determine output path + docs_dir = project_root / "docs" / "docs" / "core" + output_file = docs_dir / "cli-commands.md" + + # Ensure directory exists + docs_dir.mkdir(parents=True, exist_ok=True) + + # Write the generated documentation + content_changed = True + if output_file.exists(): + with open(output_file, "r", encoding="utf-8") as f: + existing_content = f.read() + content_changed = existing_content != markdown_content + + if content_changed: + with open(output_file, "w", encoding="utf-8") as f: + f.write(markdown_content) + + print(f"CLI documentation generated successfully at: {output_file}") + print( + f"Generated {len(markdown_content.splitlines())} lines of documentation" + ) + else: + print(f"CLI documentation is up to date at: {output_file}") + + except Exception as e: + print(f"Error generating documentation: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/docs/docs/core/cli-commands.md b/docs/docs/core/cli-commands.md new file mode 100644 index 000000000..a058dad74 --- /dev/null +++ b/docs/docs/core/cli-commands.md @@ -0,0 +1,210 @@ +# CLI Commands + +## `drop` + +Drop the backend setup for flows. + +Modes of operation: + +1. Drop all flows defined in an app: `cocoindex drop ` + +2. Drop specific named flows: `cocoindex drop [FLOW_NAME...]` + +**Usage:** + +```bash +cocoindex drop [OPTIONS] [APP_TARGET] [FLOW_NAME]... +``` + +**Options:** + +| Option | Description | +|--------|-------------| +| `-f, --force` | Force drop without confirmation prompts. | +| `--help` | Show this message and exit. | + +--- + +## `evaluate` + +Evaluate the flow and dump flow outputs to files. + +Instead of updating the index, it dumps what should be indexed to files. + +Mainly used for evaluation purpose. + +APP_FLOW_SPECIFIER: Specifies the application and optionally the target flow. + +Can be one of the following formats: + +- path/to/your_app.py + +- an_installed.module_name + +- path/to/your_app.py:SpecificFlowName + +- an_installed.module_name:SpecificFlowName + +:SpecificFlowName can be omitted only if the application defines a single + +flow. + +**Usage:** + +```bash +cocoindex evaluate [OPTIONS] APP_FLOW_SPECIFIER +``` + +**Options:** + +| Option | Description | +|--------|-------------| +| `-o, --output-dir TEXT` | The directory to dump the output to. | +| `--cache / --no-cache` | Use already-cached intermediate data if available. [default: cache] | +| `--help` | Show this message and exit. | + +--- + +## `ls` + +List all flows. + +If APP_TARGET (path/to/app.py or a module) is provided, lists flows defined + +in the app and their backend setup status. + +If APP_TARGET is omitted, lists all flows that have a persisted setup in the + +backend. + +**Usage:** + +```bash +cocoindex ls [OPTIONS] [APP_TARGET] +``` + +**Options:** + +| Option | Description | +|--------|-------------| +| `--help` | Show this message and exit. | + +--- + +## `server` + +Start a HTTP server providing REST APIs. + +It will allow tools like CocoInsight to access the server. + +APP_TARGET: path/to/app.py or installed_module. + +**Usage:** + +```bash +cocoindex server [OPTIONS] APP_TARGET +``` + +**Options:** + +| Option | Description | +|--------|-------------| +| `-a, --address TEXT` | The address to bind the server to, in the format of IP:PORT. If unspecified, the address specified in COCOINDEX_SERVER_ADDRESS will be used. | +| `-c, --cors-origin TEXT` | The origins of the clients (e.g. CocoInsight UI) to allow CORS from. Multiple origins can be specified as a comma-separated list. e.g. `https://cocoindex.io,http://localhost:3000`. Origins specified in COCOINDEX_SERVER_CORS_ORIGINS will also be included. | +| `-ci, --cors-cocoindex` | Allow `https://cocoindex.io` to access the server. | +| `-cl, --cors-local INTEGER` | Allow `http://localhost:` to access the server. | +| `-L, --live-update` | Continuously watch changes from data sources and apply to the target index. | +| `--setup` | Automatically setup backends for the flow if it's not setup yet. | +| `--reexport` | Reexport to targets even if there's no change. | +| `-f, --force` | Force setup without confirmation prompts. | +| `-q, --quiet` | Avoid printing anything to the standard output, e.g. statistics. | +| `-r, --reload` | Enable auto-reload on code changes. | +| `--help` | Show this message and exit. | + +--- + +## `setup` + +Check and apply backend setup changes for flows, including the internal + +storage and target (to export to). + +APP_TARGET: path/to/app.py or installed_module. + +**Usage:** + +```bash +cocoindex setup [OPTIONS] APP_TARGET +``` + +**Options:** + +| Option | Description | +|--------|-------------| +| `-f, --force` | Force setup without confirmation prompts. | +| `--help` | Show this message and exit. | + +--- + +## `show` + +Show the flow spec and schema. + +APP_FLOW_SPECIFIER: Specifies the application and optionally the target + +flow. Can be one of the following formats: + +- path/to/your_app.py + +- an_installed.module_name + +- path/to/your_app.py:SpecificFlowName + +- an_installed.module_name:SpecificFlowName + +:SpecificFlowName can be omitted only if the application defines a single + +flow. + +**Usage:** + +```bash +cocoindex show [OPTIONS] APP_FLOW_SPECIFIER +``` + +**Options:** + +| Option | Description | +|--------|-------------| +| `--color / --no-color` | Enable or disable colored output. | +| `--verbose` | Show verbose output with full details. | +| `--help` | Show this message and exit. | + +--- + +## `update` + +Update the index to reflect the latest data from data sources. + +APP_FLOW_SPECIFIER: path/to/app.py, module, path/to/app.py:FlowName, or + +module:FlowName. If :FlowName is omitted, updates all flows. + +**Usage:** + +```bash +cocoindex update [OPTIONS] APP_FLOW_SPECIFIER +``` + +**Options:** + +| Option | Description | +|--------|-------------| +| `-L, --live` | Continuously watch changes from data sources and apply to the target index. | +| `--reexport` | Reexport to targets even if there's no change. | +| `--setup` | Automatically setup backends for the flow if it's not setup yet. | +| `-f, --force` | Force setup without confirmation prompts. | +| `-q, --quiet` | Avoid printing anything to the standard output, e.g. statistics. | +| `--help` | Show this message and exit. | + +--- diff --git a/docs/docs/core/cli.mdx b/docs/docs/core/cli.mdx index cdcc0977d..265ab9a70 100644 --- a/docs/docs/core/cli.mdx +++ b/docs/docs/core/cli.mdx @@ -53,23 +53,6 @@ CocoIndex CLI supports the following global options: * `--version`: Show the CocoIndex version and exit. * `--help`: Show the main help message and exit. -## Subcommands - -The following subcommands are available: - -| Subcommand | Description | -| ---------- | ----------- | -| `ls` | List all flows present in the given file/module. Or list all persisted flows under the current app namespace if no file/module specified. | -| `show` | Show the spec and schema for a specific flow. | -| `setup` | Check and apply backend setup changes for flows, including the internal storage and target (to export). | -| `drop` | Drop the backend setup for specified flows. | -| `update` | Update the index defined by the flow. | -| `evaluate` | Evaluate the flow and dump flow outputs to files. Instead of updating the index, it dumps what should be indexed to files. Mainly used for evaluation purpose. | -| `server` | Start a HTTP server providing REST APIs. It will allow tools like CocoInsight to access the server. | - -Use `--help` to see the full list of subcommands, and `subcommand --help` to see the usage of a specific one. - -```sh -cocoindex --help # Show all subcommands -cocoindex show --help # Show usage of "show" subcommand -``` +import CliCommands from './cli-commands.md'; + +