diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml index af5db240..32d7303c 100644 --- a/.pre-commit-hooks.yaml +++ b/.pre-commit-hooks.yaml @@ -1,7 +1,16 @@ - id: datapilot_run_dbt_checks name: datapilot run dbt checks - description: datapilot run dbt checks + description: Run DataPilot dbt project health checks on changed files entry: datapilot_run_dbt_checks language: python types_or: [yaml, sql] require_serial: true + # Optional arguments that can be passed to the hook: + # --config-path: Path to configuration file + # --token: API token for authentication + # --instance-name: Tenant/instance name + # --backend-url: Backend URL (defaults to https://api.myaltimate.com) + # --config-name: Name of config to use from API + # --manifest-path: Path to DBT manifest file (defaults to ./target/manifest.json) + # --catalog-path: Path to DBT catalog file (defaults to ./target/catalog.json) + # --base-path: Base path of the dbt project (defaults to current directory) diff --git a/README.md b/README.md index 8cc7882e..dc8b4b53 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,40 @@ The [--config-path] is an optional argument. You can provide a yaml file with ov Note: The dbt docs generate requires an active database connection and may take a long time for projects with large number of models. +### Pre-commit Hook Integration + +DataPilot provides a pre-commit hook that automatically runs health checks on changed files before each commit. This ensures code quality and catches issues early in the development process. + +#### Quick Setup + +1. Install pre-commit: +```bash +pip install pre-commit +``` + +2. Add to your `.pre-commit-config.yaml`: +```yaml +repos: + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.27 # Always use a specific version tag + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}", + "--manifest-path", "./target/manifest.json", + "--catalog-path", "./target/catalog.json" + ] +``` + +3. Install the hook: +```bash +pre-commit install +``` + +For detailed setup instructions, see the [Pre-commit Hook Setup Guide](docs/pre-commit-setup.md). + ### Checks The following checks are available: diff --git a/docs/hooks.rst b/docs/hooks.rst index 9a86c002..0e1d03fa 100644 --- a/docs/hooks.rst +++ b/docs/hooks.rst @@ -11,37 +11,171 @@ To use the DataPilot pre-commit hook, follow these steps: 1. Install the `pre-commit` package if you haven't already: -``` -pip install pre-commit -``` +.. code-block:: shell + + pip install pre-commit 2. Add the following configuration to your .pre-commit-config.yaml file in the root of your repository: -``` +.. code-block:: yaml + repos: - - repo: https://github.com/AltimateAI/datapilot-cli - rev: - hooks: - - id: datapilot_run_dbt_checks - args: ["--config-path", "path/to/your/config/file"] -``` + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.27 # Use a specific version tag, not 'main' + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}" + ] + +Configuration Options +--------------------- + +The DataPilot pre-commit hook supports several configuration options: + +**Required Configuration:** + +- ``rev``: Always use a specific version tag (e.g., ``v0.0.27``) instead of ``main`` for production stability + +**Optional Arguments:** + +- ``--config-path``: Path to your DataPilot configuration file +- ``--token``: Your API token for authentication (can use environment variables) +- ``--instance-name``: Your tenant/instance name (can use environment variables) +- ``--backend-url``: Backend URL (defaults to https://api.myaltimate.com) +- ``--config-name``: Name of config to use from API +- ``--base-path``: Base path of the dbt project (defaults to current directory) +- ``--manifest-path``: Path to the DBT manifest file (defaults to {base_path}/target/manifest.json) +- ``--catalog-path``: Path to the DBT catalog file (defaults to {base_path}/target/catalog.json) -Replace with the desired revision of the DataPilot repository and "path/to/your/config/file" with the path to your configuration file. +**Environment Variables:** + +You can use environment variables for sensitive information: + +.. code-block:: yaml + + repos: + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.27 + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}", + "--manifest-path", "./target/manifest.json", + "--catalog-path", "./target/catalog.json" + ] + +**Configuration File Example:** + +Create a ``datapilot-config.yaml`` file in your project root: + +.. code-block:: yaml + + # DataPilot Configuration + disabled_insights: + - "hard_coded_references" + - "duplicate_sources" + + # Custom settings for your project + project_settings: + max_fanout: 10 + require_tests: true 3. Install the pre-commit hook: -``` -pre-commit install -``` +.. code-block:: shell + + pre-commit install Usage ----- -Once the hook is installed, it will run automatically before each commit. If any issues are detected, the commit will be aborted, and you will be prompted to fix the issues before retrying the commit. +Once the hook is installed, it will run automatically before each commit. The hook will: + +1. **Validate Configuration**: Check that your config file exists and is valid +2. **Authenticate**: Use your provided token and instance name to authenticate +3. **Load DBT Artifacts**: Load manifest and catalog files for analysis +4. **Analyze Changes**: Only analyze files that have changed in the commit +5. **Report Issues**: Display any issues found and prevent the commit if problems are detected + +**Required DBT Artifacts:** + +The pre-commit hook requires DBT manifest and catalog files to function properly: +- **Manifest File**: Generated by running `dbt compile` or `dbt run`. Default location: `./target/manifest.json` +- **Catalog File**: Generated by running `dbt docs generate`. Default location: `./target/catalog.json` -If you want to manually run all pre-commit hooks on a repository, run `pre-commit run --all-files`. To run individual hooks use `pre-commit run `. +**Note**: The catalog file is optional but recommended for comprehensive analysis. If not available, the hook will continue without catalog information. +**Manual Execution:** + +To manually run all pre-commit hooks on a repository: + +.. code-block:: shell + + pre-commit run --all-files + +To run individual hooks: + +.. code-block:: shell + + pre-commit run datapilot_run_dbt_checks + +**Troubleshooting:** + +- **Authentication Issues**: Ensure your token and instance name are correctly set +- **Empty Config Files**: The hook will fail if your config file is empty or invalid +- **Missing Manifest File**: Ensure you have run `dbt compile` or `dbt run` to generate the manifest.json file +- **Missing Catalog File**: Run `dbt docs generate` to create the catalog.json file (optional but recommended) +- **No Changes**: If no relevant files have changed, the hook will skip execution +- **Network Issues**: Ensure you have access to the DataPilot API + +Best Practices +------------- + +1. **Use Version Tags**: Always specify a version tag in the ``rev`` field, never use ``main`` +2. **Environment Variables**: Use environment variables for sensitive information like tokens +3. **Configuration Files**: Create a dedicated config file for your project settings +4. **Regular Updates**: Update to new versions when they become available +5. **Team Coordination**: Ensure all team members use the same configuration + +Example Complete Setup +--------------------- + +Here's a complete example of a ``.pre-commit-config.yaml`` file: + +.. code-block:: yaml + + # .pre-commit-config.yaml + exclude: '^(\.tox|ci/templates|\.bumpversion\.cfg)(/|$)' + + repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.14 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix, --show-fixes] + + - repo: https://github.com/psf/black + rev: 23.12.1 + hooks: + - id: black + + - repo: https://github.com/AltimateAI/datapilot-cli + rev: v0.0.27 + hooks: + - id: datapilot_run_dbt_checks + args: [ + "--config-path", "./datapilot-config.yaml", + "--token", "${DATAPILOT_TOKEN}", + "--instance-name", "${DATAPILOT_INSTANCE}", + "--manifest-path", "./target/manifest.json", + "--catalog-path", "./target/catalog.json" + ] Feedback and Contributions -------------------------- diff --git a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py index ff5d7d78..712694e1 100644 --- a/src/datapilot/core/platforms/dbt/hooks/executor_hook.py +++ b/src/datapilot/core/platforms/dbt/hooks/executor_hook.py @@ -1,85 +1,312 @@ import argparse +import sys import time +from pathlib import Path from typing import Optional from typing import Sequence +from typing import Tuple +from datapilot.clients.altimate.utils import get_all_dbt_configs from datapilot.config.config import load_config from datapilot.core.platforms.dbt.constants import MODEL from datapilot.core.platforms.dbt.constants import PROJECT from datapilot.core.platforms.dbt.executor import DBTInsightGenerator from datapilot.core.platforms.dbt.formatting import generate_model_insights_table from datapilot.core.platforms.dbt.formatting import generate_project_insights_table +from datapilot.core.platforms.dbt.utils import load_catalog +from datapilot.core.platforms.dbt.utils import load_manifest from datapilot.utils.formatting.utils import tabulate_data -from datapilot.utils.utils import generate_partial_manifest_catalog -def main(argv: Optional[Sequence[str]] = None): - start_time = time.time() +def get_config(name: str, matching_configs: list) -> dict: + """Extract config from matching configs by name.""" + if matching_configs: + print(f"Using config from API: {name} Config ID: {matching_configs[0]['id']}", file=sys.stderr) + return matching_configs[0].get("config", {}) + else: + print(f"No config found with name: {name}", file=sys.stderr) + print("Pre-commit hook failed: Config not found.", file=sys.stderr) + sys.exit(1) + + +def validate_config_file(config_path: str) -> bool: + """Validate that the config file exists and is not empty.""" + if not Path(config_path).exists(): + print(f"Error: Config file '{config_path}' does not exist.", file=sys.stderr) + return False + + try: + config = load_config(config_path) + if not config: + print(f"Error: Config file '{config_path}' is empty or invalid.", file=sys.stderr) + return False + return True + except Exception as e: + print(f"Error: Failed to load config file '{config_path}': {e}", file=sys.stderr) + return False + + +def setup_argument_parser() -> argparse.ArgumentParser: + """Set up and return the argument parser.""" parser = argparse.ArgumentParser() - parser.add_argument( - "--config-path", - nargs="*", - help="Path of the config file to be used for the insight generation", - ) + parser.add_argument("--config-path", nargs="*", help="Path of the config file to be used for the insight generation") + parser.add_argument("--base-path", nargs="*", help="Base path of the dbt project") + parser.add_argument("--token", help="Your API token for authentication.") + parser.add_argument("--instance-name", help="Your tenant ID.") + parser.add_argument("--backend-url", help="Altimate's Backend URL", default="https://api.myaltimate.com") + parser.add_argument("--config-name", help="Name of the DBT config to use from the API") + parser.add_argument("--manifest-path", help="Path to the DBT manifest file (defaults to ./target/manifest.json)") + parser.add_argument("--catalog-path", help="Path to the DBT catalog file (defaults to ./target/catalog.json)") + return parser - parser.add_argument( - "--base-path", - nargs="*", - help="Base path of the dbt project", - ) - args = parser.parse_known_args(argv) - # print(f"args: {args}", file=sys.__stdout__) - config = {} - if hasattr(args[0], "config_path") and args[0].config_path: - # print(f"Using config file: {args[0].config_path[0]}") - config = load_config(args[0].config_path[0]) - - base_path = "./" - if hasattr(args[0], "base_path") and args[0].base_path: - base_path = args[0].base_path[0] - - changed_files = args[1] - # print(f"Changed files: {changed_files}") - - if not changed_files: - # print("No changed files detected - test. Exiting...") - return - - # print(f"Changed files: {changed_files}", file=sys.__stdout__) - selected_models, manifest, catalog = generate_partial_manifest_catalog(changed_files, base_path=base_path) - # print("se1ected models", selected_models, file=sys.__stdout__) +def extract_arguments(args) -> Tuple[str, str, str, str, str, str, str, str]: + """Extract and return common arguments from parsed args.""" + config_name = getattr(args, "config_name", None) + token = getattr(args, "token", None) + instance_name = getattr(args, "instance_name", None) + backend_url = getattr(args, "backend_url", "https://api.myaltimate.com") + + # Extract config_path and base_path (they are lists) + config_path = args.config_path[0] if args.config_path else None + base_path = args.base_path[0] if args.base_path else "./" + + # Extract manifest and catalog paths + manifest_path = getattr(args, "manifest_path", None) + catalog_path = getattr(args, "catalog_path", None) + + return config_name, token, instance_name, backend_url, config_path, base_path, manifest_path, catalog_path + + +def load_config_from_file(config_path: str) -> dict: + """Load configuration from a file.""" + print(f"Loading config from file: {config_path}", file=sys.stderr) + if not validate_config_file(config_path): + print("Pre-commit hook failed: Invalid config file.", file=sys.stderr) + sys.exit(1) + config = load_config(config_path) + print("Config loaded successfully from file", file=sys.stderr) + return config + + +def load_config_from_api(config_name: str, token: str, instance_name: str, backend_url: str) -> dict: + """Load configuration from API.""" + print(f"Fetching config '{config_name}' from API...", file=sys.stderr) + try: + configs = get_all_dbt_configs(token, instance_name, backend_url) + if configs and "items" in configs: + matching_configs = [c for c in configs["items"] if c["name"] == config_name] + return get_config(config_name, matching_configs) + else: + print("Failed to fetch configs from API", file=sys.stderr) + print("Pre-commit hook failed: Unable to fetch configs.", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error fetching config from API: {e}", file=sys.stderr) + print("Pre-commit hook failed: API error.", file=sys.stderr) + sys.exit(1) + + +def get_configuration(config_name: str, token: str, instance_name: str, backend_url: str, config_path: str) -> dict: + """Get configuration from file, API, or use defaults.""" + if config_path: + return load_config_from_file(config_path) + elif config_name and token and instance_name: + return load_config_from_api(config_name, token, instance_name, backend_url) + else: + print("No config provided. Using default configuration.", file=sys.stderr) + return {} + + +def get_file_paths(base_path: str, manifest_path: str, catalog_path: str) -> Tuple[str, str]: + """Determine manifest and catalog file paths.""" + if not manifest_path: + manifest_path = str(Path(base_path) / "target" / "manifest.json") + print(f"Using default manifest path: {manifest_path}", file=sys.stderr) + else: + print(f"Using provided manifest path: {manifest_path}", file=sys.stderr) + + if not catalog_path: + catalog_path = str(Path(base_path) / "target" / "catalog.json") + print(f"Using default catalog path: {catalog_path}", file=sys.stderr) + else: + print(f"Using provided catalog path: {catalog_path}", file=sys.stderr) + + return manifest_path, catalog_path + + +def load_manifest_file(manifest_path: str): + """Load and validate manifest file.""" + print("Loading manifest file...", file=sys.stderr) + try: + manifest = load_manifest(manifest_path) + node_count = get_node_count(manifest) + print(f"Manifest loaded successfully with {node_count} nodes", file=sys.stderr) + return manifest + except Exception as e: + print(f"Error loading manifest from {manifest_path}: {e}", file=sys.stderr) + print("Pre-commit hook failed: Unable to load manifest file.", file=sys.stderr) + sys.exit(1) + + +def load_catalog_file(catalog_path: str): + """Load catalog file if it exists.""" + if not Path(catalog_path).exists(): + print(f"Catalog file not found at {catalog_path}, continuing without catalog", file=sys.stderr) + return None + + print("Loading catalog file...", file=sys.stderr) + try: + catalog = load_catalog(catalog_path) + node_count = get_node_count(catalog) + print(f"Catalog loaded successfully with {node_count} nodes", file=sys.stderr) + return catalog + except Exception as e: + print(f"Warning: Error loading catalog from {catalog_path}: {e}", file=sys.stderr) + print("Continuing without catalog...", file=sys.stderr) + return None + + +def get_node_count(obj) -> int: + """Get the number of nodes from manifest or catalog object.""" + if hasattr(obj, "nodes"): + return len(obj.nodes) + elif hasattr(obj, "get") and callable(obj.get): + return len(obj.get("nodes", {})) + else: + return 0 + + +def process_changed_files(changed_files: list) -> list: + """Process changed files for selective model testing.""" + if changed_files: + print(f"Processing {len(changed_files)} changed files for selective testing...", file=sys.stderr) + print(f"Changed files: {', '.join(changed_files)}", file=sys.stderr) + return changed_files + else: + print("No changed files detected. Running checks on all models.", file=sys.stderr) + return [] + + +def run_insight_generation(manifest, catalog, config, selected_models, token, instance_name, backend_url): + """Run the insight generation process.""" + print("Initializing DBT Insight Generator...", file=sys.stderr) insight_generator = DBTInsightGenerator( manifest=manifest, catalog=catalog, config=config, - selected_model_ids=selected_models, + selected_models=selected_models, + token=token, + instance_name=instance_name, + backend_url=backend_url, ) - reports = insight_generator.run() - if reports: - model_report = generate_model_insights_table(reports[MODEL]) - if len(model_report) > 0: - print("--" * 50) - print("Model Insights") - print("--" * 50) - for model_id, report in model_report.items(): - print(f"Model: {model_id}") - print(f"File path: {report['path']}") - print(tabulate_data(report["table"], headers="keys")) - print("\n") - - project_report = generate_project_insights_table(reports[PROJECT]) - if len(project_report) > 0: - print("--" * 50) - print("Project Insights") - print("--" * 50) - print(tabulate_data(project_report, headers="keys")) - - exit(1) + + print("Running insight generation...", file=sys.stderr) + return insight_generator.run() + + +def print_model_insights(model_report: dict): + """Print model insights in a formatted table.""" + print("--" * 50, file=sys.stderr) + print("Model Insights", file=sys.stderr) + print("--" * 50, file=sys.stderr) + for model_id, report in model_report.items(): + print(f"Model: {model_id}", file=sys.stderr) + print(f"File path: {report['path']}", file=sys.stderr) + print(tabulate_data(report["table"], headers="keys"), file=sys.stderr) + print("\n", file=sys.stderr) + + +def print_project_insights(project_report: dict): + """Print project insights in a formatted table.""" + print("--" * 50, file=sys.stderr) + print("Project Insights", file=sys.stderr) + print("--" * 50, file=sys.stderr) + print(tabulate_data(project_report, headers="keys"), file=sys.stderr) + + +def process_reports(reports: dict) -> bool: + """Process and display reports, return True if issues found.""" + if not reports: + print("No insights generated. All checks passed!", file=sys.stderr) + return False + + print("Insights generated successfully. Analyzing results...", file=sys.stderr) + model_report = generate_model_insights_table(reports[MODEL]) + project_report = generate_project_insights_table(reports[PROJECT]) + + has_model_issues = len(model_report) > 0 + has_project_issues = len(project_report) > 0 + + if has_model_issues: + print_model_insights(model_report) + + if has_project_issues: + print_project_insights(project_report) + + if has_model_issues or has_project_issues: + print("\nPre-commit hook failed: DataPilot found issues that need to be addressed.", file=sys.stderr) + return True + else: + print("All checks passed! No issues found.", file=sys.stderr) + return False + + +def log_warnings(token: str, instance_name: str): + """Log warnings for missing authentication parameters.""" + if not token: + print("Warning: No API token provided. Using default configuration.", file=sys.stderr) + print("To specify a token, use: --token 'your-token'", file=sys.stderr) + + if not instance_name: + print("Warning: No instance name provided. Using default configuration.", file=sys.stderr) + print("To specify an instance, use: --instance-name 'your-instance'", file=sys.stderr) + + +def main(argv: Optional[Sequence[str]] = None): + start_time = time.time() + print("Starting DataPilot pre-commit hook...", file=sys.stderr) + + # Parse arguments + parser = setup_argument_parser() + args = parser.parse_known_args(argv) + + # Extract arguments + config_name, token, instance_name, backend_url, config_path, base_path, manifest_path, catalog_path = extract_arguments(args[0]) + + # Get configuration + config = get_configuration(config_name, token, instance_name, backend_url, config_path) + + # Log warnings for missing auth parameters + log_warnings(token, instance_name) + + # Determine file paths + manifest_path, catalog_path = get_file_paths(base_path, manifest_path, catalog_path) + + # Load manifest and catalog + manifest = load_manifest_file(manifest_path) + catalog = load_catalog_file(catalog_path) + + # Process changed files + selected_models = process_changed_files(args[1]) + + try: + # Run insight generation + reports = run_insight_generation(manifest, catalog, config, selected_models, token, instance_name, backend_url) + + # Process results + has_issues = process_reports(reports) + if has_issues: + sys.exit(1) + + except Exception as e: + print(f"Error running DataPilot checks: {e}", file=sys.stderr) + print("Pre-commit hook failed due to an error.", file=sys.stderr) + sys.exit(1) end_time = time.time() total_time = end_time - start_time - print(f"Total time taken: {round(total_time, 2)} seconds") + print(f"DataPilot checks completed successfully in {round(total_time, 2)} seconds", file=sys.stderr) if __name__ == "__main__":