From e51712b07ef06c0e70913eb03b2bb088466be618 Mon Sep 17 00:00:00 2001 From: Andy Xie Date: Mon, 23 Mar 2026 22:56:12 +0800 Subject: [PATCH 1/4] Integrate known failure pattern classification into RCA skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add optional error classification step that matches job errors against a curated YAML of regex-based failure patterns. This gives Claude verified ground-truth categories instead of guessing, using a standardized 8-category taxonomy (platform_failure, connectivity_failure, authentication_failure, resource_failure, timeout_failure, etc.). The known_failed.yaml is fetched at runtime from a configurable URL or local path — nothing is vendored. Configure via: - CLI: --known-failures-url or --known-failures-file - Env: KNOWN_FAILED_YAML_URL or KNOWN_FAILED_YAML in .claude/settings.local.json Classification is fully optional — the pipeline works without it. Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/root-cause-analysis/README.md | 7 +- skills/root-cause-analysis/SKILL.md | 8 +- .../schemas/summary.schema.json | 10 +- .../root-cause-analysis/scripts/classify.py | 168 ++++++++++++++++++ skills/root-cause-analysis/scripts/cli.py | 40 +++++ .../tests/test_classify.py | 157 ++++++++++++++++ 6 files changed, 384 insertions(+), 6 deletions(-) create mode 100644 skills/root-cause-analysis/scripts/classify.py create mode 100644 skills/root-cause-analysis/tests/test_classify.py diff --git a/skills/root-cause-analysis/README.md b/skills/root-cause-analysis/README.md index b911300..1487963 100644 --- a/skills/root-cause-analysis/README.md +++ b/skills/root-cause-analysis/README.md @@ -46,7 +46,8 @@ Add the following environment variables to your Claude Code settings file: "SPLUNK_INDEX": "your_splunk_index", "SPLUNK_OCP_APP_INDEX": "your_ocp_app_index", "SPLUNK_OCP_INFRA_INDEX": "your_ocp_infra_index", - "SPLUNK_VERIFY_SSL": "false" + "SPLUNK_VERIFY_SSL": "false", + "KNOWN_FAILED_YAML_URL": "https://api.github.com/repos/your-org/your-repo/contents/path/to/known_failed.yaml" } } ``` @@ -60,6 +61,7 @@ Update the values: - `SPLUNK_USERNAME` / `SPLUNK_PASSWORD` - Your Splunk credentials - `SPLUNK_INDEX` - Default index for AAP logs - `SPLUNK_OCP_APP_INDEX` / `SPLUNK_OCP_INFRA_INDEX` - OCP log indices +- `KNOWN_FAILED_YAML_URL` - URL to `known_failed.yaml` for error classification (e.g., GitHub API content URL). Uses `GITHUB_TOKEN` for authentication if the URL points to `api.github.com`. The file is cached locally after first fetch. Alternatively, set `KNOWN_FAILED_YAML` to a local file path. Can also be passed via `--known-failures-url` or `--known-failures-file` CLI flags. ### 3. Configure SSH for auto-fetch (optional) @@ -204,7 +206,7 @@ All steps are executed automatically by the `cli.py analyze` command: ### Step 5: Analyze and Generate Summary (Claude) -**Input files**: Read outputs from steps 1-4 (`step1_job_context.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, and if needed, `step2_splunk_logs.json`). +**Input files**: Read outputs from steps 1-4 (`step1_job_context.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, `classification.json`, and if needed, `step2_splunk_logs.json`). **Analysis Guidelines**: - **Configuration Analysis**: Variable precedence (role defaults → common.yaml → platform/account.yaml → platform/catalog/env.yaml), check for conflicts, missing variables, secrets references @@ -236,6 +238,7 @@ Analysis results are saved to `.analysis//`: | `step2_splunk_logs.json` | Correlated Splunk pod logs | Python | | `step3_correlation.json` | Unified timeline with correlation proof | Python | | `step4_github_fetch_history.json` | GitHub fetch results (configs and workload code) | Python (Claude updates for MCP verification) | +| `classification.json` | Known failure pattern matches | Python | | `step5_analysis_summary.json` | Root cause summary with recommendations | Claude | ## Correlation Methods diff --git a/skills/root-cause-analysis/SKILL.md b/skills/root-cause-analysis/SKILL.md index 263a2d3..de35b33 100644 --- a/skills/root-cause-analysis/SKILL.md +++ b/skills/root-cause-analysis/SKILL.md @@ -44,7 +44,7 @@ The `cli.py analyze` command automatically runs all steps: - **Step 3**: Correlate → Merge AAP and Splunk events into unified timeline - **Step 4**: Fetch GitHub files → Parse job metadata, fetch AgnosticV configs and AgnosticD workload code (requires `GITHUB_TOKEN` to be configured) -**Outputs**: `.analysis//step1_job_context.json`, `step2_splunk_logs.json`, `step3_correlation.json`, `step4_github_fetch_history.json` +**Outputs**: `.analysis//step1_job_context.json`, `step2_splunk_logs.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, `classification.json` This skill automatically searches for job logs in the configured `JOB_LOGS_DIR`. @@ -77,7 +77,8 @@ python3 -m venv .venv 1. **REQUIRED**: `step1_job_context.json` - Job metadata and failed task details 2. **REQUIRED**: `step3_correlation.json` - Correlated timeline with relevant pod logs (DO NOT read step2 unless needed) 3. **REQUIRED**: `step4_github_fetch_history.json` - Configuration and code context -4. **CONDITIONAL**: `step2_splunk_logs.json` - Only read if step3 indicates errors needing deeper investigation +4. **REQUIRED**: `classification.json` - Known failure pattern matches (if present). Use these verified categories instead of guessing. If a match exists, use its `error_category` as the root cause category. If no matches, flag as novel/unclassified failure. +5. **CONDITIONAL**: `step2_splunk_logs.json` - Only read if step3 indicates errors needing deeper investigation **Output**: `.analysis//step5_analysis_summary.json` @@ -96,7 +97,7 @@ python3 -m venv .venv ### Summary Requirements -1. **Root Cause**: Category (`configuration|infrastructure|workload_bug|credential|resource|dependency`), summary, confidence +1. **Root Cause**: Category — prefer `classification.json` categories when matched (`platform_failure|connectivity_failure|authentication_failure|resource_failure|timeout_failure|automation_failure|infrastructure_failure`). Fall back to (`configuration|infrastructure|workload_bug|credential|resource|dependency`) only for novel/unclassified errors. Include summary and confidence. 2. **Evidence**: Supporting evidence from AAP logs, Splunk logs, and GitHub configs/code - **REQUIRED**: When `source` is `agnosticv_config` or `agnosticd_code`, **MUST** include `github_path` in format `owner/repo:path/to/file.yml:line` - Extract GitHub paths from step4: @@ -180,6 +181,7 @@ See `schemas/summary.schema.json` for complete structure. Example: | 2 | `step2_splunk_logs.json` | Python | | 3 | `step3_correlation.json` | Python | | 4 | `step4_github_fetch_history.json` | Python (Optional Claude updates for MCP verification) | +| — | `classification.json` | Python (known failure pattern matching) | | 5 | `step5_analysis_summary.json` | Claude | All files in `.analysis//` diff --git a/skills/root-cause-analysis/schemas/summary.schema.json b/skills/root-cause-analysis/schemas/summary.schema.json index df6f5c3..dc4e879 100644 --- a/skills/root-cause-analysis/schemas/summary.schema.json +++ b/skills/root-cause-analysis/schemas/summary.schema.json @@ -31,7 +31,15 @@ "resource", "cloud_api", "secrets", - "unknown" + "unknown", + "platform_failure", + "connectivity_failure", + "authentication_failure", + "resource_failure", + "timeout_failure", + "automation_failure", + "infrastructure_failure", + "general_failure" ] }, "confidence": { diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py new file mode 100644 index 0000000..345e7c7 --- /dev/null +++ b/skills/root-cause-analysis/scripts/classify.py @@ -0,0 +1,168 @@ +"""Classify error messages against known failure patterns. + +Loads a curated YAML file of regex-based error patterns and matches them +against error messages from RCA steps 1 and 3. The YAML file can be +provided via URL, local file path, or CLI flags. + +Configuration (in .claude/settings.local.json env block): + KNOWN_FAILED_YAML_URL — URL to fetch the YAML file (cached locally) + KNOWN_FAILED_YAML — local file path (fallback) +""" + +import os +import re +import tempfile +from pathlib import Path + +import requests +import yaml + +# Cache dir for downloaded known_failed.yaml +_CACHE_DIR = Path(tempfile.gettempdir()) / "rhdp-rca" +_CACHE_FILE = _CACHE_DIR / "known_failed.yaml" + + +def fetch_known_failures_from_url(url: str) -> list[dict]: + """Fetch known failure patterns YAML from a URL. + + Caches the file locally. Returns the parsed failures list. + """ + _CACHE_DIR.mkdir(parents=True, exist_ok=True) + + headers = {} + github_token = os.environ.get("GITHUB_TOKEN", "") + if github_token and "api.github.com" in url: + headers["Authorization"] = f"token {github_token}" + headers["Accept"] = "application/vnd.github.v3.raw" + + try: + resp = requests.get(url, headers=headers, timeout=15) + resp.raise_for_status() + _CACHE_FILE.write_text(resp.text) + return _parse_yaml_content(resp.text) + except (requests.RequestException, yaml.YAMLError) as e: + # Fall back to cache if fetch fails + if _CACHE_FILE.exists(): + return load_known_failures(_CACHE_FILE) + print(f" Warning: Failed to fetch known failure patterns: {e}") + return [] + + +def load_known_failures(yaml_path: str | Path) -> list[dict]: + """Load known failure patterns from a local YAML file.""" + path = Path(yaml_path) + if not path.exists(): + return [] + try: + with open(path) as f: + return _parse_yaml_content(f.read()) + except (yaml.YAMLError, OSError): + return [] + + +def _parse_yaml_content(content: str) -> list[dict]: + """Parse YAML content and extract the failures list.""" + data = yaml.safe_load(content) + if not data: + return [] + return data.get("failures", []) + + +def classify_error(error_message: str, known_failures: list[dict]) -> dict | None: + """Match an error message against known failure patterns. + + Returns a dict with classification info on match, or None. + """ + if not error_message or not known_failures: + return None + + error_message = error_message.strip() + + for failure in known_failures: + pattern = failure.get("error_string", "") + if not pattern: + continue + try: + if re.search(pattern, error_message, re.IGNORECASE | re.DOTALL): + return { + "error_category": failure.get("category", "general_failure"), + "matched_pattern": pattern, + "failure_description": failure.get("description", ""), + } + except re.error: + continue + + return None + + +def classify_job_errors( + job_context: dict, correlation: dict, known_failures: list[dict] +) -> list[dict]: + """Classify all error messages found in step1 and step3 outputs. + + Returns a list of classification results (one per matched error). + """ + results: list[dict] = [] + seen_messages: set[str] = set() + + # Collect error messages from step1 failed tasks + for task in job_context.get("failed_tasks", []): + msg = task.get("error_message", "") + if msg and msg not in seen_messages: + seen_messages.add(msg) + match = classify_error(msg, known_failures) + if match: + match["source"] = "aap_failed_task" + match["task"] = task.get("task", "") + results.append(match) + + # Collect error messages from step3 timeline events + # Timeline events store messages in details.message (for aap_job) or + # details.message (for splunk_ocp), not at the top level. + for event in correlation.get("timeline_events", []): + details = event.get("details", {}) + msg = details.get("message", "") or details.get("error_message", "") + if msg and msg not in seen_messages: + seen_messages.add(msg) + match = classify_error(msg, known_failures) + if match: + match["source"] = "correlation_timeline" + results.append(match) + + return results + + +def resolve_known_failures( + url: str | None = None, local_path: str | None = None +) -> list[dict]: + """Resolve and load known failure patterns. + + Args: + url: URL to fetch YAML from (overrides env var) + local_path: Local file path (overrides env var) + + Priority: + 1. Explicit url/local_path arguments (from CLI flags) + 2. KNOWN_FAILED_YAML_URL env var — fetch from URL (cached locally) + 3. KNOWN_FAILED_YAML env var — read from local file path + 4. Returns empty list if none configured + """ + # CLI flag: URL + if url: + return fetch_known_failures_from_url(url) + + # CLI flag: local path + if local_path: + return load_known_failures(local_path) + + # Env var: URL + env_url = os.environ.get("KNOWN_FAILED_YAML_URL", "") + if env_url: + return fetch_known_failures_from_url(env_url) + + # Env var: local path + env_path = os.environ.get("KNOWN_FAILED_YAML", "") + if env_path: + return load_known_failures(env_path) + + return [] diff --git a/skills/root-cause-analysis/scripts/cli.py b/skills/root-cause-analysis/scripts/cli.py index 91bd46c..8ba4602 100644 --- a/skills/root-cause-analysis/scripts/cli.py +++ b/skills/root-cause-analysis/scripts/cli.py @@ -11,6 +11,7 @@ if __name__ == "__main__" and __package__ is None: # Running directly as scripts/cli.py - add parent to path sys.path.insert(0, str(Path(__file__).parent.parent)) + from scripts.classify import classify_job_errors, resolve_known_failures from scripts.config import Config from scripts.correlator import build_correlation_timeline, fetch_correlated_logs from scripts.job_parser import parse_job_log @@ -18,6 +19,7 @@ from scripts.step4_fetch_github import GitHubClient, Step4Analyzer else: # Running as module (-m scripts.cli) + from .classify import classify_job_errors, resolve_known_failures from .config import Config from .correlator import build_correlation_timeline, fetch_correlated_logs from .job_parser import parse_job_log @@ -204,6 +206,36 @@ def cmd_analyze(args: argparse.Namespace, config: Config) -> int: print(f" Error fetching GitHub files: {e}") return 1 + # Classify errors against known failure patterns (optional) + print("\n[Classify] Matching errors against known failure patterns...") + known_failures = resolve_known_failures( + url=getattr(args, "known_failures_url", None), + local_path=getattr(args, "known_failures_file", None), + ) + classification_path = analysis_dir / "classification.json" + if known_failures: + classifications = classify_job_errors(job_context, correlation, known_failures) + classification_result = { + "patterns_loaded": len(known_failures), + "matches": classifications, + } + with open(classification_path, "w") as f: + json.dump(classification_result, f, indent=2) + if classifications: + print(f" Matched {len(classifications)} error(s) against known patterns") + for c in classifications: + print(f" - {c['error_category']}: {c['failure_description']}") + else: + print(" No matches — errors may be novel/unclassified") + print(f" Output: {classification_path}") + else: + print(" Skipped: no known failure patterns configured (optional)") + print( + " Hint: Use --known-failures-file or --known-failures-url ," + " or set KNOWN_FAILED_YAML_URL / KNOWN_FAILED_YAML" + " in .claude/settings.local.json env block" + ) + # Print summary print("\n" + "=" * 60) print("Analysis Complete") @@ -356,6 +388,14 @@ def main(): action="store_true", help="Fetch job log from remote server via SSH if not found locally", ) + analyze_parser.add_argument( + "--known-failures-url", + help="URL to fetch known_failed.yaml from (overrides KNOWN_FAILED_YAML_URL env var)", + ) + analyze_parser.add_argument( + "--known-failures-file", + help="Local path to known_failed.yaml (overrides KNOWN_FAILED_YAML env var)", + ) # parse command parse_parser = subparsers.add_parser("parse", help="Parse job log only (Step 1)") diff --git a/skills/root-cause-analysis/tests/test_classify.py b/skills/root-cause-analysis/tests/test_classify.py new file mode 100644 index 0000000..86118b5 --- /dev/null +++ b/skills/root-cause-analysis/tests/test_classify.py @@ -0,0 +1,157 @@ +"""Tests for classify.py — known failure pattern matching.""" + +import tempfile +from pathlib import Path + +import yaml + +from scripts.classify import classify_error, classify_job_errors, load_known_failures + + +SAMPLE_FAILURES = [ + { + "error_string": "Shared connection to.*compute.amazonaws.com closed", + "description": "Unable to reach bastion host", + "category": "connectivity_failure", + }, + { + "error_string": "Bootstrap failed to complete: timed out waiting for the condition", + "description": "OpenShift Installer failed due to cloud timeout", + "category": "timeout_failure", + }, + { + "error_string": "MODULE FAILURE", + "description": "Ansible module failure", + "category": "automation_failure", + }, +] + + +def _write_yaml(tmp_dir: Path, failures: list[dict]) -> Path: + path = tmp_dir / "known_failed.yaml" + with open(path, "w") as f: + yaml.dump({"failures": failures}, f) + return path + + +def test_load_known_failures_valid(): + with tempfile.TemporaryDirectory() as tmp: + path = _write_yaml(Path(tmp), SAMPLE_FAILURES) + failures = load_known_failures(path) + assert len(failures) == 3 + + +def test_load_known_failures_missing_file(): + failures = load_known_failures("/nonexistent/path.yaml") + assert failures == [] + + +def test_load_known_failures_empty_file(): + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "empty.yaml" + path.write_text("") + failures = load_known_failures(path) + assert failures == [] + + +def test_classify_error_match(): + result = classify_error( + "Shared connection to ec2-1-2-3-4.compute.amazonaws.com closed", + SAMPLE_FAILURES, + ) + assert result is not None + assert result["error_category"] == "connectivity_failure" + assert result["failure_description"] == "Unable to reach bastion host" + + +def test_classify_error_no_match(): + result = classify_error("Some completely unknown error", SAMPLE_FAILURES) + assert result is None + + +def test_classify_error_empty_message(): + result = classify_error("", SAMPLE_FAILURES) + assert result is None + + +def test_classify_error_empty_patterns(): + result = classify_error("MODULE FAILURE", []) + assert result is None + + +def test_classify_error_invalid_regex_skipped(): + bad_patterns = [ + {"error_string": "[invalid(regex", "description": "bad", "category": "x"}, + {"error_string": "MODULE FAILURE", "description": "ok", "category": "automation_failure"}, + ] + result = classify_error("MODULE FAILURE", bad_patterns) + assert result is not None + assert result["error_category"] == "automation_failure" + + +def test_classify_job_errors_reads_details_message(): + """Timeline events store messages in details.message, not top-level message.""" + job_context = { + "failed_tasks": [ + { + "task": "Connect to bastion", + "error_message": "Shared connection to ec2-1-2-3.compute.amazonaws.com closed", + }, + ] + } + # Matches real build_correlation_timeline() output structure: + # aap_job events have details.error_message + # splunk_ocp events have details.message + correlation = { + "timeline_events": [ + { + "source": "aap_job", + "event_type": "task_failed", + "summary": "Task 'Run module' failed", + "details": {"task": "Run module", "error_message": "MODULE FAILURE"}, + }, + { + "source": "splunk_ocp", + "event_type": "pod_error", + "summary": "Error in pod 'installer-xyz'", + "details": { + "pod_name": "installer-xyz", + "message": "Bootstrap failed to complete: timed out waiting for the condition", + }, + }, + ] + } + results = classify_job_errors(job_context, correlation, SAMPLE_FAILURES) + assert len(results) == 3 + categories = {r["error_category"] for r in results} + assert categories == {"connectivity_failure", "automation_failure", "timeout_failure"} + + +def test_classify_job_errors_deduplicates(): + """Same error in step1 and step3 should only appear once.""" + job_context = { + "failed_tasks": [ + {"task": "Run module", "error_message": "MODULE FAILURE"}, + ] + } + correlation = { + "timeline_events": [ + { + "source": "aap_job", + "details": {"error_message": "MODULE FAILURE"}, + }, + ] + } + results = classify_job_errors(job_context, correlation, SAMPLE_FAILURES) + assert len(results) == 1 + assert results[0]["error_category"] == "automation_failure" + + +def test_classify_job_errors_empty_inputs(): + """No errors should produce empty results, not crash.""" + results = classify_job_errors({}, {}, SAMPLE_FAILURES) + assert results == [] + results = classify_job_errors({"failed_tasks": []}, {"timeline_events": []}, SAMPLE_FAILURES) + assert results == [] + results = classify_job_errors({"failed_tasks": []}, {"timeline_events": []}, []) + assert results == [] From 63b1bd775c6552ede01a974a3ff0f44fb980f261 Mon Sep 17 00:00:00 2001 From: Andy Xie Date: Fri, 10 Apr 2026 17:44:51 +0800 Subject: [PATCH 2/4] Fix CI: lazy-import requests and add PyYAML to project deps The `requests` import in classify.py was top-level, causing ModuleNotFoundError in CI since it's not in pyproject.toml deps. Move it to a lazy import inside fetch_known_failures_from_url() where it's actually needed. Add PyYAML to pyproject.toml since classify.py directly uses it. Fix ruff I001 import sorting in test_classify.py. Co-Authored-By: Claude Opus 4.6 (1M context) --- pyproject.toml | 1 + skills/root-cause-analysis/scripts/classify.py | 3 ++- skills/root-cause-analysis/tests/test_classify.py | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 383e2e8..23231dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ classifiers = [ ] dependencies = [ "python-dotenv>=1.0.0", + "PyYAML>=6.0", ] [project.optional-dependencies] diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py index 345e7c7..4ee7bfc 100644 --- a/skills/root-cause-analysis/scripts/classify.py +++ b/skills/root-cause-analysis/scripts/classify.py @@ -14,7 +14,6 @@ import tempfile from pathlib import Path -import requests import yaml # Cache dir for downloaded known_failed.yaml @@ -29,6 +28,8 @@ def fetch_known_failures_from_url(url: str) -> list[dict]: """ _CACHE_DIR.mkdir(parents=True, exist_ok=True) + import requests + headers = {} github_token = os.environ.get("GITHUB_TOKEN", "") if github_token and "api.github.com" in url: diff --git a/skills/root-cause-analysis/tests/test_classify.py b/skills/root-cause-analysis/tests/test_classify.py index 86118b5..6defdca 100644 --- a/skills/root-cause-analysis/tests/test_classify.py +++ b/skills/root-cause-analysis/tests/test_classify.py @@ -7,7 +7,6 @@ from scripts.classify import classify_error, classify_job_errors, load_known_failures - SAMPLE_FAILURES = [ { "error_string": "Shared connection to.*compute.amazonaws.com closed", From b6fc32e7a37e29a57b6afd6bf527200fa22a0ded Mon Sep 17 00:00:00 2001 From: Andy Xie Date: Fri, 10 Apr 2026 17:46:59 +0800 Subject: [PATCH 3/4] Fix ruff format: collapse resolve_known_failures signature to one line Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/root-cause-analysis/scripts/classify.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py index 4ee7bfc..f67d596 100644 --- a/skills/root-cause-analysis/scripts/classify.py +++ b/skills/root-cause-analysis/scripts/classify.py @@ -133,9 +133,7 @@ def classify_job_errors( return results -def resolve_known_failures( - url: str | None = None, local_path: str | None = None -) -> list[dict]: +def resolve_known_failures(url: str | None = None, local_path: str | None = None) -> list[dict]: """Resolve and load known failure patterns. Args: From 6715cee179109547cf03a2ad06dbd6b88ff6919f Mon Sep 17 00:00:00 2001 From: Andy Xie Date: Sat, 11 Apr 2026 01:09:53 +0800 Subject: [PATCH 4/4] Address CodeRabbit review: YAML validation, stable classification.json, schema-aligned categories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Validate YAML shape in _parse_yaml_content: reject non-dict top-level, non-list failures, and non-dict entries - Always emit classification.json even when no patterns are configured, so Step 5 has a stable pipeline contract - Fix SKILL.md fallback categories to match summary.schema.json enum (workload_bug → application_bug, credential → secrets) Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/root-cause-analysis/SKILL.md | 2 +- skills/root-cause-analysis/scripts/classify.py | 7 +++++-- skills/root-cause-analysis/scripts/cli.py | 17 ++++++++++------- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/skills/root-cause-analysis/SKILL.md b/skills/root-cause-analysis/SKILL.md index de35b33..0d993db 100644 --- a/skills/root-cause-analysis/SKILL.md +++ b/skills/root-cause-analysis/SKILL.md @@ -97,7 +97,7 @@ python3 -m venv .venv ### Summary Requirements -1. **Root Cause**: Category — prefer `classification.json` categories when matched (`platform_failure|connectivity_failure|authentication_failure|resource_failure|timeout_failure|automation_failure|infrastructure_failure`). Fall back to (`configuration|infrastructure|workload_bug|credential|resource|dependency`) only for novel/unclassified errors. Include summary and confidence. +1. **Root Cause**: Category — prefer `classification.json` categories when matched (`platform_failure|connectivity_failure|authentication_failure|resource_failure|timeout_failure|automation_failure|infrastructure_failure`). Fall back to (`configuration|infrastructure|application_bug|secrets|resource|dependency`) only for novel/unclassified errors. Include summary and confidence. 2. **Evidence**: Supporting evidence from AAP logs, Splunk logs, and GitHub configs/code - **REQUIRED**: When `source` is `agnosticv_config` or `agnosticd_code`, **MUST** include `github_path` in format `owner/repo:path/to/file.yml:line` - Extract GitHub paths from step4: diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py index f67d596..8c3859c 100644 --- a/skills/root-cause-analysis/scripts/classify.py +++ b/skills/root-cause-analysis/scripts/classify.py @@ -64,9 +64,12 @@ def load_known_failures(yaml_path: str | Path) -> list[dict]: def _parse_yaml_content(content: str) -> list[dict]: """Parse YAML content and extract the failures list.""" data = yaml.safe_load(content) - if not data: + if not isinstance(data, dict): return [] - return data.get("failures", []) + failures = data.get("failures", []) + if not isinstance(failures, list): + return [] + return [f for f in failures if isinstance(f, dict)] def classify_error(error_message: str, known_failures: list[dict]) -> dict | None: diff --git a/skills/root-cause-analysis/scripts/cli.py b/skills/root-cause-analysis/scripts/cli.py index 8ba4602..bdb9164 100644 --- a/skills/root-cause-analysis/scripts/cli.py +++ b/skills/root-cause-analysis/scripts/cli.py @@ -213,28 +213,31 @@ def cmd_analyze(args: argparse.Namespace, config: Config) -> int: local_path=getattr(args, "known_failures_file", None), ) classification_path = analysis_dir / "classification.json" + classification_result: dict = { + "patterns_loaded": len(known_failures), + "matches": [], + } if known_failures: classifications = classify_job_errors(job_context, correlation, known_failures) - classification_result = { - "patterns_loaded": len(known_failures), - "matches": classifications, - } - with open(classification_path, "w") as f: - json.dump(classification_result, f, indent=2) + classification_result["matches"] = classifications if classifications: print(f" Matched {len(classifications)} error(s) against known patterns") for c in classifications: print(f" - {c['error_category']}: {c['failure_description']}") else: print(" No matches — errors may be novel/unclassified") - print(f" Output: {classification_path}") else: + classification_result["skipped"] = True + classification_result["reason"] = "no known failure patterns configured" print(" Skipped: no known failure patterns configured (optional)") print( " Hint: Use --known-failures-file or --known-failures-url ," " or set KNOWN_FAILED_YAML_URL / KNOWN_FAILED_YAML" " in .claude/settings.local.json env block" ) + with open(classification_path, "w") as f: + json.dump(classification_result, f, indent=2) + print(f" Output: {classification_path}") # Print summary print("\n" + "=" * 60)