From e51712b07ef06c0e70913eb03b2bb088466be618 Mon Sep 17 00:00:00 2001
From: Andy Xie <xiehandi@gmail.com>
Date: Mon, 23 Mar 2026 22:56:12 +0800
Subject: [PATCH 1/4] Integrate known failure pattern classification into RCA
 skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add optional error classification step that matches job errors against
a curated YAML of regex-based failure patterns. This gives Claude
verified ground-truth categories instead of guessing, using a
standardized 8-category taxonomy (platform_failure, connectivity_failure,
authentication_failure, resource_failure, timeout_failure, etc.).

The known_failed.yaml is fetched at runtime from a configurable URL
or local path — nothing is vendored. Configure via:
- CLI: --known-failures-url <url> or --known-failures-file <path>
- Env: KNOWN_FAILED_YAML_URL or KNOWN_FAILED_YAML in
  .claude/settings.local.json

Classification is fully optional — the pipeline works without it.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/root-cause-analysis/README.md          |   7 +-
 skills/root-cause-analysis/SKILL.md           |   8 +-
 .../schemas/summary.schema.json               |  10 +-
 .../root-cause-analysis/scripts/classify.py   | 168 ++++++++++++++++++
 skills/root-cause-analysis/scripts/cli.py     |  40 +++++
 .../tests/test_classify.py                    | 157 ++++++++++++++++
 6 files changed, 384 insertions(+), 6 deletions(-)
 create mode 100644 skills/root-cause-analysis/scripts/classify.py
 create mode 100644 skills/root-cause-analysis/tests/test_classify.py
diff --git a/skills/root-cause-analysis/README.md b/skills/root-cause-analysis/README.md
index b911300..1487963 100644
--- a/skills/root-cause-analysis/README.md
+++ b/skills/root-cause-analysis/README.md
@@ -46,7 +46,8 @@ Add the following environment variables to your Claude Code settings file:
     "SPLUNK_INDEX": "your_splunk_index",
     "SPLUNK_OCP_APP_INDEX": "your_ocp_app_index",
     "SPLUNK_OCP_INFRA_INDEX": "your_ocp_infra_index",
-    "SPLUNK_VERIFY_SSL": "false"
+    "SPLUNK_VERIFY_SSL": "false",
+    "KNOWN_FAILED_YAML_URL": "https://api.github.com/repos/your-org/your-repo/contents/path/to/known_failed.yaml"
   }
 }
 ```
@@ -60,6 +61,7 @@ Update the values:
 - `SPLUNK_USERNAME` / `SPLUNK_PASSWORD` - Your Splunk credentials
 - `SPLUNK_INDEX` - Default index for AAP logs
 - `SPLUNK_OCP_APP_INDEX` / `SPLUNK_OCP_INFRA_INDEX` - OCP log indices
+- `KNOWN_FAILED_YAML_URL` - URL to `known_failed.yaml` for error classification (e.g., GitHub API content URL). Uses `GITHUB_TOKEN` for authentication if the URL points to `api.github.com`. The file is cached locally after first fetch. Alternatively, set `KNOWN_FAILED_YAML` to a local file path. Can also be passed via `--known-failures-url` or `--known-failures-file` CLI flags.
 
 ### 3. Configure SSH for auto-fetch (optional)
 
@@ -204,7 +206,7 @@ All steps are executed automatically by the `cli.py analyze` command:
 
 ### Step 5: Analyze and Generate Summary (Claude)
 
-**Input files**: Read outputs from steps 1-4 (`step1_job_context.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, and if needed, `step2_splunk_logs.json`).
+**Input files**: Read outputs from steps 1-4 (`step1_job_context.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, `classification.json`, and if needed, `step2_splunk_logs.json`).
 
 **Analysis Guidelines**:
 - **Configuration Analysis**: Variable precedence (role defaults → common.yaml → platform/account.yaml → platform/catalog/env.yaml), check for conflicts, missing variables, secrets references
@@ -236,6 +238,7 @@ Analysis results are saved to `.analysis/<job-id>/`:
 | `step2_splunk_logs.json` | Correlated Splunk pod logs | Python |
 | `step3_correlation.json` | Unified timeline with correlation proof | Python |
 | `step4_github_fetch_history.json` | GitHub fetch results (configs and workload code) | Python (Claude updates for MCP verification) |
+| `classification.json` | Known failure pattern matches | Python |
 | `step5_analysis_summary.json` | Root cause summary with recommendations | Claude |
 
 ## Correlation Methods
diff --git a/skills/root-cause-analysis/SKILL.md b/skills/root-cause-analysis/SKILL.md
index 263a2d3..de35b33 100644
--- a/skills/root-cause-analysis/SKILL.md
+++ b/skills/root-cause-analysis/SKILL.md
@@ -44,7 +44,7 @@ The `cli.py analyze` command automatically runs all steps:
 - **Step 3**: Correlate → Merge AAP and Splunk events into unified timeline
 - **Step 4**: Fetch GitHub files → Parse job metadata, fetch AgnosticV configs and AgnosticD workload code (requires `GITHUB_TOKEN` to be configured)
 
-**Outputs**: `.analysis/<job-id>/step1_job_context.json`, `step2_splunk_logs.json`, `step3_correlation.json`, `step4_github_fetch_history.json`
+**Outputs**: `.analysis/<job-id>/step1_job_context.json`, `step2_splunk_logs.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, `classification.json`
 
 This skill automatically searches for job logs in the configured `JOB_LOGS_DIR`.
 
@@ -77,7 +77,8 @@ python3 -m venv .venv
 1. **REQUIRED**: `step1_job_context.json` - Job metadata and failed task details
 2. **REQUIRED**: `step3_correlation.json` - Correlated timeline with relevant pod logs (DO NOT read step2 unless needed)
 3. **REQUIRED**: `step4_github_fetch_history.json` - Configuration and code context
-4. **CONDITIONAL**: `step2_splunk_logs.json` - Only read if step3 indicates errors needing deeper investigation
+4. **REQUIRED**: `classification.json` - Known failure pattern matches (if present). Use these verified categories instead of guessing. If a match exists, use its `error_category` as the root cause category. If no matches, flag as novel/unclassified failure.
+5. **CONDITIONAL**: `step2_splunk_logs.json` - Only read if step3 indicates errors needing deeper investigation
 
 **Output**: `.analysis/<job-id>/step5_analysis_summary.json` 
 
@@ -96,7 +97,7 @@ python3 -m venv .venv
 
 ### Summary Requirements
 
-1. **Root Cause**: Category (`configuration|infrastructure|workload_bug|credential|resource|dependency`), summary, confidence
+1. **Root Cause**: Category — prefer `classification.json` categories when matched (`platform_failure|connectivity_failure|authentication_failure|resource_failure|timeout_failure|automation_failure|infrastructure_failure`). Fall back to (`configuration|infrastructure|workload_bug|credential|resource|dependency`) only for novel/unclassified errors. Include summary and confidence.
 2. **Evidence**: Supporting evidence from AAP logs, Splunk logs, and GitHub configs/code
    - **REQUIRED**: When `source` is `agnosticv_config` or `agnosticd_code`, **MUST** include `github_path` in format `owner/repo:path/to/file.yml:line`
    - Extract GitHub paths from step4:
@@ -180,6 +181,7 @@ See `schemas/summary.schema.json` for complete structure. Example:
 | 2 | `step2_splunk_logs.json` | Python |
 | 3 | `step3_correlation.json` | Python |
 | 4 | `step4_github_fetch_history.json` | Python (Optional Claude updates for MCP verification) |
+| — | `classification.json` | Python (known failure pattern matching) |
 | 5 | `step5_analysis_summary.json` | Claude |
 
 All files in `.analysis/<job-id>/`
diff --git a/skills/root-cause-analysis/schemas/summary.schema.json b/skills/root-cause-analysis/schemas/summary.schema.json
index df6f5c3..dc4e879 100644
--- a/skills/root-cause-analysis/schemas/summary.schema.json
+++ b/skills/root-cause-analysis/schemas/summary.schema.json
@@ -31,7 +31,15 @@
             "resource",
             "cloud_api",
             "secrets",
-            "unknown"
+            "unknown",
+            "platform_failure",
+            "connectivity_failure",
+            "authentication_failure",
+            "resource_failure",
+            "timeout_failure",
+            "automation_failure",
+            "infrastructure_failure",
+            "general_failure"
           ]
         },
         "confidence": {
diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py
new file mode 100644
index 0000000..345e7c7
--- /dev/null
+++ b/skills/root-cause-analysis/scripts/classify.py
@@ -0,0 +1,168 @@
+"""Classify error messages against known failure patterns.
+
+Loads a curated YAML file of regex-based error patterns and matches them
+against error messages from RCA steps 1 and 3. The YAML file can be
+provided via URL, local file path, or CLI flags.
+
+Configuration (in .claude/settings.local.json env block):
+  KNOWN_FAILED_YAML_URL — URL to fetch the YAML file (cached locally)
+  KNOWN_FAILED_YAML     — local file path (fallback)
+"""
+
+import os
+import re
+import tempfile
+from pathlib import Path
+
+import requests
+import yaml
+
+# Cache dir for downloaded known_failed.yaml
+_CACHE_DIR = Path(tempfile.gettempdir()) / "rhdp-rca"
+_CACHE_FILE = _CACHE_DIR / "known_failed.yaml"
+
+
+def fetch_known_failures_from_url(url: str) -> list[dict]:
+    """Fetch known failure patterns YAML from a URL.
+
+    Caches the file locally. Returns the parsed failures list.
+    """
+    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+    headers = {}
+    github_token = os.environ.get("GITHUB_TOKEN", "")
+    if github_token and "api.github.com" in url:
+        headers["Authorization"] = f"token {github_token}"
+        headers["Accept"] = "application/vnd.github.v3.raw"
+
+    try:
+        resp = requests.get(url, headers=headers, timeout=15)
+        resp.raise_for_status()
+        _CACHE_FILE.write_text(resp.text)
+        return _parse_yaml_content(resp.text)
+    except (requests.RequestException, yaml.YAMLError) as e:
+        # Fall back to cache if fetch fails
+        if _CACHE_FILE.exists():
+            return load_known_failures(_CACHE_FILE)
+        print(f"  Warning: Failed to fetch known failure patterns: {e}")
+        return []
+
+
+def load_known_failures(yaml_path: str | Path) -> list[dict]:
+    """Load known failure patterns from a local YAML file."""
+    path = Path(yaml_path)
+    if not path.exists():
+        return []
+    try:
+        with open(path) as f:
+            return _parse_yaml_content(f.read())
+    except (yaml.YAMLError, OSError):
+        return []
+
+
+def _parse_yaml_content(content: str) -> list[dict]:
+    """Parse YAML content and extract the failures list."""
+    data = yaml.safe_load(content)
+    if not data:
+        return []
+    return data.get("failures", [])
+
+
+def classify_error(error_message: str, known_failures: list[dict]) -> dict | None:
+    """Match an error message against known failure patterns.
+
+    Returns a dict with classification info on match, or None.
+    """
+    if not error_message or not known_failures:
+        return None
+
+    error_message = error_message.strip()
+
+    for failure in known_failures:
+        pattern = failure.get("error_string", "")
+        if not pattern:
+            continue
+        try:
+            if re.search(pattern, error_message, re.IGNORECASE | re.DOTALL):
+                return {
+                    "error_category": failure.get("category", "general_failure"),
+                    "matched_pattern": pattern,
+                    "failure_description": failure.get("description", ""),
+                }
+        except re.error:
+            continue
+
+    return None
+
+
+def classify_job_errors(
+    job_context: dict, correlation: dict, known_failures: list[dict]
+) -> list[dict]:
+    """Classify all error messages found in step1 and step3 outputs.
+
+    Returns a list of classification results (one per matched error).
+    """
+    results: list[dict] = []
+    seen_messages: set[str] = set()
+
+    # Collect error messages from step1 failed tasks
+    for task in job_context.get("failed_tasks", []):
+        msg = task.get("error_message", "")
+        if msg and msg not in seen_messages:
+            seen_messages.add(msg)
+            match = classify_error(msg, known_failures)
+            if match:
+                match["source"] = "aap_failed_task"
+                match["task"] = task.get("task", "")
+                results.append(match)
+
+    # Collect error messages from step3 timeline events
+    # Timeline events store messages in details.message (for aap_job) or
+    # details.message (for splunk_ocp), not at the top level.
+    for event in correlation.get("timeline_events", []):
+        details = event.get("details", {})
+        msg = details.get("message", "") or details.get("error_message", "")
+        if msg and msg not in seen_messages:
+            seen_messages.add(msg)
+            match = classify_error(msg, known_failures)
+            if match:
+                match["source"] = "correlation_timeline"
+                results.append(match)
+
+    return results
+
+
+def resolve_known_failures(
+    url: str | None = None, local_path: str | None = None
+) -> list[dict]:
+    """Resolve and load known failure patterns.
+
+    Args:
+        url: URL to fetch YAML from (overrides env var)
+        local_path: Local file path (overrides env var)
+
+    Priority:
+    1. Explicit url/local_path arguments (from CLI flags)
+    2. KNOWN_FAILED_YAML_URL env var — fetch from URL (cached locally)
+    3. KNOWN_FAILED_YAML env var — read from local file path
+    4. Returns empty list if none configured
+    """
+    # CLI flag: URL
+    if url:
+        return fetch_known_failures_from_url(url)
+
+    # CLI flag: local path
+    if local_path:
+        return load_known_failures(local_path)
+
+    # Env var: URL
+    env_url = os.environ.get("KNOWN_FAILED_YAML_URL", "")
+    if env_url:
+        return fetch_known_failures_from_url(env_url)
+
+    # Env var: local path
+    env_path = os.environ.get("KNOWN_FAILED_YAML", "")
+    if env_path:
+        return load_known_failures(env_path)
+
+    return []
diff --git a/skills/root-cause-analysis/scripts/cli.py b/skills/root-cause-analysis/scripts/cli.py
index 91bd46c..8ba4602 100644
--- a/skills/root-cause-analysis/scripts/cli.py
+++ b/skills/root-cause-analysis/scripts/cli.py
@@ -11,6 +11,7 @@
 if __name__ == "__main__" and __package__ is None:
     # Running directly as scripts/cli.py - add parent to path
     sys.path.insert(0, str(Path(__file__).parent.parent))
+    from scripts.classify import classify_job_errors, resolve_known_failures
     from scripts.config import Config
     from scripts.correlator import build_correlation_timeline, fetch_correlated_logs
     from scripts.job_parser import parse_job_log
@@ -18,6 +19,7 @@
     from scripts.step4_fetch_github import GitHubClient, Step4Analyzer
 else:
     # Running as module (-m scripts.cli)
+    from .classify import classify_job_errors, resolve_known_failures
     from .config import Config
     from .correlator import build_correlation_timeline, fetch_correlated_logs
     from .job_parser import parse_job_log
@@ -204,6 +206,36 @@ def cmd_analyze(args: argparse.Namespace, config: Config) -> int:
             print(f"  Error fetching GitHub files: {e}")
             return 1
 
+    # Classify errors against known failure patterns (optional)
+    print("\n[Classify] Matching errors against known failure patterns...")
+    known_failures = resolve_known_failures(
+        url=getattr(args, "known_failures_url", None),
+        local_path=getattr(args, "known_failures_file", None),
+    )
+    classification_path = analysis_dir / "classification.json"
+    if known_failures:
+        classifications = classify_job_errors(job_context, correlation, known_failures)
+        classification_result = {
+            "patterns_loaded": len(known_failures),
+            "matches": classifications,
+        }
+        with open(classification_path, "w") as f:
+            json.dump(classification_result, f, indent=2)
+        if classifications:
+            print(f"  Matched {len(classifications)} error(s) against known patterns")
+            for c in classifications:
+                print(f"    - {c['error_category']}: {c['failure_description']}")
+        else:
+            print("  No matches — errors may be novel/unclassified")
+        print(f"  Output: {classification_path}")
+    else:
+        print("  Skipped: no known failure patterns configured (optional)")
+        print(
+            "  Hint: Use --known-failures-file <path> or --known-failures-url <url>,"
+            " or set KNOWN_FAILED_YAML_URL / KNOWN_FAILED_YAML"
+            " in .claude/settings.local.json env block"
+        )
+
     # Print summary
     print("\n" + "=" * 60)
     print("Analysis Complete")
@@ -356,6 +388,14 @@ def main():
         action="store_true",
         help="Fetch job log from remote server via SSH if not found locally",
     )
+    analyze_parser.add_argument(
+        "--known-failures-url",
+        help="URL to fetch known_failed.yaml from (overrides KNOWN_FAILED_YAML_URL env var)",
+    )
+    analyze_parser.add_argument(
+        "--known-failures-file",
+        help="Local path to known_failed.yaml (overrides KNOWN_FAILED_YAML env var)",
+    )
 
     # parse command
     parse_parser = subparsers.add_parser("parse", help="Parse job log only (Step 1)")
diff --git a/skills/root-cause-analysis/tests/test_classify.py b/skills/root-cause-analysis/tests/test_classify.py
new file mode 100644
index 0000000..86118b5
--- /dev/null
+++ b/skills/root-cause-analysis/tests/test_classify.py
@@ -0,0 +1,157 @@
+"""Tests for classify.py — known failure pattern matching."""
+
+import tempfile
+from pathlib import Path
+
+import yaml
+
+from scripts.classify import classify_error, classify_job_errors, load_known_failures
+
+
+SAMPLE_FAILURES = [
+    {
+        "error_string": "Shared connection to.*compute.amazonaws.com closed",
+        "description": "Unable to reach bastion host",
+        "category": "connectivity_failure",
+    },
+    {
+        "error_string": "Bootstrap failed to complete: timed out waiting for the condition",
+        "description": "OpenShift Installer failed due to cloud timeout",
+        "category": "timeout_failure",
+    },
+    {
+        "error_string": "MODULE FAILURE",
+        "description": "Ansible module failure",
+        "category": "automation_failure",
+    },
+]
+
+
+def _write_yaml(tmp_dir: Path, failures: list[dict]) -> Path:
+    path = tmp_dir / "known_failed.yaml"
+    with open(path, "w") as f:
+        yaml.dump({"failures": failures}, f)
+    return path
+
+
+def test_load_known_failures_valid():
+    with tempfile.TemporaryDirectory() as tmp:
+        path = _write_yaml(Path(tmp), SAMPLE_FAILURES)
+        failures = load_known_failures(path)
+        assert len(failures) == 3
+
+
+def test_load_known_failures_missing_file():
+    failures = load_known_failures("/nonexistent/path.yaml")
+    assert failures == []
+
+
+def test_load_known_failures_empty_file():
+    with tempfile.TemporaryDirectory() as tmp:
+        path = Path(tmp) / "empty.yaml"
+        path.write_text("")
+        failures = load_known_failures(path)
+        assert failures == []
+
+
+def test_classify_error_match():
+    result = classify_error(
+        "Shared connection to ec2-1-2-3-4.compute.amazonaws.com closed",
+        SAMPLE_FAILURES,
+    )
+    assert result is not None
+    assert result["error_category"] == "connectivity_failure"
+    assert result["failure_description"] == "Unable to reach bastion host"
+
+
+def test_classify_error_no_match():
+    result = classify_error("Some completely unknown error", SAMPLE_FAILURES)
+    assert result is None
+
+
+def test_classify_error_empty_message():
+    result = classify_error("", SAMPLE_FAILURES)
+    assert result is None
+
+
+def test_classify_error_empty_patterns():
+    result = classify_error("MODULE FAILURE", [])
+    assert result is None
+
+
+def test_classify_error_invalid_regex_skipped():
+    bad_patterns = [
+        {"error_string": "[invalid(regex", "description": "bad", "category": "x"},
+        {"error_string": "MODULE FAILURE", "description": "ok", "category": "automation_failure"},
+    ]
+    result = classify_error("MODULE FAILURE", bad_patterns)
+    assert result is not None
+    assert result["error_category"] == "automation_failure"
+
+
+def test_classify_job_errors_reads_details_message():
+    """Timeline events store messages in details.message, not top-level message."""
+    job_context = {
+        "failed_tasks": [
+            {
+                "task": "Connect to bastion",
+                "error_message": "Shared connection to ec2-1-2-3.compute.amazonaws.com closed",
+            },
+        ]
+    }
+    # Matches real build_correlation_timeline() output structure:
+    # aap_job events have details.error_message
+    # splunk_ocp events have details.message
+    correlation = {
+        "timeline_events": [
+            {
+                "source": "aap_job",
+                "event_type": "task_failed",
+                "summary": "Task 'Run module' failed",
+                "details": {"task": "Run module", "error_message": "MODULE FAILURE"},
+            },
+            {
+                "source": "splunk_ocp",
+                "event_type": "pod_error",
+                "summary": "Error in pod 'installer-xyz'",
+                "details": {
+                    "pod_name": "installer-xyz",
+                    "message": "Bootstrap failed to complete: timed out waiting for the condition",
+                },
+            },
+        ]
+    }
+    results = classify_job_errors(job_context, correlation, SAMPLE_FAILURES)
+    assert len(results) == 3
+    categories = {r["error_category"] for r in results}
+    assert categories == {"connectivity_failure", "automation_failure", "timeout_failure"}
+
+
+def test_classify_job_errors_deduplicates():
+    """Same error in step1 and step3 should only appear once."""
+    job_context = {
+        "failed_tasks": [
+            {"task": "Run module", "error_message": "MODULE FAILURE"},
+        ]
+    }
+    correlation = {
+        "timeline_events": [
+            {
+                "source": "aap_job",
+                "details": {"error_message": "MODULE FAILURE"},
+            },
+        ]
+    }
+    results = classify_job_errors(job_context, correlation, SAMPLE_FAILURES)
+    assert len(results) == 1
+    assert results[0]["error_category"] == "automation_failure"
+
+
+def test_classify_job_errors_empty_inputs():
+    """No errors should produce empty results, not crash."""
+    results = classify_job_errors({}, {}, SAMPLE_FAILURES)
+    assert results == []
+    results = classify_job_errors({"failed_tasks": []}, {"timeline_events": []}, SAMPLE_FAILURES)
+    assert results == []
+    results = classify_job_errors({"failed_tasks": []}, {"timeline_events": []}, [])
+    assert results == []

From 63b1bd775c6552ede01a974a3ff0f44fb980f261 Mon Sep 17 00:00:00 2001
From: Andy Xie <anxie@redhat.com>
Date: Fri, 10 Apr 2026 17:44:51 +0800
Subject: [PATCH 2/4] Fix CI: lazy-import requests and add PyYAML to project
 deps

The `requests` import in classify.py was top-level, causing
ModuleNotFoundError in CI since it's not in pyproject.toml deps.
Move it to a lazy import inside fetch_known_failures_from_url()
where it's actually needed. Add PyYAML to pyproject.toml since
classify.py directly uses it. Fix ruff I001 import sorting in
test_classify.py.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 pyproject.toml                                    | 1 +
 skills/root-cause-analysis/scripts/classify.py    | 3 ++-
 skills/root-cause-analysis/tests/test_classify.py | 1 -
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 383e2e8..23231dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "python-dotenv>=1.0.0",
+    "PyYAML>=6.0",
 ]
 
 [project.optional-dependencies]
diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py
index 345e7c7..4ee7bfc 100644
--- a/skills/root-cause-analysis/scripts/classify.py
+++ b/skills/root-cause-analysis/scripts/classify.py
@@ -14,7 +14,6 @@
 import tempfile
 from pathlib import Path
 
-import requests
 import yaml
 
 # Cache dir for downloaded known_failed.yaml
@@ -29,6 +28,8 @@ def fetch_known_failures_from_url(url: str) -> list[dict]:
     """
     _CACHE_DIR.mkdir(parents=True, exist_ok=True)
 
+    import requests
+
     headers = {}
     github_token = os.environ.get("GITHUB_TOKEN", "")
     if github_token and "api.github.com" in url:
diff --git a/skills/root-cause-analysis/tests/test_classify.py b/skills/root-cause-analysis/tests/test_classify.py
index 86118b5..6defdca 100644
--- a/skills/root-cause-analysis/tests/test_classify.py
+++ b/skills/root-cause-analysis/tests/test_classify.py
@@ -7,7 +7,6 @@
 
 from scripts.classify import classify_error, classify_job_errors, load_known_failures
 
-
 SAMPLE_FAILURES = [
     {
         "error_string": "Shared connection to.*compute.amazonaws.com closed",

From b6fc32e7a37e29a57b6afd6bf527200fa22a0ded Mon Sep 17 00:00:00 2001
From: Andy Xie <anxie@redhat.com>
Date: Fri, 10 Apr 2026 17:46:59 +0800
Subject: [PATCH 3/4] Fix ruff format: collapse resolve_known_failures
 signature to one line

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/root-cause-analysis/scripts/classify.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py
index 4ee7bfc..f67d596 100644
--- a/skills/root-cause-analysis/scripts/classify.py
+++ b/skills/root-cause-analysis/scripts/classify.py
@@ -133,9 +133,7 @@ def classify_job_errors(
     return results
 
 
-def resolve_known_failures(
-    url: str | None = None, local_path: str | None = None
-) -> list[dict]:
+def resolve_known_failures(url: str | None = None, local_path: str | None = None) -> list[dict]:
     """Resolve and load known failure patterns.
 
     Args:

From 6715cee179109547cf03a2ad06dbd6b88ff6919f Mon Sep 17 00:00:00 2001
From: Andy Xie <anxie@redhat.com>
Date: Sat, 11 Apr 2026 01:09:53 +0800
Subject: [PATCH 4/4] Address CodeRabbit review: YAML validation, stable
 classification.json, schema-aligned categories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Validate YAML shape in _parse_yaml_content: reject non-dict top-level,
  non-list failures, and non-dict entries
- Always emit classification.json even when no patterns are configured,
  so Step 5 has a stable pipeline contract
- Fix SKILL.md fallback categories to match summary.schema.json enum
  (workload_bug → application_bug, credential → secrets)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 skills/root-cause-analysis/SKILL.md            |  2 +-
 skills/root-cause-analysis/scripts/classify.py |  7 +++++--
 skills/root-cause-analysis/scripts/cli.py      | 17 ++++++++++-------
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/skills/root-cause-analysis/SKILL.md b/skills/root-cause-analysis/SKILL.md
index de35b33..0d993db 100644
--- a/skills/root-cause-analysis/SKILL.md
+++ b/skills/root-cause-analysis/SKILL.md
@@ -97,7 +97,7 @@ python3 -m venv .venv
 
 ### Summary Requirements
 
-1. **Root Cause**: Category — prefer `classification.json` categories when matched (`platform_failure|connectivity_failure|authentication_failure|resource_failure|timeout_failure|automation_failure|infrastructure_failure`). Fall back to (`configuration|infrastructure|workload_bug|credential|resource|dependency`) only for novel/unclassified errors. Include summary and confidence.
+1. **Root Cause**: Category — prefer `classification.json` categories when matched (`platform_failure|connectivity_failure|authentication_failure|resource_failure|timeout_failure|automation_failure|infrastructure_failure`). Fall back to (`configuration|infrastructure|application_bug|secrets|resource|dependency`) only for novel/unclassified errors. Include summary and confidence.
 2. **Evidence**: Supporting evidence from AAP logs, Splunk logs, and GitHub configs/code
    - **REQUIRED**: When `source` is `agnosticv_config` or `agnosticd_code`, **MUST** include `github_path` in format `owner/repo:path/to/file.yml:line`
    - Extract GitHub paths from step4:
diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py
index f67d596..8c3859c 100644
--- a/skills/root-cause-analysis/scripts/classify.py
+++ b/skills/root-cause-analysis/scripts/classify.py
@@ -64,9 +64,12 @@ def load_known_failures(yaml_path: str | Path) -> list[dict]:
 def _parse_yaml_content(content: str) -> list[dict]:
     """Parse YAML content and extract the failures list."""
     data = yaml.safe_load(content)
-    if not data:
+    if not isinstance(data, dict):
         return []
-    return data.get("failures", [])
+    failures = data.get("failures", [])
+    if not isinstance(failures, list):
+        return []
+    return [f for f in failures if isinstance(f, dict)]
 
 
 def classify_error(error_message: str, known_failures: list[dict]) -> dict | None:
diff --git a/skills/root-cause-analysis/scripts/cli.py b/skills/root-cause-analysis/scripts/cli.py
index 8ba4602..bdb9164 100644
--- a/skills/root-cause-analysis/scripts/cli.py
+++ b/skills/root-cause-analysis/scripts/cli.py
@@ -213,28 +213,31 @@ def cmd_analyze(args: argparse.Namespace, config: Config) -> int:
         local_path=getattr(args, "known_failures_file", None),
     )
     classification_path = analysis_dir / "classification.json"
+    classification_result: dict = {
+        "patterns_loaded": len(known_failures),
+        "matches": [],
+    }
     if known_failures:
         classifications = classify_job_errors(job_context, correlation, known_failures)
-        classification_result = {
-            "patterns_loaded": len(known_failures),
-            "matches": classifications,
-        }
-        with open(classification_path, "w") as f:
-            json.dump(classification_result, f, indent=2)
+        classification_result["matches"] = classifications
         if classifications:
             print(f"  Matched {len(classifications)} error(s) against known patterns")
             for c in classifications:
                 print(f"    - {c['error_category']}: {c['failure_description']}")
         else:
             print("  No matches — errors may be novel/unclassified")
-        print(f"  Output: {classification_path}")
     else:
+        classification_result["skipped"] = True
+        classification_result["reason"] = "no known failure patterns configured"
         print("  Skipped: no known failure patterns configured (optional)")
         print(
             "  Hint: Use --known-failures-file <path> or --known-failures-url <url>,"
             " or set KNOWN_FAILED_YAML_URL / KNOWN_FAILED_YAML"
             " in .claude/settings.local.json env block"
         )
+    with open(classification_path, "w") as f:
+        json.dump(classification_result, f, indent=2)
+    print(f"  Output: {classification_path}")
 
     # Print summary
     print("\n" + "=" * 60)