redhat-et · PalmPalm7 · Mar 23, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "python-dotenv>=1.0.0",
+    "PyYAML>=6.0",
 ]
 
 [project.optional-dependencies]

diff --git a/skills/root-cause-analysis/README.md b/skills/root-cause-analysis/README.md
@@ -46,7 +46,8 @@ Add the following environment variables to your Claude Code settings file:
     "SPLUNK_INDEX": "your_splunk_index",
     "SPLUNK_OCP_APP_INDEX": "your_ocp_app_index",
     "SPLUNK_OCP_INFRA_INDEX": "your_ocp_infra_index",
-    "SPLUNK_VERIFY_SSL": "false"
+    "SPLUNK_VERIFY_SSL": "false",
+    "KNOWN_FAILED_YAML_URL": "https://api.github.com/repos/your-org/your-repo/contents/path/to/known_failed.yaml"
   }
 }
 ```
@@ -60,6 +61,7 @@ Update the values:
 - `SPLUNK_USERNAME` / `SPLUNK_PASSWORD` - Your Splunk credentials
 - `SPLUNK_INDEX` - Default index for AAP logs
 - `SPLUNK_OCP_APP_INDEX` / `SPLUNK_OCP_INFRA_INDEX` - OCP log indices
+- `KNOWN_FAILED_YAML_URL` - URL to `known_failed.yaml` for error classification (e.g., GitHub API content URL). Uses `GITHUB_TOKEN` for authentication if the URL points to `api.github.com`. The file is cached locally after first fetch. Alternatively, set `KNOWN_FAILED_YAML` to a local file path. Can also be passed via `--known-failures-url` or `--known-failures-file` CLI flags.
 
 ### 3. Configure SSH for auto-fetch (optional)
 
@@ -204,7 +206,7 @@ All steps are executed automatically by the `cli.py analyze` command:
 
 ### Step 5: Analyze and Generate Summary (Claude)
 
-**Input files**: Read outputs from steps 1-4 (`step1_job_context.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, and if needed, `step2_splunk_logs.json`).
+**Input files**: Read outputs from steps 1-4 (`step1_job_context.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, `classification.json`, and if needed, `step2_splunk_logs.json`).
 
 **Analysis Guidelines**:
 - **Configuration Analysis**: Variable precedence (role defaults → common.yaml → platform/account.yaml → platform/catalog/env.yaml), check for conflicts, missing variables, secrets references
@@ -236,6 +238,7 @@ Analysis results are saved to `.analysis/<job-id>/`:
 | `step2_splunk_logs.json` | Correlated Splunk pod logs | Python |
 | `step3_correlation.json` | Unified timeline with correlation proof | Python |
 | `step4_github_fetch_history.json` | GitHub fetch results (configs and workload code) | Python (Claude updates for MCP verification) |
+| `classification.json` | Known failure pattern matches | Python |
 | `step5_analysis_summary.json` | Root cause summary with recommendations | Claude |
 
 ## Correlation Methods

diff --git a/skills/root-cause-analysis/SKILL.md b/skills/root-cause-analysis/SKILL.md
@@ -44,7 +44,7 @@ The `cli.py analyze` command automatically runs all steps:
 - **Step 3**: Correlate → Merge AAP and Splunk events into unified timeline
 - **Step 4**: Fetch GitHub files → Parse job metadata, fetch AgnosticV configs and AgnosticD workload code (requires `GITHUB_TOKEN` to be configured)
 
-**Outputs**: `.analysis/<job-id>/step1_job_context.json`, `step2_splunk_logs.json`, `step3_correlation.json`, `step4_github_fetch_history.json`
+**Outputs**: `.analysis/<job-id>/step1_job_context.json`, `step2_splunk_logs.json`, `step3_correlation.json`, `step4_github_fetch_history.json`, `classification.json`
 
 This skill automatically searches for job logs in the configured `JOB_LOGS_DIR`.
 
@@ -77,7 +77,8 @@ python3 -m venv .venv
 1. **REQUIRED**: `step1_job_context.json` - Job metadata and failed task details
 2. **REQUIRED**: `step3_correlation.json` - Correlated timeline with relevant pod logs (DO NOT read step2 unless needed)
 3. **REQUIRED**: `step4_github_fetch_history.json` - Configuration and code context
-4. **CONDITIONAL**: `step2_splunk_logs.json` - Only read if step3 indicates errors needing deeper investigation
+4. **REQUIRED**: `classification.json` - Known failure pattern matches (if present). Use these verified categories instead of guessing. If a match exists, use its `error_category` as the root cause category. If no matches, flag as novel/unclassified failure.
+5. **CONDITIONAL**: `step2_splunk_logs.json` - Only read if step3 indicates errors needing deeper investigation
 
 **Output**: `.analysis/<job-id>/step5_analysis_summary.json` 
 
@@ -96,7 +97,7 @@ python3 -m venv .venv
 
 ### Summary Requirements
 
-1. **Root Cause**: Category (`configuration|infrastructure|workload_bug|credential|resource|dependency`), summary, confidence
+1. **Root Cause**: Category — prefer `classification.json` categories when matched (`platform_failure|connectivity_failure|authentication_failure|resource_failure|timeout_failure|automation_failure|infrastructure_failure`). Fall back to (`configuration|infrastructure|application_bug|secrets|resource|dependency`) only for novel/unclassified errors. Include summary and confidence.
 2. **Evidence**: Supporting evidence from AAP logs, Splunk logs, and GitHub configs/code
    - **REQUIRED**: When `source` is `agnosticv_config` or `agnosticd_code`, **MUST** include `github_path` in format `owner/repo:path/to/file.yml:line`
    - Extract GitHub paths from step4:
@@ -180,6 +181,7 @@ See `schemas/summary.schema.json` for complete structure. Example:
 | 2 | `step2_splunk_logs.json` | Python |
 | 3 | `step3_correlation.json` | Python |
 | 4 | `step4_github_fetch_history.json` | Python (Optional Claude updates for MCP verification) |
+| — | `classification.json` | Python (known failure pattern matching) |
 | 5 | `step5_analysis_summary.json` | Claude |
 
 All files in `.analysis/<job-id>/`
diff --git a/skills/root-cause-analysis/schemas/summary.schema.json b/skills/root-cause-analysis/schemas/summary.schema.json
@@ -31,7 +31,15 @@
             "resource",
             "cloud_api",
             "secrets",
-            "unknown"
+            "unknown",
+            "platform_failure",
+            "connectivity_failure",
+            "authentication_failure",
+            "resource_failure",
+            "timeout_failure",
+            "automation_failure",
+            "infrastructure_failure",
+            "general_failure"
           ]
         },
         "confidence": {

diff --git a/skills/root-cause-analysis/scripts/classify.py b/skills/root-cause-analysis/scripts/classify.py
@@ -0,0 +1,170 @@
+"""Classify error messages against known failure patterns.
+
+Loads a curated YAML file of regex-based error patterns and matches them
+against error messages from RCA steps 1 and 3. The YAML file can be
+provided via URL, local file path, or CLI flags.
+
+Configuration (in .claude/settings.local.json env block):
+  KNOWN_FAILED_YAML_URL — URL to fetch the YAML file (cached locally)
+  KNOWN_FAILED_YAML     — local file path (fallback)
+"""
+
+import os
+import re
+import tempfile
+from pathlib import Path
+
+import yaml
+
+# Cache dir for downloaded known_failed.yaml
+_CACHE_DIR = Path(tempfile.gettempdir()) / "rhdp-rca"
+_CACHE_FILE = _CACHE_DIR / "known_failed.yaml"
+
+
+def fetch_known_failures_from_url(url: str) -> list[dict]:
+    """Fetch known failure patterns YAML from a URL.
+
+    Caches the file locally. Returns the parsed failures list.
+    """
+    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+    import requests
+
+    headers = {}
+    github_token = os.environ.get("GITHUB_TOKEN", "")
+    if github_token and "api.github.com" in url:
+        headers["Authorization"] = f"token {github_token}"
+        headers["Accept"] = "application/vnd.github.v3.raw"
+
+    try:
+        resp = requests.get(url, headers=headers, timeout=15)
+        resp.raise_for_status()
+        _CACHE_FILE.write_text(resp.text)
+        return _parse_yaml_content(resp.text)
+    except (requests.RequestException, yaml.YAMLError) as e:
+        # Fall back to cache if fetch fails
+        if _CACHE_FILE.exists():
+            return load_known_failures(_CACHE_FILE)
+        print(f"  Warning: Failed to fetch known failure patterns: {e}")
+        return []
+
+
+def load_known_failures(yaml_path: str | Path) -> list[dict]:
+    """Load known failure patterns from a local YAML file."""
+    path = Path(yaml_path)
+    if not path.exists():
+        return []
+    try:
+        with open(path) as f:
+            return _parse_yaml_content(f.read())
+    except (yaml.YAMLError, OSError):
+        return []
+
+
+def _parse_yaml_content(content: str) -> list[dict]:
+    """Parse YAML content and extract the failures list."""
+    data = yaml.safe_load(content)
+    if not isinstance(data, dict):
+        return []
+    failures = data.get("failures", [])
+    if not isinstance(failures, list):
+        return []
+    return [f for f in failures if isinstance(f, dict)]
+
+
+def classify_error(error_message: str, known_failures: list[dict]) -> dict | None:
+    """Match an error message against known failure patterns.
+
+    Returns a dict with classification info on match, or None.
+    """
+    if not error_message or not known_failures:
+        return None
+
+    error_message = error_message.strip()
+
+    for failure in known_failures:
+        pattern = failure.get("error_string", "")
+        if not pattern:
+            continue
+        try:
+            if re.search(pattern, error_message, re.IGNORECASE | re.DOTALL):
+                return {
+                    "error_category": failure.get("category", "general_failure"),
+                    "matched_pattern": pattern,
+                    "failure_description": failure.get("description", ""),
+                }
+        except re.error:
+            continue
+
+    return None
+
+
+def classify_job_errors(
+    job_context: dict, correlation: dict, known_failures: list[dict]
+) -> list[dict]:
+    """Classify all error messages found in step1 and step3 outputs.
+
+    Returns a list of classification results (one per matched error).
+    """
+    results: list[dict] = []
+    seen_messages: set[str] = set()
+
+    # Collect error messages from step1 failed tasks
+    for task in job_context.get("failed_tasks", []):
+        msg = task.get("error_message", "")
+        if msg and msg not in seen_messages:
+            seen_messages.add(msg)
+            match = classify_error(msg, known_failures)
+            if match:
+                match["source"] = "aap_failed_task"
+                match["task"] = task.get("task", "")
+                results.append(match)
+
+    # Collect error messages from step3 timeline events
+    # Timeline events store messages in details.message (for aap_job) or
+    # details.message (for splunk_ocp), not at the top level.
+    for event in correlation.get("timeline_events", []):
+        details = event.get("details", {})
+        msg = details.get("message", "") or details.get("error_message", "")
+        if msg and msg not in seen_messages:
+            seen_messages.add(msg)
+            match = classify_error(msg, known_failures)
+            if match:
+                match["source"] = "correlation_timeline"
+                results.append(match)
+
+    return results
+
+
+def resolve_known_failures(url: str | None = None, local_path: str | None = None) -> list[dict]:
+    """Resolve and load known failure patterns.
+
+    Args:
+        url: URL to fetch YAML from (overrides env var)
+        local_path: Local file path (overrides env var)
+
+    Priority:
+    1. Explicit url/local_path arguments (from CLI flags)
+    2. KNOWN_FAILED_YAML_URL env var — fetch from URL (cached locally)
+    3. KNOWN_FAILED_YAML env var — read from local file path
+    4. Returns empty list if none configured
+    """
+    # CLI flag: URL
+    if url:
+        return fetch_known_failures_from_url(url)
+
+    # CLI flag: local path
+    if local_path:
+        return load_known_failures(local_path)
+
+    # Env var: URL
+    env_url = os.environ.get("KNOWN_FAILED_YAML_URL", "")
+    if env_url:
+        return fetch_known_failures_from_url(env_url)
+
+    # Env var: local path
+    env_path = os.environ.get("KNOWN_FAILED_YAML", "")
+    if env_path:
+        return load_known_failures(env_path)
+
+    return []
diff --git a/skills/root-cause-analysis/scripts/cli.py b/skills/root-cause-analysis/scripts/cli.py
@@ -11,13 +11,15 @@
 if __name__ == "__main__" and __package__ is None:
     # Running directly as scripts/cli.py - add parent to path
     sys.path.insert(0, str(Path(__file__).parent.parent))
+    from scripts.classify import classify_job_errors, resolve_known_failures
     from scripts.config import Config
     from scripts.correlator import build_correlation_timeline, fetch_correlated_logs
     from scripts.job_parser import parse_job_log
     from scripts.log_fetcher import fetch_job_log
     from scripts.step4_fetch_github import GitHubClient, Step4Analyzer
 else:
     # Running as module (-m scripts.cli)
+    from .classify import classify_job_errors, resolve_known_failures
     from .config import Config
     from .correlator import build_correlation_timeline, fetch_correlated_logs
     from .job_parser import parse_job_log
@@ -204,6 +206,39 @@ def cmd_analyze(args: argparse.Namespace, config: Config) -> int:
             print(f"  Error fetching GitHub files: {e}")
             return 1
 
+    # Classify errors against known failure patterns (optional)
+    print("\n[Classify] Matching errors against known failure patterns...")
+    known_failures = resolve_known_failures(
+        url=getattr(args, "known_failures_url", None),
+        local_path=getattr(args, "known_failures_file", None),
+    )
+    classification_path = analysis_dir / "classification.json"
+    classification_result: dict = {
+        "patterns_loaded": len(known_failures),
+        "matches": [],
+    }
+    if known_failures:
+        classifications = classify_job_errors(job_context, correlation, known_failures)
+        classification_result["matches"] = classifications
+        if classifications:
+            print(f"  Matched {len(classifications)} error(s) against known patterns")
+            for c in classifications:
+                print(f"    - {c['error_category']}: {c['failure_description']}")
+        else:
+            print("  No matches — errors may be novel/unclassified")
+    else:
+        classification_result["skipped"] = True
+        classification_result["reason"] = "no known failure patterns configured"
+        print("  Skipped: no known failure patterns configured (optional)")
+        print(
+            "  Hint: Use --known-failures-file <path> or --known-failures-url <url>,"
+            " or set KNOWN_FAILED_YAML_URL / KNOWN_FAILED_YAML"
+            " in .claude/settings.local.json env block"
+        )
+    with open(classification_path, "w") as f:
+        json.dump(classification_result, f, indent=2)
+    print(f"  Output: {classification_path}")
+
     # Print summary
     print("\n" + "=" * 60)
     print("Analysis Complete")
@@ -356,6 +391,14 @@ def main():
         action="store_true",
         help="Fetch job log from remote server via SSH if not found locally",
     )
+    analyze_parser.add_argument(
+        "--known-failures-url",
+        help="URL to fetch known_failed.yaml from (overrides KNOWN_FAILED_YAML_URL env var)",
+    )
+    analyze_parser.add_argument(
+        "--known-failures-file",
+        help="Local path to known_failed.yaml (overrides KNOWN_FAILED_YAML env var)",
+    )
 
     # parse command
     parse_parser = subparsers.add_parser("parse", help="Parse job log only (Step 1)")