Enhance RCA preflight checks with interactive setup and graceful degr… (#3)

Shreyanand · claude · web-flow · commit 3dd796e8dd93 · 2026-03-24T13:26:02.000+05:30
* Enhance RCA preflight checks with interactive setup and graceful degradation

- Add env_vars and configurable metadata to all setup checks so Claude
  can walk users through interactive configuration of missing settings
- Make Splunk optional: warn and skip steps 2-3 instead of hard failure
- Add SSH config validation (_ssh_host_exists) and ssh_setup_needed flag
  for interactive SSH config creation
- Add MLFlow SSH tunnel auto-start: kill stale tunnels, start fresh,
  verify reachability with MLFlow-specific response validation
- Add JUMPBOX_URI to MLFlow check for tunnel configuration
- Fix settings.example.json key: "environment" → "env" to match
  Claude Code's expected format
- Add settings.example.json template
- Categorize checks: JOB_LOGS_DIR required, MLFlow recommended,
  SSH/Splunk/GitHub optional

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.claude/settings.example.json b/.claude/settings.example.json
@@ -0,0 +1,42 @@
+{
+  "env": {
+    "JOB_LOGS_DIR": "<path-to-local-job-logs>",
+    "JUMPBOX_URI": "<username>@<jumpbox-host> -p <port>",
+    "REMOTE_HOST": "<ssh-host-alias>",
+    "REMOTE_DIR": "<remote-log-directory>",
+    "SPLUNK_HOST": "<your-splunk-host-url>",
+    "SPLUNK_USERNAME": "<your-splunk-username>",
+    "SPLUNK_PASSWORD": "<your-splunk-password>",
+    "SPLUNK_INDEX": "<your-splunk-index>",
+    "SPLUNK_OCP_APP_INDEX": "<splunk-ocp-app-index>",
+    "SPLUNK_OCP_INFRA_INDEX": "<splunk-ocp-infra-index>",
+    "SPLUNK_VERIFY_SSL": "false",
+    "GITHUB_TOKEN": "<your-github-token>",
+    "MLFLOW_PORT": "<localhost-port>",
+    "MLFLOW_EXPERIMENT_NAME": "<your-experiment-name>",
+    "MLFLOW_CLAUDE_TRACING_ENABLED": "true",
+    "MLFLOW_TAG_USER": "<your-username>"
+  },
+  "hooks": {
+    "SessionStart": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "bash .claude/hooks/session-start.sh"
+          }
+        ]
+      }
+    ],
+    "Stop": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "python -c \"from mlflow.claude_code.hooks import stop_hook_handler; stop_hook_handler()\""
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/README.md b/README.md
@@ -148,13 +148,15 @@ Step 5   [Claude]  Analyze and summarize root cause
 - Troubleshoot Kubernetes/OpenShift problems
 
 **[View detailed documentation →](./skills/root-cause-analysis/README.md)**
+
 ---
+
 ```json
 {
   "env": {
-    "REMOTE_HOST":"<remote-host>",
+    "REMOTE_HOST": "<remote-host>",
     "REMOTE_DIR": "<remote-dir>",
-    "DEFAULT_LOCAL_DIR":"Path.home() / "aiops_extracted_logs"",
+    "DEFAULT_LOCAL_DIR": "<path-to-local-extracted-logs>",
     "JOB_LOGS_DIR": "/path/to/your/extracted_logs",
     "SPLUNK_HOST": "<your-remote-splunk>",
     "SPLUNK_USERNAME": "your-username",
diff --git a/skills/feedback-capture/README.md b/skills/feedback-capture/README.md
@@ -13,9 +13,9 @@ The Feedback Capture skill is designed to:
 ## Components
 
 - **`SKILL.md`**: Defines the agent's behavior, including the steps to ask for feedback, determining the category, and the command to run the formatting script.
-- **`scripts/formatting.py`**: Default feedback script. Always writes to `feedback.txt`, and also logs to MLflow if available.
+- **`scripts/formatting.py`**: Default feedback script. Always writes to `feedback.json`, and also logs to MLflow if available.
     - **Input**: Category, Feedback text, Context summary, Skill name.
-    - **Output**: Appends a formatted entry to `feedback.txt` (always). If MLflow is installed and configured, also logs a trace to MLflow.
+    - **Output**: Appends a formatted entry to `feedback.json` (always). If MLflow is installed and configured, also logs a trace to MLflow.
 - **`scripts/mlflow_feedback.py`**: Standalone MLflow-only feedback logging. Requires MLflow to be installed and configured. Used by `formatting.py` internally when MLflow is available.
 
 ## Usage
@@ -24,7 +24,7 @@ This skill is typically triggered when an interaction completes or when the user
 
 ### How it works
 
-1. `formatting.py` **always** saves feedback to `feedback.txt` (file-based, no dependencies)
+1. `formatting.py` **always** saves feedback to `feedback.json` (file-based, no dependencies)
 2. If MLflow is installed and configured, it **also** logs a trace to MLflow automatically
 3. No setup required for basic usage — MLflow tracing is a bonus when available
 
diff --git a/skills/feedback-capture/scripts/formatting.py b/skills/feedback-capture/scripts/formatting.py
@@ -78,7 +78,7 @@ def main():
                 feedback_entries = json.load(f)
                 if not isinstance(feedback_entries, list):
                     feedback_entries = []
-        except (json.JSONDecodeError, Exception) as e:
+        except Exception as e:
             print(f"Warning: Could not read existing feedback.json: {e}")
             feedback_entries = []
 
diff --git a/skills/feedback-capture/scripts/mlflow_feedback.py b/skills/feedback-capture/scripts/mlflow_feedback.py
@@ -86,18 +86,18 @@ def main() -> int:
         "summary": args.context,  # MLflow uses "summary" field
         "chat_history_file": chat_history_json_filename,
         "user": os.environ.get("MLFLOW_TAG_USER", os.environ.get("USER", "unknown")),
-        "source": "feedback-capture"
+        "source": "feedback-capture",
     }
 
     # Load existing feedback entries or create new list
     feedback_entries = []
     if feedback_json_filepath.exists():
         try:
-            with open(feedback_json_filepath, "r") as f:
+            with open(feedback_json_filepath) as f:
                 feedback_entries = json.load(f)
                 if not isinstance(feedback_entries, list):
                     feedback_entries = []
-        except (json.JSONDecodeError, Exception) as e:
+        except Exception as e:
             print(f"Warning: Could not read existing feedback.json: {e}")
             feedback_entries = []
 
diff --git a/skills/feedback-capture/scripts/utils.py b/skills/feedback-capture/scripts/utils.py
@@ -52,7 +52,9 @@ def get_chat_history_jsonl_path(session_id=None):
         return None
 
 
-def upload_feedback_to_jumpbox(feedback_file: Path, chat_history_file: Path, session_id: str | None = None) -> bool:
+def upload_feedback_to_jumpbox(
+    feedback_file: Path, chat_history_file: Path, session_id: str | None = None
+) -> bool:
     """Upload feedback.json and chat history to Jumpbox."""
     jumpbox_uri = os.environ.get("JUMPBOX_URI", "")
 
@@ -101,7 +103,7 @@ def upload_feedback_to_jumpbox(feedback_file: Path, chat_history_file: Path, ses
             scp_cmd = ["scp"]
             if ssh_port:
                 scp_cmd.extend(["-P", ssh_port])
-                
+
             dest_filename = f"feedback_{session_id}.json" if session_id else feedback_file.name
             scp_cmd.extend([str(feedback_file), f"{ssh_target}:/tmp/feedback/{dest_filename}"])
 
diff --git a/skills/root-cause-analysis/SKILL.md b/skills/root-cause-analysis/SKILL.md
@@ -18,13 +18,64 @@ Investigate failed jobs by correlating Ansible Automation Platform (AAP) job log
 When a user asks to analyze a failed job, execute these steps automatically.
 The skill's base path is provided when this skill is invoked. Run scripts relative to this folder.
 
-### Setup (run once per session if .venv doesn't exist)
+### Preflight Check (run before first analysis)
 
 ```bash
-# Create virtual environment and install dependencies
+# Create virtual environment and install dependencies (if .venv doesn't exist)
 python3 -m venv .venv && .venv/bin/pip install -q -r requirements.txt
+
+# Check all prerequisites (use --json for structured output)
+.venv/bin/python scripts/cli.py setup --json
 ```
 
+Review the JSON output. Some settings are required, others are optional:
+
+**Required** (skill will not proceed without these):
+- **JOB_LOGS_DIR** -- Local directory for job log files
+- **JUMPBOX_URI** -- SSH jumpbox connection for uploading analysis results and feedback
+
+**Recommended** (analysis works without these but functionality is reduced):
+- **MLFlow** -- Tracing configuration for recording analysis runs (MLFLOW_PORT, MLFLOW_EXPERIMENT_NAME, MLFLOW_TAG_USER)
+
+**Optional** (skill runs with reduced functionality when missing):
+- **SSH / REMOTE_HOST** not configured: `--fetch` flag won't work (user must provide logs in JOB_LOGS_DIR manually)
+- **Splunk** not configured: Steps 2-3 (log correlation) will be skipped
+- **GitHub token** not configured: Step 4 (config fetching) will be skipped
+
+#### Interactive Setup for Missing Configs
+
+If any checks have `"status": "missing"` and `"configurable": true`, offer to help the user configure them:
+
+1. List the missing configurable items grouped by check name
+2. Ask: "Would you like me to help configure these? I'll walk you through each one."
+3. If yes, for each missing check with `"configurable": true`:
+   - Show the check name and each `env_vars[].prompt` to explain what's needed
+   - If the env var has a `"default"`, mention it (user can press enter to accept)
+   - If the env var has `"optional": true`, let the user know they can skip it
+   - Ask the user for the value
+   - **SSH special handling**: If the SSH check has `"ssh_setup_needed": true`:
+     - Ask the user for their SSH host alias name
+     - Check if that alias already exists in `~/.ssh/config` -- if so, use it as `REMOTE_HOST`
+     - If it doesn't exist, ask: do you want to create a new SSH config entry? If yes, ask for: hostname, username, port (default 22), and optional identity file path
+     - Read `~/.ssh/config`, append the new `Host` block, and write it back
+     - Then set `REMOTE_HOST` to the alias name
+4. After collecting all values, read the project's `.claude/settings.json` file
+5. Merge the new values into the `"env"` block (create it if it doesn't exist)
+6. Write the updated settings file
+7. Tell the user to **restart the Claude Code session** for env vars to take effect
+8. **Important**: Write secrets (tokens, passwords) to `.claude/settings.json` -- ensure this file is in `.gitignore`
+
+If checks show non-configurable errors (e.g., venv issues, rsync not found), provide the fix command instead.
+
+#### MLFlow Server Startup
+
+The `MLFlow server` preflight check automatically handles server connectivity:
+- If the server is unreachable and `JUMPBOX_URI` is configured, it starts an SSH tunnel automatically
+- If the tunnel is already running, it skips startup
+- If the tunnel fails, it reports the error but the skill can still proceed (MLFlow is recommended, not required)
+
+If any **required** checks (JOB_LOGS_DIR, JUMPBOX_URI) are still missing after the setup flow, do **not** proceed to analysis -- tell the user what's still needed. If MLFlow is missing, warn that tracing won't be recorded but proceed. If all required checks pass (recommended/optional items may remain missing), proceed to analysis.
+
 ### Step 1-4: Run the analysis CLI
 
 Always use `--fetch` when analyzing by job ID. This automatically downloads the log from the remote server if it's not already present locally, and skips fetching if the log is already there.
diff --git a/skills/root-cause-analysis/scripts/cli.py b/skills/root-cause-analysis/scripts/cli.py
@@ -16,6 +16,7 @@
     from scripts.correlator import build_correlation_timeline, fetch_correlated_logs
     from scripts.job_parser import parse_job_log
     from scripts.log_fetcher import fetch_job_log
+    from scripts.setup import print_checks, run_checks
     from scripts.step4_fetch_github import GitHubClient, Step4Analyzer
     from scripts.tracing import HAS_MLFLOW, SpanType, mlflow, trace
 else:
@@ -24,6 +25,7 @@
     from .correlator import build_correlation_timeline, fetch_correlated_logs
     from .job_parser import parse_job_log
     from .log_fetcher import fetch_job_log
+    from .setup import print_checks, run_checks
     from .step4_fetch_github import GitHubClient, Step4Analyzer
     from .tracing import HAS_MLFLOW, SpanType, mlflow, trace
 
@@ -116,6 +118,7 @@ def upload_analysis_to_jumpbox(analysis_dir: Path, config: Config) -> bool:
         print(f"  Error uploading to Jumpbox: {e}")
         return False
 
+
 def get_step_name(step: int) -> str:
     """Get descriptive name for step."""
     names = {
@@ -219,13 +222,13 @@ def cmd_analyze(args: argparse.Namespace, config: Config, span=None) -> int:
         return 1
 
     # Validate Splunk config
-    errors = config.validate_splunk()
-    if errors:
-        error_message = f"Splunk configuration invalid: {', '.join(errors)}"
-        print(f"Error: {error_message}")
-        if span:
-            span.set_outputs({"error": error_message})
-        return 1
+    splunk_errors = config.validate_splunk()
+    if splunk_errors:
+        print(f"Warning: Splunk configuration invalid: {', '.join(splunk_errors)}")
+        print(
+            "  Step 2 (Splunk log fetch) will be skipped. Set SPLUNK_HOST/SPLUNK_USERNAME/SPLUNK_PASSWORD in .claude/settings.json to enable."
+        )
+    skip_splunk = bool(splunk_errors)
 
     # GitHub token validation will be done at Step 4 (where it's actually needed)
     github_errors = config.validate_github()
@@ -256,17 +259,31 @@ def cmd_analyze(args: argparse.Namespace, config: Config, span=None) -> int:
 
     # Step 2: Fetch Splunk logs
     print("\n[Step 2] Fetching Splunk logs...")
-    try:
-        splunk_logs = fetch_correlated_logs(config, job_context)
-        step2_path = save_step(analysis_dir, 2, splunk_logs)
-        print(f"  OCP logs: {len(splunk_logs.get('ocp_logs', []))}")
-        print(f"  Error logs: {len(splunk_logs.get('error_logs', []))}")
-        print(f"  Pods found: {len(splunk_logs.get('pods_found', []))}")
-        print(f"  Output: {step2_path}")
-    except Exception as e:
-        print(f"  Error fetching Splunk logs: {e}")
-        splunk_logs = {"ocp_logs": [], "error_logs": [], "pods_found": [], "errors": [str(e)]}
+    if skip_splunk:
+        print("  Skipped: Splunk not configured")
+        splunk_logs = {
+            "ocp_logs": [],
+            "error_logs": [],
+            "pods_found": [],
+            "skipped": True,
+            "reason": "Splunk not configured",
+        }
         save_step(analysis_dir, 2, splunk_logs)
+    else:
+        try:
+            splunk_logs = fetch_correlated_logs(config, job_context)
+            step2_path = save_step(analysis_dir, 2, splunk_logs)
+            ocp_logs = splunk_logs.get("ocp_logs", [])
+            error_logs = splunk_logs.get("error_logs", [])
+            pods_found = splunk_logs.get("pods_found", [])
+            print(f"  OCP logs: {len(ocp_logs) if isinstance(ocp_logs, list) else 0}")
+            print(f"  Error logs: {len(error_logs) if isinstance(error_logs, list) else 0}")
+            print(f"  Pods found: {len(pods_found) if isinstance(pods_found, list) else 0}")
+            print(f"  Output: {step2_path}")
+        except Exception as e:
+            print(f"  Error fetching Splunk logs: {e}")
+            splunk_logs = {"ocp_logs": [], "error_logs": [], "pods_found": [], "errors": [str(e)]}
+            save_step(analysis_dir, 2, splunk_logs)
 
     # Step 3: Build correlation
     print("\n[Step 3] Building correlation timeline...")
@@ -447,6 +464,20 @@ def cmd_query(args: argparse.Namespace, config: Config, span=None) -> int:
     return 0
 
 
+def cmd_setup(args: argparse.Namespace, config: Config, span=None) -> int:
+    """Run preflight checks for all prerequisites."""
+    base_dir = Path(__file__).parent.parent
+    repo_root = base_dir.parent.parent  # skills/ -> repo root
+    results = run_checks(base_dir, repo_root)
+
+    if getattr(args, "json", False):
+        print(json.dumps(results, indent=2))
+        return 0 if all(r["status"] == "ok" for r in results) else 1
+
+    issues = print_checks(results)
+    return 0 if issues == 0 else 1
+
+
 def cmd_status(args: argparse.Namespace, config: Config, span=None) -> int:
     """Show analysis status for a job."""
     analysis_dir = config.analysis_dir / args.job_id
@@ -526,6 +557,10 @@ def main() -> int:
     )
     query_parser.add_argument("--output", "-o", help="Output file (default: print summary)")
 
+    # setup command
+    setup_parser = subparsers.add_parser("setup", help="Check prerequisites and configuration")
+    setup_parser.add_argument("--json", action="store_true", help="Output results as JSON")
+
     # status command
     status_parser = subparsers.add_parser("status", help="Show analysis status")
     status_parser.add_argument("job_id", help="Job ID to check")
@@ -559,6 +594,7 @@ def main() -> int:
         "analyze": cmd_analyze,
         "parse": cmd_parse,
         "query": cmd_query,
+        "setup": cmd_setup,
         "status": cmd_status,
     }
 
diff --git a/skills/root-cause-analysis/scripts/setup.py b/skills/root-cause-analysis/scripts/setup.py