Skip to content

Commit 3dd796e

Browse files
Shreyanandclaude
andauthored
Enhance RCA preflight checks with interactive setup and graceful degr… (#3)
* Enhance RCA preflight checks with interactive setup and graceful degradation - Add env_vars and configurable metadata to all setup checks so Claude can walk users through interactive configuration of missing settings - Make Splunk optional: warn and skip steps 2-3 instead of hard failure - Add SSH config validation (_ssh_host_exists) and ssh_setup_needed flag for interactive SSH config creation - Add MLFlow SSH tunnel auto-start: kill stale tunnels, start fresh, verify reachability with MLFlow-specific response validation - Add JUMPBOX_URI to MLFlow check for tunnel configuration - Fix settings.example.json key: "environment" → "env" to match Claude Code's expected format - Add settings.example.json template - Categorize checks: JOB_LOGS_DIR required, MLFlow recommended, SSH/Splunk/GitHub optional Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9483e4a commit 3dd796e

9 files changed

Lines changed: 750 additions & 30 deletions

File tree

.claude/settings.example.json

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"env": {
3+
"JOB_LOGS_DIR": "<path-to-local-job-logs>",
4+
"JUMPBOX_URI": "<username>@<jumpbox-host> -p <port>",
5+
"REMOTE_HOST": "<ssh-host-alias>",
6+
"REMOTE_DIR": "<remote-log-directory>",
7+
"SPLUNK_HOST": "<your-splunk-host-url>",
8+
"SPLUNK_USERNAME": "<your-splunk-username>",
9+
"SPLUNK_PASSWORD": "<your-splunk-password>",
10+
"SPLUNK_INDEX": "<your-splunk-index>",
11+
"SPLUNK_OCP_APP_INDEX": "<splunk-ocp-app-index>",
12+
"SPLUNK_OCP_INFRA_INDEX": "<splunk-ocp-infra-index>",
13+
"SPLUNK_VERIFY_SSL": "false",
14+
"GITHUB_TOKEN": "<your-github-token>",
15+
"MLFLOW_PORT": "<localhost-port>",
16+
"MLFLOW_EXPERIMENT_NAME": "<your-experiment-name>",
17+
"MLFLOW_CLAUDE_TRACING_ENABLED": "true",
18+
"MLFLOW_TAG_USER": "<your-username>"
19+
},
20+
"hooks": {
21+
"SessionStart": [
22+
{
23+
"hooks": [
24+
{
25+
"type": "command",
26+
"command": "bash .claude/hooks/session-start.sh"
27+
}
28+
]
29+
}
30+
],
31+
"Stop": [
32+
{
33+
"hooks": [
34+
{
35+
"type": "command",
36+
"command": "python -c \"from mlflow.claude_code.hooks import stop_hook_handler; stop_hook_handler()\""
37+
}
38+
]
39+
}
40+
]
41+
}
42+
}

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,13 +148,15 @@ Step 5 [Claude] Analyze and summarize root cause
148148
- Troubleshoot Kubernetes/OpenShift problems
149149

150150
**[View detailed documentation →](./skills/root-cause-analysis/README.md)**
151+
151152
---
153+
152154
```json
153155
{
154156
"env": {
155-
"REMOTE_HOST":"<remote-host>",
157+
"REMOTE_HOST": "<remote-host>",
156158
"REMOTE_DIR": "<remote-dir>",
157-
"DEFAULT_LOCAL_DIR":"Path.home() / "aiops_extracted_logs"",
159+
"DEFAULT_LOCAL_DIR": "<path-to-local-extracted-logs>",
158160
"JOB_LOGS_DIR": "/path/to/your/extracted_logs",
159161
"SPLUNK_HOST": "<your-remote-splunk>",
160162
"SPLUNK_USERNAME": "your-username",

skills/feedback-capture/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ The Feedback Capture skill is designed to:
1313
## Components
1414

1515
- **`SKILL.md`**: Defines the agent's behavior, including the steps to ask for feedback, determining the category, and the command to run the formatting script.
16-
- **`scripts/formatting.py`**: Default feedback script. Always writes to `feedback.txt`, and also logs to MLflow if available.
16+
- **`scripts/formatting.py`**: Default feedback script. Always writes to `feedback.json`, and also logs to MLflow if available.
1717
- **Input**: Category, Feedback text, Context summary, Skill name.
18-
- **Output**: Appends a formatted entry to `feedback.txt` (always). If MLflow is installed and configured, also logs a trace to MLflow.
18+
- **Output**: Appends a formatted entry to `feedback.json` (always). If MLflow is installed and configured, also logs a trace to MLflow.
1919
- **`scripts/mlflow_feedback.py`**: Standalone MLflow-only feedback logging. Requires MLflow to be installed and configured. Used by `formatting.py` internally when MLflow is available.
2020

2121
## Usage
@@ -24,7 +24,7 @@ This skill is typically triggered when an interaction completes or when the user
2424

2525
### How it works
2626

27-
1. `formatting.py` **always** saves feedback to `feedback.txt` (file-based, no dependencies)
27+
1. `formatting.py` **always** saves feedback to `feedback.json` (file-based, no dependencies)
2828
2. If MLflow is installed and configured, it **also** logs a trace to MLflow automatically
2929
3. No setup required for basic usage — MLflow tracing is a bonus when available
3030

skills/feedback-capture/scripts/formatting.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def main():
7878
feedback_entries = json.load(f)
7979
if not isinstance(feedback_entries, list):
8080
feedback_entries = []
81-
except (json.JSONDecodeError, Exception) as e:
81+
except Exception as e:
8282
print(f"Warning: Could not read existing feedback.json: {e}")
8383
feedback_entries = []
8484

skills/feedback-capture/scripts/mlflow_feedback.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,18 +86,18 @@ def main() -> int:
8686
"summary": args.context, # MLflow uses "summary" field
8787
"chat_history_file": chat_history_json_filename,
8888
"user": os.environ.get("MLFLOW_TAG_USER", os.environ.get("USER", "unknown")),
89-
"source": "feedback-capture"
89+
"source": "feedback-capture",
9090
}
9191

9292
# Load existing feedback entries or create new list
9393
feedback_entries = []
9494
if feedback_json_filepath.exists():
9595
try:
96-
with open(feedback_json_filepath, "r") as f:
96+
with open(feedback_json_filepath) as f:
9797
feedback_entries = json.load(f)
9898
if not isinstance(feedback_entries, list):
9999
feedback_entries = []
100-
except (json.JSONDecodeError, Exception) as e:
100+
except Exception as e:
101101
print(f"Warning: Could not read existing feedback.json: {e}")
102102
feedback_entries = []
103103

skills/feedback-capture/scripts/utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ def get_chat_history_jsonl_path(session_id=None):
5252
return None
5353

5454

55-
def upload_feedback_to_jumpbox(feedback_file: Path, chat_history_file: Path, session_id: str | None = None) -> bool:
55+
def upload_feedback_to_jumpbox(
56+
feedback_file: Path, chat_history_file: Path, session_id: str | None = None
57+
) -> bool:
5658
"""Upload feedback.json and chat history to Jumpbox."""
5759
jumpbox_uri = os.environ.get("JUMPBOX_URI", "")
5860

@@ -101,7 +103,7 @@ def upload_feedback_to_jumpbox(feedback_file: Path, chat_history_file: Path, ses
101103
scp_cmd = ["scp"]
102104
if ssh_port:
103105
scp_cmd.extend(["-P", ssh_port])
104-
106+
105107
dest_filename = f"feedback_{session_id}.json" if session_id else feedback_file.name
106108
scp_cmd.extend([str(feedback_file), f"{ssh_target}:/tmp/feedback/{dest_filename}"])
107109

skills/root-cause-analysis/SKILL.md

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,64 @@ Investigate failed jobs by correlating Ansible Automation Platform (AAP) job log
1818
When a user asks to analyze a failed job, execute these steps automatically.
1919
The skill's base path is provided when this skill is invoked. Run scripts relative to this folder.
2020

21-
### Setup (run once per session if .venv doesn't exist)
21+
### Preflight Check (run before first analysis)
2222

2323
```bash
24-
# Create virtual environment and install dependencies
24+
# Create virtual environment and install dependencies (if .venv doesn't exist)
2525
python3 -m venv .venv && .venv/bin/pip install -q -r requirements.txt
26+
27+
# Check all prerequisites (use --json for structured output)
28+
.venv/bin/python scripts/cli.py setup --json
2629
```
2730

31+
Review the JSON output. Some settings are required, others are optional:
32+
33+
**Required** (skill will not proceed without these):
34+
- **JOB_LOGS_DIR** -- Local directory for job log files
35+
- **JUMPBOX_URI** -- SSH jumpbox connection for uploading analysis results and feedback
36+
37+
**Recommended** (analysis works without these but functionality is reduced):
38+
- **MLFlow** -- Tracing configuration for recording analysis runs (MLFLOW_PORT, MLFLOW_EXPERIMENT_NAME, MLFLOW_TAG_USER)
39+
40+
**Optional** (skill runs with reduced functionality when missing):
41+
- **SSH / REMOTE_HOST** not configured: `--fetch` flag won't work (user must provide logs in JOB_LOGS_DIR manually)
42+
- **Splunk** not configured: Steps 2-3 (log correlation) will be skipped
43+
- **GitHub token** not configured: Step 4 (config fetching) will be skipped
44+
45+
#### Interactive Setup for Missing Configs
46+
47+
If any checks have `"status": "missing"` and `"configurable": true`, offer to help the user configure them:
48+
49+
1. List the missing configurable items grouped by check name
50+
2. Ask: "Would you like me to help configure these? I'll walk you through each one."
51+
3. If yes, for each missing check with `"configurable": true`:
52+
- Show the check name and each `env_vars[].prompt` to explain what's needed
53+
- If the env var has a `"default"`, mention it (user can press enter to accept)
54+
- If the env var has `"optional": true`, let the user know they can skip it
55+
- Ask the user for the value
56+
- **SSH special handling**: If the SSH check has `"ssh_setup_needed": true`:
57+
- Ask the user for their SSH host alias name
58+
- Check if that alias already exists in `~/.ssh/config` -- if so, use it as `REMOTE_HOST`
59+
- If it doesn't exist, ask: do you want to create a new SSH config entry? If yes, ask for: hostname, username, port (default 22), and optional identity file path
60+
- Read `~/.ssh/config`, append the new `Host` block, and write it back
61+
- Then set `REMOTE_HOST` to the alias name
62+
4. After collecting all values, read the project's `.claude/settings.json` file
63+
5. Merge the new values into the `"env"` block (create it if it doesn't exist)
64+
6. Write the updated settings file
65+
7. Tell the user to **restart the Claude Code session** for env vars to take effect
66+
8. **Important**: Write secrets (tokens, passwords) to `.claude/settings.json` -- ensure this file is in `.gitignore`
67+
68+
If checks show non-configurable errors (e.g., venv issues, rsync not found), provide the fix command instead.
69+
70+
#### MLFlow Server Startup
71+
72+
The `MLFlow server` preflight check automatically handles server connectivity:
73+
- If the server is unreachable and `JUMPBOX_URI` is configured, it starts an SSH tunnel automatically
74+
- If the tunnel is already running, it skips startup
75+
- If the tunnel fails, it reports the error but the skill can still proceed (MLFlow is recommended, not required)
76+
77+
If any **required** checks (JOB_LOGS_DIR, JUMPBOX_URI) are still missing after the setup flow, do **not** proceed to analysis -- tell the user what's still needed. If MLFlow is missing, warn that tracing won't be recorded but proceed. If all required checks pass (recommended/optional items may remain missing), proceed to analysis.
78+
2879
### Step 1-4: Run the analysis CLI
2980

3081
Always use `--fetch` when analyzing by job ID. This automatically downloads the log from the remote server if it's not already present locally, and skips fetching if the log is already there.

skills/root-cause-analysis/scripts/cli.py

Lines changed: 53 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from scripts.correlator import build_correlation_timeline, fetch_correlated_logs
1717
from scripts.job_parser import parse_job_log
1818
from scripts.log_fetcher import fetch_job_log
19+
from scripts.setup import print_checks, run_checks
1920
from scripts.step4_fetch_github import GitHubClient, Step4Analyzer
2021
from scripts.tracing import HAS_MLFLOW, SpanType, mlflow, trace
2122
else:
@@ -24,6 +25,7 @@
2425
from .correlator import build_correlation_timeline, fetch_correlated_logs
2526
from .job_parser import parse_job_log
2627
from .log_fetcher import fetch_job_log
28+
from .setup import print_checks, run_checks
2729
from .step4_fetch_github import GitHubClient, Step4Analyzer
2830
from .tracing import HAS_MLFLOW, SpanType, mlflow, trace
2931

@@ -116,6 +118,7 @@ def upload_analysis_to_jumpbox(analysis_dir: Path, config: Config) -> bool:
116118
print(f" Error uploading to Jumpbox: {e}")
117119
return False
118120

121+
119122
def get_step_name(step: int) -> str:
120123
"""Get descriptive name for step."""
121124
names = {
@@ -219,13 +222,13 @@ def cmd_analyze(args: argparse.Namespace, config: Config, span=None) -> int:
219222
return 1
220223

221224
# Validate Splunk config
222-
errors = config.validate_splunk()
223-
if errors:
224-
error_message = f"Splunk configuration invalid: {', '.join(errors)}"
225-
print(f"Error: {error_message}")
226-
if span:
227-
span.set_outputs({"error": error_message})
228-
return 1
225+
splunk_errors = config.validate_splunk()
226+
if splunk_errors:
227+
print(f"Warning: Splunk configuration invalid: {', '.join(splunk_errors)}")
228+
print(
229+
" Step 2 (Splunk log fetch) will be skipped. Set SPLUNK_HOST/SPLUNK_USERNAME/SPLUNK_PASSWORD in .claude/settings.json to enable."
230+
)
231+
skip_splunk = bool(splunk_errors)
229232

230233
# GitHub token validation will be done at Step 4 (where it's actually needed)
231234
github_errors = config.validate_github()
@@ -256,17 +259,31 @@ def cmd_analyze(args: argparse.Namespace, config: Config, span=None) -> int:
256259

257260
# Step 2: Fetch Splunk logs
258261
print("\n[Step 2] Fetching Splunk logs...")
259-
try:
260-
splunk_logs = fetch_correlated_logs(config, job_context)
261-
step2_path = save_step(analysis_dir, 2, splunk_logs)
262-
print(f" OCP logs: {len(splunk_logs.get('ocp_logs', []))}")
263-
print(f" Error logs: {len(splunk_logs.get('error_logs', []))}")
264-
print(f" Pods found: {len(splunk_logs.get('pods_found', []))}")
265-
print(f" Output: {step2_path}")
266-
except Exception as e:
267-
print(f" Error fetching Splunk logs: {e}")
268-
splunk_logs = {"ocp_logs": [], "error_logs": [], "pods_found": [], "errors": [str(e)]}
262+
if skip_splunk:
263+
print(" Skipped: Splunk not configured")
264+
splunk_logs = {
265+
"ocp_logs": [],
266+
"error_logs": [],
267+
"pods_found": [],
268+
"skipped": True,
269+
"reason": "Splunk not configured",
270+
}
269271
save_step(analysis_dir, 2, splunk_logs)
272+
else:
273+
try:
274+
splunk_logs = fetch_correlated_logs(config, job_context)
275+
step2_path = save_step(analysis_dir, 2, splunk_logs)
276+
ocp_logs = splunk_logs.get("ocp_logs", [])
277+
error_logs = splunk_logs.get("error_logs", [])
278+
pods_found = splunk_logs.get("pods_found", [])
279+
print(f" OCP logs: {len(ocp_logs) if isinstance(ocp_logs, list) else 0}")
280+
print(f" Error logs: {len(error_logs) if isinstance(error_logs, list) else 0}")
281+
print(f" Pods found: {len(pods_found) if isinstance(pods_found, list) else 0}")
282+
print(f" Output: {step2_path}")
283+
except Exception as e:
284+
print(f" Error fetching Splunk logs: {e}")
285+
splunk_logs = {"ocp_logs": [], "error_logs": [], "pods_found": [], "errors": [str(e)]}
286+
save_step(analysis_dir, 2, splunk_logs)
270287

271288
# Step 3: Build correlation
272289
print("\n[Step 3] Building correlation timeline...")
@@ -447,6 +464,20 @@ def cmd_query(args: argparse.Namespace, config: Config, span=None) -> int:
447464
return 0
448465

449466

467+
def cmd_setup(args: argparse.Namespace, config: Config, span=None) -> int:
468+
"""Run preflight checks for all prerequisites."""
469+
base_dir = Path(__file__).parent.parent
470+
repo_root = base_dir.parent.parent # skills/ -> repo root
471+
results = run_checks(base_dir, repo_root)
472+
473+
if getattr(args, "json", False):
474+
print(json.dumps(results, indent=2))
475+
return 0 if all(r["status"] == "ok" for r in results) else 1
476+
477+
issues = print_checks(results)
478+
return 0 if issues == 0 else 1
479+
480+
450481
def cmd_status(args: argparse.Namespace, config: Config, span=None) -> int:
451482
"""Show analysis status for a job."""
452483
analysis_dir = config.analysis_dir / args.job_id
@@ -526,6 +557,10 @@ def main() -> int:
526557
)
527558
query_parser.add_argument("--output", "-o", help="Output file (default: print summary)")
528559

560+
# setup command
561+
setup_parser = subparsers.add_parser("setup", help="Check prerequisites and configuration")
562+
setup_parser.add_argument("--json", action="store_true", help="Output results as JSON")
563+
529564
# status command
530565
status_parser = subparsers.add_parser("status", help="Show analysis status")
531566
status_parser.add_argument("job_id", help="Job ID to check")
@@ -559,6 +594,7 @@ def main() -> int:
559594
"analyze": cmd_analyze,
560595
"parse": cmd_parse,
561596
"query": cmd_query,
597+
"setup": cmd_setup,
562598
"status": cmd_status,
563599
}
564600

0 commit comments

Comments
 (0)