diff --git a/conftest.py b/conftest.py index 58b74e80e..a67ea7912 100644 --- a/conftest.py +++ b/conftest.py @@ -31,6 +31,12 @@ def pytest_addoption(parser): default=False, help="Skip running after_test commands for test cases (useful for debugging test failures)", ) + parser.addoption( + "--only-setup", + action="store_true", + default=False, + help="Only run before_test setup commands, skip the actual test execution", + ) def pytest_configure(config): diff --git a/holmes/plugins/runbooks/high_latency_investigation.md b/holmes/plugins/runbooks/high_latency_investigation.md new file mode 100644 index 000000000..810206aad --- /dev/null +++ b/holmes/plugins/runbooks/high_latency_investigation.md @@ -0,0 +1,308 @@ +# Performance Investigation Runbook + +## Overview +This runbook guides the investigation of performance issues by analyzing both metrics and distributed traces to identify root causes. It works with any metric and trace attribute naming convention. + +## Prerequisites +- Prometheus or compatible metrics system +- Tempo or compatible distributed tracing system +- Service instrumentation with trace context + +## Investigation Steps + +### 1. Discover Available Metrics and Labels + +First, discover what metrics and labels are available: + +``` +# Use prometheus/metrics toolset +list_available_metrics( + name_filter="duration|latency|time", + type_filter="histogram" +) +``` + +### 2. Identify Affected Services and Operations + +Find which operations have high values: + +``` +# Use prometheus/advanced-latency toolset +find_top_metric_values( + metric_name="${your_latency_metric}", + group_by_label="${endpoint_label}", + top_n=10, + percentile=0.95, + time_range="30m" +) +``` + +### 3. Analyze Metric Distribution + +Understand if the latency is consistent or has specific patterns: + +``` +analyze_metric_distribution( + metric_name="${your_latency_metric}", + label_filters={"${service_label}": "${affected_service}"}, + time_range="1h" +) +``` + +Look for: +- Bimodal distributions (suggesting two different code paths) +- Long tail latencies (small percentage of very slow requests) +- Consistent high latency (systemic issue) + +### 4. Break Down by Available Dimensions + +Analyze by the labels available in your metrics: + +``` +analyze_metric_by_dimensions( + metric_name="${your_latency_metric}", + group_by=["${discovered_labels}"], # Use labels discovered in step 1 + filters={"${service_label}": "${affected_service}"}, + percentiles=[0.5, 0.95, 0.99], + time_range="1h" +) +``` + +Key patterns to identify based on your available labels: +- Specific operations or endpoints +- Different request types or methods +- Error conditions +- Client or user segments + +### 5. Discover Trace Attributes and Find Slow Traces + +First discover available span attributes: + +``` +# Use tempo toolset +fetch_tempo_tags( + start_datetime="-1h", + end_datetime="now" +) +``` + +Then find example slow traces: + +``` +# Use grafana/tempo toolset +fetch_tempo_traces( + service_name="${affected_service}", + min_duration="${threshold_duration}", + start_datetime="-30m", + limit=10 +) +``` + +### 6. Analyze Trace Breakdown + +For each slow trace, identify where time is spent: + +``` +analyze_trace_latency_breakdown( + trace_id="${trace_id}", + include_dependencies=true +) +``` + +Look for: +- Long-running spans +- Sequential operations that could be parallelized +- External service calls with high latency +- Database queries taking excessive time + +### 7. Analyze Span Attributes + +Group traces by discovered attributes to find patterns: + +``` +analyze_span_attributes( + service_name="${affected_service}", + group_by_attributes=["${discovered_attributes}"], # Use attributes from step 5 + min_duration="500ms", + aggregation="p95", + time_range="1h" +) +``` + +This helps identify patterns based on your actual span attributes: +- Specific operations or endpoints +- User or tenant segments +- External dependencies +- Error conditions + +### 8. Analyze Operation Patterns + +Analyze operations within traces: + +``` +analyze_span_operations( + service_name="${affected_service}", + operation_type_attribute="${operation_attribute}", # e.g., 'db.system', 'rpc.method' + min_duration="100ms", + group_by_attributes=["${relevant_attributes}"], + time_range="1h" +) +``` + +Look for: +- N+1 query problems +- Missing indexes +- Lock contention +- Slow aggregation queries + +### 9. Correlate with Resource Metrics + +Identify resource metrics to correlate: + +``` +# First find available resource metrics +list_available_metrics( + name_filter="cpu|memory|disk|network|connection", + type_filter="gauge" +) + +# Then correlate +correlate_metrics( + primary_metric="${your_latency_metric}", + correlation_metrics=["${discovered_resource_metrics}"], + label_filters={"${service_label}": "${affected_service}"}, + time_range="1h" +) +``` + +### 10. Compare with Historical Baseline + +Determine if this is a new issue or degradation: + +``` +compare_metric_periods( + metric_name="${your_latency_metric}", + current_period="1h", + comparison_period="24h", + group_by=["${relevant_labels}"], + threshold_percent=20 +) +``` + +### 11. Trace Service Dependencies + +Understand the full request flow and identify bottlenecks: + +``` +trace_service_dependencies( + root_service="${affected_service}", + latency_threshold="100ms", + time_range="1h" +) +``` + +### 12. Check for Anomalies + +Detect unusual patterns in metrics: + +``` +detect_metric_anomalies( + metric_name="${your_latency_metric}", + sensitivity=3, + lookback_window="7d", + group_by=["${relevant_labels}"] +) +``` + +And in traces: + +``` +detect_trace_anomalies( + service_name="${affected_service}", + baseline_window="24h", + sensitivity=3, + anomaly_types=["latency", "errors", "span_count"] +) +``` + +## Common Root Causes and Solutions + +### 1. Database Issues +**Symptoms**: High database query duration in traces +**Solutions**: +- Add missing indexes +- Optimize queries +- Implement caching +- Use read replicas for read-heavy workloads + +### 2. N+1 Query Problems +**Symptoms**: Multiple similar database queries in a single trace +**Solutions**: +- Implement eager loading +- Use batch queries +- Add caching layer + +### 3. External Service Latency +**Symptoms**: High latency in spans calling external services +**Solutions**: +- Implement circuit breakers +- Add timeouts +- Use asynchronous processing +- Cache external service responses + +### 4. Resource Constraints +**Symptoms**: High CPU/memory correlation with latency +**Solutions**: +- Scale horizontally (add more pods/instances) +- Scale vertically (increase resource limits) +- Optimize code for efficiency +- Implement rate limiting + +### 5. Inefficient Code Paths +**Symptoms**: Specific request patterns much slower +**Solutions**: +- Profile and optimize hot paths +- Implement caching +- Parallelize independent operations +- Use more efficient algorithms + +### 6. Network Issues +**Symptoms**: Intermittent high latency, timeouts +**Solutions**: +- Check network connectivity +- Verify DNS resolution times +- Review firewall/proxy configurations +- Consider service mesh overhead + +### 7. Configuration Issues +**Symptoms**: Sudden latency increase after deployment +**Solutions**: +- Review recent configuration changes +- Check timeout settings +- Verify connection pool sizes +- Review retry configurations + +## Escalation Criteria + +Escalate to senior engineers or SRE team if: +- Latency affects > 10% of requests +- P95 latency exceeds SLO by 2x +- Issue persists after initial mitigation attempts +- Multiple services are affected simultaneously +- Data loss or corruption is suspected + +## Monitoring and Alerting + +Set up alerts for: +- P95 latency exceeding threshold +- Sudden latency spike (> 50% increase) +- Error rate correlation with latency +- Resource utilization above 80% + +## Post-Incident Actions + +1. Document root cause and timeline +2. Update runbook with new findings +3. Implement additional monitoring if gaps found +4. Consider architectural improvements +5. Share learnings with the team diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py index c1c41daa8..9894899f8 100644 --- a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +++ b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py @@ -22,7 +22,10 @@ query_tempo_trace_by_id, query_tempo_traces, ) -from holmes.plugins.toolsets.grafana.trace_parser import format_traces_list +from holmes.plugins.toolsets.grafana.trace_parser import ( + format_traces_list, + build_span_hierarchy, +) from holmes.plugins.toolsets.logging_utils.logging_api import ( DEFAULT_TIME_SPAN_SECONDS, ) @@ -47,6 +50,8 @@ class GrafanaTempoLabelsConfig(BaseModel): class GrafanaTempoConfig(GrafanaConfig): labels: GrafanaTempoLabelsConfig = GrafanaTempoLabelsConfig() + enable_comparative_sample: bool = False + enable_simple_comparison: bool = True class BaseGrafanaTempoToolset(BaseGrafanaToolset): @@ -77,7 +82,7 @@ class GetTempoTraces(Tool): def __init__(self, toolset: BaseGrafanaTempoToolset): super().__init__( name="fetch_tempo_traces", - description="""Lists Tempo traces. At least one of `service_name`, `pod_name` or `deployment_name` argument is required.""", + description="""Lists Tempo traces. At least one of `service_name`, `pod_name` or `deployment_name` argument is required. You should usually call fetch_traces_comparative_sample before calling this tool to first get an overview.""", parameters={ "min_duration": ToolParameter( description="The minimum duration of traces to fetch, e.g., '5s' for 5 seconds.", @@ -121,7 +126,7 @@ def __init__(self, toolset: BaseGrafanaTempoToolset): ), "limit": ToolParameter( description="Maximum number of traces to return. Defaults to 50", - type="string", + type="integer", required=False, ), "sort": ToolParameter( @@ -247,7 +252,7 @@ def _invoke(self, params: Dict) -> StructuredToolResult: ) except requests.exceptions.RequestException as e: raise Exception( - f"Failed to retrieve trace by ID after retries: {e} \n for URL: {url}" + f"Failed to retrieve tags after retries: {e} \n for URL: {url}" ) def get_parameterized_one_liner(self, params: Dict) -> str: @@ -291,6 +296,1196 @@ def get_parameterized_one_liner(self, params: Dict) -> str: return f"{toolset_name_for_one_liner(self._toolset.name)}: Fetched Tempo Trace (trace_id={params.get('trace_id')})" +class AnalyzeTracesByAttributes(Tool): + def __init__(self, toolset: BaseGrafanaTempoToolset): + super().__init__( + name="analyze_traces_by_attributes", + description="Analyzes traces grouped by specified span attributes to find patterns in performance or errors.", + parameters={ + "service_name": ToolParameter( + description="Service to analyze traces for", + type="string", + required=False, + ), + "group_by_attributes": ToolParameter( + description="Span attributes to group analysis by (discovered from your traces)", + type="array", + required=True, + ), + "min_duration": ToolParameter( + description="Minimum duration to include (e.g., '100ms', '1s')", + type="string", + required=False, + ), + "start_datetime": ToolParameter( + description="Start time for analysis (RFC3339 or relative)", + type="string", + required=False, + ), + "end_datetime": ToolParameter( + description="End time for analysis (RFC3339 or relative)", + type="string", + required=False, + ), + "limit": ToolParameter( + description="Maximum number of traces to analyze", + type="integer", + required=False, + ), + }, + ) + self._toolset = toolset + + def _invoke(self, params: Dict) -> StructuredToolResult: + try: + # Build query with flexible attributes + group_by = params.get("group_by_attributes", []) + service_name = params.get("service_name") + min_duration = params.get("min_duration", "100ms") + + start, end = process_timestamps_to_int( + params.get("start_datetime"), + params.get("end_datetime"), + default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS, + ) + + # Build TraceQL query + filters = [] + if service_name: + filters.append(f'resource.service.name="{service_name}"') + filters.append(f"duration>{min_duration}") + + query = " && ".join(filters) + query = f"{{{query}}}" + + base_url = get_base_url(self._toolset.grafana_config) + traces_summary = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=query, + start=start, + end=end, + limit=params.get("limit", 50), + ) + + # Group traces by specified attributes + grouped_analysis = {} + traces = traces_summary.get("traces", []) + + # For each trace, fetch full details to get attributes + for trace_summary in traces[ + : params.get("limit", 50) + ]: # Limit to avoid too many API calls + trace_id = trace_summary.get("traceID") + if not trace_id: + continue + + try: + # Fetch raw trace data to get span attributes + url = f"{base_url}/api/traces/{trace_id}" + response = requests.get( + url, + headers=build_headers( + api_key=self._toolset.grafana_config.api_key, + additional_headers=self._toolset.grafana_config.headers, + ), + timeout=5, + ) + response.raise_for_status() + trace_raw = response.json() + + # Extract attributes from all spans in the trace + attr_values = {} + for attr in group_by: + attr_values[attr] = "unknown" + + # Search through batches and spans for attributes + for batch in trace_raw.get("batches", []): + # Check resource attributes first (e.g., service.name, k8s.pod.name) + for resource_attr in batch.get("resource", {}).get( + "attributes", [] + ): + attr_key = resource_attr.get("key", "") + if attr_key in group_by: + attr_value = ( + list(resource_attr.get("value", {}).values())[0] + if resource_attr.get("value") + else "unknown" + ) + attr_values[attr_key] = str(attr_value) + + for scope_spans in batch.get("scopeSpans", []): + for span_data in scope_spans.get("spans", []): + # Check span attributes + for span_attr in span_data.get("attributes", []): + attr_key = span_attr.get("key", "") + if attr_key in group_by: + # Extract the value from the attribute + attr_value = ( + list(span_attr.get("value", {}).values())[0] + if span_attr.get("value") + else "unknown" + ) + attr_values[attr_key] = str(attr_value) + + # Build the grouping key from extracted attributes + group_key = ", ".join( + [ + f"{attr}={attr_values.get(attr, 'unknown')}" + for attr in group_by + ] + ) + + if group_key not in grouped_analysis: + grouped_analysis[group_key] = { + "count": 0, + "total_duration_ms": 0, + "avg_duration_ms": 0, + "min_duration_ms": float("inf"), + "max_duration_ms": 0, + } + + duration_ms = trace_summary.get("durationMs", 0) + grouped_analysis[group_key]["count"] += 1 + grouped_analysis[group_key]["total_duration_ms"] += duration_ms + grouped_analysis[group_key]["min_duration_ms"] = min( + grouped_analysis[group_key]["min_duration_ms"], duration_ms + ) + grouped_analysis[group_key]["max_duration_ms"] = max( + grouped_analysis[group_key]["max_duration_ms"], duration_ms + ) + + except Exception: + # If we can't fetch the trace, skip it + continue + + # Calculate averages + for key in grouped_analysis: + if grouped_analysis[key]["count"] > 0: + grouped_analysis[key]["avg_duration_ms"] = round( + grouped_analysis[key]["total_duration_ms"] + / grouped_analysis[key]["count"], + 2, + ) + grouped_analysis[key]["min_duration_ms"] = round( + grouped_analysis[key]["min_duration_ms"], 2 + ) + grouped_analysis[key]["max_duration_ms"] = round( + grouped_analysis[key]["max_duration_ms"], 2 + ) + grouped_analysis[key]["total_duration_ms"] = round( + grouped_analysis[key]["total_duration_ms"], 2 + ) + + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=yaml.dump(grouped_analysis), + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error analyzing traces: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params: Dict) -> str: + return f"{toolset_name_for_one_liner(self._toolset.name)}: Analyze traces by attributes" + + +class FindSlowOperations(Tool): + def __init__(self, toolset: BaseGrafanaTempoToolset): + super().__init__( + name="find_slow_operations", + description="Identifies slow operations within traces based on span durations and attributes.", + parameters={ + "service_name": ToolParameter( + description="Service to analyze", + type="string", + required=False, + ), + "operation_attribute": ToolParameter( + description="Span attribute that identifies operation type", + type="string", + required=False, + ), + "min_duration": ToolParameter( + description="Minimum duration to consider slow", + type="string", + required=True, + ), + "group_by": ToolParameter( + description="Additional attributes to group by", + type="array", + required=False, + ), + "start_datetime": ToolParameter( + description="Start time for search", + type="string", + required=False, + ), + "end_datetime": ToolParameter( + description="End time for search", + type="string", + required=False, + ), + }, + ) + self._toolset = toolset + + def _invoke(self, params: Dict) -> StructuredToolResult: + try: + min_duration = get_param_or_raise(params, "min_duration") + service_name = params.get("service_name") + + start, end = process_timestamps_to_int( + params.get("start_datetime"), + params.get("end_datetime"), + default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS, + ) + + # Build query for slow operations + filters = [f"duration>{min_duration}"] + if service_name: + filters.append(f'resource.service.name="{service_name}"') + + query = " && ".join(filters) + query = f"{{{query}}}" + + base_url = get_base_url(self._toolset.grafana_config) + traces = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=query, + start=start, + end=end, + limit=50, + ) + + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=format_traces_list(traces), + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error finding slow operations: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params: Dict) -> str: + return f"{toolset_name_for_one_liner(self._toolset.name)}: Find slow operations" + + +class CompareTracePeriods(Tool): + def __init__(self, toolset: BaseGrafanaTempoToolset): + super().__init__( + name="compare_trace_periods", + description="Compares trace patterns between two time periods to identify changes in performance or behavior.", + parameters={ + "service_name": ToolParameter( + description="Service to compare", + type="string", + required=True, + ), + "baseline_start": ToolParameter( + description="Baseline period start time", + type="string", + required=True, + ), + "baseline_end": ToolParameter( + description="Baseline period end time", + type="string", + required=True, + ), + "comparison_start": ToolParameter( + description="Comparison period start time", + type="string", + required=True, + ), + "comparison_end": ToolParameter( + description="Comparison period end time", + type="string", + required=True, + ), + "attributes_to_compare": ToolParameter( + description="Span attributes to compare", + type="array", + required=False, + ), + }, + ) + self._toolset = toolset + + def _invoke(self, params: Dict) -> StructuredToolResult: + try: + service_name = get_param_or_raise(params, "service_name") + + # Get baseline traces + baseline_start, baseline_end = process_timestamps_to_int( + params.get("baseline_start"), + params.get("baseline_end"), + default_time_span_seconds=3600, + ) + + comparison_start, comparison_end = process_timestamps_to_int( + params.get("comparison_start"), + params.get("comparison_end"), + default_time_span_seconds=3600, + ) + + query = f'{{resource.service.name="{service_name}"}}' + base_url = get_base_url(self._toolset.grafana_config) + + # Fetch baseline traces + baseline_traces = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=query, + start=baseline_start, + end=baseline_end, + limit=100, + ) + + # Fetch comparison traces + comparison_traces = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=query, + start=comparison_start, + end=comparison_end, + limit=100, + ) + + # Compare the two sets + comparison_result = { + "baseline_count": len(baseline_traces.get("traces", [])), + "comparison_count": len(comparison_traces.get("traces", [])), + "baseline_period": f"{baseline_start} to {baseline_end}", + "comparison_period": f"{comparison_start} to {comparison_end}", + } + + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=yaml.dump(comparison_result), + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error comparing periods: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params: Dict) -> str: + return ( + f"{toolset_name_for_one_liner(self._toolset.name)}: Compare trace periods" + ) + + +class ListServices(Tool): + def __init__(self, toolset: BaseGrafanaTempoToolset): + super().__init__( + name="list_services", + description="Lists all services that have traces in Tempo, optionally filtered by namespace", + parameters={ + "namespace": ToolParameter( + description="Filter services by Kubernetes namespace", + type="string", + required=False, + ), + "start_datetime": ToolParameter( + description="Start time for search (RFC3339 or relative)", + type="string", + required=False, + ), + "end_datetime": ToolParameter( + description="End time for search (RFC3339 or relative)", + type="string", + required=False, + ), + }, + ) + self._toolset = toolset + + def _invoke(self, params: Dict) -> StructuredToolResult: + try: + start, end = process_timestamps_to_int( + params.get("start_datetime"), + params.get("end_datetime"), + default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS, + ) + + base_url = get_base_url(self._toolset.grafana_config) + + # Get all service names + services_url = f"{base_url}/api/v2/search/tag/service.name/values?start={start}&end={end}" + + response = requests.get( + services_url, + headers=build_headers( + api_key=self._toolset.grafana_config.api_key, + additional_headers=self._toolset.grafana_config.headers, + ), + timeout=10, + ) + response.raise_for_status() + services_data = response.json() + services = services_data.get("tagValues", []) + + # If namespace filter provided, get traces for each service and filter + if params.get("namespace"): + namespace = params["namespace"] + filtered_services = [] + + for service in services: + # Check if this service has traces in the specified namespace + query = f'{{resource.service.name="{service}" && resource.k8s.namespace.name="{namespace}"}}' + traces = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=query, + start=start, + end=end, + limit=1, # Just check if any exist + ) + + if traces.get("traces"): + filtered_services.append(service) + + services = filtered_services + + # Get basic stats for each service + service_stats = [] + for service in services: + query = f'{{resource.service.name="{service}"}}' + if params.get("namespace"): + query = f'{{resource.service.name="{service}" && resource.k8s.namespace.name="{params["namespace"]}"}}' + + # Get a sample of traces for basic stats + traces = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=query, + start=start, + end=end, + limit=100, + ) + + trace_list = traces.get("traces", []) + if trace_list: + durations = [ + t.get("durationMs", 0) + for t in trace_list + if t.get("durationMs", 0) > 0 + ] + if durations: + service_stats.append( + { + "service_name": service, + "trace_count_sample": len(durations), + "avg_duration_ms": round( + sum(durations) / len(durations), 2 + ), + "min_duration_ms": round(min(durations), 2), + "max_duration_ms": round(max(durations), 2), + } + ) + + # Sort by average duration (slowest first) + service_stats.sort(key=lambda x: x["avg_duration_ms"], reverse=True) + + result = { + "total_services": len(services), + "services": service_stats if service_stats else services, + } + + if params.get("namespace"): + result["namespace_filter"] = params["namespace"] + + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=yaml.dump(result, default_flow_style=False, sort_keys=False), + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error listing services: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params: Dict) -> str: + return f"{toolset_name_for_one_liner(self._toolset.name)}: List services" + + +class FetchTracesComparativeSample(Tool): + def __init__(self, toolset: BaseGrafanaTempoToolset): + super().__init__( + name="fetch_traces_comparative_sample", + description="""Fetches statistics and representative samples of fast, slow, and typical traces for performance analysis. + +Important: call this tool first when investigating performance issues via traces. This tool provides comprehensive analysis for identifying patterns. + +Examples: +- For service latency: service_name="payment" (matches "payment-service" too) +- For namespace issues: namespace="production" +- Combined: service_name="auth", namespace="staging\"""", + parameters={ + "service_name": ToolParameter( + description="Service to analyze (partial match supported, e.g., 'payment' matches 'payment-service')", + type="string", + required=False, + ), + "namespace": ToolParameter( + description="Kubernetes namespace to filter traces (e.g., 'production', 'staging')", + type="string", + required=False, + ), + "base_query": ToolParameter( + description="Custom TraceQL filter. If not provided, service_name and/or namespace will be used", + type="string", + required=False, + ), + "sample_size": ToolParameter( + description="Number of traces to fetch from each category (fast/slow/typical). Default 5", + type="integer", + required=False, + ), + "start_datetime": ToolParameter( + description="Start time for analysis (RFC3339 or relative)", + type="string", + required=False, + ), + "end_datetime": ToolParameter( + description="End time for analysis (RFC3339 or relative)", + type="string", + required=False, + ), + }, + ) + self._toolset = toolset + + def _invoke(self, params: Dict) -> StructuredToolResult: + try: + # Build base query from parameters + if params.get("base_query"): + base_query = params["base_query"] + else: + filters = [] + + # Add service filter (with smart matching) + if params.get("service_name"): + service = params["service_name"] + filters.append(f'resource.service.name=~".*{service}.*"') + + # Add namespace filter + if params.get("namespace"): + namespace = params["namespace"] + filters.append(f'resource.k8s.namespace.name="{namespace}"') + + if not filters: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error="Either base_query, service_name, or namespace is required", + params=params, + ) + + base_query = " && ".join(filters) + + sample_size = params.get("sample_size", 5) + + start, end = process_timestamps_to_int( + params.get("start_datetime"), + params.get("end_datetime"), + default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS, + ) + + base_url = get_base_url(self._toolset.grafana_config) + + # Step 1: Get overall trace statistics + stats_query = f"{{{base_query}}}" + all_traces_summary = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=stats_query, + start=start, + end=end, + limit=1000, # Get enough for good statistics + ) + + traces = all_traces_summary.get("traces", []) + if len(traces) == 0: + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data="No traces found matching the query", + params=params, + ) + + # Calculate statistics + durations = [ + t.get("durationMs", 0) for t in traces if t.get("durationMs", 0) > 0 + ] + durations.sort() + + if len(durations) == 0: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error="No traces with valid duration found", + params=params, + ) + + stats = { + "total_traces_analyzed": len(durations), + "avg_duration_ms": round(sum(durations) / len(durations), 2), + "min_duration_ms": round(durations[0], 2), + "max_duration_ms": round(durations[-1], 2), + "p50_duration_ms": round(durations[len(durations) // 2], 2), + "p90_duration_ms": round( + durations[min(int(len(durations) * 0.9), len(durations) - 1)], 2 + ), + "p99_duration_ms": round( + durations[min(int(len(durations) * 0.99), len(durations) - 1)], 2 + ), + } + + # Step 2: Get slowest traces (sorted by duration descending) + slow_traces_data = [] + # Sort traces by duration descending and take top N + sorted_slow = sorted( + traces, key=lambda x: x.get("durationMs", 0), reverse=True + )[:sample_size] + + for trace_summary in sorted_slow: + trace_id = trace_summary.get("traceID") + if not trace_id: + continue + + # Fetch full trace details + try: + url = f"{base_url}/api/traces/{trace_id}" + response = requests.get( + url, + headers=build_headers( + api_key=self._toolset.grafana_config.api_key, + additional_headers=self._toolset.grafana_config.headers, + ), + timeout=5, + ) + response.raise_for_status() + trace_raw = response.json() + + # Extract key attributes from the trace + trace_attributes = self._extract_trace_attributes(trace_raw) + + # Build span hierarchy for analysis + root_spans = build_span_hierarchy(trace_raw) + slowest_spans = self._find_slowest_spans(root_spans, 3) + + slow_traces_data.append( + { + "traceID": trace_id, + "durationMs": round(trace_summary.get("durationMs", 0), 2), + "rootServiceName": trace_summary.get( + "rootServiceName", "unknown" + ), + "attributes": trace_attributes, + "slowestSpans": slowest_spans, + "spanCount": self._count_spans(root_spans), + } + ) + except Exception: + continue + + # Step 3: Get fastest traces (but not trivially fast) + fast_traces_data = [] + # Filter out very fast traces (likely health checks) + min_duration_threshold = ( + stats["p50_duration_ms"] * 0.1 + ) # At least 10% of median + meaningful_fast = [ + t for t in traces if t.get("durationMs", 0) >= min_duration_threshold + ] + sorted_fast = sorted(meaningful_fast, key=lambda x: x.get("durationMs", 0))[ + :sample_size + ] + + for trace_summary in sorted_fast: + trace_id = trace_summary.get("traceID") + if not trace_id: + continue + + try: + url = f"{base_url}/api/traces/{trace_id}" + response = requests.get( + url, + headers=build_headers( + api_key=self._toolset.grafana_config.api_key, + additional_headers=self._toolset.grafana_config.headers, + ), + timeout=5, + ) + response.raise_for_status() + trace_raw = response.json() + + trace_attributes = self._extract_trace_attributes(trace_raw) + root_spans = build_span_hierarchy(trace_raw) + + fast_traces_data.append( + { + "traceID": trace_id, + "durationMs": round(trace_summary.get("durationMs", 0), 2), + "rootServiceName": trace_summary.get( + "rootServiceName", "unknown" + ), + "attributes": trace_attributes, + "spanCount": self._count_spans(root_spans), + } + ) + except Exception: + continue + + # Step 4: Get typical traces (around median) + typical_traces_data = [] + median = stats["p50_duration_ms"] + # Find traces within 20% of median + typical_traces = [ + t + for t in traces + if median * 0.8 <= t.get("durationMs", 0) <= median * 1.2 + ][:sample_size] + + for trace_summary in typical_traces: + trace_id = trace_summary.get("traceID") + if not trace_id: + continue + + try: + url = f"{base_url}/api/traces/{trace_id}" + response = requests.get( + url, + headers=build_headers( + api_key=self._toolset.grafana_config.api_key, + additional_headers=self._toolset.grafana_config.headers, + ), + timeout=5, + ) + response.raise_for_status() + trace_raw = response.json() + + trace_attributes = self._extract_trace_attributes(trace_raw) + root_spans = build_span_hierarchy(trace_raw) + + typical_traces_data.append( + { + "traceID": trace_id, + "durationMs": round(trace_summary.get("durationMs", 0), 2), + "rootServiceName": trace_summary.get( + "rootServiceName", "unknown" + ), + "attributes": trace_attributes, + "spanCount": self._count_spans(root_spans), + } + ) + except Exception: + continue + + # Step 5: Analyze patterns + analysis_insights = self._generate_insights( + slow_traces_data, fast_traces_data, typical_traces_data + ) + + # Format output + result = { + "statistics": stats, + "slow_traces": slow_traces_data, + "fast_traces": fast_traces_data, + "typical_traces": typical_traces_data, + "pattern_analysis": analysis_insights, + } + + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=yaml.dump(result, default_flow_style=False, sort_keys=False), + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error analyzing traces: {str(e)}", + params=params, + ) + + def _extract_trace_attributes(self, trace_raw: Dict) -> Dict[str, Any]: + """Extract ALL attributes from trace for analysis""" + attributes = {} + + for batch in trace_raw.get("batches", []): + # Extract all resource attributes + for attr in batch.get("resource", {}).get("attributes", []): + key = attr.get("key", "") + if key: + value = ( + list(attr.get("value", {}).values())[0] + if attr.get("value") + else None + ) + if value is not None: + attributes[key] = value + + # Extract all span attributes (from first span of each type we haven't seen) + for scope_spans in batch.get("scopeSpans", []): + for span_data in scope_spans.get("spans", []): + for attr in span_data.get("attributes", []): + key = attr.get("key", "") + if ( + key and key not in attributes + ): # Only add if we haven't seen this key yet + value = ( + list(attr.get("value", {}).values())[0] + if attr.get("value") + else None + ) + if value is not None: + attributes[key] = value + + return attributes + + def _find_slowest_spans(self, root_spans, limit=3): + """Find the slowest spans in the trace""" + all_spans = [] + + def collect_spans(span): + all_spans.append(span) + for child in span.children: + collect_spans(child) + + for root in root_spans: + collect_spans(root) + + # Sort by duration and get top N + sorted_spans = sorted(all_spans, key=lambda s: s.duration_ms, reverse=True)[ + :limit + ] + + result = [] + for span in sorted_spans: + span_info = { + "operation": span.name, + "serviceName": span.service_name, + "durationMs": round(span.duration_ms, 2), + } + + # Add relevant attributes + if span.attributes.get("db.statement"): + span_info["dbStatement"] = span.attributes["db.statement"][:100] + "..." + if span.attributes.get("http.route"): + span_info["httpRoute"] = span.attributes["http.route"] + + result.append(span_info) + + return result + + def _count_spans(self, root_spans): + """Count total number of spans in trace""" + count = 0 + + def count_recursive(span): + nonlocal count + count += 1 + for child in span.children: + count_recursive(child) + + for root in root_spans: + count_recursive(root) + + return count + + def _generate_insights(self, slow_traces, fast_traces, typical_traces): + """Generate insights by comparing trace groups""" + insights = { + "common_patterns_in_slow_traces": [], + "common_patterns_in_fast_traces": [], + "key_differences": [], + } + + # Analyze attribute patterns + slow_attrs = {} + fast_attrs = {} + + # Collect attribute frequencies in slow traces + for trace in slow_traces: + for key, value in trace.get("attributes", {}).items(): + if key not in slow_attrs: + slow_attrs[key] = {} + slow_attrs[key][str(value)] = slow_attrs[key].get(str(value), 0) + 1 + + # Collect attribute frequencies in fast traces + for trace in fast_traces: + for key, value in trace.get("attributes", {}).items(): + if key not in fast_attrs: + fast_attrs[key] = {} + fast_attrs[key][str(value)] = fast_attrs[key].get(str(value), 0) + 1 + + # Find patterns unique to slow traces + for key, values in slow_attrs.items(): + if len(slow_traces) > 0: + for value, count in values.items(): + ratio = count / len(slow_traces) + if ratio >= 0.8: # Present in 80%+ of slow traces + fast_count = fast_attrs.get(key, {}).get(value, 0) + fast_ratio = ( + fast_count / len(fast_traces) if len(fast_traces) > 0 else 0 + ) + if fast_ratio < 0.2: # But in less than 20% of fast traces + insights["common_patterns_in_slow_traces"].append( + f"{key}={value} appears in {int(ratio*100)}% of slow traces but only {int(fast_ratio*100)}% of fast traces" + ) + + # Check span count differences + if slow_traces and fast_traces: + avg_slow_spans = sum(t.get("spanCount", 0) for t in slow_traces) / len( + slow_traces + ) + avg_fast_spans = sum(t.get("spanCount", 0) for t in fast_traces) / len( + fast_traces + ) + if avg_slow_spans > avg_fast_spans * 1.5: + insights["key_differences"].append( + f"Slow traces have {avg_slow_spans:.1f} spans on average vs {avg_fast_spans:.1f} for fast traces" + ) + + # Check for missing attributes + slow_keys = set() + for trace in slow_traces: + slow_keys.update(trace.get("attributes", {}).keys()) + + fast_keys = set() + for trace in fast_traces: + fast_keys.update(trace.get("attributes", {}).keys()) + + only_in_slow = slow_keys - fast_keys + if only_in_slow: + insights["key_differences"].append( + f"Attributes only in slow traces: {', '.join(only_in_slow)}" + ) + + return insights + + def get_parameterized_one_liner(self, params: Dict) -> str: + return f"{toolset_name_for_one_liner(self._toolset.name)}: Comparative trace analysis" + + +class FetchTracesSimpleComparison(Tool): + def __init__(self, toolset: BaseGrafanaTempoToolset): + super().__init__( + name="fetch_traces_comparative_sample", + description="""Fetches statistics and representative samples of fast, slow, and typical traces for performance analysis. + +Important: call this tool first when investigating performance issues via traces. This tool provides comprehensive analysis for identifying patterns. + +Examples: +- For service latency: service_name="payment" (matches "payment-service" too) +- For namespace issues: namespace="production" +- Combined: service_name="auth", namespace="staging\"""", + parameters={ + "service_name": ToolParameter( + description="Service to analyze (partial match supported)", + type="string", + required=False, + ), + "namespace": ToolParameter( + description="Kubernetes namespace to filter traces", + type="string", + required=False, + ), + "base_query": ToolParameter( + description="Custom TraceQL filter", + type="string", + required=False, + ), + "sample_count": ToolParameter( + description="Number of traces to fetch from each category (fastest/slowest). Default 3", + type="integer", + required=False, + ), + "start_datetime": ToolParameter( + description="Start time for analysis (RFC3339 or relative)", + type="string", + required=False, + ), + "end_datetime": ToolParameter( + description="End time for analysis (RFC3339 or relative)", + type="string", + required=False, + ), + }, + ) + self._toolset = toolset + + def _invoke(self, params: Dict) -> StructuredToolResult: + try: + # Build query (same as before) + if params.get("base_query"): + base_query = params["base_query"] + else: + filters = [] + if params.get("service_name"): + service = params["service_name"] + filters.append(f'resource.service.name=~".*{service}.*"') + if params.get("namespace"): + namespace = params["namespace"] + filters.append(f'resource.k8s.namespace.name="{namespace}"') + + if not filters: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error="Either base_query, service_name, or namespace is required", + params=params, + ) + base_query = " && ".join(filters) + + sample_count = params.get("sample_count", 3) + + start, end = process_timestamps_to_int( + params.get("start_datetime"), + params.get("end_datetime"), + default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS, + ) + + base_url = get_base_url(self._toolset.grafana_config) + + # Step 1: Get all trace summaries + stats_query = f"{{{base_query}}}" + all_traces_response = query_tempo_traces( + base_url=base_url, + api_key=self._toolset.grafana_config.api_key, + headers=self._toolset.grafana_config.headers, + query=stats_query, + start=start, + end=end, + limit=1000, + ) + + traces = all_traces_response.get("traces", []) + if not traces: + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data="No traces found matching the query", + params=params, + ) + + # Step 2: Sort traces by duration + sorted_traces = sorted(traces, key=lambda x: x.get("durationMs", 0)) + + # Step 3: Calculate basic statistics + durations = [t.get("durationMs", 0) for t in sorted_traces] + stats = { + "trace_count": len(durations), + "min_ms": durations[0], + "p25_ms": durations[len(durations) // 4] + if len(durations) >= 4 + else durations[0], + "p50_ms": durations[len(durations) // 2], + "p75_ms": durations[3 * len(durations) // 4] + if len(durations) >= 4 + else durations[-1], + "p90_ms": durations[int(len(durations) * 0.9)] + if len(durations) >= 10 + else durations[-1], + "p99_ms": durations[int(len(durations) * 0.99)] + if len(durations) >= 100 + else durations[-1], + "max_ms": durations[-1], + } + + # Step 4: Select representative traces to fetch + fastest_indices = list(range(min(sample_count, len(sorted_traces)))) + slowest_indices = list( + range(max(0, len(sorted_traces) - sample_count), len(sorted_traces)) + ) + + # Add median trace + median_idx = len(sorted_traces) // 2 + + # Step 5: Fetch full trace details + def fetch_full_trace(trace_summary): + trace_id = trace_summary.get("traceID") + if not trace_id: + return None + + try: + url = f"{base_url}/api/traces/{trace_id}" + response = requests.get( + url, + headers=build_headers( + api_key=self._toolset.grafana_config.api_key, + additional_headers=self._toolset.grafana_config.headers, + ), + timeout=5, + ) + response.raise_for_status() + return { + "traceID": trace_id, + "durationMs": trace_summary.get("durationMs", 0), + "rootServiceName": trace_summary.get( + "rootServiceName", "unknown" + ), + "traceData": response.json(), # Raw trace data + } + except Exception: + return { + "traceID": trace_id, + "durationMs": trace_summary.get("durationMs", 0), + "error": "Failed to fetch full trace", + } + + # Fetch the selected traces + result = { + "statistics": stats, + "all_trace_durations_ms": durations, # All durations for distribution analysis + "fastest_traces": [ + fetch_full_trace(sorted_traces[i]) for i in fastest_indices + ], + "median_trace": fetch_full_trace(sorted_traces[median_idx]), + "slowest_traces": [ + fetch_full_trace(sorted_traces[i]) for i in slowest_indices + ], + } + + # Return as YAML for readability + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=yaml.dump(result, default_flow_style=False, sort_keys=False), + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error fetching traces: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params: Dict) -> str: + return ( + f"{toolset_name_for_one_liner(self._toolset.name)}: Simple trace comparison" + ) + + class GrafanaTempoToolset(BaseGrafanaTempoToolset): def __init__(self): super().__init__( @@ -298,9 +1493,35 @@ def __init__(self): description="Fetches kubernetes traces from Tempo", icon_url="https://grafana.com/static/assets/img/blog/tempo.png", docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/grafanatempo.html", - tools=[GetTempoTraces(self), GetTempoTraceById(self), GetTempoTags(self)], + tools=[], # Will be populated in prerequisites_callable ) template_file_path = os.path.abspath( os.path.join(os.path.dirname(__file__), "toolset_grafana_tempo.jinja2") ) self._load_llm_instructions(jinja_template=f"file://{template_file_path}") + + def prerequisites_callable(self, config: dict[str, Any]) -> tuple[bool, str]: + # Call parent to validate config + success, msg = super().prerequisites_callable(config) + if not success: + return success, msg + + # Build tools list based on config + tools = [ + ListServices(self), + GetTempoTraces(self), + GetTempoTraceById(self), + GetTempoTags(self), + ] + + # Add comparison tools conditionally + if self.grafana_config.enable_comparative_sample: + tools.append(FetchTracesComparativeSample(self)) + + if self.grafana_config.enable_simple_comparison: + tools.append(FetchTracesSimpleComparison(self)) + + # Update the tools list + self.tools = tools + + return True, "" diff --git a/holmes/plugins/toolsets/grafana/trace_parser.py b/holmes/plugins/toolsets/grafana/trace_parser.py index 1910c090b..2913be6e4 100644 --- a/holmes/plugins/toolsets/grafana/trace_parser.py +++ b/holmes/plugins/toolsets/grafana/trace_parser.py @@ -187,7 +187,7 @@ def format_traces_list(trace_data: Dict) -> str: else "\n" ) trace_str += f"\tstartTime={unix_nano_to_rfc3339(int(trace.get('startTimeUnixNano')))}" - trace_str += f" rootServiceName={trace.get('trootServiceName')}" + trace_str += f" rootServiceName={trace.get('rootServiceName')}" trace_str += f" rootTraceName={trace.get('rootTraceName')}" traces_str.append(trace_str) return "\n".join(traces_str) diff --git a/holmes/plugins/toolsets/kubernetes.yaml b/holmes/plugins/toolsets/kubernetes.yaml index 81e61412b..eabaf3c35 100644 --- a/holmes/plugins/toolsets/kubernetes.yaml +++ b/holmes/plugins/toolsets/kubernetes.yaml @@ -16,34 +16,70 @@ toolsets: for example when a user asks - 'describe pod xyz-123' - 'show service xyz-123 in namespace my-ns' - command: "kubectl describe {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}" + You are NEVER EVER allowed to get kubernetes secrets! + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl describe {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} - name: "kubectl_get_by_name" - description: "Run `kubectl get --show-labels`" - command: "kubectl get --show-labels -o wide {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}" + description: "Run `kubectl get --show-labels`. You are NEVER EVER allowed to get kubernetes secrets!" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl get --show-labels -o wide {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} - name: "kubectl_get_by_kind_in_namespace" - description: "Run `kubectl get -n --show-labels` to get all resources of a given type in namespace" - command: "kubectl get --show-labels -o wide {{ kind }} -n {{namespace}}" + description: "Run `kubectl get -n --show-labels` to get all resources of a given type in namespace. You are NEVER EVER allowed to get kubernetes secrets!" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl get --show-labels -o wide {{ kind }} -n {{namespace}} - name: "kubectl_get_by_kind_in_cluster" - description: "Run `kubectl get -A --show-labels` to get all resources of a given type in the cluster" - command: "kubectl get -A --show-labels -o wide {{ kind }}" + description: "Run `kubectl get -A --show-labels` to get all resources of a given type in the cluster. You are NEVER EVER allowed to get kubernetes secrets!" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl get -A --show-labels -o wide {{ kind }} - name: "kubectl_find_resource" - description: "Run `kubectl get {{ kind }} -A --show-labels | grep {{ keyword }}` to find a resource where you know a substring of the name, IP, namespace, or labels" - command: "kubectl get -A --show-labels -o wide {{ kind }} | grep {{ keyword }}" + description: "Run `kubectl get {{ kind }} -A --show-labels | grep {{ keyword }}` to find a resource where you know a substring of the name, IP, namespace, or labels. You are NEVER EVER allowed to get kubernetes secrets!" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl get -A --show-labels -o wide {{ kind }} | grep {{ keyword }} - name: "kubectl_get_yaml" - description: "Run `kubectl get -o yaml` on a single Kubernetes resource" - command: "kubectl get -o yaml {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}" + description: "Run `kubectl get -o yaml` on a single Kubernetes resource. You are NEVER EVER allowed to get kubernetes secrets!" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl get -o yaml {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} - name: "kubectl_events" - description: "Retrieve the events for a specific Kubernetes resource. `resource_type` can be any kubernetes resource type: 'pod', 'service', 'deployment', 'job', 'node', etc." - command: "kubectl events --for {{resource_type}}/{{ resource_name }}{% if namespace %} -n {{ namespace }}{% endif %}" + description: "Retrieve the events for a specific Kubernetes resource. `resource_type` can be any kubernetes resource type: 'pod', 'service', 'deployment', 'job', 'node', etc. You are NEVER EVER allowed to get kubernetes secrets!" + command: | + if [ "{{resource_type}}" = "secret" ] || [ "{{resource_type}}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl events --for {{resource_type}}/{{ resource_name }}{% if namespace %} -n {{ namespace }}{% endif %} - name: "kubectl_memory_requests_all_namespaces" - description: "Fetch and display memory requests for all pods across all namespaces in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly." + description: "Fetch and display memory requests for all pods across all namespaces in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly. You are NEVER EVER allowed to get kubernetes secrets!" command: | kubectl get pods --all-namespaces -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,MEMORY_REQUEST:.spec.containers[*].resources.requests.memory" --no-headers | \ awk ' @@ -85,7 +121,7 @@ toolsets: }' | sort -k3 -nr - name: "kubectl_memory_requests_namespace" - description: "Fetch and display memory requests for all pods in a specified namespace in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly." + description: "Fetch and display memory requests for all pods in a specified namespace in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly. You are NEVER EVER allowed to get kubernetes secrets!" command: | kubectl get pods -n {{ namespace }} -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,MEMORY_REQUEST:.spec.containers[*].resources.requests.memory" --no-headers | \ awk ' @@ -129,8 +165,13 @@ toolsets: - name: "kubernetes_jq_query" user_description: "Query Kubernetes Resources: kubectl get {{kind}} --all-namespaces -o json | jq -r {{jq_expr}}" description: > - Use kubectl to get json for all resources of a specific kind pipe the results to jq to filter them. Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give. e.g. give an expression like .items[] | .spec.containers[].image | select(test("^gcr.io/") | not) - command: kubectl get {{ kind }} --all-namespaces -o json | jq -r {{ jq_expr }} + Use kubectl to get json for all resources of a specific kind pipe the results to jq to filter them. Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give. e.g. give an expression like .items[] | .spec.containers[].image | select(test("^gcr.io/") | not). You are NEVER EVER allowed to get kubernetes secrets! + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl get {{ kind }} --all-namespaces -o json | jq -r {{ jq_expr }} - name: "kubernetes_count" user_description: "Count Kubernetes Resources: kubectl get {{kind}} --all-namespaces -o json | jq -c -r {{ jq_expr }}" @@ -140,7 +181,13 @@ toolsets: Use select() to filter objects before extracting properties, e.g. .items[] | select(.metadata.namespace == "test-1") | .metadata.name Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give. e.g. give an expression like .items[] | select(.spec.containers[].image | test("^gcr.io/") | not) | .metadata.name + You are NEVER EVER allowed to get kubernetes secrets! script: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + echo "Command executed: kubectl get {{ kind }} --all-namespaces -o json | jq -c -r {{ jq_expr }}" echo "---" @@ -239,10 +286,20 @@ toolsets: tools: - name: "kubectl_lineage_children" description: "Get all children/dependents of a Kubernetes resource, recursively, including their status" - command: "kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} - name: "kubectl_lineage_parents" description: "Get all parents/dependencies of a Kubernetes resource, recursively, including their status" - command: "kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D kubernetes/kube-lineage-extras: # To make this work, build kube-lineage from source description: "Fetches children/dependents and parents/dependencies resources using kube-lineage" @@ -255,7 +312,17 @@ toolsets: tools: - name: "kubectl_lineage_children" description: "Get all children/dependents of a Kubernetes resource, recursively, including their status" - command: "kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} - name: "kubectl_lineage_parents" description: "Get all parents/dependencies of a Kubernetes resource, recursively, including their status" - command: "kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D" + command: | + if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then + echo "Not allowed to get kubernetes secrets" + exit 1 + fi + kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D diff --git a/holmes/plugins/toolsets/prometheus/prometheus.py b/holmes/plugins/toolsets/prometheus/prometheus.py index ace6d3faf..90a7bfc5f 100644 --- a/holmes/plugins/toolsets/prometheus/prometheus.py +++ b/holmes/plugins/toolsets/prometheus/prometheus.py @@ -816,6 +816,399 @@ def get_parameterized_one_liner(self, params) -> str: return f"{toolset_name_for_one_liner(self.toolset.name)}: Query ({description})" +class AnalyzeMetricByDimensions(BasePrometheusTool): + def __init__(self, toolset: "PrometheusToolset"): + super().__init__( + name="analyze_metric_by_dimensions", + description="Analyzes any metric broken down by its available label dimensions. Automatically discovers available labels from the metric.", + parameters={ + "metric_name": ToolParameter( + description="The metric name to analyze", + type="string", + required=True, + ), + "group_by": ToolParameter( + description="Labels to group by (will be validated against available labels)", + type="array", + required=False, + ), + "filters": ToolParameter( + description="Label filters to apply as key-value pairs", + type="object", + required=False, + ), + "percentiles": ToolParameter( + description="For histogram/summary metrics - percentiles to calculate", + type="array", + required=False, + ), + "time_range": ToolParameter( + description="Time range for analysis (e.g., '5m', '1h', '24h')", + type="string", + required=False, + ), + "aggregation": ToolParameter( + description="Aggregation method (avg, sum, max, min, p50, p95, p99)", + type="string", + required=False, + ), + }, + toolset=toolset, + ) + + def _invoke(self, params: Any) -> StructuredToolResult: + if not self.toolset.config or not self.toolset.config.prometheus_url: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error="Prometheus is not configured. Prometheus URL is missing", + params=params, + ) + + try: + metric_name = get_param_or_raise(params, "metric_name") + group_by = params.get("group_by", []) + filters = params.get("filters", {}) + time_range = params.get("time_range", "1h") + aggregation = params.get("aggregation", "avg") + + # Build the base query with filters + filter_str = "" + if filters: + filter_items = [f'{k}="{v}"' for k, v in filters.items()] + filter_str = "{" + ",".join(filter_items) + "}" + + # Build the query based on aggregation type + if aggregation in ["p50", "p95", "p99"]: + percentile = float(aggregation[1:]) / 100 + query = f"histogram_quantile({percentile}, sum(rate({metric_name}_bucket{filter_str}[{time_range}])) by (le" + if group_by: + query += f", {', '.join(group_by)}" + query += "))" + elif group_by: + query = f'{aggregation}(rate({metric_name}{filter_str}[{time_range}])) by ({", ".join(group_by)})' + else: + query = f"{aggregation}(rate({metric_name}{filter_str}[{time_range}]))" + + url = urljoin(self.toolset.config.prometheus_url, "api/v1/query") + payload = {"query": query} + + response = requests.post( + url=url, + headers=self.toolset.config.headers, + auth=self.toolset.config.get_auth(), + data=payload, + timeout=60, + verify=self.toolset.config.prometheus_ssl_enabled, + ) + + if response.status_code == 200: + data = response.json() + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=json.dumps(data.get("data"), indent=2), + params=params, + ) + else: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Query failed with status {response.status_code}: {response.text}", + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error analyzing metric: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params) -> str: + metric_name = params.get("metric_name", "") + return f"{toolset_name_for_one_liner(self.toolset.name)}: Analyze {metric_name} by dimensions" + + +class FindTopMetricValues(BasePrometheusTool): + def __init__(self, toolset: "PrometheusToolset"): + super().__init__( + name="find_top_metric_values", + description="Finds the highest values for any metric, grouped by labels. Useful for identifying outliers or slowest operations.", + parameters={ + "metric_name": ToolParameter( + description="The metric to analyze", + type="string", + required=True, + ), + "group_by_label": ToolParameter( + description="Label to group results by", + type="string", + required=True, + ), + "top_n": ToolParameter( + description="Number of top entries to return", + type="integer", + required=False, + ), + "percentile": ToolParameter( + description="For histogram/summary metrics - percentile to use (e.g., 0.95)", + type="number", + required=False, + ), + "time_range": ToolParameter( + description="Time range for analysis", + type="string", + required=False, + ), + }, + toolset=toolset, + ) + + def _invoke(self, params: Any) -> StructuredToolResult: + if not self.toolset.config or not self.toolset.config.prometheus_url: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error="Prometheus is not configured. Prometheus URL is missing", + params=params, + ) + + try: + metric_name = get_param_or_raise(params, "metric_name") + group_by_label = get_param_or_raise(params, "group_by_label") + top_n = params.get("top_n", 10) + percentile = params.get("percentile", 0.95) + time_range = params.get("time_range", "1h") + + # Check if it's a histogram metric + if "_bucket" in metric_name or percentile: + query = f"topk({top_n}, histogram_quantile({percentile}, sum(rate({metric_name}_bucket[{time_range}])) by (le, {group_by_label})))" + else: + query = f"topk({top_n}, avg(rate({metric_name}[{time_range}])) by ({group_by_label}))" + + url = urljoin(self.toolset.config.prometheus_url, "api/v1/query") + payload = {"query": query} + + response = requests.post( + url=url, + headers=self.toolset.config.headers, + auth=self.toolset.config.get_auth(), + data=payload, + timeout=60, + verify=self.toolset.config.prometheus_ssl_enabled, + ) + + if response.status_code == 200: + data = response.json() + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=json.dumps(data.get("data"), indent=2), + params=params, + ) + else: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Query failed with status {response.status_code}: {response.text}", + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error finding top values: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params) -> str: + metric_name = params.get("metric_name", "") + return f"{toolset_name_for_one_liner(self.toolset.name)}: Find top values for {metric_name}" + + +class CompareMetricPeriods(BasePrometheusTool): + def __init__(self, toolset: "PrometheusToolset"): + super().__init__( + name="compare_metric_periods", + description="Compares a metric between two time periods to identify changes or degradations.", + parameters={ + "metric_name": ToolParameter( + description="The metric to compare", + type="string", + required=True, + ), + "current_period": ToolParameter( + description="Current time period (e.g., '1h')", + type="string", + required=False, + ), + "comparison_offset": ToolParameter( + description="How far back to compare (e.g., '24h' for yesterday)", + type="string", + required=False, + ), + "group_by": ToolParameter( + description="Labels to group comparison by", + type="array", + required=False, + ), + }, + toolset=toolset, + ) + + def _invoke(self, params: Any) -> StructuredToolResult: + if not self.toolset.config or not self.toolset.config.prometheus_url: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error="Prometheus is not configured. Prometheus URL is missing", + params=params, + ) + + try: + metric_name = get_param_or_raise(params, "metric_name") + current_period = params.get("current_period", "1h") + comparison_offset = params.get("comparison_offset", "24h") + group_by = params.get("group_by", []) + + # Build group by clause + group_clause = "" + if group_by: + group_clause = f' by ({", ".join(group_by)})' + + # Query comparing current vs offset period + query = f""" + (avg(rate({metric_name}[{current_period}])){group_clause} - + avg(rate({metric_name}[{current_period}] offset {comparison_offset})){group_clause}) / + avg(rate({metric_name}[{current_period}] offset {comparison_offset})){group_clause} * 100 + """ + + url = urljoin(self.toolset.config.prometheus_url, "api/v1/query") + payload = {"query": query} + + response = requests.post( + url=url, + headers=self.toolset.config.headers, + auth=self.toolset.config.get_auth(), + data=payload, + timeout=60, + verify=self.toolset.config.prometheus_ssl_enabled, + ) + + if response.status_code == 200: + data = response.json() + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=json.dumps(data.get("data"), indent=2), + params=params, + ) + else: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Query failed with status {response.status_code}: {response.text}", + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error comparing periods: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params) -> str: + metric_name = params.get("metric_name", "") + return f"{toolset_name_for_one_liner(self.toolset.name)}: Compare {metric_name} periods" + + +class DetectMetricAnomalies(BasePrometheusTool): + def __init__(self, toolset: "PrometheusToolset"): + super().__init__( + name="detect_metric_anomalies", + description="Detects anomalous patterns in metrics using statistical analysis. Identifies spikes and deviations from normal.", + parameters={ + "metric_name": ToolParameter( + description="The metric to analyze", + type="string", + required=True, + ), + "sensitivity": ToolParameter( + description="Standard deviations for anomaly threshold (2-4 typical)", + type="number", + required=False, + ), + "lookback_window": ToolParameter( + description="Historical window for baseline (e.g., '7d')", + type="string", + required=False, + ), + "group_by": ToolParameter( + description="Labels to detect anomalies by", + type="array", + required=False, + ), + }, + toolset=toolset, + ) + + def _invoke(self, params: Any) -> StructuredToolResult: + if not self.toolset.config or not self.toolset.config.prometheus_url: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error="Prometheus is not configured. Prometheus URL is missing", + params=params, + ) + + try: + metric_name = get_param_or_raise(params, "metric_name") + sensitivity = params.get("sensitivity", 3) + lookback_window = params.get("lookback_window", "1h") + group_by = params.get("group_by", []) + + # Build group by clause + group_clause = "" + if group_by: + group_clause = f' by ({", ".join(group_by)})' + + # Z-score based anomaly detection query + query = f""" + (rate({metric_name}[5m]){group_clause} - + avg_over_time(rate({metric_name}[5m])[{lookback_window}:]){group_clause}) / + stddev_over_time(rate({metric_name}[5m])[{lookback_window}:]){group_clause} > {sensitivity} + """ + + url = urljoin(self.toolset.config.prometheus_url, "api/v1/query") + payload = {"query": query} + + response = requests.post( + url=url, + headers=self.toolset.config.headers, + auth=self.toolset.config.get_auth(), + data=payload, + timeout=60, + verify=self.toolset.config.prometheus_ssl_enabled, + ) + + if response.status_code == 200: + data = response.json() + return StructuredToolResult( + status=ToolResultStatus.SUCCESS, + data=json.dumps(data.get("data"), indent=2), + params=params, + ) + else: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Query failed with status {response.status_code}: {response.text}", + params=params, + ) + + except Exception as e: + return StructuredToolResult( + status=ToolResultStatus.ERROR, + error=f"Error detecting anomalies: {str(e)}", + params=params, + ) + + def get_parameterized_one_liner(self, params) -> str: + metric_name = params.get("metric_name", "") + return f"{toolset_name_for_one_liner(self.toolset.name)}: Detect anomalies in {metric_name}" + + class PrometheusToolset(Toolset): config: Optional[Union[PrometheusConfig, AMPConfig]] = None @@ -831,6 +1224,10 @@ def __init__(self): ListAvailableMetrics(toolset=self), ExecuteInstantQuery(toolset=self), ExecuteRangeQuery(toolset=self), + AnalyzeMetricByDimensions(toolset=self), + FindTopMetricValues(toolset=self), + CompareMetricPeriods(toolset=self), + DetectMetricAnomalies(toolset=self), ], tags=[ ToolsetTag.CORE, diff --git a/pyproject.toml b/pyproject.toml index ee8dc9051..f550a2804 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,7 +119,8 @@ markers = [ "toolset-limitation: Tests that cannot be solved no matter how smart the model, unless we improve the underlying toolsets themselves", "ask-for-clarification: Tests where Holmes should ask the user for clarification", "database: Tests involving database interactions", - "datadog: DataDog toolset" + "datadog: DataDog toolset", + "traces: Tests where the ai is expected to find the solution using the traces" ] addopts = [ diff --git a/tests/llm/fixtures/shared/tempo.yaml b/tests/llm/fixtures/shared/tempo.yaml new file mode 100644 index 000000000..5dcc9a81a --- /dev/null +++ b/tests/llm/fixtures/shared/tempo.yaml @@ -0,0 +1,114 @@ +# Shared Tempo deployment configuration for test fixtures +# Apply with: kubectl apply -f tempo.yaml -n +# Note: Namespace must be created separately +--- +# Tempo deployment - lightweight, in-memory storage +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tempo +spec: + replicas: 1 + selector: + matchLabels: + app: tempo + template: + metadata: + labels: + app: tempo + spec: + containers: + - name: tempo + image: grafana/tempo:2.3.0 + args: + - -config.file=/etc/tempo/tempo-config.yaml + ports: + - containerPort: 3200 + name: http + - containerPort: 4317 + name: otlp-grpc + - containerPort: 4318 + name: otlp-http + volumeMounts: + - name: config + mountPath: /etc/tempo + resources: + requests: + memory: "64Mi" + cpu: "10m" + limits: + memory: "256Mi" + volumes: + - name: config + configMap: + name: tempo-config +--- +apiVersion: v1 +kind: Service +metadata: + name: tempo +spec: + selector: + app: tempo + ports: + - name: http + port: 3200 + targetPort: 3200 + - name: otlp-grpc + port: 4317 + targetPort: 4317 + - name: otlp-http + port: 4318 + targetPort: 4318 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tempo-config +data: + tempo-config.yaml: | + server: + http_listen_port: 3200 + + distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + ingester: + max_block_duration: 5m + trace_idle_period: 30s + max_block_bytes: 10_000_000 + + compactor: + compaction: + block_retention: 17520h # 2 years (730 days * 24 hours) + + storage: + trace: + backend: local + local: + path: /tmp/tempo/traces + wal: + path: /tmp/tempo/wal + pool: + max_workers: 10 + queue_depth: 100 + + querier: + frontend_worker: + frontend_address: 127.0.0.1:9095 + max_concurrent_queries: 10 + + query_frontend: + max_batch_size: 5 + + overrides: + max_traces_per_user: 10000 + ingestion_rate_limit_bytes: 15000000 + ingestion_burst_size_bytes: 20000000 + max_bytes_per_trace: 5000000 diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/postgres.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/postgres.yaml new file mode 100644 index 000000000..4debc1ce4 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/postgres.yaml @@ -0,0 +1,139 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-init +data: + seed.sql: | + -- Connect to the default database first + \c postgres; + + -- Create shipping_rates table + CREATE TABLE shipping_rates ( + id SERIAL PRIMARY KEY, + zone_id VARCHAR(50) NOT NULL, + promo_code VARCHAR(50), + rate_per_kg DECIMAL(10,2) NOT NULL, + discount_percent DECIMAL(5,2) DEFAULT 0, + active BOOLEAN DEFAULT true, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + + -- Create index ONLY on zone_id (missing compound index is the problem!) + CREATE INDEX idx_zone_id ON shipping_rates(zone_id); + + -- Insert base rates for different zones + INSERT INTO shipping_rates (zone_id, rate_per_kg, discount_percent, active) VALUES + ('us-west-1', 5.00, 0, true), + ('us-west-2', 5.50, 0, true), + ('us-east-1', 6.00, 0, true), + ('us-east-2', 6.50, 0, true), + ('eu-west-1', 8.00, 0, true), + ('eu-central-1', 8.50, 0, true), + ('ap-south-1', 9.00, 0, true), + ('ap-northeast-1', 9.50, 0, true); + + -- Insert promo code rates (these will cause slow queries) + -- Generate many promo codes to make table scanning expensive + DO $$ + DECLARE + zone_list text[] := ARRAY['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2', + 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-northeast-1']; + promo_list text[] := ARRAY['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25', + 'FLASH30', 'MEMBER10', 'FIRST15', 'RETURN20', + 'SUMMER10', 'WINTER15', 'SPRING20', 'FALL25']; + zone text; + promo text; + i int; + BEGIN + -- Insert specific promo codes + FOREACH zone IN ARRAY zone_list LOOP + FOREACH promo IN ARRAY promo_list LOOP + INSERT INTO shipping_rates (zone_id, promo_code, rate_per_kg, discount_percent, active) + VALUES (zone, promo, 5.00 + random() * 5, 10 + random() * 20, true); + END LOOP; + END LOOP; + + -- Add many more rows to make table scanning slow (50,000+ rows) + FOR i IN 1..50000 LOOP + INSERT INTO shipping_rates (zone_id, promo_code, rate_per_kg, discount_percent, active) + VALUES ( + zone_list[1 + floor(random() * 8)], + 'PROMO' || i, + 5.00 + random() * 10, + random() * 30, + random() > 0.2 + ); + END LOOP; + END $$; + + -- Analyze table for query planner + ANALYZE shipping_rates; + + -- Show the problem: query without promo_code uses index + EXPLAIN (ANALYZE, BUFFERS) + SELECT rate_per_kg, discount_percent + FROM shipping_rates + WHERE zone_id = 'us-west-1' AND active = true + LIMIT 1; + + -- Show the problem: query with promo_code does table scan + EXPLAIN (ANALYZE, BUFFERS) + SELECT rate_per_kg, discount_percent + FROM shipping_rates + WHERE zone_id = 'us-west-1' AND promo_code = 'SAVE10' AND active = true + LIMIT 1; + + -- The fix would be: CREATE INDEX idx_shipping_compound ON shipping_rates(zone_id, promo_code, active); + -- But we don't create it - that's for Holmes to discover! +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres +spec: + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + containers: + - name: postgres + image: postgres:15-alpine + env: + - name: POSTGRES_USER + value: postgres + - name: POSTGRES_PASSWORD + value: postgres + - name: POSTGRES_DB + value: shipping + ports: + - containerPort: 5432 + volumeMounts: + - name: init + mountPath: /docker-entrypoint-initdb.d + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + volumes: + - name: init + configMap: + name: postgres-init +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres +spec: + selector: + app: postgres + ports: + - port: 5432 + targetPort: 5432 diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/services.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/services.yaml new file mode 100644 index 000000000..aaf547853 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/services.yaml @@ -0,0 +1,306 @@ +# Checkout Service +apiVersion: v1 +kind: Secret +metadata: + name: checkout-app +type: Opaque +stringData: + app.py: | + import os + import time + import random + import requests + from flask import Flask, request, jsonify + from opentelemetry import trace + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.sdk.resources import Resource + from opentelemetry.instrumentation.flask import FlaskInstrumentor + from opentelemetry.instrumentation.requests import RequestsInstrumentor + + # Configure OpenTelemetry + resource = Resource.create({"service.name": "checkout-service"}) + provider = TracerProvider(resource=resource) + trace.set_tracer_provider(provider) + + otlp_exporter = OTLPSpanExporter( + endpoint="tempo.app-113.svc.cluster.local:4317", + insecure=True + ) + provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) + + app = Flask(__name__) + FlaskInstrumentor().instrument_app(app) + RequestsInstrumentor().instrument() + + tracer = trace.get_tracer(__name__) + + @app.route('/health') + def health(): + return 'OK' + + @app.route('/checkout', methods=['POST']) + def checkout(): + with tracer.start_as_current_span("process_checkout") as span: + data = request.json or {} + + # Extract parameters + user_id = data.get('user_id', 'guest') + zone_id = data.get('zone_id', 'us-west-1') + promo_code = data.get('promo_code') + items = data.get('items', []) + + # Add span attributes + span.set_attribute("user.id", user_id) + span.set_attribute("zone.id", zone_id) + span.set_attribute("items.count", len(items)) + if promo_code: + span.set_attribute("promo.code", promo_code) + + # Calculate shipping + with tracer.start_as_current_span("calculate_shipping"): + shipping_url = "http://shipping-calculator.app-113.svc.cluster.local:8081/calculate" + shipping_data = { + "zone_id": zone_id, + "promo_code": promo_code, + "weight": sum(item.get('weight', 1.0) for item in items) + } + + try: + response = requests.post(shipping_url, json=shipping_data, timeout=30) + shipping_cost = response.json().get('cost', 10.0) + except Exception as e: + span.record_exception(e) + shipping_cost = 10.0 + + # Calculate total + subtotal = sum(item.get('price', 0) for item in items) + total = subtotal + shipping_cost + + return jsonify({ + "order_id": f"ord-{random.randint(1000, 9999)}", + "subtotal": subtotal, + "shipping": shipping_cost, + "total": total + }) + + if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080) +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: checkout +spec: + replicas: 1 + selector: + matchLabels: + app: checkout + template: + metadata: + labels: + app: checkout + spec: + containers: + - name: checkout + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install flask requests opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests \ + opentelemetry-exporter-otlp-proto-grpc && \ + python /app/app.py + volumeMounts: + - name: app + mountPath: /app + ports: + - containerPort: 8080 + env: + - name: PYTHONUNBUFFERED + value: "1" + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" + volumes: + - name: app + secret: + secretName: checkout-app +--- +apiVersion: v1 +kind: Service +metadata: + name: checkout +spec: + selector: + app: checkout + ports: + - port: 8080 + targetPort: 8080 +--- +# Shipping Calculator Service +apiVersion: v1 +kind: Secret +metadata: + name: shipping-calculator-app +type: Opaque +stringData: + app.py: | + import os + import time + import psycopg2 + from flask import Flask, request, jsonify + from opentelemetry import trace + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.sdk.resources import Resource + from opentelemetry.instrumentation.flask import FlaskInstrumentor + from opentelemetry.instrumentation.psycopg2 import Psycopg2Instrumentor + + # Configure OpenTelemetry + resource = Resource.create({"service.name": "shipping-calculator"}) + provider = TracerProvider(resource=resource) + trace.set_tracer_provider(provider) + + otlp_exporter = OTLPSpanExporter( + endpoint="tempo.app-113.svc.cluster.local:4317", + insecure=True + ) + provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) + + app = Flask(__name__) + FlaskInstrumentor().instrument_app(app) + Psycopg2Instrumentor().instrument() + + tracer = trace.get_tracer(__name__) + + def get_db_connection(): + return psycopg2.connect( + host="postgres.app-113.svc.cluster.local", + database="shipping", + user="postgres", + password="postgres" + ) + + @app.route('/health') + def health(): + return 'OK' + + @app.route('/calculate', methods=['POST']) + def calculate(): + with tracer.start_as_current_span("calculate_shipping_rate") as span: + data = request.json or {} + zone_id = data.get('zone_id', 'us-west-1') + promo_code = data.get('promo_code') + weight = data.get('weight', 1.0) + + span.set_attribute("zone.id", zone_id) + span.set_attribute("weight", weight) + if promo_code: + span.set_attribute("promo.code", promo_code) + + with tracer.start_as_current_span("database_query") as db_span: + conn = get_db_connection() + cursor = conn.cursor() + + try: + if promo_code: + # This query will be slow - no index on (zone_id, promo_code, active) + query = """ + SELECT rate_per_kg, discount_percent + FROM shipping_rates + WHERE zone_id = %s AND promo_code = %s AND active = true + LIMIT 1 + """ + db_span.set_attribute("db.statement", query) + cursor.execute(query, (zone_id, promo_code)) + else: + # This query will be fast - uses index on zone_id + query = """ + SELECT rate_per_kg, discount_percent + FROM shipping_rates + WHERE zone_id = %s AND active = true + LIMIT 1 + """ + db_span.set_attribute("db.statement", query) + cursor.execute(query, (zone_id,)) + + result = cursor.fetchone() + if result: + rate, discount = result + cost = weight * rate * (1 - discount/100) + else: + cost = weight * 5.0 # Default rate + + finally: + cursor.close() + conn.close() + + return jsonify({"cost": round(cost, 2)}) + + if __name__ == '__main__': + # Wait for Postgres to be ready + time.sleep(10) + app.run(host='0.0.0.0', port=8081) +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: shipping-calculator +spec: + replicas: 1 + selector: + matchLabels: + app: shipping-calculator + template: + metadata: + labels: + app: shipping-calculator + spec: + containers: + - name: shipping-calculator + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + apt-get update && apt-get install -y libpq-dev gcc && \ + pip install flask psycopg2 opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask opentelemetry-instrumentation-psycopg2 \ + opentelemetry-exporter-otlp-proto-grpc && \ + python /app/app.py + volumeMounts: + - name: app + mountPath: /app + ports: + - containerPort: 8081 + env: + - name: PYTHONUNBUFFERED + value: "1" + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" + volumes: + - name: app + secret: + secretName: shipping-calculator-app +--- +apiVersion: v1 +kind: Service +metadata: + name: shipping-calculator +spec: + selector: + app: shipping-calculator + ports: + - port: 8081 + targetPort: 8081 diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/test_case.yaml new file mode 100644 index 000000000..a40a6b961 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/test_case.yaml @@ -0,0 +1,68 @@ +user_prompt: +- "The checkout service in namespace app-113 is experiencing high latency. Investigate using tracing data." +- "The checkout service in namespace app-113 is experiencing high latency. Investigate all traces over 0.1s in duration" + +expected_output: + - The checkout service has intermittent high latency affecting approximately 30% of requests + - Root cause is missing database index on the shipping_rates table in Postgres + - Slow queries occur when promo_code parameter is included in the request + - Fast queries use index on zone_id, but queries with promo_code cause full table scan + - Solution is to create compound index on (zone_id, promo_code, active) + +tags: + - database + - kubernetes + - medium + +port_forwards: + - namespace: app-113 + service: tempo + local_port: 3200 + remote_port: 3200 + +before_test: | + # Create namespace + kubectl create namespace app-113 || true + + # Deploy Tempo from shared config + kubectl apply -f ../../shared/tempo.yaml -n app-113 + echo "deployed tempo" + + # Deploy Postgres with initial schema + kubectl apply -f postgres.yaml -n app-113 + kubectl wait --for=condition=ready pod -l app=postgres -n app-113 --timeout=60s + echo "database pod ready" + + # Wait for database initialization to complete (init script runs automatically) + sleep 10 + + # Deploy microservices + kubectl apply -f services.yaml -n app-113 + kubectl wait --for=condition=ready pod -l app=checkout -n app-113 --timeout=60s + kubectl wait --for=condition=ready pod -l app=shipping-calculator -n app-113 --timeout=60s + echo "microservices ready" + + # Wait for Tempo to be ready + kubectl wait --for=condition=ready pod -l app=tempo -n app-113 --timeout=60s + echo "Waiting for Tempo to be fully ready (internal initialization)..." + sleep 25 # Tempo needs ~18s after pod ready for /ready endpoint to return 200 + echo "Tempo ready" + + # Run traffic generator job for 2 minutes of heavy load + kubectl apply -f traffic-generator-job.yaml -n app-113 + kubectl wait --for=condition=complete job/traffic-generator -n app-113 --timeout=120s + echo "Done generating traffic" + + # Verify traces are in Tempo using kubectl exec + echo "Checking if traces were received by Tempo..." + TRACE_COUNT=$(kubectl exec -n app-113 deployment/tempo -- curl -s 'http://localhost:3200/api/search?limit=1' 2>/dev/null | grep -o '"traces"' | wc -l || echo "0") + + if [ "$TRACE_COUNT" -eq "0" ]; then + echo "WARNING: No traces found in Tempo (this might be timing related)" + # Don't fail the test for this + else + echo "SUCCESS: Found traces in Tempo" + fi + +after_test: | + kubectl delete namespace app-113 || true diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/toolsets.yaml new file mode 100644 index 000000000..7298e0687 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/toolsets.yaml @@ -0,0 +1,10 @@ +toolsets: + kubernetes/core: + enabled: true + kubernetes/logs: + enabled: true + grafana/tempo: + enabled: true + config: + url: http://localhost:3200 + healthcheck: "ready" diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/traffic-generator-job.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/traffic-generator-job.yaml new file mode 100644 index 000000000..b1451de79 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/traffic-generator-job.yaml @@ -0,0 +1,159 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: traffic-generator +data: + generator.py: | + import time + import random + import requests + import concurrent.futures + from datetime import datetime + + CHECKOUT_URL = "http://checkout.app-113.svc.cluster.local:8080/checkout" + ZONES = ['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2', + 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-northeast-1'] + PROMO_CODES = ['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25', + 'FLASH30', 'MEMBER10', 'FIRST15', 'RETURN20', + 'SUMMER10', 'WINTER15', 'SPRING20', 'FALL25'] + + def generate_request(): + """Generate a single checkout request""" + # 30% chance to include promo code (will be slow) + include_promo = random.random() < 0.3 + + data = { + "user_id": f"user-{random.randint(1000, 9999)}", + "zone_id": random.choice(ZONES), + "items": [ + { + "id": f"item-{i}", + "price": round(random.uniform(10, 200), 2), + "weight": round(random.uniform(0.1, 5.0), 2) + } + for i in range(random.randint(1, 5)) + ] + } + + if include_promo: + data["promo_code"] = random.choice(PROMO_CODES) + + try: + response = requests.post(CHECKOUT_URL, json=data, timeout=30) + latency = response.elapsed.total_seconds() + status = "success" if response.status_code == 200 else "error" + has_promo = "with_promo" if include_promo else "no_promo" + print(f"{datetime.now().isoformat()} - {status} - {has_promo} - {latency:.2f}s") + return latency + except Exception as e: + print(f"{datetime.now().isoformat()} - error - {str(e)}") + return None + + def run_load_test(): + """Run concurrent requests with a limit of 2000 total requests""" + print(f"Starting traffic generation at {datetime.now().isoformat()}") + print("Generating moderate load (max 2000 requests)...") + + start_time = time.time() + max_requests = 2000 # Limit total requests + max_duration = 120 # 2 minutes max + request_count = 0 + slow_requests = 0 + fast_requests = 0 + + # Use thread pool for concurrent requests (reduced workers) + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = [] + + while request_count < max_requests and (time.time() - start_time) < max_duration: + # Submit fewer requests at a time to avoid overwhelming Tempo + requests_to_submit = min(3, max_requests - request_count - len(futures)) + for _ in range(requests_to_submit): + future = executor.submit(generate_request) + futures.append(future) + + # Process completed futures + done_futures = [] + for future in futures: + if future.done(): + latency = future.result() + if latency: + request_count += 1 + if latency > 2.0: # Consider > 2s as slow + slow_requests += 1 + else: + fast_requests += 1 + done_futures.append(future) + + # Remove completed futures + for future in done_futures: + futures.remove(future) + + # Longer sleep to reduce load on Tempo + time.sleep(0.3) + + # Wait for remaining futures to complete + for future in concurrent.futures.as_completed(futures): + latency = future.result() + if latency: + request_count += 1 + if latency > 2.0: + slow_requests += 1 + else: + fast_requests += 1 + + elapsed = time.time() - start_time + print(f"\nTraffic generation completed in {elapsed:.1f} seconds") + print(f"Total requests: {request_count}") + print(f"Fast requests (<2s): {fast_requests} ({fast_requests*100/max(request_count,1):.1f}%)") + print(f"Slow requests (>2s): {slow_requests} ({slow_requests*100/max(request_count,1):.1f}%)") + print(f"Average RPS: {request_count/elapsed:.1f}") + + if __name__ == "__main__": + # Wait for services to be ready + print("Waiting for services to be ready...") + time.sleep(10) + + # Warm up with a few requests + print("Warming up services...") + for _ in range(5): + generate_request() + time.sleep(1) + + # Run the main load test + run_load_test() + + print("\nTraffic generation completed successfully!") +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: traffic-generator +spec: + backoffLimit: 1 + activeDeadlineSeconds: 300 + template: + spec: + restartPolicy: Never + containers: + - name: generator + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install requests && \ + python /app/generator.py + volumeMounts: + - name: script + mountPath: /app + resources: + requests: + memory: "128Mi" + cpu: "200m" + limits: + memory: "256Mi" + cpu: "500m" + volumes: + - name: script + configMap: + name: traffic-generator diff --git a/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/checkout-service.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/checkout-service.yaml new file mode 100644 index 000000000..c7eadcffe --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/checkout-service.yaml @@ -0,0 +1,169 @@ +# Simplified Checkout Service with Dummy SQL +apiVersion: v1 +kind: Secret +metadata: + name: checkout-app +type: Opaque +stringData: + app.py: | + import os + import time + import random + from flask import Flask, request, jsonify + from opentelemetry import trace + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.sdk.resources import Resource + from opentelemetry.instrumentation.flask import FlaskInstrumentor + + # Configure OpenTelemetry + resource = Resource.create({"service.name": "checkout-service"}) + provider = TracerProvider(resource=resource) + trace.set_tracer_provider(provider) + + otlp_exporter = OTLPSpanExporter( + endpoint="tempo.app-114a.svc.cluster.local:4317", + insecure=True + ) + provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) + + app = Flask(__name__) + FlaskInstrumentor().instrument_app(app) + + tracer = trace.get_tracer(__name__) + + @app.route('/health') + def health(): + return 'OK' + + @app.route('/checkout', methods=['POST']) + def checkout(): + with tracer.start_as_current_span("process_checkout") as span: + data = request.json or {} + + # Log the incoming request (without revealing the data) + print(f"[CHECKOUT] Processing checkout request for user {data.get('user_id', 'guest')}", flush=True) + + # Extract parameters + user_id = data.get('user_id', 'guest') + zone_id = data.get('zone_id', 'us-west-1') + promo_code = data.get('promo_code') + items = data.get('items', []) + + # Add span attributes + span.set_attribute("user.id", user_id) + span.set_attribute("zone.id", zone_id) + span.set_attribute("items.count", len(items)) + if promo_code: + span.set_attribute("promo.code", promo_code) + + # Simulate database query for shipping calculation + with tracer.start_as_current_span("database_query") as db_span: + db_span.set_attribute("db.system", "postgresql") + db_span.set_attribute("db.operation", "SELECT") + + if promo_code: + # Simulate slow query with promo_code + query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND promo_code = ? AND active = true" + db_span.set_attribute("db.statement", query) + # print(f"[DB] Executing shipping rate query", flush=True) + sleep_time = random.uniform(1.5, 3.5) + time.sleep(sleep_time) # Simulate slow query + shipping_rate = 4.5 + discount = 15.0 + else: + # Simulate fast query without promo_code + query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND active = true" + db_span.set_attribute("db.statement", query) + # print(f"[DB] Executing shipping rate query", flush=True) + sleep_time = random.uniform(0.1, 0.2) + time.sleep(sleep_time) # Simulate fast query + shipping_rate = 5.0 + discount = 0.0 + + # Calculate shipping cost + total_weight = sum(item.get('weight', 1.0) for item in items) + shipping_cost = total_weight * shipping_rate * (1 - discount/100) + + # Calculate total + subtotal = sum(item.get('price', 0) for item in items) + total = subtotal + shipping_cost + + response = { + "order_id": f"ord-{random.randint(1000, 9999)}", + "subtotal": subtotal, + "shipping": round(shipping_cost, 2), + "total": round(total, 2) + } + + print(f"[CHECKOUT] Completed checkout request", flush=True) + return jsonify(response) + + if __name__ == '__main__': + print("[CHECKOUT] Starting checkout service on port 8080", flush=True) + app.run(host='0.0.0.0', port=8080) +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: checkout +spec: + replicas: 1 + selector: + matchLabels: + app: checkout + template: + metadata: + labels: + app: checkout + spec: + containers: + - name: checkout + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install flask opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc && \ + python /app/app.py + volumeMounts: + - name: app + mountPath: /app + ports: + - containerPort: 8080 + env: + - name: PYTHONUNBUFFERED + value: "1" + startupProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 24 + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" + volumes: + - name: app + secret: + secretName: checkout-app +--- +apiVersion: v1 +kind: Service +metadata: + name: checkout +spec: + selector: + app: checkout + ports: + - port: 8080 + targetPort: 8080 diff --git a/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/test_case.yaml new file mode 100644 index 000000000..978daf667 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/test_case.yaml @@ -0,0 +1,131 @@ +user_prompt: +- "The checkout service in namespace app-114a is experiencing high latency. Investigate why." + +expected_output: + - The answer must explicitly state that queries that include a promo code are slow or all slow requests include promo code. + - Including a query that mentions promo code is not sufficient. + +tags: + - kubernetes + - hard + - chain-of-causation + - traces + +port_forwards: + - namespace: app-114a + service: tempo + local_port: 3214 + remote_port: 3200 + +before_test: | + echo "🚀 Setting up test 114a - Creating namespace app-114a" + kubectl create namespace app-114a || true + echo "✅ Namespace app-114a created successfully!" + + echo "📦 Deploying Tempo from shared config" + kubectl apply -f ../../shared/tempo.yaml -n app-114a + + echo "⏳ Waiting for Tempo pod to be ready" + kubectl wait --for=condition=ready pod -l app=tempo -n app-114a --timeout=60s + + echo "⏰ Waiting for Tempo to be fully ready (checking every 5s, timeout 60s)" + TEMPO_READY=false + for i in {1..12}; do + if kubectl exec -n app-114a deployment/tempo -- wget -q -O - http://localhost:3200/ready 2>/dev/null; then + echo "✅ Tempo is ready!" + TEMPO_READY=true + break + else + echo "⏳ Attempt $i/12: Tempo not ready yet, waiting 5s..." + sleep 5 + fi + done + + if [ "$TEMPO_READY" = false ]; then + echo "❌ Tempo failed to become ready after 60 seconds" + exit 1 + fi + + echo "✅ Tempo deployment complete!" + + echo "🛒 Deploying checkout service" + kubectl apply -f checkout-service.yaml -n app-114a + + echo "⏳ Waiting for checkout pod to be ready" + kubectl wait --for=condition=ready pod -l app=checkout -n app-114a --timeout=60s + + echo "🔍 Checking checkout deployment status" + kubectl get pods -n app-114a -l app=checkout + + echo "🚦 Deploying traffic generator" + kubectl apply -f traffic-generator.yaml -n app-114a + + echo "⏳ Waiting for traffic generator pod to be ready" + kubectl wait --for=condition=ready pod -l app=traffic-generator -n app-114a --timeout=60s + + echo "🔍 Checking all pods status" + kubectl get pods -n app-114a + + echo "⏰ Waiting for traffic generator to produce logs (checking every 3s, timeout 60s)" + PROMO_LOG_FOUND=false + for i in {1..20}; do + if kubectl logs -n app-114a -l app=traffic-generator --tail=100 2>/dev/null | grep -q "WITH promo_code"; then + echo "✅ Found traffic generator log WITH promo_code after $((i*3)) seconds" + PROMO_LOG_FOUND=true + break + else + echo "⏳ Attempt $i/20: No promo_code log yet, waiting 3s..." + sleep 3 + fi + done + + if [ "$PROMO_LOG_FOUND" = false ]; then + echo "❌ Missing traffic generator log WITH promo_code after 60 seconds" + exit 1 + fi + + if kubectl logs -n app-114a -l app=traffic-generator --tail=100 | grep -q "WITHOUT promo_code"; then + echo "✅ Found traffic generator log WITHOUT promo_code" + else + echo "❌ Missing traffic generator log WITHOUT promo_code" + exit 1 + fi + + if kubectl logs -n app-114a -l app=checkout --tail=100 | grep -q "Processing checkout request"; then + echo "✅ Found checkout request log" + else + echo "❌ Missing checkout request log" + exit 1 + fi + + # Commented out traffic generator trace checks as it no longer sends traces + # echo "🔍 Querying Tempo for traces from traffic generator" + # TRAFFIC_GEN_TRACES=$(curl -s "http://localhost:3200/api/search?tags=service.name%3Dtraffic-generator&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0") + # echo "Found $TRAFFIC_GEN_TRACES traces from traffic-generator" + + echo "🔍 Querying Tempo for traces from checkout service" + CHECKOUT_TRACES=$(kubectl run -n app-114a tempo-query --rm -i --restart=Never --image=curlimages/curl -- -s "http://tempo:3200/api/search?tags=service.name%3Dcheckout-service&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0") + echo "Found $CHECKOUT_TRACES traces from checkout-service" + + # Commented out traffic generator trace check + # if [ "$TRAFFIC_GEN_TRACES" -gt "0" ]; then + # echo "✅ Found traces from traffic-generator" + # else + # echo "❌ No traces found from traffic-generator" + # exit 1 + # fi + + if [ "$CHECKOUT_TRACES" -gt "0" ]; then + echo "✅ Found traces from checkout-service" + else + echo "❌ No traces found from checkout-service" + exit 1 + fi + + # Delete Traffic generator so the ai won't cheat + kubectl delete -f traffic-generator.yaml -n app-114a + + echo "✅ Test setup complete!" + +after_test: | + kubectl delete namespace app-114a || true diff --git a/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/toolsets.yaml new file mode 100644 index 000000000..3facea342 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/toolsets.yaml @@ -0,0 +1,12 @@ +toolsets: + kubernetes/core: + enabled: true + kubernetes/logs: + enabled: true + grafana/tempo: + enabled: true + config: + url: http://localhost:3214 + healthcheck: "ready" + enable_comparative_sample: true + enable_simple_comparison: false diff --git a/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/traffic-generator.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/traffic-generator.yaml new file mode 100644 index 000000000..675d0eaaf --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/traffic-generator.yaml @@ -0,0 +1,157 @@ +# Traffic Generator Deployment +apiVersion: v1 +kind: Secret +metadata: + name: traffic-generator-app +type: Opaque +stringData: + app.py: | + import time + import random + import requests + from datetime import datetime + # from opentelemetry import trace + # from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + # from opentelemetry.sdk.trace import TracerProvider + # from opentelemetry.sdk.trace.export import BatchSpanProcessor + # from opentelemetry.sdk.resources import Resource + # from opentelemetry.instrumentation.requests import RequestsInstrumentor + + # # Configure OpenTelemetry + # resource = Resource.create({"service.name": "traffic-generator"}) + # provider = TracerProvider(resource=resource) + # trace.set_tracer_provider(provider) + + # otlp_exporter = OTLPSpanExporter( + # endpoint="tempo.app-114a.svc.cluster.local:4317", + # insecure=True + # ) + # provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) + + # # Instrument requests library + # RequestsInstrumentor().instrument() + + # tracer = trace.get_tracer(__name__) + + CHECKOUT_URL = "http://checkout.app-114a.svc.cluster.local:8080/checkout" + ZONES = ['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2'] + PROMO_CODES = ['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25'] + + def generate_traffic(): + """Continuously generate traffic to checkout service""" + print("[TRAFFIC-GEN] Starting traffic generator", flush=True) + request_count = 0 + + while True: + request_count += 1 + + # 30% chance to include promo code + include_promo = random.random() < 0.3 + + # Build request data + data = { + "user_id": f"user-{random.randint(1000, 9999)}", + "zone_id": random.choice(ZONES), + "items": [ + { + "id": f"item-{i}", + "price": round(random.uniform(10, 200), 2), + "weight": round(random.uniform(0.5, 5.0), 2) + } + for i in range(random.randint(1, 3)) + ] + } + + if include_promo: + data["promo_code"] = random.choice(PROMO_CODES) + + # Log the request + promo_status = "WITH" if include_promo else "WITHOUT" + print(f"[TRAFFIC-GEN] Request #{request_count}: Sending request {promo_status} promo_code", flush=True) + + # Make request with tracing + # with tracer.start_as_current_span("checkout_request") as span: + # span.set_attribute("request.number", request_count) + # span.set_attribute("has.promo_code", include_promo) + # if include_promo: + # span.set_attribute("promo.code", data.get("promo_code")) + + try: + start_time = time.time() + response = requests.post(CHECKOUT_URL, json=data, timeout=10) + latency = time.time() - start_time + + # span.set_attribute("http.status_code", response.status_code) + # span.set_attribute("response.latency", latency) + + status = "success" if response.status_code == 200 else f"error({response.status_code})" + print(f"[TRAFFIC-GEN] Request #{request_count}: Response status={status}, latency={latency:.2f}s", flush=True) + + except Exception as e: + # span.record_exception(e) + # span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) + print(f"[TRAFFIC-GEN] Request #{request_count}: Error - {str(e)}", flush=True) + + # Wait 100ms to 200ms second before next request + sleep_time = random.uniform(0.1, 0.2) + time.sleep(sleep_time) + + if __name__ == '__main__': + print("[TRAFFIC-GEN] Starting...", flush=True) + + # Start generating traffic + generate_traffic() +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: traffic-generator +spec: + replicas: 1 + selector: + matchLabels: + app: traffic-generator + template: + metadata: + labels: + app: traffic-generator + spec: + containers: + - name: traffic-generator + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install requests && \ + # pip install opentelemetry-api opentelemetry-sdk \ + # opentelemetry-instrumentation-requests \ + # opentelemetry-exporter-otlp-proto-grpc && \ + touch /tmp/ready && \ + python /app/app.py + volumeMounts: + - name: app + mountPath: /app + env: + - name: PYTHONUNBUFFERED + value: "1" + startupProbe: + exec: + command: + - cat + - /tmp/ready + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 12 + resources: + requests: + memory: "64Mi" + cpu: "25m" + limits: + memory: "128Mi" + cpu: "100m" + volumes: + - name: app + secret: + secretName: traffic-generator-app diff --git a/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/checkout-service.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/checkout-service.yaml new file mode 100644 index 000000000..8c6a0f088 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/checkout-service.yaml @@ -0,0 +1,169 @@ +# Simplified Checkout Service with Dummy SQL +apiVersion: v1 +kind: Secret +metadata: + name: checkout-app +type: Opaque +stringData: + app.py: | + import os + import time + import random + from flask import Flask, request, jsonify + from opentelemetry import trace + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.sdk.resources import Resource + from opentelemetry.instrumentation.flask import FlaskInstrumentor + + # Configure OpenTelemetry + resource = Resource.create({"service.name": "checkout-service"}) + provider = TracerProvider(resource=resource) + trace.set_tracer_provider(provider) + + otlp_exporter = OTLPSpanExporter( + endpoint="tempo.app-114b.svc.cluster.local:4317", + insecure=True + ) + provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) + + app = Flask(__name__) + FlaskInstrumentor().instrument_app(app) + + tracer = trace.get_tracer(__name__) + + @app.route('/health') + def health(): + return 'OK' + + @app.route('/checkout', methods=['POST']) + def checkout(): + with tracer.start_as_current_span("process_checkout") as span: + data = request.json or {} + + # Log the incoming request (without revealing the data) + print(f"[CHECKOUT] Processing checkout request for user {data.get('user_id', 'guest')}", flush=True) + + # Extract parameters + user_id = data.get('user_id', 'guest') + zone_id = data.get('zone_id', 'us-west-1') + promo_code = data.get('promo_code') + items = data.get('items', []) + + # Add span attributes + span.set_attribute("user.id", user_id) + span.set_attribute("zone.id", zone_id) + span.set_attribute("items.count", len(items)) + if promo_code: + span.set_attribute("promo.code", promo_code) + + # Simulate database query for shipping calculation + with tracer.start_as_current_span("database_query") as db_span: + db_span.set_attribute("db.system", "postgresql") + db_span.set_attribute("db.operation", "SELECT") + + if promo_code: + # Simulate slow query with promo_code + query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND promo_code = ? AND active = true" + db_span.set_attribute("db.statement", query) + # print(f"[DB] Executing shipping rate query", flush=True) + sleep_time = random.uniform(1.5, 3.5) + time.sleep(sleep_time) # Simulate slow query + shipping_rate = 4.5 + discount = 15.0 + else: + # Simulate fast query without promo_code + query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND active = true" + db_span.set_attribute("db.statement", query) + # print(f"[DB] Executing shipping rate query", flush=True) + sleep_time = random.uniform(0.1, 0.2) + time.sleep(sleep_time) # Simulate fast query + shipping_rate = 5.0 + discount = 0.0 + + # Calculate shipping cost + total_weight = sum(item.get('weight', 1.0) for item in items) + shipping_cost = total_weight * shipping_rate * (1 - discount/100) + + # Calculate total + subtotal = sum(item.get('price', 0) for item in items) + total = subtotal + shipping_cost + + response = { + "order_id": f"ord-{random.randint(1000, 9999)}", + "subtotal": subtotal, + "shipping": round(shipping_cost, 2), + "total": round(total, 2) + } + + print(f"[CHECKOUT] Completed checkout request", flush=True) + return jsonify(response) + + if __name__ == '__main__': + print("[CHECKOUT] Starting checkout service on port 8080", flush=True) + app.run(host='0.0.0.0', port=8080) +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: checkout +spec: + replicas: 1 + selector: + matchLabels: + app: checkout + template: + metadata: + labels: + app: checkout + spec: + containers: + - name: checkout + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install flask opentelemetry-api opentelemetry-sdk \ + opentelemetry-instrumentation-flask \ + opentelemetry-exporter-otlp-proto-grpc && \ + python /app/app.py + volumeMounts: + - name: app + mountPath: /app + ports: + - containerPort: 8080 + env: + - name: PYTHONUNBUFFERED + value: "1" + startupProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 24 + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" + volumes: + - name: app + secret: + secretName: checkout-app +--- +apiVersion: v1 +kind: Service +metadata: + name: checkout +spec: + selector: + app: checkout + ports: + - port: 8080 + targetPort: 8080 diff --git a/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/test_case.yaml new file mode 100644 index 000000000..ed24c676c --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/test_case.yaml @@ -0,0 +1,131 @@ +user_prompt: +- "The checkout service in namespace app-114b is experiencing high latency. Investigate why." + +expected_output: + - The answer must explicitly state that queries that include a promo code are slow or all slow requests include promo code. + - Including a query that mentions promo code is not sufficient. + +tags: + - kubernetes + - hard + - chain-of-causation + - traces + +port_forwards: + - namespace: app-114b + service: tempo + local_port: 3215 + remote_port: 3200 + +before_test: | + echo "🚀 Setting up test 114 - Creating namespace app-114b" + kubectl create namespace app-114b || true + echo "✅ Namespace app-114b created successfully!" + + echo "📦 Deploying Tempo from shared config" + kubectl apply -f ../../shared/tempo.yaml -n app-114b + + echo "⏳ Waiting for Tempo pod to be ready" + kubectl wait --for=condition=ready pod -l app=tempo -n app-114b --timeout=60s + + echo "⏰ Waiting for Tempo to be fully ready (checking every 5s, timeout 60s)" + TEMPO_READY=false + for i in {1..12}; do + if kubectl exec -n app-114b deployment/tempo -- wget -q -O - http://localhost:3200/ready 2>/dev/null; then + echo "✅ Tempo is ready!" + TEMPO_READY=true + break + else + echo "⏳ Attempt $i/12: Tempo not ready yet, waiting 5s..." + sleep 5 + fi + done + + if [ "$TEMPO_READY" = false ]; then + echo "❌ Tempo failed to become ready after 60 seconds" + exit 1 + fi + + echo "✅ Tempo deployment complete!" + + echo "🛒 Deploying checkout service" + kubectl apply -f checkout-service.yaml -n app-114b + + echo "⏳ Waiting for checkout pod to be ready" + kubectl wait --for=condition=ready pod -l app=checkout -n app-114b --timeout=60s + + echo "🔍 Checking checkout deployment status" + kubectl get pods -n app-114b -l app=checkout + + echo "🚦 Deploying traffic generator" + kubectl apply -f traffic-generator.yaml -n app-114b + + echo "⏳ Waiting for traffic generator pod to be ready" + kubectl wait --for=condition=ready pod -l app=traffic-generator -n app-114b --timeout=60s + + echo "🔍 Checking all pods status" + kubectl get pods -n app-114b + + echo "⏰ Waiting for traffic generator to produce logs (checking every 3s, timeout 60s)" + PROMO_LOG_FOUND=false + for i in {1..20}; do + if kubectl logs -n app-114b -l app=traffic-generator --tail=100 2>/dev/null | grep -q "WITH promo_code"; then + echo "✅ Found traffic generator log WITH promo_code after $((i*3)) seconds" + PROMO_LOG_FOUND=true + break + else + echo "⏳ Attempt $i/20: No promo_code log yet, waiting 3s..." + sleep 3 + fi + done + + if [ "$PROMO_LOG_FOUND" = false ]; then + echo "❌ Missing traffic generator log WITH promo_code after 60 seconds" + exit 1 + fi + + if kubectl logs -n app-114b -l app=traffic-generator --tail=100 | grep -q "WITHOUT promo_code"; then + echo "✅ Found traffic generator log WITHOUT promo_code" + else + echo "❌ Missing traffic generator log WITHOUT promo_code" + exit 1 + fi + + if kubectl logs -n app-114b -l app=checkout --tail=100 | grep -q "Processing checkout request"; then + echo "✅ Found checkout request log" + else + echo "❌ Missing checkout request log" + exit 1 + fi + + # Commented out traffic generator trace checks as it no longer sends traces + # echo "🔍 Querying Tempo for traces from traffic generator" + # TRAFFIC_GEN_TRACES=$(curl -s "http://localhost:3200/api/search?tags=service.name%3Dtraffic-generator&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0") + # echo "Found $TRAFFIC_GEN_TRACES traces from traffic-generator" + + echo "🔍 Querying Tempo for traces from checkout service" + CHECKOUT_TRACES=$(kubectl run -n app-114b tempo-query --rm -i --restart=Never --image=curlimages/curl -- -s "http://tempo:3200/api/search?tags=service.name%3Dcheckout-service&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0") + echo "Found $CHECKOUT_TRACES traces from checkout-service" + + # Commented out traffic generator trace check + # if [ "$TRAFFIC_GEN_TRACES" -gt "0" ]; then + # echo "✅ Found traces from traffic-generator" + # else + # echo "❌ No traces found from traffic-generator" + # exit 1 + # fi + + if [ "$CHECKOUT_TRACES" -gt "0" ]; then + echo "✅ Found traces from checkout-service" + else + echo "❌ No traces found from checkout-service" + exit 1 + fi + + # Delete Traffic generator so the ai won't cheat + kubectl delete -f traffic-generator.yaml -n app-114b + + echo "✅ Test setup complete!" + +after_test: | + kubectl delete namespace app-114b || true diff --git a/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/toolsets.yaml new file mode 100644 index 000000000..05d06ab94 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/toolsets.yaml @@ -0,0 +1,12 @@ +toolsets: + kubernetes/core: + enabled: true + kubernetes/logs: + enabled: true + grafana/tempo: + enabled: true + config: + url: http://localhost:3215 + healthcheck: "ready" + enable_comparative_sample: false + enable_simple_comparison: true diff --git a/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/traffic-generator.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/traffic-generator.yaml new file mode 100644 index 000000000..98c6443fa --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/traffic-generator.yaml @@ -0,0 +1,157 @@ +# Traffic Generator Deployment +apiVersion: v1 +kind: Secret +metadata: + name: traffic-generator-app +type: Opaque +stringData: + app.py: | + import time + import random + import requests + from datetime import datetime + # from opentelemetry import trace + # from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + # from opentelemetry.sdk.trace import TracerProvider + # from opentelemetry.sdk.trace.export import BatchSpanProcessor + # from opentelemetry.sdk.resources import Resource + # from opentelemetry.instrumentation.requests import RequestsInstrumentor + + # # Configure OpenTelemetry + # resource = Resource.create({"service.name": "traffic-generator"}) + # provider = TracerProvider(resource=resource) + # trace.set_tracer_provider(provider) + + # otlp_exporter = OTLPSpanExporter( + # endpoint="tempo.app-114b.svc.cluster.local:4317", + # insecure=True + # ) + # provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) + + # # Instrument requests library + # RequestsInstrumentor().instrument() + + # tracer = trace.get_tracer(__name__) + + CHECKOUT_URL = "http://checkout.app-114b.svc.cluster.local:8080/checkout" + ZONES = ['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2'] + PROMO_CODES = ['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25'] + + def generate_traffic(): + """Continuously generate traffic to checkout service""" + print("[TRAFFIC-GEN] Starting traffic generator", flush=True) + request_count = 0 + + while True: + request_count += 1 + + # 30% chance to include promo code + include_promo = random.random() < 0.3 + + # Build request data + data = { + "user_id": f"user-{random.randint(1000, 9999)}", + "zone_id": random.choice(ZONES), + "items": [ + { + "id": f"item-{i}", + "price": round(random.uniform(10, 200), 2), + "weight": round(random.uniform(0.5, 5.0), 2) + } + for i in range(random.randint(1, 3)) + ] + } + + if include_promo: + data["promo_code"] = random.choice(PROMO_CODES) + + # Log the request + promo_status = "WITH" if include_promo else "WITHOUT" + print(f"[TRAFFIC-GEN] Request #{request_count}: Sending request {promo_status} promo_code", flush=True) + + # Make request with tracing + # with tracer.start_as_current_span("checkout_request") as span: + # span.set_attribute("request.number", request_count) + # span.set_attribute("has.promo_code", include_promo) + # if include_promo: + # span.set_attribute("promo.code", data.get("promo_code")) + + try: + start_time = time.time() + response = requests.post(CHECKOUT_URL, json=data, timeout=10) + latency = time.time() - start_time + + # span.set_attribute("http.status_code", response.status_code) + # span.set_attribute("response.latency", latency) + + status = "success" if response.status_code == 200 else f"error({response.status_code})" + print(f"[TRAFFIC-GEN] Request #{request_count}: Response status={status}, latency={latency:.2f}s", flush=True) + + except Exception as e: + # span.record_exception(e) + # span.set_status(trace.Status(trace.StatusCode.ERROR, str(e))) + print(f"[TRAFFIC-GEN] Request #{request_count}: Error - {str(e)}", flush=True) + + # Wait 100ms to 200ms second before next request + sleep_time = random.uniform(0.1, 0.2) + time.sleep(sleep_time) + + if __name__ == '__main__': + print("[TRAFFIC-GEN] Starting...", flush=True) + + # Start generating traffic + generate_traffic() +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: traffic-generator +spec: + replicas: 1 + selector: + matchLabels: + app: traffic-generator + template: + metadata: + labels: + app: traffic-generator + spec: + containers: + - name: traffic-generator + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install requests && \ + # pip install opentelemetry-api opentelemetry-sdk \ + # opentelemetry-instrumentation-requests \ + # opentelemetry-exporter-otlp-proto-grpc && \ + touch /tmp/ready && \ + python /app/app.py + volumeMounts: + - name: app + mountPath: /app + env: + - name: PYTHONUNBUFFERED + value: "1" + startupProbe: + exec: + command: + - cat + - /tmp/ready + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 12 + resources: + requests: + memory: "64Mi" + cpu: "25m" + limits: + memory: "128Mi" + cpu: "100m" + volumes: + - name: app + secret: + secretName: traffic-generator-app diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/api_service.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/api_service.yaml new file mode 100644 index 000000000..8872dc2f1 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/api_service.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: Service +metadata: + name: api-service + namespace: app-200 +spec: + selector: + app: api-service + ports: + - port: 8080 + targetPort: 8080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api-service + namespace: app-200 +spec: + replicas: 1 + selector: + matchLabels: + app: api-service + template: + metadata: + labels: + app: api-service + spec: + containers: + - name: api + image: nginx:alpine + ports: + - containerPort: 8080 diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/prometheus_mock.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/prometheus_mock.yaml new file mode 100644 index 000000000..d77c9045f --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/prometheus_mock.yaml @@ -0,0 +1,324 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-mock + namespace: app-200 +spec: + selector: + app: prometheus-mock + ports: + - port: 9090 + targetPort: 9090 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-mock-server + namespace: app-200 +data: + server.py: | + from flask import Flask, request, jsonify + import time + import random + + app = Flask(__name__) + + # Mock metrics data with varying latency by dimensions + def generate_metrics_data(): + current_time = int(time.time()) + + # Different latency patterns by endpoint and user agent + patterns = [ + # High latency pattern - recommendations endpoint with mobile UA + { + "endpoint": "/api/v2/recommendations", + "user_agent": "MobileApp/2.0", + "region": "eu-west", + "p95_latency": 3.2, + "p50_latency": 1.8, + "request_rate": 50, + }, + { + "endpoint": "/api/v2/recommendations", + "user_agent": "MobileApp/2.1", + "region": "eu-west", + "p95_latency": 3.1, + "p50_latency": 1.7, + "request_rate": 45, + }, + { + "endpoint": "/api/v2/recommendations", + "user_agent": "MobileApp/2.2", + "region": "eu-west", + "p95_latency": 3.3, + "p50_latency": 1.9, + "request_rate": 48, + }, + # Normal latency - same endpoint, different UA + { + "endpoint": "/api/v2/recommendations", + "user_agent": "WebBrowser", + "region": "us-east", + "p95_latency": 0.5, + "p50_latency": 0.2, + "request_rate": 200, + }, + # Normal latency - other endpoints + { + "endpoint": "/api/v2/products", + "user_agent": "MobileApp/2.0", + "region": "eu-west", + "p95_latency": 0.3, + "p50_latency": 0.15, + "request_rate": 100, + }, + { + "endpoint": "/api/v2/users", + "user_agent": "WebBrowser", + "region": "us-east", + "p95_latency": 0.2, + "p50_latency": 0.1, + "request_rate": 150, + }, + ] + + return patterns + + @app.route('/api/v1/metadata', methods=['GET']) + def metadata(): + """Return available metrics metadata""" + return jsonify({ + "status": "success", + "data": { + "http_request_duration_seconds": { + "type": "histogram", + "help": "HTTP request latency", + "labels": ["endpoint", "method", "status_code", "user_agent", "region"] + }, + "http_requests_total": { + "type": "counter", + "help": "Total HTTP requests", + "labels": ["endpoint", "method", "status_code", "user_agent", "region"] + }, + "instance_cpu_usage": { + "type": "gauge", + "help": "CPU usage percentage", + "labels": ["instance", "region"] + } + } + }) + + @app.route('/api/v1/query', methods=['POST']) + def query(): + """Handle instant queries""" + query_str = request.form.get('query', '') + current_time = int(time.time()) + + # Mock responses based on query patterns + if 'http_request_duration_seconds' in query_str: + if 'histogram_quantile' in query_str: + # Return p95 latencies + patterns = generate_metrics_data() + result = [] + + for pattern in patterns: + if 'by' in query_str: # Grouped query + metric = { + "endpoint": pattern["endpoint"], + "user_agent": pattern["user_agent"], + "region": pattern["region"] + } + value = pattern["p95_latency"] if '0.95' in query_str else pattern["p50_latency"] + else: + metric = {} + value = 0.8 # Overall average + + result.append({ + "metric": metric, + "value": [current_time, str(value)] + }) + + return jsonify({ + "status": "success", + "data": { + "resultType": "vector", + "result": result + } + }) + + elif 'topk' in query_str: + # Return top slow endpoints + patterns = generate_metrics_data() + # Sort by latency and return top entries + sorted_patterns = sorted(patterns, key=lambda x: x["p95_latency"], reverse=True) + result = [] + + for i, pattern in enumerate(sorted_patterns[:5]): + result.append({ + "metric": { + "endpoint": pattern["endpoint"], + "user_agent": pattern["user_agent"], + "region": pattern["region"] + }, + "value": [current_time, str(pattern["p95_latency"])] + }) + + return jsonify({ + "status": "success", + "data": { + "resultType": "vector", + "result": result + } + }) + + elif 'instance_cpu_usage' in query_str: + # Return CPU metrics correlating with high latency + result = [ + { + "metric": {"instance": "recommender-eu-1", "region": "eu-west"}, + "value": [current_time, "0.95"] + }, + { + "metric": {"instance": "recommender-eu-2", "region": "eu-west"}, + "value": [current_time, "0.93"] + }, + { + "metric": {"instance": "recommender-us-1", "region": "us-east"}, + "value": [current_time, "0.45"] + } + ] + + return jsonify({ + "status": "success", + "data": { + "resultType": "vector", + "result": result + } + }) + + elif 'offset' in query_str: + # Historical comparison - return lower values + patterns = generate_metrics_data() + result = [] + + for pattern in patterns: + # Historical data shows 50% lower latency + historical_value = pattern["p95_latency"] * 0.5 if pattern["endpoint"] == "/api/v2/recommendations" else pattern["p95_latency"] + result.append({ + "metric": { + "endpoint": pattern["endpoint"], + "user_agent": pattern["user_agent"] + }, + "value": [current_time - 86400, str(historical_value)] + }) + + return jsonify({ + "status": "success", + "data": { + "resultType": "vector", + "result": result + } + }) + + elif 'stddev_over_time' in query_str: + # Anomaly detection - flag the high latency pattern + result = [ + { + "metric": { + "endpoint": "/api/v2/recommendations", + "user_agent": "MobileApp/2.0", + "region": "eu-west" + }, + "value": [current_time, "5.2"] # High z-score indicating anomaly + } + ] + + return jsonify({ + "status": "success", + "data": { + "resultType": "vector", + "result": result + } + }) + + # Default empty response + return jsonify({ + "status": "success", + "data": { + "resultType": "vector", + "result": [] + } + }) + + @app.route('/api/v1/query_range', methods=['POST']) + def query_range(): + """Handle range queries""" + # Similar to instant query but with time series data + return query() + + @app.route('/api/v1/series', methods=['GET']) + def series(): + """Return series metadata""" + return jsonify({ + "status": "success", + "data": [ + { + "__name__": "http_request_duration_seconds_bucket", + "endpoint": "/api/v2/recommendations", + "user_agent": "MobileApp/2.0", + "region": "eu-west", + "le": "0.5" + }, + { + "__name__": "http_request_duration_seconds_bucket", + "endpoint": "/api/v2/recommendations", + "user_agent": "MobileApp/2.0", + "region": "eu-west", + "le": "1.0" + } + ] + }) + + @app.route('/api/v1/labels', methods=['GET']) + def labels(): + """Return available labels""" + return jsonify({ + "status": "success", + "data": ["endpoint", "method", "status_code", "user_agent", "region", "instance"] + }) + + if __name__ == '__main__': + app.run(host='0.0.0.0', port=9090) +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-mock + namespace: app-200 +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus-mock + template: + metadata: + labels: + app: prometheus-mock + spec: + containers: + - name: server + image: python:3.9-slim + command: ["sh", "-c"] + args: + - | + pip install flask + python /app/server.py + ports: + - containerPort: 9090 + volumeMounts: + - name: server-code + mountPath: /app + volumes: + - name: server-code + configMap: + name: prometheus-mock-server diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/tempo_mock.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/tempo_mock.yaml new file mode 100644 index 000000000..dfce117a5 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/tempo_mock.yaml @@ -0,0 +1,255 @@ +apiVersion: v1 +kind: Service +metadata: + name: tempo-mock + namespace: app-200 +spec: + selector: + app: tempo-mock + ports: + - port: 3100 + targetPort: 3100 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tempo-mock-server + namespace: app-200 +data: + server.py: | + from flask import Flask, request, jsonify + import time + import uuid + import random + + app = Flask(__name__) + + def generate_trace_data(): + """Generate realistic trace data with patterns matching the latency issue""" + traces = [] + + # High latency traces for mobile app EU users + for i in range(10): + trace_id = str(uuid.uuid4()).replace('-', '') + traces.append({ + "traceID": trace_id, + "rootServiceName": "api-service", + "rootTraceName": "POST /api/v2/recommendations", + "startTimeUnixNano": str(int(time.time() - 300) * 1000000000), + "durationMs": 3200 + random.randint(-200, 200), # ~3.2s + "spanSet": { + "spans": [ + { + "spanID": f"span{i}1", + "operationName": "POST /api/v2/recommendations", + "duration": 3200000000, # nanoseconds + "attributes": { + "http.method": "POST", + "http.url": "/api/v2/recommendations", + "http.user_agent": f"MobileApp/2.{i % 3}", + "http.status_code": "200", + "user.region": "eu-west", + "customer.tier": "premium" + } + }, + { + "spanID": f"span{i}2", + "operationName": "recommendation-service.get-item", + "duration": 500000000, # 500ms per call + "attributes": { + "db.system": "redis", + "db.operation": "GET", + "service.name": "recommendation-service" + } + }, + { + "spanID": f"span{i}3", + "operationName": "recommendation-service.get-item", + "duration": 500000000, # N+1 pattern + "attributes": { + "db.system": "redis", + "db.operation": "GET", + "service.name": "recommendation-service" + } + }, + { + "spanID": f"span{i}4", + "operationName": "recommendation-service.get-item", + "duration": 500000000, # N+1 pattern + "attributes": { + "db.system": "redis", + "db.operation": "GET", + "service.name": "recommendation-service" + } + } + ] + } + }) + + # Normal latency traces for web users + for i in range(20): + trace_id = str(uuid.uuid4()).replace('-', '') + traces.append({ + "traceID": trace_id, + "rootServiceName": "api-service", + "rootTraceName": "POST /api/v2/recommendations", + "startTimeUnixNano": str(int(time.time() - 300) * 1000000000), + "durationMs": 500 + random.randint(-50, 50), # ~500ms + "spanSet": { + "spans": [ + { + "spanID": f"web{i}1", + "operationName": "POST /api/v2/recommendations", + "duration": 500000000, + "attributes": { + "http.method": "POST", + "http.url": "/api/v2/recommendations", + "http.user_agent": "Mozilla/5.0", + "http.status_code": "200", + "user.region": "us-east", + "customer.tier": "standard" + } + }, + { + "spanID": f"web{i}2", + "operationName": "recommendation-service.batch-get", + "duration": 300000000, # Single batch call + "attributes": { + "db.system": "redis", + "db.operation": "MGET", + "service.name": "recommendation-service" + } + } + ] + } + }) + + return traces + + @app.route('/api/search', methods=['GET']) + def search(): + """Search traces endpoint""" + query = request.args.get('q', '') + limit = int(request.args.get('limit', 20)) + + traces = generate_trace_data() + + # Filter based on query + if 'duration>2s' in query or 'duration>2000ms' in query: + # Return only slow traces + traces = [t for t in traces if t['durationMs'] > 2000] + elif 'duration>500ms' in query: + # Return medium and slow traces + traces = [t for t in traces if t['durationMs'] > 500] + + if 'service.name="api-service"' in query: + # Filter by service (all our traces are from api-service) + pass + + # Return limited results + traces = traces[:limit] + + return jsonify({ + "traces": traces + }) + + @app.route('/api/traces/', methods=['GET']) + def get_trace(trace_id): + """Get specific trace by ID""" + # Generate a detailed trace + return jsonify({ + "traceID": trace_id, + "rootServiceName": "api-service", + "rootTraceName": "POST /api/v2/recommendations", + "startTimeUnixNano": str(int(time.time() - 300) * 1000000000), + "durationMs": 3200, + "spanSet": { + "spans": [ + { + "spanID": "root", + "operationName": "POST /api/v2/recommendations", + "startTimeUnixNano": str(int(time.time() - 300) * 1000000000), + "endTimeUnixNano": str(int(time.time() - 297) * 1000000000), + "duration": 3200000000, + "attributes": { + "http.method": "POST", + "http.url": "/api/v2/recommendations", + "http.user_agent": "MobileApp/2.0", + "http.status_code": "200", + "user.region": "eu-west", + "customer.tier": "premium", + "user.id": "user123", + "trace.id": trace_id + } + } + ] + } + }) + + @app.route('/api/v2/search/tags', methods=['GET']) + def tags(): + """Return available span tags/attributes""" + return jsonify({ + "scopes": [ + { + "name": "span", + "tags": [ + "http.method", + "http.url", + "http.user_agent", + "http.status_code", + "user.region", + "customer.tier", + "user.id", + "service.name", + "db.system", + "db.operation" + ] + }, + { + "name": "resource", + "tags": [ + "service.name", + "k8s.pod.name", + "k8s.namespace.name", + "k8s.deployment.name" + ] + } + ] + }) + + if __name__ == '__main__': + app.run(host='0.0.0.0', port=3100) +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tempo-mock + namespace: app-200 +spec: + replicas: 1 + selector: + matchLabels: + app: tempo-mock + template: + metadata: + labels: + app: tempo-mock + spec: + containers: + - name: server + image: python:3.9-slim + command: ["sh", "-c"] + args: + - | + pip install flask + python /app/server.py + ports: + - containerPort: 3100 + volumeMounts: + - name: server-code + mountPath: /app + volumes: + - name: server-code + configMap: + name: tempo-mock-server diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/test_case.yaml new file mode 100644 index 000000000..892277c7d --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/test_case.yaml @@ -0,0 +1,66 @@ +tags: + - network + - traces + - easy + +before_test: | + # Create namespace + kubectl create namespace app-200 + + # Deploy a mock Prometheus with latency metrics + kubectl apply -f ./prometheus_mock.yaml -n app-200 + + # Deploy a mock Tempo with trace data + kubectl apply -f ./tempo_mock.yaml -n app-200 + + # Deploy an application with varying latency patterns + kubectl apply -f ./api_service.yaml -n app-200 + + # Wait for services to be ready + kubectl wait --for=condition=ready pod -l app=prometheus-mock -n app-200 --timeout=60s + kubectl wait --for=condition=ready pod -l app=tempo-mock -n app-200 --timeout=60s + kubectl wait --for=condition=ready pod -l app=api-service -n app-200 --timeout=60s + + # Set up port forwarding with unique ports + kubectl port-forward -n app-200 service/prometheus-mock 20090:9090 & + kubectl port-forward -n app-200 service/tempo-mock 20100:3100 & + sleep 5 + +after_test: | + # Kill port-forward processes + pkill -f "kubectl port-forward.*app-200.*20090:9090" || true + pkill -f "kubectl port-forward.*app-200.*20100:3100" || true + + # Delete namespace + kubectl delete namespace app-200 --ignore-not-found + +user_prompt: | + I'm seeing high latency in my API service. Some users are reporting slow responses + but not all. Can you investigate what's causing this and identify which specific + subset of requests is affected? + +expected_output: | + The high latency issue is affecting a specific subset of requests: + + **Affected Traffic Pattern:** + - Endpoint: `/api/v2/recommendations` + - User Agent: Mobile app versions 2.0-2.3 + - Geographic Region: EU region users + - Time Pattern: Latency spikes during peak hours (10am-2pm UTC) + + **Root Cause Analysis:** + - The `/api/v2/recommendations` endpoint makes calls to an external recommendation service + - EU users are routed to an EU-based instance that has resource constraints + - Mobile app versions 2.0-2.3 use an inefficient API calling pattern (N+1 queries) + - The combination causes p95 latency to exceed 3 seconds during peak load + + **Evidence:** + - Prometheus metrics show p95 latency of 3.2s for the specific endpoint/user-agent combination + - Only 15% of total traffic is affected (matching the mobile app + EU user segment) + - Tempo traces show multiple sequential calls to the recommendation service + - Resource metrics correlate with high latency periods (CPU at 95% on EU instances) + + **Recommendations:** + 1. Immediate: Scale up EU recommendation service instances + 2. Short-term: Implement request batching to reduce N+1 queries + 3. Long-term: Update mobile app to use more efficient API patterns diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/toolsets.yaml new file mode 100644 index 000000000..a7d1e1e9c --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/toolsets.yaml @@ -0,0 +1,15 @@ +# Configure toolsets for this test +toolsets: + prometheus/metrics: + enabled: true + config: + prometheus_url: http://localhost:20090 + prometheus_ssl_enabled: false + tool_calls_return_data: true + + grafana/tempo: + enabled: true + config: + url: http://localhost:20100 + api_key: "" + grafana_datasource_uid: "tempo" diff --git a/tests/llm/utils/commands.py b/tests/llm/utils/commands.py index 0e5ed468d..aa9e09a30 100644 --- a/tests/llm/utils/commands.py +++ b/tests/llm/utils/commands.py @@ -2,6 +2,7 @@ import logging import os import subprocess +import sys import time from contextlib import contextmanager from typing import Dict, Optional @@ -70,6 +71,14 @@ def _invoke_command(command: str, cwd: str) -> str: output = f"{result.stdout}\n{result.stderr}" logging.debug(f"** `{command}`:\n{output}") logging.debug(f"Ran `{command}` in {cwd} with exit code {result.returncode}") + + # Show output if SHOW_SETUP_OUTPUT is set + if os.environ.get("SHOW_SETUP_OUTPUT", "").lower() in ("true", "1"): + if result.stdout: + sys.stderr.write(f"[SETUP OUTPUT] {result.stdout}\n") + if result.stderr: + sys.stderr.write(f"[SETUP STDERR] {result.stderr}\n") + return output except subprocess.CalledProcessError as e: truncated_command = _truncate_script(command) diff --git a/tests/plugins/toolsets/grafana/test_tempo_advanced_tools.py b/tests/plugins/toolsets/grafana/test_tempo_advanced_tools.py new file mode 100644 index 000000000..42c6af2c1 --- /dev/null +++ b/tests/plugins/toolsets/grafana/test_tempo_advanced_tools.py @@ -0,0 +1,359 @@ +from unittest.mock import patch + +import pytest +import yaml + +from holmes.core.tools import ToolResultStatus +from holmes.plugins.toolsets.grafana.toolset_grafana_tempo import ( + AnalyzeTracesByAttributes, + CompareTracePeriods, + FindSlowOperations, + GrafanaTempoConfig, + GrafanaTempoToolset, +) + + +@pytest.fixture +def tempo_toolset(): + """Create a GrafanaTempoToolset with mock config""" + toolset = GrafanaTempoToolset() + toolset._grafana_config = GrafanaTempoConfig( + api_key="test-api-key", + url="http://grafana:3000", + grafana_datasource_uid="tempo-uid", + ) + return toolset + + +class TestAnalyzeTracesByAttributes: + def test_analyze_traces_basic(self, tempo_toolset): + tool = AnalyzeTracesByAttributes(toolset=tempo_toolset) + + # Mock trace data + mock_traces = [ + {"traceID": "trace1", "duration": 1500}, + {"traceID": "trace2", "duration": 2000}, + {"traceID": "trace3", "duration": 1800}, + ] + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + return_value=mock_traces, + ) as mock_query: + result = tool._invoke( + { + "service_name": "api-service", + "group_by_attributes": ["http.method", "http.status_code"], + "min_duration": "500ms", + "start_datetime": "-1h", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify query was called with correct parameters + mock_query.assert_called_once() + call_args = mock_query.call_args + assert 'resource.service.name="api-service"' in call_args[1]["query"] + assert "duration>500ms" in call_args[1]["query"] + + # Check that result contains grouped analysis + result_data = yaml.safe_load(result.data) + assert isinstance(result_data, dict) + + def test_analyze_without_service_filter(self, tempo_toolset): + tool = AnalyzeTracesByAttributes(toolset=tempo_toolset) + + mock_traces = [] + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + return_value=mock_traces, + ) as mock_query: + result = tool._invoke( + { + "group_by_attributes": ["user.id", "tenant.id"], + "min_duration": "1s", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify service filter was not included + call_args = mock_query.call_args + assert "resource.service.name" not in call_args[1]["query"] + assert "duration>1s" in call_args[1]["query"] + + def test_custom_limit(self, tempo_toolset): + tool = AnalyzeTracesByAttributes(toolset=tempo_toolset) + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + return_value=[], + ) as mock_query: + result = tool._invoke( + { + "group_by_attributes": ["endpoint"], + "limit": 500, + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify custom limit was used + call_args = mock_query.call_args + assert call_args[1]["limit"] == 500 + + def test_error_handling(self, tempo_toolset): + tool = AnalyzeTracesByAttributes(toolset=tempo_toolset) + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + side_effect=Exception("API error"), + ): + result = tool._invoke( + { + "group_by_attributes": ["test"], + } + ) + + assert result.status == ToolResultStatus.ERROR + assert "API error" in result.error + + +class TestFindSlowOperations: + def test_find_slow_operations(self, tempo_toolset): + tool = FindSlowOperations(toolset=tempo_toolset) + + mock_traces = [ + {"traceID": "slow1", "duration": 5000}, + {"traceID": "slow2", "duration": 6000}, + ] + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + return_value=mock_traces, + ) as mock_query: + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.format_traces_list", + return_value="formatted_traces", + ) as mock_format: + result = tool._invoke( + { + "service_name": "backend", + "min_duration": "2s", + "start_datetime": "-30m", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + assert result.data == "formatted_traces" + + # Verify query construction + call_args = mock_query.call_args + assert "duration>2s" in call_args[1]["query"] + assert 'resource.service.name="backend"' in call_args[1]["query"] + + # Verify formatting was called + mock_format.assert_called_once_with(mock_traces) + + def test_find_slow_operations_without_service(self, tempo_toolset): + tool = FindSlowOperations(toolset=tempo_toolset) + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + return_value=[], + ) as mock_query: + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.format_traces_list", + return_value="formatted_traces", + ): + result = tool._invoke( + { + "min_duration": "500ms", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify only duration filter was applied + call_args = mock_query.call_args + query = call_args[1]["query"] + assert "duration>500ms" in query + assert "resource.service.name" not in query + + def test_missing_required_parameter(self, tempo_toolset): + tool = FindSlowOperations(toolset=tempo_toolset) + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_param_or_raise", + side_effect=ValueError("min_duration is required"), + ): + result = tool._invoke({}) + + assert result.status == ToolResultStatus.ERROR + assert "min_duration is required" in result.error + + +class TestCompareTracePeriods: + def test_compare_periods(self, tempo_toolset): + tool = CompareTracePeriods(toolset=tempo_toolset) + + baseline_traces = [ + {"traceID": "b1", "duration": 1000}, + {"traceID": "b2", "duration": 1200}, + ] + + comparison_traces = [ + {"traceID": "c1", "duration": 1500}, + {"traceID": "c2", "duration": 1600}, + {"traceID": "c3", "duration": 1700}, + ] + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.process_timestamps_to_int", + side_effect=[(1234567800, 1234567860), (1234567900, 1234567960)], + ): + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + side_effect=[baseline_traces, comparison_traces], + ) as mock_query: + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_base_url", + return_value="http://grafana:3000", + ): + result = tool._invoke( + { + "service_name": "api", + "baseline_start": "-25h", + "baseline_end": "-24h", + "comparison_start": "-1h", + "comparison_end": "now", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify two queries were made + assert mock_query.call_count == 2 + + # Check result contains comparison data + result_data = yaml.safe_load(result.data) + assert result_data["baseline_count"] == 2 + assert result_data["comparison_count"] == 3 + assert "baseline_period" in result_data + assert "comparison_period" in result_data + + def test_compare_with_attributes(self, tempo_toolset): + tool = CompareTracePeriods(toolset=tempo_toolset) + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.process_timestamps_to_int", + side_effect=[(1234567800, 1234567860), (1234567900, 1234567960)], + ): + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_base_url", + return_value="http://grafana:3000", + ): + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + return_value=[], + ) as mock_query: + result = tool._invoke( + { + "service_name": "frontend", + "baseline_start": "-48h", + "baseline_end": "-47h", + "comparison_start": "-2h", + "comparison_end": "-1h", + "attributes_to_compare": [ + "http.method", + "http.status_code", + ], + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify both queries used same service filter + calls = mock_query.call_args_list + for call in calls: + assert 'resource.service.name="frontend"' in call[1]["query"] + + def test_missing_service_name(self, tempo_toolset): + tool = CompareTracePeriods(toolset=tempo_toolset) + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_param_or_raise", + side_effect=ValueError("service_name is required"), + ): + result = tool._invoke( + { + "baseline_start": "-2h", + "baseline_end": "-1h", + "comparison_start": "-1h", + "comparison_end": "now", + } + ) + + assert result.status == ToolResultStatus.ERROR + assert "service_name is required" in result.error + + +class TestToolIntegration: + """Test that tools are properly integrated into the toolset""" + + def test_tools_in_toolset(self): + toolset = GrafanaTempoToolset() + tool_names = [tool.name for tool in toolset.tools] + + # Original tools + assert "fetch_tempo_traces" in tool_names + assert "fetch_tempo_trace_by_id" in tool_names + assert "fetch_tempo_tags" in tool_names + + # New advanced tools + assert "analyze_traces_by_attributes" in tool_names + assert "find_slow_operations" in tool_names + assert "compare_trace_periods" in tool_names + + def test_tool_one_liners(self, tempo_toolset): + # Test that each tool generates appropriate one-liner descriptions + tools = [ + AnalyzeTracesByAttributes(toolset=tempo_toolset), + FindSlowOperations(toolset=tempo_toolset), + CompareTracePeriods(toolset=tempo_toolset), + ] + + for tool in tools: + one_liner = tool.get_parameterized_one_liner({}) + assert "Grafana" in one_liner or "grafana" in one_liner + + +class TestTimeProcessing: + """Test time processing utilities""" + + def test_process_timestamps(self, tempo_toolset): + tool = FindSlowOperations(toolset=tempo_toolset) + + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.process_timestamps_to_int", + return_value=(1234567890, 1234567900), + ) as mock_process: + with patch( + "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces", + return_value=[], + ): + tool._invoke( + { + "min_duration": "1s", + "start_datetime": "-1h", + "end_datetime": "now", + } + ) + + # Verify time processing was called + mock_process.assert_called_once() + call_args = mock_process.call_args + assert call_args[0][0] == "-1h" + assert call_args[0][1] == "now" diff --git a/tests/plugins/toolsets/prometheus/test_advanced_tools.py b/tests/plugins/toolsets/prometheus/test_advanced_tools.py new file mode 100644 index 000000000..07e8d0493 --- /dev/null +++ b/tests/plugins/toolsets/prometheus/test_advanced_tools.py @@ -0,0 +1,314 @@ +import json +from unittest.mock import MagicMock, patch + +import pytest + +from holmes.core.tools import ToolResultStatus +from holmes.plugins.toolsets.prometheus.prometheus import ( + AnalyzeMetricByDimensions, + CompareMetricPeriods, + DetectMetricAnomalies, + FindTopMetricValues, + PrometheusConfig, + PrometheusToolset, +) + + +@pytest.fixture +def prometheus_toolset(): + """Create a PrometheusToolset with mock config""" + toolset = PrometheusToolset() + toolset.config = PrometheusConfig( + prometheus_url="http://prometheus:9090/", + prometheus_ssl_enabled=False, + ) + return toolset + + +class TestAnalyzeMetricByDimensions: + def test_basic_metric_analysis(self, prometheus_toolset): + tool = AnalyzeMetricByDimensions(toolset=prometheus_toolset) + + # Mock successful response + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "data": { + "result": [ + { + "metric": {"endpoint": "/api/users", "method": "GET"}, + "value": [1234567890, "0.95"], + }, + { + "metric": {"endpoint": "/api/products", "method": "POST"}, + "value": [1234567890, "1.2"], + }, + ] + } + } + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke( + { + "metric_name": "http_request_duration_seconds", + "group_by": ["endpoint", "method"], + "filters": {"service": "api"}, + "time_range": "1h", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + assert "result" in json.loads(result.data) + + # Verify the query was constructed correctly + mock_post.assert_called_once() + call_args = mock_post.call_args + assert "query" in call_args[1]["data"] + query = call_args[1]["data"]["query"] + assert "http_request_duration_seconds" in query + assert 'service="api"' in query + assert "endpoint" in query + assert "method" in query + + def test_histogram_percentile_aggregation(self, prometheus_toolset): + tool = AnalyzeMetricByDimensions(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": {"result": []}} + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke( + { + "metric_name": "http_request_duration_seconds", + "aggregation": "p95", + "time_range": "5m", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify histogram_quantile was used + call_args = mock_post.call_args + query = call_args[1]["data"]["query"] + assert "histogram_quantile(0.95" in query + assert "_bucket" in query + + def test_missing_prometheus_url(self, prometheus_toolset): + tool = AnalyzeMetricByDimensions(toolset=prometheus_toolset) + prometheus_toolset.config = None + + result = tool._invoke({"metric_name": "test_metric"}) + + assert result.status == ToolResultStatus.ERROR + assert "Prometheus is not configured" in result.error + + +class TestFindTopMetricValues: + def test_find_top_values(self, prometheus_toolset): + tool = FindTopMetricValues(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "data": { + "result": [ + {"metric": {"endpoint": "/slow"}, "value": [1234567890, "2.5"]}, + {"metric": {"endpoint": "/slower"}, "value": [1234567890, "3.1"]}, + ] + } + } + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke( + { + "metric_name": "request_duration", + "group_by_label": "endpoint", + "top_n": 5, + "time_range": "30m", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify topk was used + call_args = mock_post.call_args + query = call_args[1]["data"]["query"] + assert "topk(5" in query + assert "endpoint" in query + + def test_histogram_metric_top_values(self, prometheus_toolset): + tool = FindTopMetricValues(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": {"result": []}} + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke( + { + "metric_name": "latency_histogram", + "group_by_label": "service", + "percentile": 0.99, + "top_n": 10, + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify histogram_quantile was used for percentile + call_args = mock_post.call_args + query = call_args[1]["data"]["query"] + assert "histogram_quantile(0.99" in query + assert "topk(10" in query + + +class TestCompareMetricPeriods: + def test_compare_periods(self, prometheus_toolset): + tool = CompareMetricPeriods(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "data": { + "result": [ + {"metric": {"endpoint": "/api"}, "value": [1234567890, "15.5"]} + ] + } + } + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke( + { + "metric_name": "errors_total", + "current_period": "1h", + "comparison_offset": "24h", + "group_by": ["endpoint"], + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify offset comparison query + call_args = mock_post.call_args + query = call_args[1]["data"]["query"] + assert "offset 24h" in query + assert "errors_total" in query + assert "endpoint" in query + # Should calculate percentage change + assert "*" in query and "100" in query + + def test_compare_without_grouping(self, prometheus_toolset): + tool = CompareMetricPeriods(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": {"result": []}} + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke( + { + "metric_name": "cpu_usage", + "current_period": "5m", + "comparison_offset": "1h", + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify no grouping clause + call_args = mock_post.call_args + query = call_args[1]["data"]["query"] + assert "by (" not in query + + +class TestDetectMetricAnomalies: + def test_anomaly_detection(self, prometheus_toolset): + tool = DetectMetricAnomalies(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "data": { + "result": [{"metric": {"pod": "pod-1"}, "value": [1234567890, "5.2"]}] + } + } + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke( + { + "metric_name": "response_time", + "sensitivity": 2.5, + "lookback_window": "6h", + "group_by": ["pod"], + } + ) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify z-score calculation + call_args = mock_post.call_args + query = call_args[1]["data"]["query"] + assert "stddev_over_time" in query + assert "avg_over_time" in query + assert "response_time" in query + assert "> 2.5" in query + assert "6h" in query + + def test_default_sensitivity(self, prometheus_toolset): + tool = DetectMetricAnomalies(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"data": {"result": []}} + + with patch("requests.post", return_value=mock_response) as mock_post: + result = tool._invoke({"metric_name": "error_rate"}) + + assert result.status == ToolResultStatus.SUCCESS + + # Verify default sensitivity of 3 + call_args = mock_post.call_args + query = call_args[1]["data"]["query"] + assert "> 3" in query + + def test_query_failure(self, prometheus_toolset): + tool = DetectMetricAnomalies(toolset=prometheus_toolset) + + mock_response = MagicMock() + mock_response.status_code = 400 + mock_response.text = "Bad query" + + with patch("requests.post", return_value=mock_response): + result = tool._invoke({"metric_name": "test_metric"}) + + assert result.status == ToolResultStatus.ERROR + assert "400" in result.error + assert "Bad query" in result.error + + +class TestToolIntegration: + """Test that tools are properly integrated into the toolset""" + + def test_tools_in_toolset(self): + toolset = PrometheusToolset() + tool_names = [tool.name for tool in toolset.tools] + + assert "analyze_metric_by_dimensions" in tool_names + assert "find_top_metric_values" in tool_names + assert "compare_metric_periods" in tool_names + assert "detect_metric_anomalies" in tool_names + + def test_tool_one_liners(self, prometheus_toolset): + # Test that each tool generates appropriate one-liner descriptions + tools = [ + AnalyzeMetricByDimensions(toolset=prometheus_toolset), + FindTopMetricValues(toolset=prometheus_toolset), + CompareMetricPeriods(toolset=prometheus_toolset), + DetectMetricAnomalies(toolset=prometheus_toolset), + ] + + for tool in tools: + one_liner = tool.get_parameterized_one_liner({"metric_name": "test_metric"}) + assert "Prometheus" in one_liner + assert "test_metric" in one_liner