|
| 1 | +package tools |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "math" |
| 7 | + "sync" |
| 8 | + "time" |
| 9 | + |
| 10 | + "observability-hub/internal/telemetry" |
| 11 | +) |
| 12 | + |
| 13 | +// InvestigateIncidentInput represents the input for the investigate_incident tool. |
| 14 | +type InvestigateIncidentInput struct { |
| 15 | + Service string `json:"service"` // service name to investigate e.g. "proxy", "collectors" |
| 16 | + Hours int `json:"hours,omitempty"` // lookback window in hours (default 1, max 168) |
| 17 | + Since string `json:"since,omitempty"` // ISO 8601 start time e.g. "2026-03-06T17:00:00Z" — overrides hours |
| 18 | +} |
| 19 | + |
| 20 | +// IncidentReport is the structured output of an investigation. |
| 21 | +type IncidentReport struct { |
| 22 | + Service string `json:"service"` |
| 23 | + WindowHr int `json:"window_hours"` |
| 24 | + Since string `json:"since,omitempty"` |
| 25 | + Healthy bool `json:"healthy"` |
| 26 | + |
| 27 | + ErrorLogs interface{} `json:"error_logs,omitempty"` |
| 28 | + ErrorTraces interface{} `json:"error_traces,omitempty"` |
| 29 | + Metrics interface{} `json:"metrics,omitempty"` |
| 30 | + ErrorSummary string `json:"error_summary,omitempty"` |
| 31 | +} |
| 32 | + |
| 33 | +// InvestigateIncidentHandler orchestrates metrics, logs, and traces to produce an incident report. |
| 34 | +type InvestigateIncidentHandler struct { |
| 35 | + queryMetrics func(ctx context.Context, query string) (interface{}, error) |
| 36 | + queryLogs func(ctx context.Context, query string, limit int, hours int) (interface{}, error) |
| 37 | + queryTraces func(ctx context.Context, traceID string, query string, hours int, limit int) (interface{}, error) |
| 38 | +} |
| 39 | + |
| 40 | +// NewInvestigateIncidentHandler creates a new investigate_incident handler. |
| 41 | +func NewInvestigateIncidentHandler( |
| 42 | + queryMetrics func(ctx context.Context, query string) (interface{}, error), |
| 43 | + queryLogs func(ctx context.Context, query string, limit int, hours int) (interface{}, error), |
| 44 | + queryTraces func(ctx context.Context, traceID string, query string, hours int, limit int) (interface{}, error), |
| 45 | +) *InvestigateIncidentHandler { |
| 46 | + return &InvestigateIncidentHandler{ |
| 47 | + queryMetrics: queryMetrics, |
| 48 | + queryLogs: queryLogs, |
| 49 | + queryTraces: queryTraces, |
| 50 | + } |
| 51 | +} |
| 52 | + |
| 53 | +// Execute runs the investigate_incident tool. |
| 54 | +// It checks for errors in logs and traces in parallel, then fetches supporting |
| 55 | +// metrics if issues are found, and returns a structured incident report. |
| 56 | +func (h *InvestigateIncidentHandler) Execute(ctx context.Context, input InvestigateIncidentInput) (interface{}, error) { |
| 57 | + if input.Service == "" { |
| 58 | + return nil, fmt.Errorf("service is required") |
| 59 | + } |
| 60 | + |
| 61 | + // Resolve hours: since overrides hours when set |
| 62 | + if input.Since != "" { |
| 63 | + t, err := time.Parse(time.RFC3339, input.Since) |
| 64 | + if err != nil { |
| 65 | + return nil, fmt.Errorf("invalid since format, expected RFC3339 e.g. 2026-03-06T17:00:00Z: %w", err) |
| 66 | + } |
| 67 | + computed := int(math.Ceil(time.Since(t).Hours())) |
| 68 | + if computed <= 0 { |
| 69 | + return nil, fmt.Errorf("since must be in the past") |
| 70 | + } |
| 71 | + input.Hours = computed |
| 72 | + } |
| 73 | + if input.Hours <= 0 { |
| 74 | + input.Hours = 1 |
| 75 | + } |
| 76 | + if input.Hours > 168 { |
| 77 | + input.Hours = 168 |
| 78 | + } |
| 79 | + |
| 80 | + telemetry.Info("investigating incident", "service", input.Service, "hours", input.Hours, "since", input.Since) |
| 81 | + |
| 82 | + // Step 1: Check for errors in logs and traces in parallel |
| 83 | + type result struct { |
| 84 | + data interface{} |
| 85 | + err error |
| 86 | + } |
| 87 | + |
| 88 | + logsCh := make(chan result, 1) |
| 89 | + tracesCh := make(chan result, 1) |
| 90 | + |
| 91 | + logQuery := fmt.Sprintf(`{service="%s"} |~ "(?i)error"`, input.Service) |
| 92 | + traceQuery := fmt.Sprintf(`{resource.service.name="%s"} && status=error`, input.Service) |
| 93 | + |
| 94 | + var wg sync.WaitGroup |
| 95 | + wg.Add(2) |
| 96 | + |
| 97 | + go func() { |
| 98 | + defer wg.Done() |
| 99 | + data, err := h.queryLogs(ctx, logQuery, 20, input.Hours) |
| 100 | + logsCh <- result{data, err} |
| 101 | + }() |
| 102 | + |
| 103 | + go func() { |
| 104 | + defer wg.Done() |
| 105 | + data, err := h.queryTraces(ctx, "", traceQuery, input.Hours, 10) |
| 106 | + tracesCh <- result{data, err} |
| 107 | + }() |
| 108 | + |
| 109 | + wg.Wait() |
| 110 | + logsResult := <-logsCh |
| 111 | + tracesResult := <-tracesCh |
| 112 | + |
| 113 | + report := IncidentReport{ |
| 114 | + Service: input.Service, |
| 115 | + WindowHr: input.Hours, |
| 116 | + Since: input.Since, |
| 117 | + Healthy: true, |
| 118 | + } |
| 119 | + |
| 120 | + hasErrors := false |
| 121 | + |
| 122 | + if logsResult.err == nil && hasLogEntries(logsResult.data) { |
| 123 | + report.ErrorLogs = logsResult.data |
| 124 | + hasErrors = true |
| 125 | + } |
| 126 | + if tracesResult.err == nil && hasTraceEntries(tracesResult.data) { |
| 127 | + report.ErrorTraces = tracesResult.data |
| 128 | + hasErrors = true |
| 129 | + } |
| 130 | + |
| 131 | + if !hasErrors { |
| 132 | + telemetry.Info("incident investigation complete: no errors found", "service", input.Service) |
| 133 | + return report, nil |
| 134 | + } |
| 135 | + |
| 136 | + // Step 2: Errors found — fetch supporting metrics |
| 137 | + report.Healthy = false |
| 138 | + errorRateQuery := fmt.Sprintf(`sum(rate(http_requests_total{service="%s",status=~"5.."}[5m])) / sum(rate(http_requests_total{service="%s"}[5m]))`, input.Service, input.Service) |
| 139 | + metricsData, err := h.queryMetrics(ctx, errorRateQuery) |
| 140 | + if err == nil { |
| 141 | + report.Metrics = metricsData |
| 142 | + } |
| 143 | + |
| 144 | + report.ErrorSummary = buildSummary(report) |
| 145 | + telemetry.Info("incident investigation complete: errors detected", "service", input.Service) |
| 146 | + return report, nil |
| 147 | +} |
| 148 | + |
| 149 | +// hasLogEntries returns true if the log result contains at least one log line. |
| 150 | +func hasLogEntries(data interface{}) bool { |
| 151 | + m, ok := data.(map[string]interface{}) |
| 152 | + if !ok { |
| 153 | + return false |
| 154 | + } |
| 155 | + results, ok := m["data"].(map[string]interface{}) |
| 156 | + if !ok { |
| 157 | + return false |
| 158 | + } |
| 159 | + entries, ok := results["result"].([]interface{}) |
| 160 | + return ok && len(entries) > 0 |
| 161 | +} |
| 162 | + |
| 163 | +// hasTraceEntries returns true if the trace search result contains at least one trace. |
| 164 | +func hasTraceEntries(data interface{}) bool { |
| 165 | + m, ok := data.(map[string]interface{}) |
| 166 | + if !ok { |
| 167 | + return false |
| 168 | + } |
| 169 | + traces, ok := m["traces"].([]interface{}) |
| 170 | + return ok && len(traces) > 0 |
| 171 | +} |
| 172 | + |
| 173 | +// buildSummary produces a plain-text summary for the AI to reason over. |
| 174 | +func buildSummary(r IncidentReport) string { |
| 175 | + summary := fmt.Sprintf("Incident detected for service %q over the last %d hour(s).", r.Service, r.WindowHr) |
| 176 | + if r.ErrorLogs != nil { |
| 177 | + summary += " Error log entries found." |
| 178 | + } |
| 179 | + if r.ErrorTraces != nil { |
| 180 | + summary += " Error spans found in distributed traces." |
| 181 | + } |
| 182 | + if r.Metrics != nil { |
| 183 | + summary += " Error rate metrics retrieved for correlation." |
| 184 | + } |
| 185 | + return summary |
| 186 | +} |
0 commit comments