|
| 1 | +package tools |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "encoding/json" |
| 6 | + "fmt" |
| 7 | + "io" |
| 8 | + "strings" |
| 9 | + "time" |
| 10 | + |
| 11 | + "github.com/mark3labs/mcp-go/mcp" |
| 12 | + "github.com/mark3labs/mcp-go/server" |
| 13 | + "github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig" |
| 14 | +) |
| 15 | + |
| 16 | +type TroubleshootKubernetesListTop400500HttpErrorsInPods struct { |
| 17 | + SysdigClient sysdig.ExtendedClientWithResponsesInterface |
| 18 | +} |
| 19 | + |
| 20 | +func NewTroubleshootKubernetesListTop400500HttpErrorsInPods(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTop400500HttpErrorsInPods { |
| 21 | + return &TroubleshootKubernetesListTop400500HttpErrorsInPods{ |
| 22 | + SysdigClient: sysdigClient, |
| 23 | + } |
| 24 | +} |
| 25 | + |
| 26 | +func (t *TroubleshootKubernetesListTop400500HttpErrorsInPods) RegisterInServer(s *server.MCPServer) { |
| 27 | + tool := mcp.NewTool("troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods", |
| 28 | + mcp.WithDescription("Lists the pods with the highest rate of HTTP 4xx and 5xx errors over a specified time interval, allowing filtering by cluster, namespace, workload type, and workload name."), |
| 29 | + mcp.WithString("interval", mcp.Description("Time interval for the query (e.g. '1h', '30m'). Default is '1h'.")), |
| 30 | + mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")), |
| 31 | + mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")), |
| 32 | + mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")), |
| 33 | + mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")), |
| 34 | + mcp.WithNumber("limit", |
| 35 | + mcp.Description("Maximum number of pods to return."), |
| 36 | + mcp.DefaultNumber(20), |
| 37 | + ), |
| 38 | + mcp.WithOutputSchema[map[string]any](), |
| 39 | + WithRequiredPermissions(), // FIXME(fede): Add the required permissions. It should be `promql.exec` but somehow the token does not have that permission even if you are able to execute queries. |
| 40 | + ) |
| 41 | + s.AddTool(tool, t.handle) |
| 42 | +} |
| 43 | + |
| 44 | +func (t *TroubleshootKubernetesListTop400500HttpErrorsInPods) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { |
| 45 | + interval := mcp.ParseString(request, "interval", "1h") |
| 46 | + clusterName := mcp.ParseString(request, "cluster_name", "") |
| 47 | + namespaceName := mcp.ParseString(request, "namespace_name", "") |
| 48 | + workloadType := mcp.ParseString(request, "workload_type", "") |
| 49 | + workloadName := mcp.ParseString(request, "workload_name", "") |
| 50 | + limit := mcp.ParseInt(request, "limit", 20) |
| 51 | + |
| 52 | + query, err := buildTopHttpErrorsQuery(interval, limit, clusterName, namespaceName, workloadType, workloadName) |
| 53 | + if err != nil { |
| 54 | + return mcp.NewToolResultErrorFromErr("failed to build query", err), nil |
| 55 | + } |
| 56 | + |
| 57 | + limitQuery := sysdig.LimitQuery(limit) |
| 58 | + params := &sysdig.GetQueryV1Params{ |
| 59 | + Query: query, |
| 60 | + Limit: &limitQuery, |
| 61 | + } |
| 62 | + |
| 63 | + httpResp, err := t.SysdigClient.GetQueryV1(ctx, params) |
| 64 | + if err != nil { |
| 65 | + return mcp.NewToolResultErrorFromErr("failed to execute query", err), nil |
| 66 | + } |
| 67 | + |
| 68 | + if httpResp.StatusCode != 200 { |
| 69 | + bodyBytes, _ := io.ReadAll(httpResp.Body) |
| 70 | + return mcp.NewToolResultErrorf("failed to execute query: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil |
| 71 | + } |
| 72 | + |
| 73 | + var queryResponse sysdig.QueryResponseV1 |
| 74 | + if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil { |
| 75 | + return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil |
| 76 | + } |
| 77 | + |
| 78 | + return mcp.NewToolResultJSON(queryResponse) |
| 79 | +} |
| 80 | + |
| 81 | +func buildTopHttpErrorsQuery(interval string, limit int, clusterName, namespaceName, workloadType, workloadName string) (string, error) { |
| 82 | + duration, err := time.ParseDuration(interval) |
| 83 | + if err != nil { |
| 84 | + return "", fmt.Errorf("invalid interval format: %w", err) |
| 85 | + } |
| 86 | + seconds := duration.Seconds() |
| 87 | + |
| 88 | + filters := []string{} |
| 89 | + if clusterName != "" { |
| 90 | + filters = append(filters, fmt.Sprintf("kube_cluster_name=~\"%s\"", clusterName)) |
| 91 | + } |
| 92 | + if namespaceName != "" { |
| 93 | + filters = append(filters, fmt.Sprintf("kube_namespace_name=\"%s\"", namespaceName)) |
| 94 | + } |
| 95 | + if workloadType != "" { |
| 96 | + filters = append(filters, fmt.Sprintf("kube_workload_type=\"%s\"", workloadType)) |
| 97 | + } |
| 98 | + if workloadName != "" { |
| 99 | + filters = append(filters, fmt.Sprintf("kube_workload_name=\"%s\"", workloadName)) |
| 100 | + } |
| 101 | + |
| 102 | + filterStr := "" |
| 103 | + if len(filters) > 0 { |
| 104 | + filterStr = strings.Join(filters, ",") |
| 105 | + } |
| 106 | + |
| 107 | + // topk(20,sum(sum_over_time(sysdig_container_net_http_error_count{kube_cluster_name=~"demo-kube-gke"}[1h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 3600 |
| 108 | + return fmt.Sprintf("topk(%d,sum(sum_over_time(sysdig_container_net_http_error_count{%s}[%s])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / %f", |
| 109 | + limit, filterStr, interval, seconds), nil |
| 110 | +} |
0 commit comments