Skip to content

Commit 61af83a

Browse files
authored
feat: add troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods (#38)
1 parent 1dbac10 commit 61af83a

File tree

5 files changed

+240
-0
lines changed

5 files changed

+240
-0
lines changed

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ The handler filters tools dynamically based on `GetMyPermissions` from Sysdig Se
5656
| `kubernetes_list_cronjobs` | `tool_kubernetes_list_cronjobs.go` | Retrieves information from the cronjobs in the cluster. | `promql.exec` | "List all cronjobs in cluster 'prod' and namespace 'default'" |
5757
| `troubleshoot_kubernetes_list_top_unavailable_pods` | `tool_troubleshoot_kubernetes_list_top_unavailable_pods.go` | Shows the top N pods with the highest number of unavailable or unready replicas. | `promql.exec` | "Show the top 20 unavailable pods in cluster 'production'" |
5858
| `troubleshoot_kubernetes_list_top_restarted_pods` | `tool_troubleshoot_kubernetes_list_top_restarted_pods.go` | Lists the pods with the highest number of container restarts. | `promql.exec` | "Show the top 10 pods with the most container restarts in cluster 'production'" |
59+
| `troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods` | `tool_troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods.go` | Lists the pods with the highest rate of HTTP 4xx and 5xx errors over a specified time interval. | `promql.exec` | "Show the top 20 pods with the most HTTP errors in cluster 'production'" |
5960

6061
Every tool has a companion `_test.go` file that exercises request validation, permission metadata, and Sysdig client calls through mocks.
6162
Note that if you add more tools you need to also update this file to reflect that.

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ The server dynamically filters the available tools based on the permissions asso
143143
- **Required Permission**: `promql.exec`
144144
- **Sample Prompt**: "Show the top 10 pods with the most container restarts in cluster 'production'"
145145

146+
- **`troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods`**
147+
- **Description**: Lists the pods with the highest rate of HTTP 4xx and 5xx errors over a specified time interval, allowing filtering by cluster, namespace, workload type, and workload name.
148+
- **Required Permission**: `promql.exec`
149+
- **Sample Prompt**: "Show the top 20 pods with the most HTTP errors in cluster 'production'"
150+
146151
## Requirements
147152

148153
- [Go](https://go.dev/doc/install) 1.25 or higher (if running without Docker).

cmd/server/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
102102
tools.NewKubernetesListPodContainers(sysdigClient),
103103
tools.NewTroubleshootKubernetesListTopUnavailablePods(sysdigClient),
104104
tools.NewTroubleshootKubernetesListTopRestartedPods(sysdigClient),
105+
tools.NewTroubleshootKubernetesListTop400500HttpErrorsInPods(sysdigClient),
105106
)
106107
return handler
107108
}
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
package tools
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"strings"
9+
"time"
10+
11+
"github.com/mark3labs/mcp-go/mcp"
12+
"github.com/mark3labs/mcp-go/server"
13+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
14+
)
15+
16+
type TroubleshootKubernetesListTop400500HttpErrorsInPods struct {
17+
SysdigClient sysdig.ExtendedClientWithResponsesInterface
18+
}
19+
20+
func NewTroubleshootKubernetesListTop400500HttpErrorsInPods(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTop400500HttpErrorsInPods {
21+
return &TroubleshootKubernetesListTop400500HttpErrorsInPods{
22+
SysdigClient: sysdigClient,
23+
}
24+
}
25+
26+
func (t *TroubleshootKubernetesListTop400500HttpErrorsInPods) RegisterInServer(s *server.MCPServer) {
27+
tool := mcp.NewTool("troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
28+
mcp.WithDescription("Lists the pods with the highest rate of HTTP 4xx and 5xx errors over a specified time interval, allowing filtering by cluster, namespace, workload type, and workload name."),
29+
mcp.WithString("interval", mcp.Description("Time interval for the query (e.g. '1h', '30m'). Default is '1h'.")),
30+
mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
31+
mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")),
32+
mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")),
33+
mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")),
34+
mcp.WithNumber("limit",
35+
mcp.Description("Maximum number of pods to return."),
36+
mcp.DefaultNumber(20),
37+
),
38+
mcp.WithOutputSchema[map[string]any](),
39+
WithRequiredPermissions(), // FIXME(fede): Add the required permissions. It should be `promql.exec` but somehow the token does not have that permission even if you are able to execute queries.
40+
)
41+
s.AddTool(tool, t.handle)
42+
}
43+
44+
func (t *TroubleshootKubernetesListTop400500HttpErrorsInPods) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
45+
interval := mcp.ParseString(request, "interval", "1h")
46+
clusterName := mcp.ParseString(request, "cluster_name", "")
47+
namespaceName := mcp.ParseString(request, "namespace_name", "")
48+
workloadType := mcp.ParseString(request, "workload_type", "")
49+
workloadName := mcp.ParseString(request, "workload_name", "")
50+
limit := mcp.ParseInt(request, "limit", 20)
51+
52+
query, err := buildTopHttpErrorsQuery(interval, limit, clusterName, namespaceName, workloadType, workloadName)
53+
if err != nil {
54+
return mcp.NewToolResultErrorFromErr("failed to build query", err), nil
55+
}
56+
57+
limitQuery := sysdig.LimitQuery(limit)
58+
params := &sysdig.GetQueryV1Params{
59+
Query: query,
60+
Limit: &limitQuery,
61+
}
62+
63+
httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
64+
if err != nil {
65+
return mcp.NewToolResultErrorFromErr("failed to execute query", err), nil
66+
}
67+
68+
if httpResp.StatusCode != 200 {
69+
bodyBytes, _ := io.ReadAll(httpResp.Body)
70+
return mcp.NewToolResultErrorf("failed to execute query: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil
71+
}
72+
73+
var queryResponse sysdig.QueryResponseV1
74+
if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil {
75+
return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil
76+
}
77+
78+
return mcp.NewToolResultJSON(queryResponse)
79+
}
80+
81+
func buildTopHttpErrorsQuery(interval string, limit int, clusterName, namespaceName, workloadType, workloadName string) (string, error) {
82+
duration, err := time.ParseDuration(interval)
83+
if err != nil {
84+
return "", fmt.Errorf("invalid interval format: %w", err)
85+
}
86+
seconds := duration.Seconds()
87+
88+
filters := []string{}
89+
if clusterName != "" {
90+
filters = append(filters, fmt.Sprintf("kube_cluster_name=~\"%s\"", clusterName))
91+
}
92+
if namespaceName != "" {
93+
filters = append(filters, fmt.Sprintf("kube_namespace_name=\"%s\"", namespaceName))
94+
}
95+
if workloadType != "" {
96+
filters = append(filters, fmt.Sprintf("kube_workload_type=\"%s\"", workloadType))
97+
}
98+
if workloadName != "" {
99+
filters = append(filters, fmt.Sprintf("kube_workload_name=\"%s\"", workloadName))
100+
}
101+
102+
filterStr := ""
103+
if len(filters) > 0 {
104+
filterStr = strings.Join(filters, ",")
105+
}
106+
107+
// topk(20,sum(sum_over_time(sysdig_container_net_http_error_count{kube_cluster_name=~"demo-kube-gke"}[1h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 3600
108+
return fmt.Sprintf("topk(%d,sum(sum_over_time(sysdig_container_net_http_error_count{%s}[%s])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / %f",
109+
limit, filterStr, interval, seconds), nil
110+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
package tools_test
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"io"
7+
"net/http"
8+
9+
"github.com/mark3labs/mcp-go/mcp"
10+
"github.com/mark3labs/mcp-go/server"
11+
. "github.com/onsi/ginkgo/v2"
12+
. "github.com/onsi/gomega"
13+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
14+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
15+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
16+
"go.uber.org/mock/gomock"
17+
)
18+
19+
var _ = Describe("TroubleshootKubernetesListTop400500HttpErrorsInPods Tool", func() {
20+
var (
21+
tool *tools.TroubleshootKubernetesListTop400500HttpErrorsInPods
22+
mockSysdig *mocks.MockExtendedClientWithResponsesInterface
23+
mcpServer *server.MCPServer
24+
ctrl *gomock.Controller
25+
ctx context.Context
26+
)
27+
28+
BeforeEach(func() {
29+
ctrl = gomock.NewController(GinkgoT())
30+
mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
31+
tool = tools.NewTroubleshootKubernetesListTop400500HttpErrorsInPods(mockSysdig)
32+
mcpServer = server.NewMCPServer("test", "test")
33+
tool.RegisterInServer(mcpServer)
34+
ctx = context.Background()
35+
})
36+
37+
It("should register successfully in the server", func() {
38+
Expect(mcpServer.GetTool("troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods")).NotTo(BeNil())
39+
})
40+
41+
When("listing top http errors", func() {
42+
DescribeTable("it succeeds", func(ctx context.Context, toolName string, request mcp.CallToolRequest, expectedParamsRequested sysdig.GetQueryV1Params) {
43+
mockSysdig.EXPECT().GetQueryV1(gomock.Any(), &expectedParamsRequested).Return(&http.Response{
44+
StatusCode: http.StatusOK,
45+
Body: io.NopCloser(bytes.NewBufferString(`{"status":"success"}`)),
46+
}, nil)
47+
48+
serverTool := mcpServer.GetTool(toolName)
49+
result, err := serverTool.Handler(ctx, request)
50+
Expect(err).NotTo(HaveOccurred())
51+
52+
resultData, ok := result.Content[0].(mcp.TextContent)
53+
Expect(ok).To(BeTrue())
54+
Expect(resultData.Text).To(MatchJSON(`{"status":"success"}`))
55+
},
56+
Entry("default params",
57+
"troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
58+
mcp.CallToolRequest{
59+
Params: mcp.CallToolParams{
60+
Name: "troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
61+
Arguments: map[string]any{},
62+
},
63+
},
64+
sysdig.GetQueryV1Params{
65+
Query: `topk(20,sum(sum_over_time(sysdig_container_net_http_error_count{}[1h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 3600.000000`,
66+
Limit: asPtr(sysdig.LimitQuery(20)),
67+
},
68+
),
69+
Entry("with custom params",
70+
"troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
71+
mcp.CallToolRequest{
72+
Params: mcp.CallToolParams{
73+
Name: "troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
74+
Arguments: map[string]any{
75+
"interval": "30m",
76+
"cluster_name": "prod-cluster",
77+
"namespace_name": "backend",
78+
"limit": 5,
79+
},
80+
},
81+
},
82+
sysdig.GetQueryV1Params{
83+
Query: `topk(5,sum(sum_over_time(sysdig_container_net_http_error_count{kube_cluster_name=~"prod-cluster",kube_namespace_name="backend"}[30m])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 1800.000000`,
84+
Limit: asPtr(sysdig.LimitQuery(5)),
85+
},
86+
),
87+
Entry("with all params",
88+
"troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
89+
mcp.CallToolRequest{
90+
Params: mcp.CallToolParams{
91+
Name: "troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
92+
Arguments: map[string]any{
93+
"interval": "2h",
94+
"cluster_name": "dev",
95+
"namespace_name": "default",
96+
"workload_type": "deployment",
97+
"workload_name": "api",
98+
"limit": 10,
99+
},
100+
},
101+
},
102+
sysdig.GetQueryV1Params{
103+
Query: `topk(10,sum(sum_over_time(sysdig_container_net_http_error_count{kube_cluster_name=~"dev",kube_namespace_name="default",kube_workload_type="deployment",kube_workload_name="api"}[2h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 7200.000000`,
104+
Limit: asPtr(sysdig.LimitQuery(10)),
105+
},
106+
),
107+
)
108+
109+
It("returns error for invalid interval", func() {
110+
serverTool := mcpServer.GetTool("troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods")
111+
request := mcp.CallToolRequest{
112+
Params: mcp.CallToolParams{
113+
Name: "troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods",
114+
Arguments: map[string]any{"interval": "invalid"},
115+
},
116+
}
117+
result, err := serverTool.Handler(ctx, request)
118+
Expect(err).NotTo(HaveOccurred())
119+
Expect(result.IsError).To(BeTrue())
120+
Expect(result.Content[0].(mcp.TextContent).Text).To(ContainSubstring("invalid interval format"))
121+
})
122+
})
123+
})

0 commit comments

Comments
 (0)