Skip to content

Commit 1dbac10

Browse files
authored
feat: add troubleshoot_kubernetes_list_top_restarted_pods tool (#37)
1 parent 96c6de9 commit 1dbac10

File tree

5 files changed

+228
-0
lines changed

5 files changed

+228
-0
lines changed

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ The handler filters tools dynamically based on `GetMyPermissions` from Sysdig Se
5555
| `kubernetes_list_pod_containers` | `tool_kubernetes_list_pod_containers.go` | Retrieves information from a particular pod and container. | `promql.exec` | "Show me info for pod 'my-pod' in cluster 'production-gke'" |
5656
| `kubernetes_list_cronjobs` | `tool_kubernetes_list_cronjobs.go` | Retrieves information from the cronjobs in the cluster. | `promql.exec` | "List all cronjobs in cluster 'prod' and namespace 'default'" |
5757
| `troubleshoot_kubernetes_list_top_unavailable_pods` | `tool_troubleshoot_kubernetes_list_top_unavailable_pods.go` | Shows the top N pods with the highest number of unavailable or unready replicas. | `promql.exec` | "Show the top 20 unavailable pods in cluster 'production'" |
58+
| `troubleshoot_kubernetes_list_top_restarted_pods` | `tool_troubleshoot_kubernetes_list_top_restarted_pods.go` | Lists the pods with the highest number of container restarts. | `promql.exec` | "Show the top 10 pods with the most container restarts in cluster 'production'" |
5859

5960
Every tool has a companion `_test.go` file that exercises request validation, permission metadata, and Sysdig client calls through mocks.
6061
Note that if you add more tools you need to also update this file to reflect that.

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ The server dynamically filters the available tools based on the permissions asso
138138
- **Required Permission**: `promql.exec`
139139
- **Sample Prompt**: "Show the top 20 unavailable pods in cluster 'production'"
140140

141+
- **`troubleshoot_kubernetes_list_top_restarted_pods`**
142+
- **Description**: Lists the pods with the highest number of container restarts in the specified scope (cluster, namespace, workload, or individual pod). By default, it returns the top 10.
143+
- **Required Permission**: `promql.exec`
144+
- **Sample Prompt**: "Show the top 10 pods with the most container restarts in cluster 'production'"
145+
141146
## Requirements
142147

143148
- [Go](https://go.dev/doc/install) 1.25 or higher (if running without Docker).

cmd/server/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
101101
tools.NewKubernetesListWorkloads(sysdigClient),
102102
tools.NewKubernetesListPodContainers(sysdigClient),
103103
tools.NewTroubleshootKubernetesListTopUnavailablePods(sysdigClient),
104+
tools.NewTroubleshootKubernetesListTopRestartedPods(sysdigClient),
104105
)
105106
return handler
106107
}
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package tools
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"strings"
9+
10+
"github.com/mark3labs/mcp-go/mcp"
11+
"github.com/mark3labs/mcp-go/server"
12+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
13+
)
14+
15+
type TroubleshootKubernetesListTopRestartedPods struct {
16+
SysdigClient sysdig.ExtendedClientWithResponsesInterface
17+
}
18+
19+
func NewTroubleshootKubernetesListTopRestartedPods(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTopRestartedPods {
20+
return &TroubleshootKubernetesListTopRestartedPods{
21+
SysdigClient: sysdigClient,
22+
}
23+
}
24+
25+
func (t *TroubleshootKubernetesListTopRestartedPods) RegisterInServer(s *server.MCPServer) {
26+
tool := mcp.NewTool("troubleshoot_kubernetes_list_top_restarted_pods",
27+
mcp.WithDescription("Lists the pods with the highest number of container restarts in the specified scope (cluster, namespace, workload, or individual pod). By default, it returns the top 10."),
28+
mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
29+
mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")),
30+
mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")),
31+
mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")),
32+
mcp.WithString("pod_name", mcp.Description("The name of the pod to filter by.")),
33+
mcp.WithNumber("limit",
34+
mcp.Description("Maximum number of pods to return."),
35+
mcp.DefaultNumber(10),
36+
),
37+
mcp.WithOutputSchema[map[string]any](),
38+
WithRequiredPermissions(), // FIXME(fede): Add the required permissions. It should be `promql.exec` but somehow the token does not have that permission even if you are able to execute queries.
39+
)
40+
s.AddTool(tool, t.handle)
41+
}
42+
43+
func (t *TroubleshootKubernetesListTopRestartedPods) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
44+
clusterName := mcp.ParseString(request, "cluster_name", "")
45+
namespaceName := mcp.ParseString(request, "namespace_name", "")
46+
workloadType := mcp.ParseString(request, "workload_type", "")
47+
workloadName := mcp.ParseString(request, "workload_name", "")
48+
podName := mcp.ParseString(request, "pod_name", "")
49+
limit := mcp.ParseInt(request, "limit", 10)
50+
51+
query := buildKubeTopRestartsQuery(clusterName, namespaceName, workloadType, workloadName, podName, limit)
52+
53+
params := &sysdig.GetQueryV1Params{
54+
Query: query,
55+
}
56+
57+
httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
58+
if err != nil {
59+
return mcp.NewToolResultErrorFromErr("failed to get pod list", err), nil
60+
}
61+
62+
if httpResp.StatusCode != 200 {
63+
bodyBytes, _ := io.ReadAll(httpResp.Body)
64+
return mcp.NewToolResultErrorf("failed to get pod list: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil
65+
}
66+
67+
var queryResponse sysdig.QueryResponseV1
68+
if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil {
69+
return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil
70+
}
71+
72+
return mcp.NewToolResultJSON(queryResponse)
73+
}
74+
75+
func buildKubeTopRestartsQuery(clusterName, namespaceName, workloadType, workloadName, podName string, limit int) string {
76+
filters := []string{}
77+
if clusterName != "" {
78+
filters = append(filters, fmt.Sprintf("kube_cluster_name=\"%s\"", clusterName))
79+
}
80+
if namespaceName != "" {
81+
filters = append(filters, fmt.Sprintf("kube_namespace_name=\"%s\"", namespaceName))
82+
}
83+
if workloadType != "" {
84+
filters = append(filters, fmt.Sprintf("kube_workload_type=\"%s\"", workloadType))
85+
}
86+
if workloadName != "" {
87+
filters = append(filters, fmt.Sprintf("kube_workload_name=\"%s\"", workloadName))
88+
}
89+
if podName != "" {
90+
filters = append(filters, fmt.Sprintf("kube_pod_name=\"%s\"", podName))
91+
}
92+
93+
filterString := ""
94+
if len(filters) > 0 {
95+
filterString = "{" + strings.Join(filters, ",") + "}"
96+
}
97+
98+
return fmt.Sprintf("topk(%d, sum by(pod, kube_cluster_name, kube_namespace_name) (kube_pod_container_status_restarts_total%s) > 0)", limit, filterString)
99+
}
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
package tools_test
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"io"
7+
"net/http"
8+
9+
"github.com/mark3labs/mcp-go/mcp"
10+
"github.com/mark3labs/mcp-go/server"
11+
. "github.com/onsi/ginkgo/v2"
12+
. "github.com/onsi/gomega"
13+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
14+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
15+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
16+
"go.uber.org/mock/gomock"
17+
)
18+
19+
var _ = Describe("TroubleshootKubernetesListTopRestartedPods Tool", func() {
20+
var (
21+
tool *tools.TroubleshootKubernetesListTopRestartedPods
22+
mockSysdig *mocks.MockExtendedClientWithResponsesInterface
23+
mcpServer *server.MCPServer
24+
ctrl *gomock.Controller
25+
)
26+
27+
BeforeEach(func() {
28+
ctrl = gomock.NewController(GinkgoT())
29+
mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
30+
tool = tools.NewTroubleshootKubernetesListTopRestartedPods(mockSysdig)
31+
mcpServer = server.NewMCPServer("test", "test")
32+
tool.RegisterInServer(mcpServer)
33+
})
34+
35+
It("should register successfully in the server", func() {
36+
Expect(mcpServer.GetTool("troubleshoot_kubernetes_list_top_restarted_pods")).NotTo(BeNil())
37+
})
38+
39+
When("listing top restarted pods", func() {
40+
DescribeTable("it succeeds", func(ctx context.Context, toolName string, request mcp.CallToolRequest, expectedParamsRequested sysdig.GetQueryV1Params) {
41+
mockSysdig.EXPECT().GetQueryV1(gomock.Any(), &expectedParamsRequested).Return(&http.Response{
42+
StatusCode: http.StatusOK,
43+
Body: io.NopCloser(bytes.NewBufferString(`{"status":"success"}`)),
44+
}, nil)
45+
46+
serverTool := mcpServer.GetTool(toolName)
47+
result, err := serverTool.Handler(ctx, request)
48+
Expect(err).NotTo(HaveOccurred())
49+
50+
resultData, ok := result.Content[0].(mcp.TextContent)
51+
Expect(ok).To(BeTrue())
52+
Expect(resultData.Text).To(MatchJSON(`{"status":"success"}`))
53+
},
54+
Entry(nil,
55+
"troubleshoot_kubernetes_list_top_restarted_pods",
56+
mcp.CallToolRequest{
57+
Params: mcp.CallToolParams{
58+
Name: "troubleshoot_kubernetes_list_top_restarted_pods",
59+
Arguments: map[string]any{},
60+
},
61+
},
62+
sysdig.GetQueryV1Params{
63+
Query: `topk(10, sum by(pod, kube_cluster_name, kube_namespace_name) (kube_pod_container_status_restarts_total) > 0)`,
64+
},
65+
),
66+
Entry(nil,
67+
"troubleshoot_kubernetes_list_top_restarted_pods",
68+
mcp.CallToolRequest{
69+
Params: mcp.CallToolParams{
70+
Name: "troubleshoot_kubernetes_list_top_restarted_pods",
71+
Arguments: map[string]any{"limit": "20"},
72+
},
73+
},
74+
sysdig.GetQueryV1Params{
75+
Query: `topk(20, sum by(pod, kube_cluster_name, kube_namespace_name) (kube_pod_container_status_restarts_total) > 0)`,
76+
},
77+
),
78+
Entry(nil,
79+
"troubleshoot_kubernetes_list_top_restarted_pods",
80+
mcp.CallToolRequest{
81+
Params: mcp.CallToolParams{
82+
Name: "troubleshoot_kubernetes_list_top_restarted_pods",
83+
Arguments: map[string]any{"cluster_name": "my_cluster"},
84+
},
85+
},
86+
sysdig.GetQueryV1Params{
87+
Query: `topk(10, sum by(pod, kube_cluster_name, kube_namespace_name) (kube_pod_container_status_restarts_total{kube_cluster_name="my_cluster"}) > 0)`,
88+
},
89+
),
90+
Entry(nil,
91+
"troubleshoot_kubernetes_list_top_restarted_pods",
92+
mcp.CallToolRequest{
93+
Params: mcp.CallToolParams{
94+
Name: "troubleshoot_kubernetes_list_top_restarted_pods",
95+
Arguments: map[string]any{"namespace_name": "my_namespace"},
96+
},
97+
},
98+
sysdig.GetQueryV1Params{
99+
Query: `topk(10, sum by(pod, kube_cluster_name, kube_namespace_name) (kube_pod_container_status_restarts_total{kube_namespace_name="my_namespace"}) > 0)`,
100+
},
101+
),
102+
Entry(nil,
103+
"troubleshoot_kubernetes_list_top_restarted_pods",
104+
mcp.CallToolRequest{
105+
Params: mcp.CallToolParams{
106+
Name: "troubleshoot_kubernetes_list_top_restarted_pods",
107+
Arguments: map[string]any{
108+
"cluster_name": "my_cluster",
109+
"namespace_name": "my_namespace",
110+
"workload_type": "deployment",
111+
"workload_name": "my_workload",
112+
"pod_name": "my_pod",
113+
},
114+
},
115+
},
116+
sysdig.GetQueryV1Params{
117+
Query: `topk(10, sum by(pod, kube_cluster_name, kube_namespace_name) (kube_pod_container_status_restarts_total{kube_cluster_name="my_cluster",kube_namespace_name="my_namespace",kube_workload_type="deployment",kube_workload_name="my_workload",kube_pod_name="my_pod"}) > 0)`,
118+
},
119+
),
120+
)
121+
})
122+
})

0 commit comments

Comments
 (0)