Skip to content

Commit 96c6de9

Browse files
authored
feat: add troubleshoot_kubernetes_list_top_unavailable_pods tool (#36)
1 parent 91dfba0 commit 96c6de9

File tree

5 files changed

+280
-0
lines changed

5 files changed

+280
-0
lines changed

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ The handler filters tools dynamically based on `GetMyPermissions` from Sysdig Se
5454
| `kubernetes_list_workloads` | `tool_kubernetes_list_workloads.go` | Lists Kubernetes workload information. | `promql.exec` | "List all desired workloads in the cluster 'production-gke' and namespace 'default'" |
5555
| `kubernetes_list_pod_containers` | `tool_kubernetes_list_pod_containers.go` | Retrieves information from a particular pod and container. | `promql.exec` | "Show me info for pod 'my-pod' in cluster 'production-gke'" |
5656
| `kubernetes_list_cronjobs` | `tool_kubernetes_list_cronjobs.go` | Retrieves information from the cronjobs in the cluster. | `promql.exec` | "List all cronjobs in cluster 'prod' and namespace 'default'" |
57+
| `troubleshoot_kubernetes_list_top_unavailable_pods` | `tool_troubleshoot_kubernetes_list_top_unavailable_pods.go` | Shows the top N pods with the highest number of unavailable or unready replicas. | `promql.exec` | "Show the top 20 unavailable pods in cluster 'production'" |
5758

5859
Every tool has a companion `_test.go` file that exercises request validation, permission metadata, and Sysdig client calls through mocks.
5960
Note that if you add more tools you need to also update this file to reflect that.

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ The server dynamically filters the available tools based on the permissions asso
133133
- **Required Permission**: `promql.exec`
134134
- **Sample Prompt**: "List all cronjobs in cluster 'prod' and namespace 'default'"
135135

136+
- **`troubleshoot_kubernetes_list_top_unavailable_pods`**
137+
- **Description**: Shows the top N pods with the highest number of unavailable or unready replicas in a Kubernetes cluster, ordered from highest to lowest.
138+
- **Required Permission**: `promql.exec`
139+
- **Sample Prompt**: "Show the top 20 unavailable pods in cluster 'production'"
140+
136141
## Requirements
137142

138143
- [Go](https://go.dev/doc/install) 1.25 or higher (if running without Docker).

cmd/server/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
100100
tools.NewKubernetesListCronjobs(sysdigClient),
101101
tools.NewKubernetesListWorkloads(sysdigClient),
102102
tools.NewKubernetesListPodContainers(sysdigClient),
103+
tools.NewTroubleshootKubernetesListTopUnavailablePods(sysdigClient),
103104
)
104105
return handler
105106
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package tools
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"strings"
9+
10+
"github.com/mark3labs/mcp-go/mcp"
11+
"github.com/mark3labs/mcp-go/server"
12+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
13+
)
14+
15+
type TroubleshootKubernetesListTopUnavailablePods struct {
16+
SysdigClient sysdig.ExtendedClientWithResponsesInterface
17+
}
18+
19+
func NewTroubleshootKubernetesListTopUnavailablePods(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTopUnavailablePods {
20+
return &TroubleshootKubernetesListTopUnavailablePods{
21+
SysdigClient: sysdigClient,
22+
}
23+
}
24+
25+
func (t *TroubleshootKubernetesListTopUnavailablePods) RegisterInServer(s *server.MCPServer) {
26+
tool := mcp.NewTool("troubleshoot_kubernetes_list_top_unavailable_pods",
27+
mcp.WithDescription("Shows the top N pods with the highest number of unavailable or unready replicas in a Kubernetes cluster, ordered from highest to lowest."),
28+
mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
29+
mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")),
30+
mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")),
31+
mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")),
32+
mcp.WithNumber("limit",
33+
mcp.Description("Maximum number of pods to return."),
34+
mcp.DefaultNumber(20),
35+
),
36+
mcp.WithOutputSchema[map[string]any](),
37+
WithRequiredPermissions(), // FIXME(fede): Add the required permissions. It should be `promql.exec` but somehow the token does not have that permission even if you are able to execute queries.
38+
)
39+
s.AddTool(tool, t.handle)
40+
}
41+
42+
func (t *TroubleshootKubernetesListTopUnavailablePods) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
43+
clusterName := mcp.ParseString(request, "cluster_name", "")
44+
namespaceName := mcp.ParseString(request, "namespace_name", "")
45+
workloadType := mcp.ParseString(request, "workload_type", "")
46+
workloadName := mcp.ParseString(request, "workload_name", "")
47+
limit := mcp.ParseInt(request, "limit", 20)
48+
49+
query := buildTopUnavailablePodsQuery(limit, clusterName, namespaceName, workloadType, workloadName)
50+
51+
params := &sysdig.GetQueryV1Params{
52+
Query: query,
53+
}
54+
55+
httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
56+
if err != nil {
57+
return mcp.NewToolResultErrorFromErr("failed to get top unavailable pods", err), nil
58+
}
59+
60+
if httpResp.StatusCode != 200 {
61+
bodyBytes, _ := io.ReadAll(httpResp.Body)
62+
return mcp.NewToolResultErrorf("failed to get top unavailable pods: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil
63+
}
64+
65+
var queryResponse sysdig.QueryResponseV1
66+
if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil {
67+
return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil
68+
}
69+
70+
return mcp.NewToolResultJSON(queryResponse)
71+
}
72+
73+
func buildTopUnavailablePodsQuery(limit int, clusterName, namespaceName, workloadType, workloadName string) string {
74+
baseFilters := []string{}
75+
if clusterName != "" {
76+
baseFilters = append(baseFilters, fmt.Sprintf("kube_cluster_name=\"%s\"", clusterName))
77+
}
78+
if namespaceName != "" {
79+
baseFilters = append(baseFilters, fmt.Sprintf("kube_namespace_name=\"%s\"", namespaceName))
80+
}
81+
if workloadType != "" {
82+
baseFilters = append(baseFilters, fmt.Sprintf("kube_workload_type=\"%s\"", workloadType))
83+
}
84+
if workloadName != "" {
85+
baseFilters = append(baseFilters, fmt.Sprintf("kube_workload_name=\"%s\"", workloadName))
86+
}
87+
88+
// Filters for kube_workload_status_desired and kube_daemonset_status_number_ready
89+
commonFiltersStr := strings.Join(baseFilters, ",")
90+
91+
// Filters for kube_workload_status_ready (needs extra filter)
92+
readyFilters := append([]string{"kube_workload_type!=\"daemonset\""}, baseFilters...)
93+
readyFiltersStr := strings.Join(readyFilters, ",")
94+
95+
return fmt.Sprintf(`topk (
96+
%d,
97+
(
98+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
99+
kube_workload_status_desired{%s}
100+
)
101+
)
102+
-
103+
(
104+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
105+
kube_workload_status_ready{%s}
106+
or
107+
kube_daemonset_status_number_ready{%s}
108+
)
109+
or
110+
vector(0)
111+
)
112+
>
113+
0 or vector(0)
114+
)`, limit, commonFiltersStr, readyFiltersStr, commonFiltersStr)
115+
}
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
package tools_test
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"io"
7+
"net/http"
8+
9+
"github.com/mark3labs/mcp-go/mcp"
10+
"github.com/mark3labs/mcp-go/server"
11+
. "github.com/onsi/ginkgo/v2"
12+
. "github.com/onsi/gomega"
13+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
14+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
15+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
16+
"go.uber.org/mock/gomock"
17+
)
18+
19+
var _ = Describe("TroubleshootKubernetesListTopUnavailablePods Tool", func() {
20+
var (
21+
tool *tools.TroubleshootKubernetesListTopUnavailablePods
22+
mockSysdig *mocks.MockExtendedClientWithResponsesInterface
23+
mcpServer *server.MCPServer
24+
ctrl *gomock.Controller
25+
)
26+
27+
BeforeEach(func() {
28+
ctrl = gomock.NewController(GinkgoT())
29+
mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
30+
tool = tools.NewTroubleshootKubernetesListTopUnavailablePods(mockSysdig)
31+
mcpServer = server.NewMCPServer("test", "test")
32+
tool.RegisterInServer(mcpServer)
33+
})
34+
35+
It("should register successfully in the server", func() {
36+
Expect(mcpServer.GetTool("troubleshoot_kubernetes_list_top_unavailable_pods")).NotTo(BeNil())
37+
})
38+
39+
When("querying top unavailable pods", func() {
40+
DescribeTable("it succeeds", func(ctx context.Context, toolName string, request mcp.CallToolRequest, expectedParamsRequested sysdig.GetQueryV1Params) {
41+
mockSysdig.EXPECT().GetQueryV1(gomock.Any(), &expectedParamsRequested).Return(&http.Response{
42+
StatusCode: http.StatusOK,
43+
Body: io.NopCloser(bytes.NewBufferString(`{"status":"success"}`)),
44+
}, nil)
45+
46+
serverTool := mcpServer.GetTool(toolName)
47+
result, err := serverTool.Handler(ctx, request)
48+
Expect(err).NotTo(HaveOccurred())
49+
50+
resultData, ok := result.Content[0].(mcp.TextContent)
51+
Expect(ok).To(BeTrue())
52+
Expect(resultData.Text).To(MatchJSON(`{"status":"success"}`))
53+
},
54+
Entry("default params",
55+
"troubleshoot_kubernetes_list_top_unavailable_pods",
56+
mcp.CallToolRequest{
57+
Params: mcp.CallToolParams{
58+
Name: "troubleshoot_kubernetes_list_top_unavailable_pods",
59+
Arguments: map[string]any{},
60+
},
61+
},
62+
sysdig.GetQueryV1Params{
63+
Query: `topk (
64+
20,
65+
(
66+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
67+
kube_workload_status_desired{}
68+
)
69+
)
70+
-
71+
(
72+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
73+
kube_workload_status_ready{kube_workload_type!="daemonset"}
74+
or
75+
kube_daemonset_status_number_ready{}
76+
)
77+
or
78+
vector(0)
79+
)
80+
>
81+
0 or vector(0)
82+
)`,
83+
},
84+
),
85+
Entry("with specific limit and cluster",
86+
"troubleshoot_kubernetes_list_top_unavailable_pods",
87+
mcp.CallToolRequest{
88+
Params: mcp.CallToolParams{
89+
Name: "troubleshoot_kubernetes_list_top_unavailable_pods",
90+
Arguments: map[string]any{
91+
"limit": 5,
92+
"cluster_name": "my-cluster",
93+
},
94+
},
95+
},
96+
sysdig.GetQueryV1Params{
97+
Query: `topk (
98+
5,
99+
(
100+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
101+
kube_workload_status_desired{kube_cluster_name="my-cluster"}
102+
)
103+
)
104+
-
105+
(
106+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
107+
kube_workload_status_ready{kube_workload_type!="daemonset",kube_cluster_name="my-cluster"}
108+
or
109+
kube_daemonset_status_number_ready{kube_cluster_name="my-cluster"}
110+
)
111+
or
112+
vector(0)
113+
)
114+
>
115+
0 or vector(0)
116+
)`,
117+
},
118+
),
119+
Entry("with all filters",
120+
"troubleshoot_kubernetes_list_top_unavailable_pods",
121+
mcp.CallToolRequest{
122+
Params: mcp.CallToolParams{
123+
Name: "troubleshoot_kubernetes_list_top_unavailable_pods",
124+
Arguments: map[string]any{
125+
"limit": 10,
126+
"cluster_name": "my-cluster",
127+
"namespace_name": "my-ns",
128+
"workload_type": "deployment",
129+
"workload_name": "my-app",
130+
},
131+
},
132+
},
133+
sysdig.GetQueryV1Params{
134+
Query: `topk (
135+
10,
136+
(
137+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
138+
kube_workload_status_desired{kube_cluster_name="my-cluster",kube_namespace_name="my-ns",kube_workload_type="deployment",kube_workload_name="my-app"}
139+
)
140+
)
141+
-
142+
(
143+
sum by (kube_cluster_name, kube_namespace_name, kube_workload_name) (
144+
kube_workload_status_ready{kube_workload_type!="daemonset",kube_cluster_name="my-cluster",kube_namespace_name="my-ns",kube_workload_type="deployment",kube_workload_name="my-app"}
145+
or
146+
kube_daemonset_status_number_ready{kube_cluster_name="my-cluster",kube_namespace_name="my-ns",kube_workload_type="deployment",kube_workload_name="my-app"}
147+
)
148+
or
149+
vector(0)
150+
)
151+
>
152+
0 or vector(0)
153+
)`,
154+
},
155+
),
156+
)
157+
})
158+
})

0 commit comments

Comments
 (0)