Skip to content

Commit aba00f8

Browse files
authored
feat: add troubleshoot_kubernetes_list_top_network_errors_in_pods (#39)
1 parent 61af83a commit aba00f8

File tree

5 files changed

+259
-0
lines changed

5 files changed

+259
-0
lines changed

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ The handler filters tools dynamically based on `GetMyPermissions` from Sysdig Se
5757
| `troubleshoot_kubernetes_list_top_unavailable_pods` | `tool_troubleshoot_kubernetes_list_top_unavailable_pods.go` | Shows the top N pods with the highest number of unavailable or unready replicas. | `promql.exec` | "Show the top 20 unavailable pods in cluster 'production'" |
5858
| `troubleshoot_kubernetes_list_top_restarted_pods` | `tool_troubleshoot_kubernetes_list_top_restarted_pods.go` | Lists the pods with the highest number of container restarts. | `promql.exec` | "Show the top 10 pods with the most container restarts in cluster 'production'" |
5959
| `troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods` | `tool_troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods.go` | Lists the pods with the highest rate of HTTP 4xx and 5xx errors over a specified time interval. | `promql.exec` | "Show the top 20 pods with the most HTTP errors in cluster 'production'" |
60+
| `troubleshoot_kubernetes_list_top_network_errors_in_pods` | `tool_troubleshoot_kubernetes_list_top_network_errors_in_pods.go` | Shows the top network errors by pod over a given interval. | `promql.exec` | "Show the top 10 pods with the most network errors in cluster 'production'" |
6061

6162
Every tool has a companion `_test.go` file that exercises request validation, permission metadata, and Sysdig client calls through mocks.
6263
Note that if you add more tools you need to also update this file to reflect that.

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,11 @@ The server dynamically filters the available tools based on the permissions asso
148148
- **Required Permission**: `promql.exec`
149149
- **Sample Prompt**: "Show the top 20 pods with the most HTTP errors in cluster 'production'"
150150

151+
- **`troubleshoot_kubernetes_list_top_network_errors_in_pods`**
152+
- **Description**: Shows the top network errors by pod over a given interval, aggregated by cluster, namespace, workload type, and workload name. The result is an average rate of network errors per second.
153+
- **Required Permission**: `promql.exec`
154+
- **Sample Prompt**: "Show the top 10 pods with the most network errors in cluster 'production'"
155+
151156
## Requirements
152157

153158
- [Go](https://go.dev/doc/install) 1.25 or higher (if running without Docker).

cmd/server/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
103103
tools.NewTroubleshootKubernetesListTopUnavailablePods(sysdigClient),
104104
tools.NewTroubleshootKubernetesListTopRestartedPods(sysdigClient),
105105
tools.NewTroubleshootKubernetesListTop400500HttpErrorsInPods(sysdigClient),
106+
tools.NewTroubleshootKubernetesListTopNetworkErrorsInPods(sysdigClient),
106107
)
107108
return handler
108109
}
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
package tools
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"io"
8+
"strings"
9+
"time"
10+
11+
"github.com/mark3labs/mcp-go/mcp"
12+
"github.com/mark3labs/mcp-go/server"
13+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
14+
)
15+
16+
type TroubleshootKubernetesListTopNetworkErrorsInPods struct {
17+
SysdigClient sysdig.ExtendedClientWithResponsesInterface
18+
}
19+
20+
func NewTroubleshootKubernetesListTopNetworkErrorsInPods(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTopNetworkErrorsInPods {
21+
return &TroubleshootKubernetesListTopNetworkErrorsInPods{
22+
SysdigClient: sysdigClient,
23+
}
24+
}
25+
26+
func (t *TroubleshootKubernetesListTopNetworkErrorsInPods) RegisterInServer(s *server.MCPServer) {
27+
tool := mcp.NewTool("troubleshoot_kubernetes_list_top_network_errors_in_pods",
28+
mcp.WithDescription("Shows the top network errors by pod over a given interval, aggregated by cluster, namespace, workload type, and workload name. The result is an average rate of network errors per second."),
29+
mcp.WithString("interval", mcp.Description("Time interval for the query (e.g. '1h', '30m'). Default is '1h'.")),
30+
mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
31+
mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")),
32+
mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")),
33+
mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")),
34+
mcp.WithNumber("limit",
35+
mcp.Description("Maximum number of pods to return."),
36+
mcp.DefaultNumber(20),
37+
),
38+
mcp.WithOutputSchema[map[string]any](),
39+
WithRequiredPermissions(), // FIXME(fede): Add the required permissions. It should be `promql.exec` but somehow the token does not have that permission even if you are able to execute queries.
40+
)
41+
s.AddTool(tool, t.handle)
42+
}
43+
44+
func (t *TroubleshootKubernetesListTopNetworkErrorsInPods) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
45+
interval := mcp.ParseString(request, "interval", "1h")
46+
clusterName := mcp.ParseString(request, "cluster_name", "")
47+
namespaceName := mcp.ParseString(request, "namespace_name", "")
48+
workloadType := mcp.ParseString(request, "workload_type", "")
49+
workloadName := mcp.ParseString(request, "workload_name", "")
50+
limit := mcp.ParseInt(request, "limit", 20)
51+
52+
query, err := buildTopNetworkErrorsQuery(interval, limit, clusterName, namespaceName, workloadType, workloadName)
53+
if err != nil {
54+
return mcp.NewToolResultErrorFromErr("failed to build query", err), nil
55+
}
56+
57+
limitQuery := sysdig.LimitQuery(limit)
58+
params := &sysdig.GetQueryV1Params{
59+
Query: query,
60+
Limit: &limitQuery,
61+
}
62+
63+
httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
64+
if err != nil {
65+
return mcp.NewToolResultErrorFromErr("failed to execute query", err), nil
66+
}
67+
68+
if httpResp.StatusCode != 200 {
69+
bodyBytes, _ := io.ReadAll(httpResp.Body)
70+
return mcp.NewToolResultErrorf("failed to execute query: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil
71+
}
72+
73+
var queryResponse sysdig.QueryResponseV1
74+
if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil {
75+
return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil
76+
}
77+
78+
return mcp.NewToolResultJSON(queryResponse)
79+
}
80+
81+
func buildTopNetworkErrorsQuery(interval string, limit int, clusterName, namespaceName, workloadType, workloadName string) (string, error) {
82+
duration, err := time.ParseDuration(interval)
83+
if err != nil {
84+
return "", fmt.Errorf("invalid interval format: %w", err)
85+
}
86+
seconds := duration.Seconds()
87+
88+
filters := []string{}
89+
if clusterName != "" {
90+
filters = append(filters, fmt.Sprintf("kube_cluster_name=~\"%s\"", clusterName))
91+
}
92+
if namespaceName != "" {
93+
filters = append(filters, fmt.Sprintf("kube_namespace_name=\"%s\"", namespaceName))
94+
}
95+
if workloadType != "" {
96+
filters = append(filters, fmt.Sprintf("kube_workload_type=\"%s\"", workloadType))
97+
}
98+
if workloadName != "" {
99+
filters = append(filters, fmt.Sprintf("kube_workload_name=\"%s\"", workloadName))
100+
}
101+
102+
filterStr := ""
103+
if len(filters) > 0 {
104+
filterStr = strings.Join(filters, ",")
105+
}
106+
107+
return fmt.Sprintf("topk(%d,sum(sum_over_time(sysdig_container_net_error_count{%s}[%s])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / %f",
108+
limit, filterStr, interval, seconds), nil
109+
}
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
package tools_test
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"io"
7+
"net/http"
8+
9+
"github.com/mark3labs/mcp-go/mcp"
10+
"github.com/mark3labs/mcp-go/server"
11+
. "github.com/onsi/ginkgo/v2"
12+
. "github.com/onsi/gomega"
13+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
14+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
15+
"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
16+
"go.uber.org/mock/gomock"
17+
)
18+
19+
var _ = Describe("TroubleshootKubernetesListTopNetworkErrorsInPods Tool", func() {
20+
var (
21+
tool *tools.TroubleshootKubernetesListTopNetworkErrorsInPods
22+
mockSysdig *mocks.MockExtendedClientWithResponsesInterface
23+
mcpServer *server.MCPServer
24+
ctrl *gomock.Controller
25+
ctx context.Context
26+
)
27+
28+
BeforeEach(func() {
29+
ctrl = gomock.NewController(GinkgoT())
30+
mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
31+
tool = tools.NewTroubleshootKubernetesListTopNetworkErrorsInPods(mockSysdig)
32+
mcpServer = server.NewMCPServer("test", "test")
33+
tool.RegisterInServer(mcpServer)
34+
ctx = context.Background()
35+
})
36+
37+
It("should register successfully in the server", func() {
38+
Expect(mcpServer.GetTool("troubleshoot_kubernetes_list_top_network_errors_in_pods")).NotTo(BeNil())
39+
})
40+
41+
When("listing top network errors", func() {
42+
DescribeTable("it succeeds", func(ctx context.Context, toolName string, request mcp.CallToolRequest, expectedParamsRequested sysdig.GetQueryV1Params) {
43+
mockSysdig.EXPECT().GetQueryV1(gomock.Any(), &expectedParamsRequested).Return(&http.Response{
44+
StatusCode: http.StatusOK,
45+
Body: io.NopCloser(bytes.NewBufferString(`{"status":"success"}`)),
46+
}, nil)
47+
48+
serverTool := mcpServer.GetTool(toolName)
49+
result, err := serverTool.Handler(ctx, request)
50+
Expect(err).NotTo(HaveOccurred())
51+
52+
resultData, ok := result.Content[0].(mcp.TextContent)
53+
Expect(ok).To(BeTrue())
54+
Expect(resultData.Text).To(MatchJSON(`{"status":"success"}`))
55+
},
56+
Entry("default params",
57+
"troubleshoot_kubernetes_list_top_network_errors_in_pods",
58+
mcp.CallToolRequest{
59+
Params: mcp.CallToolParams{
60+
Name: "troubleshoot_kubernetes_list_top_network_errors_in_pods",
61+
Arguments: map[string]any{},
62+
},
63+
},
64+
sysdig.GetQueryV1Params{
65+
Query: `topk(20,sum(sum_over_time(sysdig_container_net_error_count{}[1h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 3600.000000`,
66+
Limit: asPtr(sysdig.LimitQuery(20)),
67+
},
68+
),
69+
Entry("with custom params",
70+
"troubleshoot_kubernetes_list_top_network_errors_in_pods",
71+
mcp.CallToolRequest{
72+
Params: mcp.CallToolParams{
73+
Name: "troubleshoot_kubernetes_list_top_network_errors_in_pods",
74+
Arguments: map[string]any{
75+
"interval": "30m",
76+
"cluster_name": "prod-cluster",
77+
"namespace_name": "backend",
78+
"limit": 5,
79+
},
80+
},
81+
},
82+
sysdig.GetQueryV1Params{
83+
Query: `topk(5,sum(sum_over_time(sysdig_container_net_error_count{kube_cluster_name=~"prod-cluster",kube_namespace_name="backend"}[30m])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 1800.000000`,
84+
Limit: asPtr(sysdig.LimitQuery(5)),
85+
},
86+
),
87+
Entry("with all params",
88+
"troubleshoot_kubernetes_list_top_network_errors_in_pods",
89+
mcp.CallToolRequest{
90+
Params: mcp.CallToolParams{
91+
Name: "troubleshoot_kubernetes_list_top_network_errors_in_pods",
92+
Arguments: map[string]any{
93+
"interval": "2h",
94+
"cluster_name": "dev",
95+
"namespace_name": "default",
96+
"workload_type": "deployment",
97+
"workload_name": "api",
98+
"limit": 10,
99+
},
100+
},
101+
},
102+
sysdig.GetQueryV1Params{
103+
Query: `topk(10,sum(sum_over_time(sysdig_container_net_error_count{kube_cluster_name=~"dev",kube_namespace_name="default",kube_workload_type="deployment",kube_workload_name="api"}[2h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 7200.000000`,
104+
Limit: asPtr(sysdig.LimitQuery(10)),
105+
},
106+
),
107+
)
108+
109+
It("returns error for invalid interval", func() {
110+
serverTool := mcpServer.GetTool("troubleshoot_kubernetes_list_top_network_errors_in_pods")
111+
request := mcp.CallToolRequest{
112+
Params: mcp.CallToolParams{
113+
Name: "troubleshoot_kubernetes_list_top_network_errors_in_pods",
114+
Arguments: map[string]any{"interval": "invalid"},
115+
},
116+
}
117+
result, err := serverTool.Handler(ctx, request)
118+
Expect(err).NotTo(HaveOccurred())
119+
Expect(result.IsError).To(BeTrue())
120+
Expect(result.Content[0].(mcp.TextContent).Text).To(ContainSubstring("invalid interval format"))
121+
})
122+
123+
It("should return an error when sysdig returns an error", func() {
124+
mockSysdig.EXPECT().GetQueryV1(gomock.Any(), gomock.Any()).Return(&http.Response{
125+
StatusCode: http.StatusInternalServerError,
126+
Body: io.NopCloser(bytes.NewBufferString(`{"errors":[{"message":"test error"}]}`)),
127+
}, nil)
128+
129+
request := mcp.CallToolRequest{
130+
Params: mcp.CallToolParams{
131+
Name: "troubleshoot_kubernetes_list_top_network_errors_in_pods",
132+
Arguments: map[string]any{},
133+
},
134+
}
135+
136+
serverTool := mcpServer.GetTool("troubleshoot_kubernetes_list_top_network_errors_in_pods")
137+
res, err := serverTool.Handler(ctx, request)
138+
Expect(err).ToNot(HaveOccurred())
139+
Expect(res.IsError).To(BeTrue())
140+
Expect(res.Content[0].(mcp.TextContent).Text).To(ContainSubstring("failed to execute query: status code 500"))
141+
})
142+
})
143+
})

0 commit comments

Comments
 (0)