feat: add troubleshoot_kubernetes_list_top_network_errors_in_pods (#39)

tembleking · web-flow · commit aba00f82de55 · 2025-11-24T17:11:42.000+01:00
diff --git a/AGENTS.md b/AGENTS.md
@@ -57,6 +57,7 @@ The handler filters tools dynamically based on `GetMyPermissions` from Sysdig Se
 | `troubleshoot_kubernetes_list_top_unavailable_pods` | `tool_troubleshoot_kubernetes_list_top_unavailable_pods.go` | Shows the top N pods with the highest number of unavailable or unready replicas. | `promql.exec` | "Show the top 20 unavailable pods in cluster 'production'" |
 | `troubleshoot_kubernetes_list_top_restarted_pods` | `tool_troubleshoot_kubernetes_list_top_restarted_pods.go` | Lists the pods with the highest number of container restarts. | `promql.exec` | "Show the top 10 pods with the most container restarts in cluster 'production'" |
 | `troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods` | `tool_troubleshoot_kubernetes_list_top_400_500_http_errors_in_pods.go` | Lists the pods with the highest rate of HTTP 4xx and 5xx errors over a specified time interval. | `promql.exec` | "Show the top 20 pods with the most HTTP errors in cluster 'production'" |
+| `troubleshoot_kubernetes_list_top_network_errors_in_pods` | `tool_troubleshoot_kubernetes_list_top_network_errors_in_pods.go` | Shows the top network errors by pod over a given interval. | `promql.exec` | "Show the top 10 pods with the most network errors in cluster 'production'" |
 
 Every tool has a companion `_test.go` file that exercises request validation, permission metadata, and Sysdig client calls through mocks.
 Note that if you add more tools you need to also update this file to reflect that.
diff --git a/README.md b/README.md
@@ -148,6 +148,11 @@ The server dynamically filters the available tools based on the permissions asso
   - **Required Permission**: `promql.exec`
   - **Sample Prompt**: "Show the top 20 pods with the most HTTP errors in cluster 'production'"
 
+- **`troubleshoot_kubernetes_list_top_network_errors_in_pods`**
+  - **Description**: Shows the top network errors by pod over a given interval, aggregated by cluster, namespace, workload type, and workload name. The result is an average rate of network errors per second.
+  - **Required Permission**: `promql.exec`
+  - **Sample Prompt**: "Show the top 10 pods with the most network errors in cluster 'production'"
+
 ## Requirements
 
 - [Go](https://go.dev/doc/install) 1.25 or higher (if running without Docker).
diff --git a/cmd/server/main.go b/cmd/server/main.go
@@ -103,6 +103,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp
 		tools.NewTroubleshootKubernetesListTopUnavailablePods(sysdigClient),
 		tools.NewTroubleshootKubernetesListTopRestartedPods(sysdigClient),
 		tools.NewTroubleshootKubernetesListTop400500HttpErrorsInPods(sysdigClient),
+		tools.NewTroubleshootKubernetesListTopNetworkErrorsInPods(sysdigClient),
 	)
 	return handler
 }
diff --git a/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_network_errors_in_pods.go b/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_network_errors_in_pods.go
@@ -0,0 +1,109 @@
+package tools
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"strings"
+	"time"
+
+	"github.com/mark3labs/mcp-go/mcp"
+	"github.com/mark3labs/mcp-go/server"
+	"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
+)
+
+type TroubleshootKubernetesListTopNetworkErrorsInPods struct {
+	SysdigClient sysdig.ExtendedClientWithResponsesInterface
+}
+
+func NewTroubleshootKubernetesListTopNetworkErrorsInPods(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTopNetworkErrorsInPods {
+	return &TroubleshootKubernetesListTopNetworkErrorsInPods{
+		SysdigClient: sysdigClient,
+	}
+}
+
+func (t *TroubleshootKubernetesListTopNetworkErrorsInPods) RegisterInServer(s *server.MCPServer) {
+	tool := mcp.NewTool("troubleshoot_kubernetes_list_top_network_errors_in_pods",
+		mcp.WithDescription("Shows the top network errors by pod over a given interval, aggregated by cluster, namespace, workload type, and workload name. The result is an average rate of network errors per second."),
+		mcp.WithString("interval", mcp.Description("Time interval for the query (e.g. '1h', '30m'). Default is '1h'.")),
+		mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")),
+		mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")),
+		mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")),
+		mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")),
+		mcp.WithNumber("limit",
+			mcp.Description("Maximum number of pods to return."),
+			mcp.DefaultNumber(20),
+		),
+		mcp.WithOutputSchema[map[string]any](),
+		WithRequiredPermissions(), // FIXME(fede): Add the required permissions. It should be `promql.exec` but somehow the token does not have that permission even if you are able to execute queries.
+	)
+	s.AddTool(tool, t.handle)
+}
+
+func (t *TroubleshootKubernetesListTopNetworkErrorsInPods) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) {
+	interval := mcp.ParseString(request, "interval", "1h")
+	clusterName := mcp.ParseString(request, "cluster_name", "")
+	namespaceName := mcp.ParseString(request, "namespace_name", "")
+	workloadType := mcp.ParseString(request, "workload_type", "")
+	workloadName := mcp.ParseString(request, "workload_name", "")
+	limit := mcp.ParseInt(request, "limit", 20)
+
+	query, err := buildTopNetworkErrorsQuery(interval, limit, clusterName, namespaceName, workloadType, workloadName)
+	if err != nil {
+		return mcp.NewToolResultErrorFromErr("failed to build query", err), nil
+	}
+
+	limitQuery := sysdig.LimitQuery(limit)
+	params := &sysdig.GetQueryV1Params{
+		Query: query,
+		Limit: &limitQuery,
+	}
+
+	httpResp, err := t.SysdigClient.GetQueryV1(ctx, params)
+	if err != nil {
+		return mcp.NewToolResultErrorFromErr("failed to execute query", err), nil
+	}
+
+	if httpResp.StatusCode != 200 {
+		bodyBytes, _ := io.ReadAll(httpResp.Body)
+		return mcp.NewToolResultErrorf("failed to execute query: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil
+	}
+
+	var queryResponse sysdig.QueryResponseV1
+	if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil {
+		return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil
+	}
+
+	return mcp.NewToolResultJSON(queryResponse)
+}
+
+func buildTopNetworkErrorsQuery(interval string, limit int, clusterName, namespaceName, workloadType, workloadName string) (string, error) {
+	duration, err := time.ParseDuration(interval)
+	if err != nil {
+		return "", fmt.Errorf("invalid interval format: %w", err)
+	}
+	seconds := duration.Seconds()
+
+	filters := []string{}
+	if clusterName != "" {
+		filters = append(filters, fmt.Sprintf("kube_cluster_name=~\"%s\"", clusterName))
+	}
+	if namespaceName != "" {
+		filters = append(filters, fmt.Sprintf("kube_namespace_name=\"%s\"", namespaceName))
+	}
+	if workloadType != "" {
+		filters = append(filters, fmt.Sprintf("kube_workload_type=\"%s\"", workloadType))
+	}
+	if workloadName != "" {
+		filters = append(filters, fmt.Sprintf("kube_workload_name=\"%s\"", workloadName))
+	}
+
+	filterStr := ""
+	if len(filters) > 0 {
+		filterStr = strings.Join(filters, ",")
+	}
+
+	return fmt.Sprintf("topk(%d,sum(sum_over_time(sysdig_container_net_error_count{%s}[%s])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / %f",
+		limit, filterStr, interval, seconds), nil
+}
diff --git a/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_network_errors_in_pods_test.go b/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_network_errors_in_pods_test.go
@@ -0,0 +1,143 @@
+package tools_test
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"net/http"
+
+	"github.com/mark3labs/mcp-go/mcp"
+	"github.com/mark3labs/mcp-go/server"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools"
+	"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig"
+	"github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks"
+	"go.uber.org/mock/gomock"
+)
+
+var _ = Describe("TroubleshootKubernetesListTopNetworkErrorsInPods Tool", func() {
+	var (
+		tool       *tools.TroubleshootKubernetesListTopNetworkErrorsInPods
+		mockSysdig *mocks.MockExtendedClientWithResponsesInterface
+		mcpServer  *server.MCPServer
+		ctrl       *gomock.Controller
+		ctx        context.Context
+	)
+
+	BeforeEach(func() {
+		ctrl = gomock.NewController(GinkgoT())
+		mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl)
+		tool = tools.NewTroubleshootKubernetesListTopNetworkErrorsInPods(mockSysdig)
+		mcpServer = server.NewMCPServer("test", "test")
+		tool.RegisterInServer(mcpServer)
+		ctx = context.Background()
+	})
+
+	It("should register successfully in the server", func() {
+		Expect(mcpServer.GetTool("troubleshoot_kubernetes_list_top_network_errors_in_pods")).NotTo(BeNil())
+	})
+
+	When("listing top network errors", func() {
+		DescribeTable("it succeeds", func(ctx context.Context, toolName string, request mcp.CallToolRequest, expectedParamsRequested sysdig.GetQueryV1Params) {
+			mockSysdig.EXPECT().GetQueryV1(gomock.Any(), &expectedParamsRequested).Return(&http.Response{
+				StatusCode: http.StatusOK,
+				Body:       io.NopCloser(bytes.NewBufferString(`{"status":"success"}`)),
+			}, nil)
+
+			serverTool := mcpServer.GetTool(toolName)
+			result, err := serverTool.Handler(ctx, request)
+			Expect(err).NotTo(HaveOccurred())
+
+			resultData, ok := result.Content[0].(mcp.TextContent)
+			Expect(ok).To(BeTrue())
+			Expect(resultData.Text).To(MatchJSON(`{"status":"success"}`))
+		},
+			Entry("default params",
+				"troubleshoot_kubernetes_list_top_network_errors_in_pods",
+				mcp.CallToolRequest{
+					Params: mcp.CallToolParams{
+						Name:      "troubleshoot_kubernetes_list_top_network_errors_in_pods",
+						Arguments: map[string]any{},
+					},
+				},
+				sysdig.GetQueryV1Params{
+					Query: `topk(20,sum(sum_over_time(sysdig_container_net_error_count{}[1h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 3600.000000`,
+					Limit: asPtr(sysdig.LimitQuery(20)),
+				},
+			),
+			Entry("with custom params",
+				"troubleshoot_kubernetes_list_top_network_errors_in_pods",
+				mcp.CallToolRequest{
+					Params: mcp.CallToolParams{
+						Name: "troubleshoot_kubernetes_list_top_network_errors_in_pods",
+						Arguments: map[string]any{
+							"interval":       "30m",
+							"cluster_name":   "prod-cluster",
+							"namespace_name": "backend",
+							"limit":          5,
+						},
+					},
+				},
+				sysdig.GetQueryV1Params{
+					Query: `topk(5,sum(sum_over_time(sysdig_container_net_error_count{kube_cluster_name=~"prod-cluster",kube_namespace_name="backend"}[30m])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 1800.000000`,
+					Limit: asPtr(sysdig.LimitQuery(5)),
+				},
+			),
+			Entry("with all params",
+				"troubleshoot_kubernetes_list_top_network_errors_in_pods",
+				mcp.CallToolRequest{
+					Params: mcp.CallToolParams{
+						Name: "troubleshoot_kubernetes_list_top_network_errors_in_pods",
+						Arguments: map[string]any{
+							"interval":       "2h",
+							"cluster_name":   "dev",
+							"namespace_name": "default",
+							"workload_type":  "deployment",
+							"workload_name":  "api",
+							"limit":          10,
+						},
+					},
+				},
+				sysdig.GetQueryV1Params{
+					Query: `topk(10,sum(sum_over_time(sysdig_container_net_error_count{kube_cluster_name=~"dev",kube_namespace_name="default",kube_workload_type="deployment",kube_workload_name="api"}[2h])) by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, kube_pod_name)) / 7200.000000`,
+					Limit: asPtr(sysdig.LimitQuery(10)),
+				},
+			),
+		)
+
+		It("returns error for invalid interval", func() {
+			serverTool := mcpServer.GetTool("troubleshoot_kubernetes_list_top_network_errors_in_pods")
+			request := mcp.CallToolRequest{
+				Params: mcp.CallToolParams{
+					Name:      "troubleshoot_kubernetes_list_top_network_errors_in_pods",
+					Arguments: map[string]any{"interval": "invalid"},
+				},
+			}
+			result, err := serverTool.Handler(ctx, request)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(result.IsError).To(BeTrue())
+			Expect(result.Content[0].(mcp.TextContent).Text).To(ContainSubstring("invalid interval format"))
+		})
+
+		It("should return an error when sysdig returns an error", func() {
+			mockSysdig.EXPECT().GetQueryV1(gomock.Any(), gomock.Any()).Return(&http.Response{
+				StatusCode: http.StatusInternalServerError,
+				Body:       io.NopCloser(bytes.NewBufferString(`{"errors":[{"message":"test error"}]}`)),
+			}, nil)
+
+			request := mcp.CallToolRequest{
+				Params: mcp.CallToolParams{
+					Name:      "troubleshoot_kubernetes_list_top_network_errors_in_pods",
+					Arguments: map[string]any{},
+				},
+			}
+
+			serverTool := mcpServer.GetTool("troubleshoot_kubernetes_list_top_network_errors_in_pods")
+			res, err := serverTool.Handler(ctx, request)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.IsError).To(BeTrue())
+			Expect(res.Content[0].(mcp.TextContent).Text).To(ContainSubstring("failed to execute query: status code 500"))
+		})
+	})
+})

Original file line number	Diff line number	Diff line change
`@@ -103,6 +103,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp`
`103`	`103`	`tools.NewTroubleshootKubernetesListTopUnavailablePods(sysdigClient),`
`104`	`104`	`tools.NewTroubleshootKubernetesListTopRestartedPods(sysdigClient),`
`105`	`105`	`tools.NewTroubleshootKubernetesListTop400500HttpErrorsInPods(sysdigClient),`
	`106`	`+ tools.NewTroubleshootKubernetesListTopNetworkErrorsInPods(sysdigClient),`
`106`	`107`	`)`
`107`	`108`	`return handler`
`108`	`109`	`}`