diff --git a/README.md b/README.md index a81daea1..7a28da12 100644 --- a/README.md +++ b/README.md @@ -208,11 +208,12 @@ The following sets of tools are available (all on by default): -| Toolset | Description | -|---------|-------------------------------------------------------------------------------------| -| config | View and manage the current local Kubernetes configuration (kubeconfig) | -| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | -| helm | Tools for managing Helm charts and releases | +| Toolset | Description | +|----------------|-------------------------------------------------------------------------------------| +| config | View and manage the current local Kubernetes configuration (kubeconfig) | +| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | +| helm | Tools for managing Helm charts and releases | +| openshift-core | Core OpenShift-specific tools (Node debugging, etc.) | @@ -336,6 +337,19 @@ In case multi-cluster support is enabled (default) and you have access to multip +
+ +openshift-core + +- **nodes_debug_exec** - Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs. + - `command` (`array`) **(required)** - Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']). + - `image` (`string`) - Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities. + - `namespace` (`string`) - Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default'). + - `node` (`string`) **(required)** - Name of the node to debug (e.g. worker-0). + - `timeout_seconds` (`integer`) - Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds). + +
+ diff --git a/internal/tools/update-readme/main.go b/internal/tools/update-readme/main.go index cdf695fc..b425a1d3 100644 --- a/internal/tools/update-readme/main.go +++ b/internal/tools/update-readme/main.go @@ -15,6 +15,7 @@ import ( _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/config" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/core" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/helm" + _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" ) type OpenShift struct{} diff --git a/pkg/config/config_default_overrides.go b/pkg/config/config_default_overrides.go index 70d065bc..05c3b4ac 100644 --- a/pkg/config/config_default_overrides.go +++ b/pkg/config/config_default_overrides.go @@ -3,6 +3,7 @@ package config func defaultOverrides() StaticConfig { return StaticConfig{ // IMPORTANT: this file is used to override default config values in downstream builds. - // This is intentionally left blank. + // OpenShift-specific defaults: add openshift-core toolset + Toolsets: []string{"core", "config", "helm", "openshift-core"}, } } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index afdde191..caa4f95d 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -167,8 +167,8 @@ func (s *ConfigSuite) TestReadConfigValidPreservesDefaultsForMissingFields() { s.Equalf("table", config.ListOutput, "Expected ListOutput to be table, got %s", config.ListOutput) }) s.Run("toolsets defaulted correctly", func() { - s.Require().Lenf(config.Toolsets, 3, "Expected 3 toolsets, got %d", len(config.Toolsets)) - for _, toolset := range []string{"core", "config", "helm"} { + s.Require().Lenf(config.Toolsets, 4, "Expected 4 toolsets, got %d", len(config.Toolsets)) + for _, toolset := range []string{"core", "config", "helm", "openshift-core"} { s.Containsf(config.Toolsets, toolset, "Expected toolsets to contain %s", toolset) } }) diff --git a/pkg/kubernetes-mcp-server/cmd/root_test.go b/pkg/kubernetes-mcp-server/cmd/root_test.go index 22521667..e5342cd5 100644 --- a/pkg/kubernetes-mcp-server/cmd/root_test.go +++ b/pkg/kubernetes-mcp-server/cmd/root_test.go @@ -137,7 +137,7 @@ func TestToolsets(t *testing.T) { rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--help"}) o, err := captureOutput(rootCmd.Execute) // --help doesn't use logger/klog, cobra prints directly to stdout - if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm).") { + if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm, openshift-core).") { t.Fatalf("Expected all available toolsets, got %s %v", o, err) } }) @@ -145,7 +145,7 @@ func TestToolsets(t *testing.T) { ioStreams, out := testStream() rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--version", "--port=1337", "--log-level=1"}) - if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm") { + if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm, openshift-core") { t.Fatalf("Expected toolsets 'full', got %s %v", out, err) } }) diff --git a/pkg/mcp/nodes_test.go b/pkg/mcp/nodes_test.go index 62ac55e9..e7626a18 100644 --- a/pkg/mcp/nodes_test.go +++ b/pkg/mcp/nodes_test.go @@ -1,14 +1,21 @@ package mcp import ( + "encoding/json" + "io" "net/http" "strconv" + "strings" "testing" "github.com/BurntSushi/toml" "github.com/containers/kubernetes-mcp-server/internal/test" "github.com/mark3labs/mcp-go/mcp" "github.com/stretchr/testify/suite" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" ) type NodesSuite struct { @@ -334,3 +341,249 @@ func (s *NodesSuite) TestNodesStatsSummaryDenied() { func TestNodes(t *testing.T) { suite.Run(t, new(NodesSuite)) } + +// Tests below are for the nodes_debug_exec tool (OpenShift-specific) + +type NodesDebugExecSuite struct { + BaseMcpSuite + mockServer *test.MockServer +} + +func (s *NodesDebugExecSuite) SetupTest() { + s.BaseMcpSuite.SetupTest() + s.mockServer = test.NewMockServer() + s.Cfg.KubeConfig = s.mockServer.KubeconfigFile(s.T()) +} + +func (s *NodesDebugExecSuite) TearDownTest() { + s.BaseMcpSuite.TearDownTest() + if s.mockServer != nil { + s.mockServer.Close() + } +} + +func (s *NodesDebugExecSuite) TestNodesDebugExecTool() { + s.Run("nodes_debug_exec with successful execution", func() { + + var ( + createdPod v1.Pod + deleteCalled bool + ) + const namespace = "debug" + const logOutput = "filesystem repaired" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + s.mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read apply body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + s.T().Fatalf("failed to decode apply body: %v", err) + } + createdPod = *created + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + createdPod.Name = pathParts[len(pathParts)-1] + } + createdPod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read create body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + s.T().Fatalf("failed to decode create body: %v", err) + } + createdPod = *created + createdPod.ObjectMeta = metav1.ObjectMeta{ + Namespace: namespace, + Name: createdPod.GenerateName + "abc", + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + podStatus := createdPod.DeepCopy() + podStatus.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 0, + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(podStatus) + case req.Method == http.MethodDelete && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + deleteCalled = true + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&metav1.Status{Status: "Success"}) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name+"/log": + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(logOutput)) + } + })) + + s.InitMcpClient() + toolResult, err := s.CallTool("nodes_debug_exec", map[string]interface{}{ + "node": "worker-0", + "namespace": namespace, + "command": []interface{}{"uname", "-a"}, + }) + + s.Run("call succeeds", func() { + s.Nilf(err, "call tool should not error: %v", err) + s.Falsef(toolResult.IsError, "tool should not return error: %v", toolResult.Content) + s.NotEmpty(toolResult.Content, "expected output content") + text := toolResult.Content[0].(mcp.TextContent).Text + s.Equalf(logOutput, text, "unexpected tool output %q", text) + }) + + s.Run("debug pod shaped correctly", func() { + s.Require().NotNil(createdPod.Spec.Containers, "expected containers in debug pod") + s.Require().Len(createdPod.Spec.Containers, 1, "expected single container in debug pod") + container := createdPod.Spec.Containers[0] + expectedCommand := []string{"uname", "-a"} + s.Truef(equalStringSlices(container.Command, expectedCommand), + "unexpected debug command: %v", container.Command) + s.Require().NotNil(container.SecurityContext, "expected security context") + s.Require().NotNil(container.SecurityContext.Privileged, "expected privileged field") + s.Truef(*container.SecurityContext.Privileged, "expected privileged container") + s.Require().NotEmpty(createdPod.Spec.Volumes, "expected volumes on debug pod") + s.Require().NotNil(createdPod.Spec.Volumes[0].HostPath, "expected hostPath volume") + s.Truef(deleteCalled, "expected debug pod to be deleted") + }) + }) +} + +func equalStringSlices(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func (s *NodesDebugExecSuite) TestNodesDebugExecToolNonZeroExit() { + s.Run("nodes_debug_exec with non-zero exit code", func() { + const namespace = "default" + const errorMessage = "failed" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + s.mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read apply body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + s.T().Fatalf("failed to decode apply body: %v", err) + } + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + pod.Name = pathParts[len(pathParts)-1] + } + pod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read create body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + s.T().Fatalf("failed to decode create body: %v", err) + } + pod.ObjectMeta = metav1.ObjectMeta{Name: pod.GenerateName + "xyz", Namespace: namespace} + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/") && strings.HasSuffix(req.URL.Path, "/log"): + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(errorMessage)) + case req.Method == http.MethodGet && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + pathParts := strings.Split(req.URL.Path, "/") + podName := pathParts[len(pathParts)-1] + pod := &v1.Pod{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Pod", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + }, + } + pod.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 2, + Reason: "Error", + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + } + })) + + s.InitMcpClient() + toolResult, err := s.CallTool("nodes_debug_exec", map[string]interface{}{ + "node": "infra-1", + "command": []interface{}{"journalctl"}, + }) + + s.Nilf(err, "call tool should not error: %v", err) + s.Truef(toolResult.IsError, "expected tool to return error") + text := toolResult.Content[0].(mcp.TextContent).Text + s.Containsf(text, "command exited with code 2", "expected exit code message, got %q", text) + s.Containsf(text, "Error", "expected error reason included, got %q", text) + }) +} + +func TestNodesDebugExec(t *testing.T) { + suite.Run(t, new(NodesDebugExecSuite)) +} diff --git a/pkg/mcp/openshift_modules.go b/pkg/mcp/openshift_modules.go new file mode 100644 index 00000000..84d982f4 --- /dev/null +++ b/pkg/mcp/openshift_modules.go @@ -0,0 +1,3 @@ +package mcp + +import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json index 1551b4c2..474bb1d0 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json @@ -195,6 +195,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -220,7 +264,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json index 6e85e401..13a956f7 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json @@ -175,6 +175,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -196,7 +240,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools-openshift.json b/pkg/mcp/testdata/toolsets-full-tools-openshift.json index fb24138e..1ac08ca7 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-openshift.json +++ b/pkg/mcp/testdata/toolsets-full-tools-openshift.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -156,7 +200,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools.json b/pkg/mcp/testdata/toolsets-full-tools.json index 5a4b5112..961fbb55 100644 --- a/pkg/mcp/testdata/toolsets-full-tools.json +++ b/pkg/mcp/testdata/toolsets-full-tools.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -156,7 +200,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-openshift-core-tools.json b/pkg/mcp/testdata/toolsets-openshift-core-tools.json new file mode 100644 index 00000000..65f3203d --- /dev/null +++ b/pkg/mcp/testdata/toolsets-openshift-core-tools.json @@ -0,0 +1,46 @@ +[ + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + } +] diff --git a/pkg/ocp/nodes_debug.go b/pkg/ocp/nodes_debug.go new file mode 100644 index 00000000..cb079a9a --- /dev/null +++ b/pkg/ocp/nodes_debug.go @@ -0,0 +1,326 @@ +package ocp + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/version" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilrand "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/utils/ptr" +) + +const ( + // DefaultNodeDebugImage is the UBI9 toolbox image that provides comprehensive debugging and troubleshooting utilities. + // This image includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), + // process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), debugging tools (gdb), + // and many other utilities commonly needed for node-level debugging and diagnostics. + DefaultNodeDebugImage = "registry.access.redhat.com/ubi9/toolbox:latest" + // NodeDebugContainerName is the name used for the debug container, matching 'oc debug node' defaults. + NodeDebugContainerName = "debug" + // DefaultNodeDebugTimeout is the maximum time to wait for the debug pod to finish executing. + DefaultNodeDebugTimeout = 1 * time.Minute +) + +// NodesDebugExec mimics `oc debug node/ -- ` by creating a privileged pod on the target +// node, running the provided command, collecting its output, and removing the pod afterwards. +// The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node resources. +// +// When namespace is empty, the configured namespace (or "default" if none) is used. When image is empty the +// default debug image is used. Timeout controls how long we wait for the pod to complete. +func NodesDebugExec( + ctx context.Context, + k OpenshiftClient, + namespace string, + nodeName string, + image string, + command []string, + timeout time.Duration, +) (string, error) { + if nodeName == "" { + return "", errors.New("node name is required") + } + if len(command) == 0 { + return "", errors.New("command is required") + } + + ns := k.NamespaceOrDefault(namespace) + if ns == "" { + ns = "default" + } + debugImage := image + if debugImage == "" { + debugImage = DefaultNodeDebugImage + } + if timeout <= 0 { + timeout = DefaultNodeDebugTimeout + } + + // Create the debug pod + created, err := createDebugPod(ctx, k, nodeName, ns, debugImage, command) + if err != nil { + return "", err + } + + // Ensure the pod is deleted regardless of completion state. + defer func() { + deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + pods, err := k.AccessControlClientset().Pods(ns) + if err == nil { + _ = pods.Delete(deleteCtx, created.Name, metav1.DeleteOptions{}) + } + }() + + // Poll for debug pod completion + terminated, lastPod, waitMsg, err := pollForCompletion(ctx, k, ns, created.Name, timeout) + if err != nil { + return "", err + } + + // Retrieve the logs + logs, err := retrieveLogs(ctx, k, ns, created.Name) + if err != nil { + return "", err + } + + // Process the results + return processResults(terminated, lastPod, waitMsg, logs) +} + +// createDebugPod creates a privileged pod on the target node to run debug commands. +func createDebugPod( + ctx context.Context, + k OpenshiftClient, + nodeName string, + namespace string, + image string, + command []string, +) (*corev1.Pod, error) { + sanitizedNode := sanitizeForName(nodeName) + hostPathType := corev1.HostPathDirectory + + // Generate a unique name + suffix := utilrand.String(5) + maxNodeLen := 63 - len("node-debug-") - 1 - len(suffix) + if maxNodeLen < 1 { + maxNodeLen = 1 + } + if len(sanitizedNode) > maxNodeLen { + sanitizedNode = sanitizedNode[:maxNodeLen] + } + podName := fmt.Sprintf("node-debug-%s-%s", sanitizedNode, suffix) + + debugPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + Labels: map[string]string{ + kubernetes.AppKubernetesManagedBy: version.BinaryName, + kubernetes.AppKubernetesComponent: "node-debug", + kubernetes.AppKubernetesName: fmt.Sprintf("node-debug-%s", sanitizedNode), + }, + }, + Spec: corev1.PodSpec{ + AutomountServiceAccountToken: ptr.To(false), + HostNetwork: true, + HostPID: true, + HostIPC: true, + NodeName: nodeName, + RestartPolicy: corev1.RestartPolicyNever, + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: ptr.To[int64](0), + }, + Tolerations: []corev1.Toleration{ + {Operator: corev1.TolerationOpExists}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute}, + }, + Volumes: []corev1.Volume{ + { + Name: "host-root", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/", + Type: &hostPathType, + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: NodeDebugContainerName, + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: command, + SecurityContext: &corev1.SecurityContext{ + Privileged: ptr.To(true), + RunAsUser: ptr.To[int64](0), + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "host-root", MountPath: "/host"}, + }, + }, + }, + }, + } + + // Create the pod using AccessControlClientset + pods, err := k.AccessControlClientset().Pods(namespace) + if err != nil { + return nil, fmt.Errorf("failed to get pods interface: %w", err) + } + + created, err := pods.Create(ctx, debugPod, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create debug pod: %w", err) + } + + return created, nil +} + +// pollForCompletion polls the debug pod until it completes or times out. +func pollForCompletion( + ctx context.Context, + k OpenshiftClient, + namespace string, + podName string, + timeout time.Duration, +) (*corev1.ContainerStateTerminated, *corev1.Pod, string, error) { + pollCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + var ( + lastPod *corev1.Pod + terminated *corev1.ContainerStateTerminated + waitMsg string + ) + + for { + // Get pod status using AccessControlClientset + pods, getErr := k.AccessControlClientset().Pods(namespace) + if getErr != nil { + return nil, nil, "", fmt.Errorf("failed to get pods interface: %w", getErr) + } + + current, err := pods.Get(pollCtx, podName, metav1.GetOptions{}) + if err != nil { + return nil, nil, "", fmt.Errorf("failed to get debug pod status: %w", err) + } + lastPod = current + + if status := containerStatusByName(current.Status.ContainerStatuses, NodeDebugContainerName); status != nil { + if status.State.Waiting != nil { + waitMsg = fmt.Sprintf("container waiting: %s", status.State.Waiting.Reason) + // Image pull issues should fail fast. + if status.State.Waiting.Reason == "ErrImagePull" || status.State.Waiting.Reason == "ImagePullBackOff" { + return nil, nil, "", fmt.Errorf("debug container failed to start (%s): %s", status.State.Waiting.Reason, status.State.Waiting.Message) + } + } + if status.State.Terminated != nil { + terminated = status.State.Terminated + break + } + } + + if current.Status.Phase == corev1.PodFailed { + break + } + + // Wait for the next tick interval before checking pod status again, or timeout if context is done. + select { + case <-pollCtx.Done(): + return nil, nil, "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", podName, pollCtx.Err()) + case <-ticker.C: + } + } + + return terminated, lastPod, waitMsg, nil +} + +// retrieveLogs retrieves the logs from the debug pod. +func retrieveLogs(ctx context.Context, k OpenshiftClient, namespace, podName string) (string, error) { + logCtx, logCancel := context.WithTimeout(ctx, 30*time.Second) + defer logCancel() + logs, logErr := k.PodsLog(logCtx, namespace, podName, NodeDebugContainerName, false, 0) + if logErr != nil { + return "", fmt.Errorf("failed to retrieve debug pod logs: %w", logErr) + } + return strings.TrimSpace(logs), nil +} + +// processResults processes the debug pod completion status and returns the appropriate result. +func processResults(terminated *corev1.ContainerStateTerminated, lastPod *corev1.Pod, waitMsg, logs string) (string, error) { + if terminated != nil { + if terminated.ExitCode != 0 { + errMsg := fmt.Sprintf("command exited with code %d", terminated.ExitCode) + if terminated.Reason != "" { + errMsg = fmt.Sprintf("%s (%s)", errMsg, terminated.Reason) + } + if terminated.Message != "" { + errMsg = fmt.Sprintf("%s: %s", errMsg, terminated.Message) + } + if logs != "" { + errMsg = fmt.Sprintf("%s\nOutput:\n%s", errMsg, logs) + } + return "", errors.New(errMsg) + } + return logs, nil + } + + if lastPod != nil && lastPod.Status.Reason != "" { + if logs != "" { + return "", fmt.Errorf("debug pod failed: %s\nOutput:\n%s", lastPod.Status.Reason, logs) + } + return "", fmt.Errorf("debug pod failed: %s", lastPod.Status.Reason) + } + if waitMsg != "" { + if logs != "" { + return "", fmt.Errorf("debug container did not complete: %s\nOutput:\n%s", waitMsg, logs) + } + return "", fmt.Errorf("debug container did not complete: %s", waitMsg) + } + if logs != "" { + return "", fmt.Errorf("debug container did not reach a terminal state\nOutput:\n%s", logs) + } + return "", errors.New("debug container did not reach a terminal state") +} + +func sanitizeForName(name string) string { + lower := strings.ToLower(name) + var b strings.Builder + b.Grow(len(lower)) + for _, r := range lower { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { + b.WriteRune(r) + continue + } + b.WriteRune('-') + } + sanitized := strings.Trim(b.String(), "-") + if sanitized == "" { + sanitized = "node" + } + if len(sanitized) > 40 { + sanitized = sanitized[:40] + } + return sanitized +} + +func containerStatusByName(statuses []corev1.ContainerStatus, name string) *corev1.ContainerStatus { + for idx := range statuses { + if statuses[idx].Name == name { + return &statuses[idx] + } + } + return nil +} diff --git a/pkg/ocp/nodes_debug_test.go b/pkg/ocp/nodes_debug_test.go new file mode 100644 index 00000000..1c86c92f --- /dev/null +++ b/pkg/ocp/nodes_debug_test.go @@ -0,0 +1,429 @@ +package ocp + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" +) + +func TestNodesDebugExecCreatesPrivilegedPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = "kernel 6.8" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "", "worker-0", "", []string{"uname", "-a"}, 2*time.Minute) + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if out != "kernel 6.8" { + t.Fatalf("unexpected command output: %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected debug pod to be created") + } + if created.Namespace != "default" { + t.Fatalf("expected default namespace fallback, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-0" { + t.Fatalf("expected pod to target node worker-0, got %q", created.Spec.NodeName) + } + if !env.Pods.Deleted { + t.Fatalf("expected debug pod to be deleted after execution") + } + + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected single container in debug pod") + } + container := created.Spec.Containers[0] + if container.Image != DefaultNodeDebugImage { + t.Fatalf("expected default image %q, got %q", DefaultNodeDebugImage, container.Image) + } + expectedCommand := []string{"uname", "-a"} + if len(container.Command) != len(expectedCommand) { + t.Fatalf("unexpected command length, got %v", container.Command) + } + for i, part := range expectedCommand { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || container.SecurityContext.Privileged == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected container to run privileged") + } + if len(container.VolumeMounts) != 1 || container.VolumeMounts[0].MountPath != "/host" { + t.Fatalf("expected container to mount host root at /host") + } + + if created.Spec.SecurityContext == nil || created.Spec.SecurityContext.RunAsUser == nil || *created.Spec.SecurityContext.RunAsUser != 0 { + t.Fatalf("expected pod security context to run as root") + } + + if len(created.Spec.Volumes) != 1 || created.Spec.Volumes[0].HostPath == nil { + t.Fatalf("expected host root volume to be configured") + } +} + +func TestNodesDebugExecReturnsErrorForNonZeroExit(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = 5 + env.Pods.TerminatedReason = "Error" + env.Pods.TerminatedMessage = "some failure" + env.Pods.Logs = "bad things happened" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "debug-ns", "infra-node", "registry.example/custom:latest", []string{"journalctl", "-xe"}, time.Minute) + if err == nil { + t.Fatalf("expected error for non-zero exit code") + } + // Logs should be included in the error message + if !strings.Contains(err.Error(), "bad things happened") { + t.Fatalf("expected error to contain logs, got: %v", err) + } + if !strings.Contains(err.Error(), "command exited with code 5") { + t.Fatalf("expected error to contain exit code, got: %v", err) + } + if out != "" { + t.Fatalf("expected empty output on error, got %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "debug-ns" { + t.Fatalf("expected provided namespace to be used, got %q", created.Namespace) + } + if containerImage := created.Spec.Containers[0].Image; containerImage != "registry.example/custom:latest" { + t.Fatalf("expected custom image to be used, got %q", containerImage) + } +} + +func TestCreateDebugPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + + created, err := createDebugPod(context.Background(), env.Kubernetes, "worker-1", "test-ns", "custom:v1", []string{"ls", "-la"}) + if err != nil { + t.Fatalf("createDebugPod failed: %v", err) + } + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "test-ns" { + t.Fatalf("expected namespace test-ns, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-1" { + t.Fatalf("expected node worker-1, got %q", created.Spec.NodeName) + } + if !strings.HasPrefix(created.Name, "node-debug-worker-1-") { + t.Fatalf("unexpected pod name: %q", created.Name) + } + if len(created.Name) > 63 { + t.Fatalf("pod name exceeds DNS label length: %d characters", len(created.Name)) + } + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected 1 container, got %d", len(created.Spec.Containers)) + } + container := created.Spec.Containers[0] + if container.Image != "custom:v1" { + t.Fatalf("expected image custom:v1, got %q", container.Image) + } + expectedCmd := []string{"ls", "-la"} + if len(container.Command) != len(expectedCmd) { + t.Fatalf("expected %d command parts, got %d", len(expectedCmd), len(container.Command)) + } + for i, part := range expectedCmd { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected privileged container") + } +} + +func TestPollForCompletion(t *testing.T) { + tests := []struct { + name string + exitCode int32 + terminatedReason string + waitingReason string + waitingMessage string + expectError bool + expectTerminated bool + errorContains []string + expectedExitCode int32 + expectedReason string + }{ + { + name: "successful completion", + exitCode: 0, + expectTerminated: true, + expectedExitCode: 0, + }, + { + name: "non-zero exit code", + exitCode: 42, + terminatedReason: "Error", + expectTerminated: true, + expectedExitCode: 42, + expectedReason: "Error", + }, + { + name: "image pull error", + waitingReason: "ErrImagePull", + waitingMessage: "image not found", + expectError: true, + errorContains: []string{"ErrImagePull", "image not found"}, + }, + { + name: "image pull backoff", + waitingReason: "ImagePullBackOff", + waitingMessage: "back-off pulling image", + expectError: true, + errorContains: []string{"ImagePullBackOff", "back-off pulling image"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = tt.exitCode + env.Pods.TerminatedReason = tt.terminatedReason + env.Pods.WaitingReason = tt.waitingReason + env.Pods.WaitingMessage = tt.waitingMessage + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + terminated, lastPod, waitMsg, err := pollForCompletion(context.Background(), env.Kubernetes, "default", created.Name, time.Minute) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if tt.expectTerminated { + if terminated == nil { + t.Fatalf("expected terminated state") + } + if terminated.ExitCode != tt.expectedExitCode { + t.Fatalf("expected exit code %d, got %d", tt.expectedExitCode, terminated.ExitCode) + } + if tt.expectedReason != "" && terminated.Reason != tt.expectedReason { + t.Fatalf("expected reason %q, got %q", tt.expectedReason, terminated.Reason) + } + if lastPod == nil { + t.Fatalf("expected lastPod to be set") + } + } + + if tt.waitingReason == "" && waitMsg != "" { + t.Fatalf("expected no wait message, got %q", waitMsg) + } + }) + } +} + +func TestRetrieveLogs(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = " some output with whitespace \n" + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + logs, err := retrieveLogs(context.Background(), env.Kubernetes, "default", created.Name) + if err != nil { + t.Fatalf("retrieveLogs failed: %v", err) + } + if logs != "some output with whitespace" { + t.Fatalf("expected trimmed logs, got %q", logs) + } +} + +func TestProcessResults(t *testing.T) { + tests := []struct { + name string + terminated *corev1.ContainerStateTerminated + pod *corev1.Pod + waitMsg string + logs string + expectError bool + errorContains []string + expectLogs bool + expectedResult string + }{ + { + name: "successful completion", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 0, + }, + logs: "success output", + expectError: false, + expectLogs: true, + expectedResult: "success output", + }, + { + name: "non-zero exit code with logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 127, + Reason: "CommandNotFound", + Message: "command not found", + }, + logs: "error logs", + expectError: true, + errorContains: []string{"127", "CommandNotFound", "command not found", "error logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "non-zero exit code without reason or message but with logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + logs: "failed output", + expectError: true, + errorContains: []string{"command exited with code 1", "failed output", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "non-zero exit code without logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + logs: "", + expectError: true, + errorContains: []string{"command exited with code 1"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "pod failed with logs", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Reason: "Evicted", + }, + }, + logs: "pod evicted logs", + expectError: true, + errorContains: []string{"Evicted", "pod evicted logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "pod failed without logs", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Reason: "Evicted", + }, + }, + logs: "", + expectError: true, + errorContains: []string{"Evicted"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "container waiting with logs", + waitMsg: "container waiting: ImagePullBackOff", + logs: "waiting logs", + expectError: true, + errorContains: []string{"did not complete", "waiting logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "container waiting without logs", + waitMsg: "container waiting: ImagePullBackOff", + logs: "", + expectError: true, + errorContains: []string{"did not complete"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "no terminal state with logs", + logs: "incomplete logs", + expectError: true, + errorContains: []string{"did not reach a terminal state", "incomplete logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "no terminal state without logs", + logs: "", + expectError: true, + errorContains: []string{"did not reach a terminal state"}, + expectLogs: false, + expectedResult: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := processResults(tt.terminated, tt.pod, tt.waitMsg, tt.logs) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + // Verify logs are NOT in the result when there's an error + if result != "" { + t.Fatalf("expected empty result on error, got %q", result) + } + } else { + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + // Verify logs ARE in the result when successful + if result != tt.expectedResult { + t.Fatalf("expected result %q, got %q", tt.expectedResult, result) + } + } + }) + } +} + +func TestSanitizeForName(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"worker-0", "worker-0"}, + {"WORKER-0", "worker-0"}, + {"worker.0", "worker-0"}, + {"worker_0", "worker-0"}, + {"ip-10-0-1-42.ec2.internal", "ip-10-0-1-42-ec2-internal"}, + {"", "node"}, + {"---", "node"}, + {strings.Repeat("a", 50), strings.Repeat("a", 40)}, + {"Worker-Node_123.domain", "worker-node-123-domain"}, + } + + for _, tt := range tests { + t.Run(fmt.Sprintf("sanitize(%q)", tt.input), func(t *testing.T) { + result := sanitizeForName(tt.input) + if result != tt.expected { + t.Fatalf("expected %q, got %q", tt.expected, result) + } + }) + } +} diff --git a/pkg/ocp/ocp_client.go b/pkg/ocp/ocp_client.go new file mode 100644 index 00000000..5c2766b4 --- /dev/null +++ b/pkg/ocp/ocp_client.go @@ -0,0 +1,44 @@ +package ocp + +import ( + "context" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// ClientsetInterface defines the interface for access-controlled clientset operations. +// This allows code to work with kubernetes.AccessControlClientset through an interface, +// making it easier to test and decouple from the concrete implementation. +type ClientsetInterface interface { + Pods(namespace string) (corev1client.PodInterface, error) +} + +// OpenshiftClient defines a minimal interface for kubernetes operations commonly needed +// by OCP toolsets. This allows for easier testing and decoupling from the concrete +// kubernetes.Kubernetes type. +type OpenshiftClient interface { + NamespaceOrDefault(namespace string) string + AccessControlClientset() ClientsetInterface + PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) +} + +// OpenshiftClientAdapter adapts kubernetes.Kubernetes to implement OpenshiftClient. +// This allows production code to use the concrete *kubernetes.Kubernetes type +// while tests can use a mock implementation. +type OpenshiftClientAdapter struct { + *kubernetes.Kubernetes +} + +// NewOpenshiftClient creates a new adapter that wraps kubernetes.Kubernetes +// to implement the OpenshiftClient interface. +func NewOpenshiftClient(k *kubernetes.Kubernetes) *OpenshiftClientAdapter { + return &OpenshiftClientAdapter{Kubernetes: k} +} + +// AccessControlClientset returns the access control clientset as an interface. +// This satisfies the OpenshiftClient interface. +func (c *OpenshiftClientAdapter) AccessControlClientset() ClientsetInterface { + return c.Kubernetes.AccessControlClientset() +} diff --git a/pkg/ocp/testhelpers.go b/pkg/ocp/testhelpers.go new file mode 100644 index 00000000..e35d1d9f --- /dev/null +++ b/pkg/ocp/testhelpers.go @@ -0,0 +1,163 @@ +package ocp + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + schemek8s "k8s.io/client-go/kubernetes/scheme" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + restclient "k8s.io/client-go/rest" +) + +// NodeDebugTestEnv bundles a test Kubernetes client with a controllable pods client for tests. +type NodeDebugTestEnv struct { + Kubernetes *FakeKubernetesClient + Pods *FakePodInterface +} + +// NewNodeDebugTestEnv constructs a testing harness for exercising NodesDebugExec. +func NewNodeDebugTestEnv(t *testing.T) *NodeDebugTestEnv { + t.Helper() + + podsClient := &FakePodInterface{} + fakeK8s := &FakeKubernetesClient{ + pods: podsClient, + namespace: "default", + } + + return &NodeDebugTestEnv{ + Kubernetes: fakeK8s, + Pods: podsClient, + } +} + +// FakeKubernetesClient implements the OpenshiftClient interface for testing +type FakeKubernetesClient struct { + pods *FakePodInterface + namespace string +} + +// AccessControlClientset returns a fake clientset for testing +func (f *FakeKubernetesClient) AccessControlClientset() ClientsetInterface { + return &FakeAccessControlClientset{pods: f.pods} +} + +func (f *FakeKubernetesClient) NamespaceOrDefault(namespace string) string { + if namespace == "" { + return f.namespace + } + return namespace +} + +func (f *FakeKubernetesClient) PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) { + req := f.pods.GetLogs(name, &corev1.PodLogOptions{Container: container, Previous: previous}) + res := req.Do(ctx) + if res.Error() != nil { + return "", res.Error() + } + rawData, err := res.Raw() + if err != nil { + return "", err + } + return string(rawData), nil +} + +// FakeAccessControlClientset mimics kubernetes.AccessControlClientset for testing +type FakeAccessControlClientset struct { + pods *FakePodInterface +} + +func (f *FakeAccessControlClientset) Pods(namespace string) (corev1client.PodInterface, error) { + return f.pods, nil +} + +// FakePodInterface implements corev1client.PodInterface with deterministic behaviour for tests. +type FakePodInterface struct { + corev1client.PodInterface + Created *corev1.Pod + Deleted bool + ExitCode int32 + TerminatedReason string + TerminatedMessage string + WaitingReason string + WaitingMessage string + Logs string +} + +func (f *FakePodInterface) Create(ctx context.Context, pod *corev1.Pod, opts metav1.CreateOptions) (*corev1.Pod, error) { + copy := pod.DeepCopy() + if copy.Name == "" && copy.GenerateName != "" { + copy.Name = copy.GenerateName + "test" + } + f.Created = copy + return copy.DeepCopy(), nil +} + +func (f *FakePodInterface) Get(ctx context.Context, name string, opts metav1.GetOptions) (*corev1.Pod, error) { + if f.Created == nil { + return nil, fmt.Errorf("pod not created yet") + } + pod := f.Created.DeepCopy() + + // If waiting state is set, return that instead of terminated + if f.WaitingReason != "" { + waiting := &corev1.ContainerStateWaiting{Reason: f.WaitingReason} + if f.WaitingMessage != "" { + waiting.Message = f.WaitingMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Waiting: waiting}, + }} + pod.Status.Phase = corev1.PodPending + return pod, nil + } + + // Otherwise return terminated state + terminated := &corev1.ContainerStateTerminated{ExitCode: f.ExitCode} + if f.TerminatedReason != "" { + terminated.Reason = f.TerminatedReason + } + if f.TerminatedMessage != "" { + terminated.Message = f.TerminatedMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Terminated: terminated}, + }} + pod.Status.Phase = corev1.PodSucceeded + return pod, nil +} + +func (f *FakePodInterface) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + f.Deleted = true + return nil +} + +func (f *FakePodInterface) GetLogs(name string, opts *corev1.PodLogOptions) *restclient.Request { + body := io.NopCloser(strings.NewReader(f.Logs)) + client := &http.Client{Transport: roundTripperFunc(func(*http.Request) (*http.Response, error) { + return &http.Response{StatusCode: http.StatusOK, Body: body}, nil + })} + content := restclient.ClientContentConfig{ + ContentType: runtime.ContentTypeJSON, + GroupVersion: schema.GroupVersion{Version: "v1"}, + Negotiator: runtime.NewClientNegotiator(schemek8s.Codecs.WithoutConversion(), schema.GroupVersion{Version: "v1"}), + } + return restclient.NewRequestWithClient(&url.URL{Scheme: "https", Host: "localhost"}, "", content, client).Verb("GET") +} + +type roundTripperFunc func(*http.Request) (*http.Response, error) + +func (f roundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) +} diff --git a/pkg/toolsets/openshift/nodes.go b/pkg/toolsets/openshift/nodes.go new file mode 100644 index 00000000..481cd53b --- /dev/null +++ b/pkg/toolsets/openshift/nodes.go @@ -0,0 +1,126 @@ +package openshift + +import ( + "errors" + "fmt" + "time" + + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp" +) + +func initNodes() []api.ServerTool { + return []api.ServerTool{ + { + Tool: api.Tool{ + Name: "nodes_debug_exec", + Description: "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "node": { + Type: "string", + Description: "Name of the node to debug (e.g. worker-0).", + }, + "command": { + Type: "array", + Description: "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + Items: &jsonschema.Schema{Type: "string"}, + }, + "namespace": { + Type: "string", + Description: "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + }, + "image": { + Type: "string", + Description: "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + }, + "timeout_seconds": { + Type: "integer", + Description: "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + Minimum: ptr.To(float64(1)), + }, + }, + Required: []string{"node", "command"}, + }, + Annotations: api.ToolAnnotations{ + Title: "Nodes: Debug Exec", + ReadOnlyHint: ptr.To(false), + DestructiveHint: ptr.To(true), + IdempotentHint: ptr.To(false), + OpenWorldHint: ptr.To(true), + }, + }, + Handler: nodesDebugExec, + }, + } +} + +func nodesDebugExec(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + nodeArg := params.GetArguments()["node"] + nodeName, ok := nodeArg.(string) + if nodeArg == nil || !ok || nodeName == "" { + return api.NewToolCallResult("", errors.New("missing required argument: node")), nil + } + + commandArg := params.GetArguments()["command"] + command, err := toStringSlice(commandArg) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("invalid command argument: %w", err)), nil + } + + namespace := "" + if nsArg, ok := params.GetArguments()["namespace"].(string); ok { + namespace = nsArg + } + + image := "" + if imageArg, ok := params.GetArguments()["image"].(string); ok { + image = imageArg + } + + var timeout time.Duration + if timeoutRaw, exists := params.GetArguments()["timeout_seconds"]; exists && timeoutRaw != nil { + switch v := timeoutRaw.(type) { + case float64: + timeout = time.Duration(int64(v)) * time.Second + case int: + timeout = time.Duration(v) * time.Second + case int64: + timeout = time.Duration(v) * time.Second + default: + return api.NewToolCallResult("", errors.New("timeout_seconds must be a numeric value")), nil + } + } + + output, execErr := ocp.NodesDebugExec(params.Context, ocp.NewOpenshiftClient(params.Kubernetes), namespace, nodeName, image, command, timeout) + if output == "" && execErr == nil { + output = fmt.Sprintf("Command executed successfully on node %s but produced no output.", nodeName) + } + return api.NewToolCallResult(output, execErr), nil +} + +func toStringSlice(arg any) ([]string, error) { + if arg == nil { + return nil, errors.New("command is required") + } + raw, ok := arg.([]interface{}) + if !ok { + return nil, errors.New("command must be an array of strings") + } + if len(raw) == 0 { + return nil, errors.New("command array cannot be empty") + } + command := make([]string, 0, len(raw)) + for _, item := range raw { + str, ok := item.(string) + if !ok { + return nil, errors.New("command items must be strings") + } + command = append(command, str) + } + return command, nil +} diff --git a/pkg/toolsets/openshift/nodes_test.go b/pkg/toolsets/openshift/nodes_test.go new file mode 100644 index 00000000..e3cddf2c --- /dev/null +++ b/pkg/toolsets/openshift/nodes_test.go @@ -0,0 +1,95 @@ +package openshift + +import ( + "context" + "testing" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp" +) + +type staticRequest struct { + args map[string]any +} + +func (s staticRequest) GetArguments() map[string]any { + return s.args +} + +func TestNodesDebugExecHandlerValidatesInput(t *testing.T) { + t.Run("missing node", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{}}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "missing required argument: node" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) + + t.Run("invalid command type", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{ + "node": "worker-0", + "command": "ls -la", + }}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "invalid command argument: command must be an array of strings" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) +} + +func TestNodesDebugExecHandlerExecutesCommand(t *testing.T) { + env := ocp.NewNodeDebugTestEnv(t) + env.Pods.Logs = "done" + + // Call NodesDebugExec directly instead of going through the handler + // This avoids the need to mock the full kubernetes.Kubernetes type + output, err := ocp.NodesDebugExec( + context.Background(), + env.Kubernetes, + "debug", + "infra-node", + "registry.local/debug:latest", + []string{"systemctl", "status", "kubelet"}, + 15*time.Second, + ) + + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if output != "done" { + t.Fatalf("unexpected output: %q", output) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod creation") + } + if created.Namespace != "debug" { + t.Fatalf("expected namespace override, got %q", created.Namespace) + } + if created.Spec.Containers[0].Image != "registry.local/debug:latest" { + t.Fatalf("expected custom image, got %q", created.Spec.Containers[0].Image) + } + expectedCommand := []string{"systemctl", "status", "kubelet"} + if len(created.Spec.Containers[0].Command) != len(expectedCommand) { + t.Fatalf("unexpected command length: %v", created.Spec.Containers[0].Command) + } + for i, part := range expectedCommand { + if created.Spec.Containers[0].Command[i] != part { + t.Fatalf("command[%d]=%q expected %q", i, created.Spec.Containers[0].Command[i], part) + } + } +} diff --git a/pkg/toolsets/openshift/toolset.go b/pkg/toolsets/openshift/toolset.go new file mode 100644 index 00000000..d11e4b66 --- /dev/null +++ b/pkg/toolsets/openshift/toolset.go @@ -0,0 +1,31 @@ +package openshift + +import ( + "slices" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets" +) + +type Toolset struct{} + +var _ api.Toolset = (*Toolset)(nil) + +func (t *Toolset) GetName() string { + return "openshift-core" +} + +func (t *Toolset) GetDescription() string { + return "Core OpenShift-specific tools (Node debugging, etc.)" +} + +func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { + return slices.Concat( + initNodes(), + ) +} + +func init() { + toolsets.Register(&Toolset{}) +}