diff --git a/README.md b/README.md index a81daea1..71ffaa9e 100644 --- a/README.md +++ b/README.md @@ -208,11 +208,12 @@ The following sets of tools are available (all on by default): -| Toolset | Description | -|---------|-------------------------------------------------------------------------------------| -| config | View and manage the current local Kubernetes configuration (kubeconfig) | -| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | -| helm | Tools for managing Helm charts and releases | +| Toolset | Description | +|----------------|-------------------------------------------------------------------------------------| +| config | View and manage the current local Kubernetes configuration (kubeconfig) | +| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | +| helm | Tools for managing Helm charts and releases | +| openshift-core | Core OpenShift-specific tools (Node debugging, etc.) | @@ -249,6 +250,9 @@ In case multi-cluster support is enabled (default) and you have access to multip - `query` (`string`) **(required)** - query specifies services(s) or files from which to return logs (required). Example: "kubelet" to fetch kubelet logs, "/" to fetch a specific log file from the node (e.g., "/var/log/kubelet.log" or "/var/log/kube-proxy.log") - `tailLines` (`integer`) - Number of lines to retrieve from the end of the logs (Optional, 0 means all logs) +- **nodes_stats_summary** - Get detailed resource usage statistics from a Kubernetes node via the kubelet's Summary API. Provides comprehensive metrics including CPU, memory, filesystem, and network usage at the node, pod, and container levels. On systems with cgroup v2 and kernel 4.20+, also includes PSI (Pressure Stall Information) metrics that show resource pressure for CPU, memory, and I/O. See https://kubernetes.io/docs/reference/instrumentation/understand-psi-metrics/ for details on PSI metrics + - `name` (`string`) **(required)** - Name of the node to get stats from + - **pods_list** - List all the Kubernetes pods in the current cluster from all namespaces - `labelSelector` (`string`) - Optional Kubernetes label selector (e.g. 'app=myapp,env=prod' or 'app in (myapp,yourapp)'), use this option when you want to filter the pods by label @@ -336,6 +340,19 @@ In case multi-cluster support is enabled (default) and you have access to multip +
+ +openshift-core + +- **nodes_debug_exec** - Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs. + - `command` (`array`) **(required)** - Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']). + - `image` (`string`) - Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities. + - `namespace` (`string`) - Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default'). + - `node` (`string`) **(required)** - Name of the node to debug (e.g. worker-0). + - `timeout_seconds` (`integer`) - Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds). + +
+ diff --git a/internal/tools/update-readme/main.go b/internal/tools/update-readme/main.go index cdf695fc..b425a1d3 100644 --- a/internal/tools/update-readme/main.go +++ b/internal/tools/update-readme/main.go @@ -15,6 +15,7 @@ import ( _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/config" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/core" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/helm" + _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" ) type OpenShift struct{} diff --git a/pkg/config/config_default_overrides.go b/pkg/config/config_default_overrides.go index 70d065bc..05c3b4ac 100644 --- a/pkg/config/config_default_overrides.go +++ b/pkg/config/config_default_overrides.go @@ -3,6 +3,7 @@ package config func defaultOverrides() StaticConfig { return StaticConfig{ // IMPORTANT: this file is used to override default config values in downstream builds. - // This is intentionally left blank. + // OpenShift-specific defaults: add openshift-core toolset + Toolsets: []string{"core", "config", "helm", "openshift-core"}, } } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index afdde191..caa4f95d 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -167,8 +167,8 @@ func (s *ConfigSuite) TestReadConfigValidPreservesDefaultsForMissingFields() { s.Equalf("table", config.ListOutput, "Expected ListOutput to be table, got %s", config.ListOutput) }) s.Run("toolsets defaulted correctly", func() { - s.Require().Lenf(config.Toolsets, 3, "Expected 3 toolsets, got %d", len(config.Toolsets)) - for _, toolset := range []string{"core", "config", "helm"} { + s.Require().Lenf(config.Toolsets, 4, "Expected 4 toolsets, got %d", len(config.Toolsets)) + for _, toolset := range []string{"core", "config", "helm", "openshift-core"} { s.Containsf(config.Toolsets, toolset, "Expected toolsets to contain %s", toolset) } }) diff --git a/pkg/kubernetes-mcp-server/cmd/root_test.go b/pkg/kubernetes-mcp-server/cmd/root_test.go index 22521667..e5342cd5 100644 --- a/pkg/kubernetes-mcp-server/cmd/root_test.go +++ b/pkg/kubernetes-mcp-server/cmd/root_test.go @@ -137,7 +137,7 @@ func TestToolsets(t *testing.T) { rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--help"}) o, err := captureOutput(rootCmd.Execute) // --help doesn't use logger/klog, cobra prints directly to stdout - if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm).") { + if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm, openshift-core).") { t.Fatalf("Expected all available toolsets, got %s %v", o, err) } }) @@ -145,7 +145,7 @@ func TestToolsets(t *testing.T) { ioStreams, out := testStream() rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--version", "--port=1337", "--log-level=1"}) - if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm") { + if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm, openshift-core") { t.Fatalf("Expected toolsets 'full', got %s %v", out, err) } }) diff --git a/pkg/kubernetes/resources.go b/pkg/kubernetes/resources.go index 1f559e12..d816ed3d 100644 --- a/pkg/kubernetes/resources.go +++ b/pkg/kubernetes/resources.go @@ -3,10 +3,11 @@ package kubernetes import ( "context" "fmt" - "k8s.io/apimachinery/pkg/runtime" "regexp" "strings" + "k8s.io/apimachinery/pkg/runtime" + "github.com/containers/kubernetes-mcp-server/pkg/version" authv1 "k8s.io/api/authorization/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -36,7 +37,7 @@ func (k *Kubernetes) ResourcesList(ctx context.Context, gvk *schema.GroupVersion // Check if operation is allowed for all namespaces (applicable for namespaced resources) isNamespaced, _ := k.isNamespaced(gvk) - if isNamespaced && !k.canIUse(ctx, gvr, namespace, "list") && namespace == "" { + if isNamespaced && !k.CanIUse(ctx, gvr, namespace, "list") && namespace == "" { namespace = k.manager.configuredNamespace() } if options.AsTable { @@ -187,7 +188,7 @@ func (k *Kubernetes) supportsGroupVersion(groupVersion string) bool { return true } -func (k *Kubernetes) canIUse(ctx context.Context, gvr *schema.GroupVersionResource, namespace, verb string) bool { +func (k *Kubernetes) CanIUse(ctx context.Context, gvr *schema.GroupVersionResource, namespace, verb string) bool { accessReviews, err := k.manager.accessControlClientSet.SelfSubjectAccessReviews() if err != nil { return false diff --git a/pkg/mcp/nodes_test.go b/pkg/mcp/nodes_test.go index 62ac55e9..e7626a18 100644 --- a/pkg/mcp/nodes_test.go +++ b/pkg/mcp/nodes_test.go @@ -1,14 +1,21 @@ package mcp import ( + "encoding/json" + "io" "net/http" "strconv" + "strings" "testing" "github.com/BurntSushi/toml" "github.com/containers/kubernetes-mcp-server/internal/test" "github.com/mark3labs/mcp-go/mcp" "github.com/stretchr/testify/suite" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" ) type NodesSuite struct { @@ -334,3 +341,249 @@ func (s *NodesSuite) TestNodesStatsSummaryDenied() { func TestNodes(t *testing.T) { suite.Run(t, new(NodesSuite)) } + +// Tests below are for the nodes_debug_exec tool (OpenShift-specific) + +type NodesDebugExecSuite struct { + BaseMcpSuite + mockServer *test.MockServer +} + +func (s *NodesDebugExecSuite) SetupTest() { + s.BaseMcpSuite.SetupTest() + s.mockServer = test.NewMockServer() + s.Cfg.KubeConfig = s.mockServer.KubeconfigFile(s.T()) +} + +func (s *NodesDebugExecSuite) TearDownTest() { + s.BaseMcpSuite.TearDownTest() + if s.mockServer != nil { + s.mockServer.Close() + } +} + +func (s *NodesDebugExecSuite) TestNodesDebugExecTool() { + s.Run("nodes_debug_exec with successful execution", func() { + + var ( + createdPod v1.Pod + deleteCalled bool + ) + const namespace = "debug" + const logOutput = "filesystem repaired" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + s.mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read apply body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + s.T().Fatalf("failed to decode apply body: %v", err) + } + createdPod = *created + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + createdPod.Name = pathParts[len(pathParts)-1] + } + createdPod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read create body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + s.T().Fatalf("failed to decode create body: %v", err) + } + createdPod = *created + createdPod.ObjectMeta = metav1.ObjectMeta{ + Namespace: namespace, + Name: createdPod.GenerateName + "abc", + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + podStatus := createdPod.DeepCopy() + podStatus.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 0, + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(podStatus) + case req.Method == http.MethodDelete && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + deleteCalled = true + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&metav1.Status{Status: "Success"}) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name+"/log": + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(logOutput)) + } + })) + + s.InitMcpClient() + toolResult, err := s.CallTool("nodes_debug_exec", map[string]interface{}{ + "node": "worker-0", + "namespace": namespace, + "command": []interface{}{"uname", "-a"}, + }) + + s.Run("call succeeds", func() { + s.Nilf(err, "call tool should not error: %v", err) + s.Falsef(toolResult.IsError, "tool should not return error: %v", toolResult.Content) + s.NotEmpty(toolResult.Content, "expected output content") + text := toolResult.Content[0].(mcp.TextContent).Text + s.Equalf(logOutput, text, "unexpected tool output %q", text) + }) + + s.Run("debug pod shaped correctly", func() { + s.Require().NotNil(createdPod.Spec.Containers, "expected containers in debug pod") + s.Require().Len(createdPod.Spec.Containers, 1, "expected single container in debug pod") + container := createdPod.Spec.Containers[0] + expectedCommand := []string{"uname", "-a"} + s.Truef(equalStringSlices(container.Command, expectedCommand), + "unexpected debug command: %v", container.Command) + s.Require().NotNil(container.SecurityContext, "expected security context") + s.Require().NotNil(container.SecurityContext.Privileged, "expected privileged field") + s.Truef(*container.SecurityContext.Privileged, "expected privileged container") + s.Require().NotEmpty(createdPod.Spec.Volumes, "expected volumes on debug pod") + s.Require().NotNil(createdPod.Spec.Volumes[0].HostPath, "expected hostPath volume") + s.Truef(deleteCalled, "expected debug pod to be deleted") + }) + }) +} + +func equalStringSlices(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func (s *NodesDebugExecSuite) TestNodesDebugExecToolNonZeroExit() { + s.Run("nodes_debug_exec with non-zero exit code", func() { + const namespace = "default" + const errorMessage = "failed" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + s.mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read apply body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + s.T().Fatalf("failed to decode apply body: %v", err) + } + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + pod.Name = pathParts[len(pathParts)-1] + } + pod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read create body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + s.T().Fatalf("failed to decode create body: %v", err) + } + pod.ObjectMeta = metav1.ObjectMeta{Name: pod.GenerateName + "xyz", Namespace: namespace} + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/") && strings.HasSuffix(req.URL.Path, "/log"): + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(errorMessage)) + case req.Method == http.MethodGet && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + pathParts := strings.Split(req.URL.Path, "/") + podName := pathParts[len(pathParts)-1] + pod := &v1.Pod{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Pod", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + }, + } + pod.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 2, + Reason: "Error", + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + } + })) + + s.InitMcpClient() + toolResult, err := s.CallTool("nodes_debug_exec", map[string]interface{}{ + "node": "infra-1", + "command": []interface{}{"journalctl"}, + }) + + s.Nilf(err, "call tool should not error: %v", err) + s.Truef(toolResult.IsError, "expected tool to return error") + text := toolResult.Content[0].(mcp.TextContent).Text + s.Containsf(text, "command exited with code 2", "expected exit code message, got %q", text) + s.Containsf(text, "Error", "expected error reason included, got %q", text) + }) +} + +func TestNodesDebugExec(t *testing.T) { + suite.Run(t, new(NodesDebugExecSuite)) +} diff --git a/pkg/mcp/openshift_modules.go b/pkg/mcp/openshift_modules.go new file mode 100644 index 00000000..84d982f4 --- /dev/null +++ b/pkg/mcp/openshift_modules.go @@ -0,0 +1,3 @@ +package mcp + +import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json index 1551b4c2..474bb1d0 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json @@ -195,6 +195,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -220,7 +264,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json index 6e85e401..13a956f7 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json @@ -175,6 +175,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -196,7 +240,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools-openshift.json b/pkg/mcp/testdata/toolsets-full-tools-openshift.json index fb24138e..1ac08ca7 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-openshift.json +++ b/pkg/mcp/testdata/toolsets-full-tools-openshift.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -156,7 +200,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools.json b/pkg/mcp/testdata/toolsets-full-tools.json index 5a4b5112..961fbb55 100644 --- a/pkg/mcp/testdata/toolsets-full-tools.json +++ b/pkg/mcp/testdata/toolsets-full-tools.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -156,7 +200,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-openshift-core-tools.json b/pkg/mcp/testdata/toolsets-openshift-core-tools.json new file mode 100644 index 00000000..65f3203d --- /dev/null +++ b/pkg/mcp/testdata/toolsets-openshift-core-tools.json @@ -0,0 +1,46 @@ +[ + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + } +] diff --git a/pkg/ocp/mustgather/mustgather_plan.go b/pkg/ocp/mustgather/mustgather_plan.go new file mode 100644 index 00000000..107f4604 --- /dev/null +++ b/pkg/ocp/mustgather/mustgather_plan.go @@ -0,0 +1,506 @@ +package mustgather + +import ( + "fmt" + "path" + "strings" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/rand" + "sigs.k8s.io/yaml" +) + +const ( + defaultGatherSourceDir = "/must-gather/" + defaultMustGatherImage = "registry.redhat.io/openshift4/ose-must-gather:latest" + defaultGatherCmd = "/usr/bin/gather" + mgAnnotation = "operators.openshift.io/must-gather-image" + maxConcurrentGathers = 8 +) + +func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + args := params.GetArguments() + + var nodeName, sourceDir, namespace, gatherCmd, timeout, since string + var hostNetwork, keepResources, allImages, withStorage bool + var images []string + var nodeSelector map[string]string + + if args["node_name"] != nil { + nodeName = args["node_name"].(string) + } + + if args["node_selector"] != nil { + nodeSelector = parseNodeSelector(args["node_selector"].(string)) + } + + if args["host_network"] != nil { + hostNetwork = args["host_network"].(bool) + } + + if args["with_storage"] != nil { + withStorage = args["with_storage"].(bool) + } + + sourceDir = defaultGatherSourceDir + if args["source_dir"] != nil { + sourceDir = path.Clean(args["source_dir"].(string)) + } + + namespace = fmt.Sprintf("openshift-must-gather-%s", rand.String(6)) + if args["namespace"] != nil { + namespace = args["namespace"].(string) + } + + if args["keep_resources"] != nil { + keepResources = args["keep_resources"].(bool) + } + + gatherCmd = defaultGatherCmd + if args["gather_command"] != nil { + gatherCmd = args["gather_command"].(string) + } + + if args["all_component_images"] != nil { + allImages = args["all_component_images"].(bool) + } + + if args["images"] != nil { + if imagesArg, ok := args["images"].([]interface{}); ok { + for _, img := range imagesArg { + if imgStr, ok := img.(string); ok { + images = append(images, imgStr) + } + } + } + } + + if allImages { + componentImages, err := getComponentImages(params) + if err != nil { + return api.NewToolCallResult("", + fmt.Errorf("failed to get operator images: %v", err), + ), nil + } + + images = append(images, componentImages...) + } + + if len(images) > maxConcurrentGathers { + return api.NewToolCallResult("", + fmt.Errorf("more than %d gather images are not supported", maxConcurrentGathers), + ), nil + } + + if args["timeout"] != nil { + timeout = args["timeout"].(string) + + _, err := time.ParseDuration(timeout) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("timeout duration is not valid")), nil + } + + gatherCmd = fmt.Sprintf("/usr/bin/timeout %s %s", timeout, gatherCmd) + } + + if args["since"] != nil { + since = args["since"].(string) + + _, err := time.ParseDuration(since) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("since duration is not valid")), nil + } + } + + envVars := []corev1.EnvVar{} + if since != "" { + envVars = append(envVars, corev1.EnvVar{ + Name: "MUST_GATHER_SINCE", + Value: since, + }) + } + + // template container for gather, + // if multiple images are added multiple containers in the same pod will be spin up + gatherContainerTemplate := corev1.Container{ + Name: "gather", + Image: defaultMustGatherImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{gatherCmd}, + Env: envVars, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "must-gather-output", + MountPath: sourceDir, + }, + }, + } + + var gatherContainers = []corev1.Container{ + *gatherContainerTemplate.DeepCopy(), + } + + if len(images) > 0 { + gatherContainers = make([]corev1.Container, len(images)) + } + + for i, image := range images { + gatherContainers[i] = *gatherContainerTemplate.DeepCopy() + + // if more than one gather container(s) are added, + // suffix container name with int id + if len(images) > 1 { + gatherContainers[i].Name = fmt.Sprintf("gather-%d", i+1) + } + gatherContainers[i].Image = image + } + + serviceAccountName := "must-gather-collector" + pvcName := fmt.Sprintf("must-gather-pvc-%s", rand.String(6)) + + // Configure volume based on storage type + var volumes []corev1.Volume + if withStorage { + volumes = []corev1.Volume{ + { + Name: "must-gather-output", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvcName, + }, + }, + }, + } + } else { + volumes = []corev1.Volume{ + { + Name: "must-gather-output", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + } + } + + pod := &corev1.Pod{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Pod", + }, + ObjectMeta: metav1.ObjectMeta{ + // Avoiding generateName as resources_create_or_update fails without explicit name. + Name: fmt.Sprintf("must-gather-%s", rand.String(6)), + Namespace: namespace, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: serviceAccountName, + NodeName: nodeName, + PriorityClassName: "system-cluster-critical", + RestartPolicy: corev1.RestartPolicyNever, + Volumes: volumes, + Containers: append(gatherContainers, corev1.Container{ + Name: "wait", + Image: "registry.redhat.io/ubi9/ubi-minimal", + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/bash", "-c", "sleep infinity"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "must-gather-output", + MountPath: "/must-gather", + }, + }, + }), + HostNetwork: hostNetwork, + NodeSelector: nodeSelector, + Tolerations: []corev1.Toleration{ + { + Operator: "Exists", + }, + }, + }, + } + + namespaceExists := false + + _, err := params.ResourcesGet(params, &schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Namespaces", + }, "", namespace) + if err == nil { + namespaceExists = true + } + + var namespaceObj *corev1.Namespace + if !namespaceExists { + namespaceObj = &corev1.Namespace{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Namespace", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + } + } + + serviceAccount := &corev1.ServiceAccount{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "ServiceAccount", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: serviceAccountName, + Namespace: namespace, + }, + } + + clusterRoleBindingName := fmt.Sprintf("%s-must-gather-collector", namespace) + clusterRoleBinding := &rbacv1.ClusterRoleBinding{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "rbac.authorization.k8s.io/v1", + Kind: "ClusterRoleBinding", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: clusterRoleBindingName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: "cluster-admin", + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: serviceAccountName, + Namespace: namespace, + }, + }, + } + + // Create PVC if persistent storage is requested + var pvc *corev1.PersistentVolumeClaim + if withStorage { + pvc = &corev1.PersistentVolumeClaim{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "PersistentVolumeClaim", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: pvcName, + Namespace: namespace, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + AccessModes: []corev1.PersistentVolumeAccessMode{ + corev1.ReadWriteOnce, + }, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse("10Gi"), + }, + }, + }, + } + } + + allowChecks := map[string]struct { + schema.GroupVersionResource + name string + verb string + }{ + "create_namespace": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "namespace"}, + verb: "create", + }, + "create_serviceaccount": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "serviceaccount"}, + verb: "create", + }, + "create_clusterrolebinding": { + GroupVersionResource: schema.GroupVersionResource{Group: "rbac.authorization.k8s.io", Version: "v1", Resource: "clusterrolebindings"}, + verb: "create", + }, + "create_pod": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "pod"}, + verb: "create", + }, + "create_persistentvolumeclaim": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "persistentvolumeclaims"}, + verb: "create", + }, + "use_scc_hostnetwork": { + GroupVersionResource: schema.GroupVersionResource{Group: "security.openshift.io", Version: "v1", Resource: "securitycontextconstraints"}, + name: "hostnetwork-v2", + verb: "use", + }, + } + isAllowed := make(map[string]bool) + + for k, check := range allowChecks { + isAllowed[k] = params.CanIUse(params, &check.GroupVersionResource, "", check.verb) + } + + var result strings.Builder + result.WriteString("The generated plan contains YAML manifests for must-gather pods and required resources (namespace, serviceaccount, clusterrolebinding") + if withStorage { + result.WriteString(", persistentvolumeclaim). " + + "The data will be stored on a persistent volume (10Gi) and preserved even after pod deletion. ") + } else { + result.WriteString("). ") + } + result.WriteString("Suggest how the user can apply the manifest and copy results locally (`oc cp` / `kubectl cp`). \n\n") + + result.WriteString("Ask the user if they want to apply the plan \n" + + "- use the resource_create_or_update tool to apply the manifest \n" + + "- alternatively, advise the user to execute `oc apply` / `kubectl apply` instead. \n\n", + ) + + if !keepResources { + result.WriteString("Once the must-gather collection is completed, the user may wish to cleanup the created resources. \n" + + "- use the resources_delete tool to delete the namespace and the clusterrolebinding \n" + + "- or, execute cleanup using `kubectl delete`. \n\n") + } + + if !namespaceExists && isAllowed["create_namespace"] { + namespaceYaml, err := yaml.Marshal(namespaceObj) + if err != nil { + return nil, fmt.Errorf("failed to marshal namespace to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(namespaceYaml) + result.WriteString("```\n\n") + } + + if !namespaceExists && !isAllowed["create_namespace"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create namespace(s).\n") + } + + // yaml(s) are dumped into individual code blocks of ``` ``` + // because resources_create_or_update tool call fails when content has more than one more resource, + // some models are smart to detect an error and retry with one resource a time though. + + serviceAccountYaml, err := yaml.Marshal(serviceAccount) + if err != nil { + return nil, fmt.Errorf("failed to marshal service account to yaml: %w", err) + } + result.WriteString("```yaml\n") + result.Write(serviceAccountYaml) + result.WriteString("```\n\n") + + if !isAllowed["create_serviceaccount"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create serviceaccount(s).\n") + } + + // Output PVC YAML if persistent storage is requested + if withStorage { + pvcYaml, err := yaml.Marshal(pvc) + if err != nil { + return nil, fmt.Errorf("failed to marshal PVC to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(pvcYaml) + result.WriteString("```\n\n") + + if !isAllowed["create_persistentvolumeclaim"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create persistentvolumeclaim(s).\n") + } + } + + clusterRoleBindingYaml, err := yaml.Marshal(clusterRoleBinding) + if err != nil { + return nil, fmt.Errorf("failed to marshal cluster role binding to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(clusterRoleBindingYaml) + result.WriteString("```\n\n") + + if !isAllowed["create_clusterrolebinding"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create clusterrolebinding(s).\n") + } + + podYaml, err := yaml.Marshal(pod) + if err != nil { + return nil, fmt.Errorf("failed to marshal pod to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(podYaml) + result.WriteString("```\n") + + if !isAllowed["create_pod"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create pod(s).\n") + } + + if hostNetwork && !isAllowed["use_scc_hostnetwork"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create pod(s) with hostNetwork: true.\n") + } + + return api.NewToolCallResult(result.String(), nil), nil +} + +func getComponentImages(params api.ToolHandlerParams) ([]string, error) { + var images []string + appendImageFromAnnotation := func(obj runtime.Object) error { + unstruct, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return err + } + + u := unstructured.Unstructured{Object: unstruct} + annotations := u.GetAnnotations() + if annotations[mgAnnotation] != "" { + images = append(images, annotations[mgAnnotation]) + } + + return nil + } + + clusterOperatorsList, err := params.ResourcesList(params, &schema.GroupVersionKind{ + Group: "config.openshift.io", + Version: "v1", + Kind: "ClusterOperator", + }, "", internalk8s.ResourceListOptions{}) + if err != nil { + return nil, err + } + + if err := clusterOperatorsList.EachListItem(appendImageFromAnnotation); err != nil { + return images, err + } + + csvList, err := params.ResourcesList(params, &schema.GroupVersionKind{ + Group: "operators.coreos.com", + Version: "v1alpha1", + Kind: "ClusterServiceVersion", + }, "", internalk8s.ResourceListOptions{}) + if err != nil { + return images, err + } + + err = csvList.EachListItem(appendImageFromAnnotation) + return images, err +} + +func parseNodeSelector(selector string) map[string]string { + result := make(map[string]string) + pairs := strings.Split(selector, ",") + for _, pair := range pairs { + kv := strings.SplitN(strings.TrimSpace(pair), "=", 2) + if len(kv) == 2 && strings.TrimSpace(kv[0]) != "" { + result[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return result +} diff --git a/pkg/ocp/nodes/nodes_debug.go b/pkg/ocp/nodes/nodes_debug.go new file mode 100644 index 00000000..f8b7e412 --- /dev/null +++ b/pkg/ocp/nodes/nodes_debug.go @@ -0,0 +1,326 @@ +package nodes + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/version" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilrand "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/utils/ptr" +) + +const ( + // DefaultNodeDebugImage is the UBI9 toolbox image that provides comprehensive debugging and troubleshooting utilities. + // This image includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), + // process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), debugging tools (gdb), + // and many other utilities commonly needed for node-level debugging and diagnostics. + DefaultNodeDebugImage = "registry.access.redhat.com/ubi9/toolbox:latest" + // NodeDebugContainerName is the name used for the debug container, matching 'oc debug node' defaults. + NodeDebugContainerName = "debug" + // DefaultNodeDebugTimeout is the maximum time to wait for the debug pod to finish executing. + DefaultNodeDebugTimeout = 1 * time.Minute +) + +// NodesDebugExec mimics `oc debug node/ -- ` by creating a privileged pod on the target +// node, running the provided command, collecting its output, and removing the pod afterwards. +// The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node resources. +// +// When namespace is empty, the configured namespace (or "default" if none) is used. When image is empty the +// default debug image is used. Timeout controls how long we wait for the pod to complete. +func NodesDebugExec( + ctx context.Context, + k OpenshiftClient, + namespace string, + nodeName string, + image string, + command []string, + timeout time.Duration, +) (string, error) { + if nodeName == "" { + return "", errors.New("node name is required") + } + if len(command) == 0 { + return "", errors.New("command is required") + } + + ns := k.NamespaceOrDefault(namespace) + if ns == "" { + ns = "default" + } + debugImage := image + if debugImage == "" { + debugImage = DefaultNodeDebugImage + } + if timeout <= 0 { + timeout = DefaultNodeDebugTimeout + } + + // Create the debug pod + created, err := createDebugPod(ctx, k, nodeName, ns, debugImage, command) + if err != nil { + return "", err + } + + // Ensure the pod is deleted regardless of completion state. + defer func() { + deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + pods, err := k.AccessControlClientset().Pods(ns) + if err == nil { + _ = pods.Delete(deleteCtx, created.Name, metav1.DeleteOptions{}) + } + }() + + // Poll for debug pod completion + terminated, lastPod, waitMsg, err := pollForCompletion(ctx, k, ns, created.Name, timeout) + if err != nil { + return "", err + } + + // Retrieve the logs + logs, err := retrieveLogs(ctx, k, ns, created.Name) + if err != nil { + return "", err + } + + // Process the results + return processResults(terminated, lastPod, waitMsg, logs) +} + +// createDebugPod creates a privileged pod on the target node to run debug commands. +func createDebugPod( + ctx context.Context, + k OpenshiftClient, + nodeName string, + namespace string, + image string, + command []string, +) (*corev1.Pod, error) { + sanitizedNode := sanitizeForName(nodeName) + hostPathType := corev1.HostPathDirectory + + // Generate a unique name + suffix := utilrand.String(5) + maxNodeLen := 63 - len("node-debug-") - 1 - len(suffix) + if maxNodeLen < 1 { + maxNodeLen = 1 + } + if len(sanitizedNode) > maxNodeLen { + sanitizedNode = sanitizedNode[:maxNodeLen] + } + podName := fmt.Sprintf("node-debug-%s-%s", sanitizedNode, suffix) + + debugPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + Labels: map[string]string{ + kubernetes.AppKubernetesManagedBy: version.BinaryName, + kubernetes.AppKubernetesComponent: "node-debug", + kubernetes.AppKubernetesName: fmt.Sprintf("node-debug-%s", sanitizedNode), + }, + }, + Spec: corev1.PodSpec{ + AutomountServiceAccountToken: ptr.To(false), + HostNetwork: true, + HostPID: true, + HostIPC: true, + NodeName: nodeName, + RestartPolicy: corev1.RestartPolicyNever, + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: ptr.To[int64](0), + }, + Tolerations: []corev1.Toleration{ + {Operator: corev1.TolerationOpExists}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute}, + }, + Volumes: []corev1.Volume{ + { + Name: "host-root", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/", + Type: &hostPathType, + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: NodeDebugContainerName, + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: command, + SecurityContext: &corev1.SecurityContext{ + Privileged: ptr.To(true), + RunAsUser: ptr.To[int64](0), + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "host-root", MountPath: "/host"}, + }, + }, + }, + }, + } + + // Create the pod using AccessControlClientset + pods, err := k.AccessControlClientset().Pods(namespace) + if err != nil { + return nil, fmt.Errorf("failed to get pods interface: %w", err) + } + + created, err := pods.Create(ctx, debugPod, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create debug pod: %w", err) + } + + return created, nil +} + +// pollForCompletion polls the debug pod until it completes or times out. +func pollForCompletion( + ctx context.Context, + k OpenshiftClient, + namespace string, + podName string, + timeout time.Duration, +) (*corev1.ContainerStateTerminated, *corev1.Pod, string, error) { + pollCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + var ( + lastPod *corev1.Pod + terminated *corev1.ContainerStateTerminated + waitMsg string + ) + + for { + // Get pod status using AccessControlClientset + pods, getErr := k.AccessControlClientset().Pods(namespace) + if getErr != nil { + return nil, nil, "", fmt.Errorf("failed to get pods interface: %w", getErr) + } + + current, err := pods.Get(pollCtx, podName, metav1.GetOptions{}) + if err != nil { + return nil, nil, "", fmt.Errorf("failed to get debug pod status: %w", err) + } + lastPod = current + + if status := containerStatusByName(current.Status.ContainerStatuses, NodeDebugContainerName); status != nil { + if status.State.Waiting != nil { + waitMsg = fmt.Sprintf("container waiting: %s", status.State.Waiting.Reason) + // Image pull issues should fail fast. + if status.State.Waiting.Reason == "ErrImagePull" || status.State.Waiting.Reason == "ImagePullBackOff" { + return nil, nil, "", fmt.Errorf("debug container failed to start (%s): %s", status.State.Waiting.Reason, status.State.Waiting.Message) + } + } + if status.State.Terminated != nil { + terminated = status.State.Terminated + break + } + } + + if current.Status.Phase == corev1.PodFailed { + break + } + + // Wait for the next tick interval before checking pod status again, or timeout if context is done. + select { + case <-pollCtx.Done(): + return nil, nil, "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", podName, pollCtx.Err()) + case <-ticker.C: + } + } + + return terminated, lastPod, waitMsg, nil +} + +// retrieveLogs retrieves the logs from the debug pod. +func retrieveLogs(ctx context.Context, k OpenshiftClient, namespace, podName string) (string, error) { + logCtx, logCancel := context.WithTimeout(ctx, 30*time.Second) + defer logCancel() + logs, logErr := k.PodsLog(logCtx, namespace, podName, NodeDebugContainerName, false, 0) + if logErr != nil { + return "", fmt.Errorf("failed to retrieve debug pod logs: %w", logErr) + } + return strings.TrimSpace(logs), nil +} + +// processResults processes the debug pod completion status and returns the appropriate result. +func processResults(terminated *corev1.ContainerStateTerminated, lastPod *corev1.Pod, waitMsg, logs string) (string, error) { + if terminated != nil { + if terminated.ExitCode != 0 { + errMsg := fmt.Sprintf("command exited with code %d", terminated.ExitCode) + if terminated.Reason != "" { + errMsg = fmt.Sprintf("%s (%s)", errMsg, terminated.Reason) + } + if terminated.Message != "" { + errMsg = fmt.Sprintf("%s: %s", errMsg, terminated.Message) + } + if logs != "" { + errMsg = fmt.Sprintf("%s\nOutput:\n%s", errMsg, logs) + } + return "", errors.New(errMsg) + } + return logs, nil + } + + if lastPod != nil && lastPod.Status.Reason != "" { + if logs != "" { + return "", fmt.Errorf("debug pod failed: %s\nOutput:\n%s", lastPod.Status.Reason, logs) + } + return "", fmt.Errorf("debug pod failed: %s", lastPod.Status.Reason) + } + if waitMsg != "" { + if logs != "" { + return "", fmt.Errorf("debug container did not complete: %s\nOutput:\n%s", waitMsg, logs) + } + return "", fmt.Errorf("debug container did not complete: %s", waitMsg) + } + if logs != "" { + return "", fmt.Errorf("debug container did not reach a terminal state\nOutput:\n%s", logs) + } + return "", errors.New("debug container did not reach a terminal state") +} + +func sanitizeForName(name string) string { + lower := strings.ToLower(name) + var b strings.Builder + b.Grow(len(lower)) + for _, r := range lower { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { + b.WriteRune(r) + continue + } + b.WriteRune('-') + } + sanitized := strings.Trim(b.String(), "-") + if sanitized == "" { + sanitized = "node" + } + if len(sanitized) > 40 { + sanitized = sanitized[:40] + } + return sanitized +} + +func containerStatusByName(statuses []corev1.ContainerStatus, name string) *corev1.ContainerStatus { + for idx := range statuses { + if statuses[idx].Name == name { + return &statuses[idx] + } + } + return nil +} diff --git a/pkg/ocp/nodes/nodes_debug_test.go b/pkg/ocp/nodes/nodes_debug_test.go new file mode 100644 index 00000000..8fb415f5 --- /dev/null +++ b/pkg/ocp/nodes/nodes_debug_test.go @@ -0,0 +1,429 @@ +package nodes + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" +) + +func TestNodesDebugExecCreatesPrivilegedPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = "kernel 6.8" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "", "worker-0", "", []string{"uname", "-a"}, 2*time.Minute) + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if out != "kernel 6.8" { + t.Fatalf("unexpected command output: %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected debug pod to be created") + } + if created.Namespace != "default" { + t.Fatalf("expected default namespace fallback, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-0" { + t.Fatalf("expected pod to target node worker-0, got %q", created.Spec.NodeName) + } + if !env.Pods.Deleted { + t.Fatalf("expected debug pod to be deleted after execution") + } + + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected single container in debug pod") + } + container := created.Spec.Containers[0] + if container.Image != DefaultNodeDebugImage { + t.Fatalf("expected default image %q, got %q", DefaultNodeDebugImage, container.Image) + } + expectedCommand := []string{"uname", "-a"} + if len(container.Command) != len(expectedCommand) { + t.Fatalf("unexpected command length, got %v", container.Command) + } + for i, part := range expectedCommand { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || container.SecurityContext.Privileged == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected container to run privileged") + } + if len(container.VolumeMounts) != 1 || container.VolumeMounts[0].MountPath != "/host" { + t.Fatalf("expected container to mount host root at /host") + } + + if created.Spec.SecurityContext == nil || created.Spec.SecurityContext.RunAsUser == nil || *created.Spec.SecurityContext.RunAsUser != 0 { + t.Fatalf("expected pod security context to run as root") + } + + if len(created.Spec.Volumes) != 1 || created.Spec.Volumes[0].HostPath == nil { + t.Fatalf("expected host root volume to be configured") + } +} + +func TestNodesDebugExecReturnsErrorForNonZeroExit(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = 5 + env.Pods.TerminatedReason = "Error" + env.Pods.TerminatedMessage = "some failure" + env.Pods.Logs = "bad things happened" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "debug-ns", "infra-node", "registry.example/custom:latest", []string{"journalctl", "-xe"}, time.Minute) + if err == nil { + t.Fatalf("expected error for non-zero exit code") + } + // Logs should be included in the error message + if !strings.Contains(err.Error(), "bad things happened") { + t.Fatalf("expected error to contain logs, got: %v", err) + } + if !strings.Contains(err.Error(), "command exited with code 5") { + t.Fatalf("expected error to contain exit code, got: %v", err) + } + if out != "" { + t.Fatalf("expected empty output on error, got %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "debug-ns" { + t.Fatalf("expected provided namespace to be used, got %q", created.Namespace) + } + if containerImage := created.Spec.Containers[0].Image; containerImage != "registry.example/custom:latest" { + t.Fatalf("expected custom image to be used, got %q", containerImage) + } +} + +func TestCreateDebugPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + + created, err := createDebugPod(context.Background(), env.Kubernetes, "worker-1", "test-ns", "custom:v1", []string{"ls", "-la"}) + if err != nil { + t.Fatalf("createDebugPod failed: %v", err) + } + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "test-ns" { + t.Fatalf("expected namespace test-ns, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-1" { + t.Fatalf("expected node worker-1, got %q", created.Spec.NodeName) + } + if !strings.HasPrefix(created.Name, "node-debug-worker-1-") { + t.Fatalf("unexpected pod name: %q", created.Name) + } + if len(created.Name) > 63 { + t.Fatalf("pod name exceeds DNS label length: %d characters", len(created.Name)) + } + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected 1 container, got %d", len(created.Spec.Containers)) + } + container := created.Spec.Containers[0] + if container.Image != "custom:v1" { + t.Fatalf("expected image custom:v1, got %q", container.Image) + } + expectedCmd := []string{"ls", "-la"} + if len(container.Command) != len(expectedCmd) { + t.Fatalf("expected %d command parts, got %d", len(expectedCmd), len(container.Command)) + } + for i, part := range expectedCmd { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected privileged container") + } +} + +func TestPollForCompletion(t *testing.T) { + tests := []struct { + name string + exitCode int32 + terminatedReason string + waitingReason string + waitingMessage string + expectError bool + expectTerminated bool + errorContains []string + expectedExitCode int32 + expectedReason string + }{ + { + name: "successful completion", + exitCode: 0, + expectTerminated: true, + expectedExitCode: 0, + }, + { + name: "non-zero exit code", + exitCode: 42, + terminatedReason: "Error", + expectTerminated: true, + expectedExitCode: 42, + expectedReason: "Error", + }, + { + name: "image pull error", + waitingReason: "ErrImagePull", + waitingMessage: "image not found", + expectError: true, + errorContains: []string{"ErrImagePull", "image not found"}, + }, + { + name: "image pull backoff", + waitingReason: "ImagePullBackOff", + waitingMessage: "back-off pulling image", + expectError: true, + errorContains: []string{"ImagePullBackOff", "back-off pulling image"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = tt.exitCode + env.Pods.TerminatedReason = tt.terminatedReason + env.Pods.WaitingReason = tt.waitingReason + env.Pods.WaitingMessage = tt.waitingMessage + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + terminated, lastPod, waitMsg, err := pollForCompletion(context.Background(), env.Kubernetes, "default", created.Name, time.Minute) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if tt.expectTerminated { + if terminated == nil { + t.Fatalf("expected terminated state") + } + if terminated.ExitCode != tt.expectedExitCode { + t.Fatalf("expected exit code %d, got %d", tt.expectedExitCode, terminated.ExitCode) + } + if tt.expectedReason != "" && terminated.Reason != tt.expectedReason { + t.Fatalf("expected reason %q, got %q", tt.expectedReason, terminated.Reason) + } + if lastPod == nil { + t.Fatalf("expected lastPod to be set") + } + } + + if tt.waitingReason == "" && waitMsg != "" { + t.Fatalf("expected no wait message, got %q", waitMsg) + } + }) + } +} + +func TestRetrieveLogs(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = " some output with whitespace \n" + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + logs, err := retrieveLogs(context.Background(), env.Kubernetes, "default", created.Name) + if err != nil { + t.Fatalf("retrieveLogs failed: %v", err) + } + if logs != "some output with whitespace" { + t.Fatalf("expected trimmed logs, got %q", logs) + } +} + +func TestProcessResults(t *testing.T) { + tests := []struct { + name string + terminated *corev1.ContainerStateTerminated + pod *corev1.Pod + waitMsg string + logs string + expectError bool + errorContains []string + expectLogs bool + expectedResult string + }{ + { + name: "successful completion", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 0, + }, + logs: "success output", + expectError: false, + expectLogs: true, + expectedResult: "success output", + }, + { + name: "non-zero exit code with logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 127, + Reason: "CommandNotFound", + Message: "command not found", + }, + logs: "error logs", + expectError: true, + errorContains: []string{"127", "CommandNotFound", "command not found", "error logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "non-zero exit code without reason or message but with logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + logs: "failed output", + expectError: true, + errorContains: []string{"command exited with code 1", "failed output", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "non-zero exit code without logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + logs: "", + expectError: true, + errorContains: []string{"command exited with code 1"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "pod failed with logs", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Reason: "Evicted", + }, + }, + logs: "pod evicted logs", + expectError: true, + errorContains: []string{"Evicted", "pod evicted logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "pod failed without logs", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Reason: "Evicted", + }, + }, + logs: "", + expectError: true, + errorContains: []string{"Evicted"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "container waiting with logs", + waitMsg: "container waiting: ImagePullBackOff", + logs: "waiting logs", + expectError: true, + errorContains: []string{"did not complete", "waiting logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "container waiting without logs", + waitMsg: "container waiting: ImagePullBackOff", + logs: "", + expectError: true, + errorContains: []string{"did not complete"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "no terminal state with logs", + logs: "incomplete logs", + expectError: true, + errorContains: []string{"did not reach a terminal state", "incomplete logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "no terminal state without logs", + logs: "", + expectError: true, + errorContains: []string{"did not reach a terminal state"}, + expectLogs: false, + expectedResult: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := processResults(tt.terminated, tt.pod, tt.waitMsg, tt.logs) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + // Verify logs are NOT in the result when there's an error + if result != "" { + t.Fatalf("expected empty result on error, got %q", result) + } + } else { + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + // Verify logs ARE in the result when successful + if result != tt.expectedResult { + t.Fatalf("expected result %q, got %q", tt.expectedResult, result) + } + } + }) + } +} + +func TestSanitizeForName(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"worker-0", "worker-0"}, + {"WORKER-0", "worker-0"}, + {"worker.0", "worker-0"}, + {"worker_0", "worker-0"}, + {"ip-10-0-1-42.ec2.internal", "ip-10-0-1-42-ec2-internal"}, + {"", "node"}, + {"---", "node"}, + {strings.Repeat("a", 50), strings.Repeat("a", 40)}, + {"Worker-Node_123.domain", "worker-node-123-domain"}, + } + + for _, tt := range tests { + t.Run(fmt.Sprintf("sanitize(%q)", tt.input), func(t *testing.T) { + result := sanitizeForName(tt.input) + if result != tt.expected { + t.Fatalf("expected %q, got %q", tt.expected, result) + } + }) + } +} diff --git a/pkg/ocp/nodes/ocp_client.go b/pkg/ocp/nodes/ocp_client.go new file mode 100644 index 00000000..09644736 --- /dev/null +++ b/pkg/ocp/nodes/ocp_client.go @@ -0,0 +1,44 @@ +package nodes + +import ( + "context" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// ClientsetInterface defines the interface for access-controlled clientset operations. +// This allows code to work with kubernetes.AccessControlClientset through an interface, +// making it easier to test and decouple from the concrete implementation. +type ClientsetInterface interface { + Pods(namespace string) (corev1client.PodInterface, error) +} + +// OpenshiftClient defines a minimal interface for kubernetes operations commonly needed +// by OCP toolsets. This allows for easier testing and decoupling from the concrete +// kubernetes.Kubernetes type. +type OpenshiftClient interface { + NamespaceOrDefault(namespace string) string + AccessControlClientset() ClientsetInterface + PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) +} + +// OpenshiftClientAdapter adapts kubernetes.Kubernetes to implement OpenshiftClient. +// This allows production code to use the concrete *kubernetes.Kubernetes type +// while tests can use a mock implementation. +type OpenshiftClientAdapter struct { + *kubernetes.Kubernetes +} + +// NewOpenshiftClient creates a new adapter that wraps kubernetes.Kubernetes +// to implement the OpenshiftClient interface. +func NewOpenshiftClient(k *kubernetes.Kubernetes) *OpenshiftClientAdapter { + return &OpenshiftClientAdapter{Kubernetes: k} +} + +// AccessControlClientset returns the access control clientset as an interface. +// This satisfies the OpenshiftClient interface. +func (c *OpenshiftClientAdapter) AccessControlClientset() ClientsetInterface { + return c.Kubernetes.AccessControlClientset() +} diff --git a/pkg/ocp/nodes/testhelpers.go b/pkg/ocp/nodes/testhelpers.go new file mode 100644 index 00000000..7ab6a0fd --- /dev/null +++ b/pkg/ocp/nodes/testhelpers.go @@ -0,0 +1,163 @@ +package nodes + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + schemek8s "k8s.io/client-go/kubernetes/scheme" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + restclient "k8s.io/client-go/rest" +) + +// NodeDebugTestEnv bundles a test Kubernetes client with a controllable pods client for tests. +type NodeDebugTestEnv struct { + Kubernetes *FakeKubernetesClient + Pods *FakePodInterface +} + +// NewNodeDebugTestEnv constructs a testing harness for exercising NodesDebugExec. +func NewNodeDebugTestEnv(t *testing.T) *NodeDebugTestEnv { + t.Helper() + + podsClient := &FakePodInterface{} + fakeK8s := &FakeKubernetesClient{ + pods: podsClient, + namespace: "default", + } + + return &NodeDebugTestEnv{ + Kubernetes: fakeK8s, + Pods: podsClient, + } +} + +// FakeKubernetesClient implements the OpenshiftClient interface for testing +type FakeKubernetesClient struct { + pods *FakePodInterface + namespace string +} + +// AccessControlClientset returns a fake clientset for testing +func (f *FakeKubernetesClient) AccessControlClientset() ClientsetInterface { + return &FakeAccessControlClientset{pods: f.pods} +} + +func (f *FakeKubernetesClient) NamespaceOrDefault(namespace string) string { + if namespace == "" { + return f.namespace + } + return namespace +} + +func (f *FakeKubernetesClient) PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) { + req := f.pods.GetLogs(name, &corev1.PodLogOptions{Container: container, Previous: previous}) + res := req.Do(ctx) + if res.Error() != nil { + return "", res.Error() + } + rawData, err := res.Raw() + if err != nil { + return "", err + } + return string(rawData), nil +} + +// FakeAccessControlClientset mimics kubernetes.AccessControlClientset for testing +type FakeAccessControlClientset struct { + pods *FakePodInterface +} + +func (f *FakeAccessControlClientset) Pods(namespace string) (corev1client.PodInterface, error) { + return f.pods, nil +} + +// FakePodInterface implements corev1client.PodInterface with deterministic behaviour for tests. +type FakePodInterface struct { + corev1client.PodInterface + Created *corev1.Pod + Deleted bool + ExitCode int32 + TerminatedReason string + TerminatedMessage string + WaitingReason string + WaitingMessage string + Logs string +} + +func (f *FakePodInterface) Create(ctx context.Context, pod *corev1.Pod, opts metav1.CreateOptions) (*corev1.Pod, error) { + copy := pod.DeepCopy() + if copy.Name == "" && copy.GenerateName != "" { + copy.Name = copy.GenerateName + "test" + } + f.Created = copy + return copy.DeepCopy(), nil +} + +func (f *FakePodInterface) Get(ctx context.Context, name string, opts metav1.GetOptions) (*corev1.Pod, error) { + if f.Created == nil { + return nil, fmt.Errorf("pod not created yet") + } + pod := f.Created.DeepCopy() + + // If waiting state is set, return that instead of terminated + if f.WaitingReason != "" { + waiting := &corev1.ContainerStateWaiting{Reason: f.WaitingReason} + if f.WaitingMessage != "" { + waiting.Message = f.WaitingMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Waiting: waiting}, + }} + pod.Status.Phase = corev1.PodPending + return pod, nil + } + + // Otherwise return terminated state + terminated := &corev1.ContainerStateTerminated{ExitCode: f.ExitCode} + if f.TerminatedReason != "" { + terminated.Reason = f.TerminatedReason + } + if f.TerminatedMessage != "" { + terminated.Message = f.TerminatedMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Terminated: terminated}, + }} + pod.Status.Phase = corev1.PodSucceeded + return pod, nil +} + +func (f *FakePodInterface) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + f.Deleted = true + return nil +} + +func (f *FakePodInterface) GetLogs(name string, opts *corev1.PodLogOptions) *restclient.Request { + body := io.NopCloser(strings.NewReader(f.Logs)) + client := &http.Client{Transport: roundTripperFunc(func(*http.Request) (*http.Response, error) { + return &http.Response{StatusCode: http.StatusOK, Body: body}, nil + })} + content := restclient.ClientContentConfig{ + ContentType: runtime.ContentTypeJSON, + GroupVersion: schema.GroupVersion{Version: "v1"}, + Negotiator: runtime.NewClientNegotiator(schemek8s.Codecs.WithoutConversion(), schema.GroupVersion{Version: "v1"}), + } + return restclient.NewRequestWithClient(&url.URL{Scheme: "https", Host: "localhost"}, "", content, client).Verb("GET") +} + +type roundTripperFunc func(*http.Request) (*http.Response, error) + +func (f roundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) +} diff --git a/pkg/toolsets/openshift/mustgather/mustgather.go b/pkg/toolsets/openshift/mustgather/mustgather.go new file mode 100644 index 00000000..680d2ada --- /dev/null +++ b/pkg/toolsets/openshift/mustgather/mustgather.go @@ -0,0 +1,85 @@ +package mustgather + +import ( + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp/mustgather" +) + +func MustGatherTools() []api.ServerTool { + return []api.ServerTool{{ + Tool: api.Tool{ + Name: "plan_mustgather", + Description: "Plan for collecting a must-gather archive from an OpenShift cluster, must-gather is a tool for collecting cluster data related to debugging and troubleshooting like logs, kubernetes resources, etc.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "node_name": { + Type: "string", + Description: "Optional node to run the mustgather pod. If not provided, a random control-plane node will be selected automatically", + }, + "node_selector": { + Type: "string", + Description: "Optional node label selector to use, only relevant when specifying a command and image which needs to capture data on a set of cluster nodes simultaneously", + }, + "host_network": { + Type: "boolean", + Description: "Optionally run the must-gather pods in the host network of the node. This is only relevant if a specific gather image needs to capture host-level data", + }, + "gather_command": { + Type: "string", + Description: "Optionally specify a custom gather command to run a specialized script, eg. /usr/bin/gather_audit_logs", + Default: api.ToRawMessage("/usr/bin/gather"), + }, + "all_component_images": { + Type: "boolean", + Description: "Optional when enabled, collects and runs multiple must gathers for all operators and components on the cluster that have an annotated must-gather image available", + }, + "images": { + Type: "array", + Description: "Optional list of images to use for gathering custom information about specific operators or cluster components. If not specified, OpenShift's default must-gather image will be used by default", + Items: &jsonschema.Schema{ + Type: "string", + }, + }, + "source_dir": { + Type: "string", + Description: "Optional to set a specific directory where the pod will copy gathered data from", + Default: api.ToRawMessage("/must-gather"), + }, + "timeout": { + Type: "string", + Description: "Timeout of the gather process eg. 30s, 6m20s, or 2h10m30s", + }, + "namespace": { + Type: "string", + Description: "Optional to specify an existing privileged namespace where must-gather pods should run. If not provided, a temporary namespace will be created", + }, + "keep_resources": { + Type: "boolean", + Description: "Optional to retain all temporary resources when the mustgather completes, otherwise temporary resources created will be advised to be cleaned up", + }, + "since": { + Type: "string", + Description: "Optional to collect logs newer than a relative duration like 5s, 2m5s, or 3h6m10s. If unspecified, all available logs will be collected", + }, + "with_storage": { + Type: "boolean", + Description: "Optional to persist the collected must-gather data to a storage volume.", + }, + }, + }, + Annotations: api.ToolAnnotations{ + Title: "MustGather: Plan", + ReadOnlyHint: ptr.To(true), + DestructiveHint: ptr.To(false), + IdempotentHint: ptr.To(false), + OpenWorldHint: ptr.To(true), + }, + }, + + Handler: mustgather.PlanMustGather, + }} +} diff --git a/pkg/toolsets/openshift/nodes/nodes.go b/pkg/toolsets/openshift/nodes/nodes.go new file mode 100644 index 00000000..b4a23e02 --- /dev/null +++ b/pkg/toolsets/openshift/nodes/nodes.go @@ -0,0 +1,126 @@ +package nodes + +import ( + "errors" + "fmt" + "time" + + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp/nodes" +) + +func NodeTools() []api.ServerTool { + return []api.ServerTool{ + { + Tool: api.Tool{ + Name: "nodes_debug_exec", + Description: "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "node": { + Type: "string", + Description: "Name of the node to debug (e.g. worker-0).", + }, + "command": { + Type: "array", + Description: "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + Items: &jsonschema.Schema{Type: "string"}, + }, + "namespace": { + Type: "string", + Description: "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + }, + "image": { + Type: "string", + Description: "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + }, + "timeout_seconds": { + Type: "integer", + Description: "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + Minimum: ptr.To(float64(1)), + }, + }, + Required: []string{"node", "command"}, + }, + Annotations: api.ToolAnnotations{ + Title: "Nodes: Debug Exec", + ReadOnlyHint: ptr.To(false), + DestructiveHint: ptr.To(true), + IdempotentHint: ptr.To(false), + OpenWorldHint: ptr.To(true), + }, + }, + Handler: nodesDebugExec, + }, + } +} + +func nodesDebugExec(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + nodeArg := params.GetArguments()["node"] + nodeName, ok := nodeArg.(string) + if nodeArg == nil || !ok || nodeName == "" { + return api.NewToolCallResult("", errors.New("missing required argument: node")), nil + } + + commandArg := params.GetArguments()["command"] + command, err := toStringSlice(commandArg) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("invalid command argument: %w", err)), nil + } + + namespace := "" + if nsArg, ok := params.GetArguments()["namespace"].(string); ok { + namespace = nsArg + } + + image := "" + if imageArg, ok := params.GetArguments()["image"].(string); ok { + image = imageArg + } + + var timeout time.Duration + if timeoutRaw, exists := params.GetArguments()["timeout_seconds"]; exists && timeoutRaw != nil { + switch v := timeoutRaw.(type) { + case float64: + timeout = time.Duration(int64(v)) * time.Second + case int: + timeout = time.Duration(v) * time.Second + case int64: + timeout = time.Duration(v) * time.Second + default: + return api.NewToolCallResult("", errors.New("timeout_seconds must be a numeric value")), nil + } + } + + output, execErr := nodes.NodesDebugExec(params.Context, nodes.NewOpenshiftClient(params.Kubernetes), namespace, nodeName, image, command, timeout) + if output == "" && execErr == nil { + output = fmt.Sprintf("Command executed successfully on node %s but produced no output.", nodeName) + } + return api.NewToolCallResult(output, execErr), nil +} + +func toStringSlice(arg any) ([]string, error) { + if arg == nil { + return nil, errors.New("command is required") + } + raw, ok := arg.([]interface{}) + if !ok { + return nil, errors.New("command must be an array of strings") + } + if len(raw) == 0 { + return nil, errors.New("command array cannot be empty") + } + command := make([]string, 0, len(raw)) + for _, item := range raw { + str, ok := item.(string) + if !ok { + return nil, errors.New("command items must be strings") + } + command = append(command, str) + } + return command, nil +} diff --git a/pkg/toolsets/openshift/nodes/nodes_test.go b/pkg/toolsets/openshift/nodes/nodes_test.go new file mode 100644 index 00000000..45929f9a --- /dev/null +++ b/pkg/toolsets/openshift/nodes/nodes_test.go @@ -0,0 +1,95 @@ +package nodes + +import ( + "context" + "testing" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp/nodes" +) + +type staticRequest struct { + args map[string]any +} + +func (s staticRequest) GetArguments() map[string]any { + return s.args +} + +func TestNodesDebugExecHandlerValidatesInput(t *testing.T) { + t.Run("missing node", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{}}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "missing required argument: node" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) + + t.Run("invalid command type", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{ + "node": "worker-0", + "command": "ls -la", + }}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "invalid command argument: command must be an array of strings" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) +} + +func TestNodesDebugExecHandlerExecutesCommand(t *testing.T) { + env := nodes.NewNodeDebugTestEnv(t) + env.Pods.Logs = "done" + + // Call NodesDebugExec directly instead of going through the handler + // This avoids the need to mock the full kubernetes.Kubernetes type + output, err := nodes.NodesDebugExec( + context.Background(), + env.Kubernetes, + "debug", + "infra-node", + "registry.local/debug:latest", + []string{"systemctl", "status", "kubelet"}, + 15*time.Second, + ) + + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if output != "done" { + t.Fatalf("unexpected output: %q", output) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod creation") + } + if created.Namespace != "debug" { + t.Fatalf("expected namespace override, got %q", created.Namespace) + } + if created.Spec.Containers[0].Image != "registry.local/debug:latest" { + t.Fatalf("expected custom image, got %q", created.Spec.Containers[0].Image) + } + expectedCommand := []string{"systemctl", "status", "kubelet"} + if len(created.Spec.Containers[0].Command) != len(expectedCommand) { + t.Fatalf("unexpected command length: %v", created.Spec.Containers[0].Command) + } + for i, part := range expectedCommand { + if created.Spec.Containers[0].Command[i] != part { + t.Fatalf("command[%d]=%q expected %q", i, created.Spec.Containers[0].Command[i], part) + } + } +} diff --git a/pkg/toolsets/openshift/toolset.go b/pkg/toolsets/openshift/toolset.go new file mode 100644 index 00000000..999825d5 --- /dev/null +++ b/pkg/toolsets/openshift/toolset.go @@ -0,0 +1,34 @@ +package openshift + +import ( + "slices" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift/mustgather" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift/nodes" +) + +type Toolset struct{} + +var _ api.Toolset = (*Toolset)(nil) + +func (t *Toolset) GetName() string { + return "openshift-core" +} + +func (t *Toolset) GetDescription() string { + return "Core OpenShift-specific tools (must-gather, Node debugging, etc.)" +} + +func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { + return slices.Concat( + mustgather.MustGatherTools(), + nodes.NodeTools(), + ) +} + +func init() { + toolsets.Register(&Toolset{}) +}