Skip to content

Commit fd35ad2

Browse files
Merge branch 'main' into feat/mcp-hub-hybrid-intelligence
2 parents 0d42ef1 + be3fde2 commit fd35ad2

37 files changed

+4228
-665
lines changed

cmd/mcp-pods/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func main() {
5151
}
5252
}()
5353

54-
telemetry.Info("mcp-pods ready", "tools", []string{"inspect_pods", "describe_pod", "list_pod_events"})
54+
telemetry.Info("mcp-pods ready", "tools", []string{"inspect_pods", "describe_pod", "list_pod_events", "get_pod_logs", "delete_pod"})
5555

5656
sig := <-sigChan
5757
telemetry.Info("received signal, shutting down", "signal", sig.String())

cmd/mcp-telemetry/main.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package main
33
import (
44
"context"
55
"os"
6+
"os/signal"
7+
"syscall"
68

79
"github.com/modelcontextprotocol/go-sdk/mcp"
810

@@ -53,15 +55,20 @@ func main() {
5355

5456
internalmcp.RegisterTelemetryTools(server, provider)
5557

56-
telemetry.Info("mcp-telemetry ready", "tools", []string{"query_metrics", "query_logs", "query_traces", "investigate_incident"})
58+
sigChan := make(chan os.Signal, 1)
59+
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
5760

58-
// Run the server on stdio transport. This is a blocking call.
59-
// We run it directly in the main thread for standalone operation.
60-
t := &mcp.StdioTransport{}
61-
if err := server.Run(ctx, t); err != nil {
62-
telemetry.Error("MCP server execution failed", "error", err)
63-
}
61+
go func() {
62+
t := &mcp.StdioTransport{}
63+
if err := server.Run(ctx, t); err != nil {
64+
telemetry.Error("MCP server execution failed", "error", err)
65+
}
66+
}()
67+
68+
telemetry.Info("mcp-telemetry ready", "tools", []string{"query_metrics", "query_logs", "query_traces", "investigate_incident"})
6469

65-
telemetry.Info("shutting down mcp-telemetry")
70+
sig := <-sigChan
71+
telemetry.Info("received signal, shutting down", "signal", sig.String())
72+
cancel()
6673
provider.Close()
6774
}

docs/decisions/018-domain-isolated-mcp-architecture.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ Combining infrastructure tools (Kubernetes API) with telemetry tools (LGTM stack
1818

1919
Adopt a **Domain-Isolated MCP Architecture** by splitting agentic capabilities into specialized, standalone binaries:
2020

21-
- **Specialized Servers:** Implement a new dedicated service, **`mcp-pods`**, to handle all infrastructure-related investigations. Future servers (e.g., `mcp-domain`) will follow this pattern.
21+
- **Specialized Servers:** Implement a new dedicated service, **`mcp-pods`**, to handle all infrastructure-related operations and investigations. This includes diagnostic log retrieval (`get_pod_logs`) and basic remediation (`delete_pod`) alongside resource inspection.
2222
- **Shared Registry Pattern:** Use a modular registry in `internal/mcp/registry.go` to share protocol logic (JSON-RPC formatting, error handling) while allowing each binary to register only the toolsets it requires.
2323
- **Standalone Binary Pattern:** Deploy these servers as pure binaries communicating over `stdio`, avoiding host-tier service managers like systemd to improve portability and alignment with future containerized orchestration.
24+
- **Lifecycle Standardization:** Enforce a unified signal-handling pattern (`os/signal`) across all MCP binaries to ensure reliable telemetry flushing and resource cleanup (e.g., closing provider connections) during server termination.
2425

2526
## Consequences
2627

@@ -39,5 +40,6 @@ Adopt a **Domain-Isolated MCP Architecture** by splitting agentic capabilities i
3940

4041
- [x] **Modular Registry:** Refactored `internal/mcp/registry.go` to support `RegisterTelemetryTools` and `RegisterPodsTools` independently.
4142
- [x] **Service Isolation:** Verified `mcp-pods` binary successfully executes and connects to the K3s API using `client-go` with pluralized naming conventions.
42-
- [x] **Tool Fidelity:** Verified `inspect_pods`, `describe_pod`, and `list_pod_events` return high-signal, machine-readable data from the live cluster.
43+
- [x] **Operational Expansion:** Implemented and verified `get_pod_logs` and `delete_pod` for active troubleshooting and remediation.
44+
- [x] **Lifecycle Standardization:** Standardized all MCP servers (pods, telemetry) on signal-based graceful shutdown patterns to ensure reliable cleanup.
4345
- [x] **Systemic Stability:** Confirmed that `mcp-telemetry` remains fully functional and isolated from the infrastructure-level logic.

internal/mcp/providers/pods.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,35 @@ func (p *PodsProvider) ListEvents(ctx context.Context, namespace, name string) (
9999

100100
return filtered, nil
101101
}
102+
103+
// GetPodLogs retrieves logs from the specified pod/container.
104+
func (p *PodsProvider) GetPodLogs(ctx context.Context, namespace, name, container string, tailLines int64, previous bool) (string, error) {
105+
opts := &corev1.PodLogOptions{
106+
Container: container,
107+
Previous: previous,
108+
}
109+
if tailLines > 0 {
110+
opts.TailLines = &tailLines
111+
}
112+
113+
req := p.clientset.CoreV1().Pods(namespace).GetLogs(name, opts)
114+
logs, err := req.DoRaw(ctx)
115+
if err != nil {
116+
return "", fmt.Errorf("failed to get logs for pod %s/%s: %w", name, namespace, err)
117+
}
118+
119+
return string(logs), nil
120+
}
121+
122+
// DeletePod deletes the specified pod.
123+
func (p *PodsProvider) DeletePod(ctx context.Context, namespace, name string, gracePeriod *int64) error {
124+
opts := metav1.DeleteOptions{
125+
GracePeriodSeconds: gracePeriod,
126+
}
127+
err := p.clientset.CoreV1().Pods(namespace).Delete(ctx, name, opts)
128+
if err != nil {
129+
return fmt.Errorf("failed to delete pod %s/%s: %w", name, namespace, err)
130+
}
131+
132+
return nil
133+
}

internal/mcp/providers/pods_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,4 +161,42 @@ func TestPodsProvider(t *testing.T) {
161161
})
162162
}
163163
})
164+
165+
t.Run("DeletePod", func(t *testing.T) {
166+
fakePod := &corev1.Pod{
167+
ObjectMeta: metav1.ObjectMeta{
168+
Name: "delete-me",
169+
Namespace: "default",
170+
},
171+
}
172+
clientset := fake.NewSimpleClientset(fakePod)
173+
provider := &PodsProvider{clientset: clientset}
174+
175+
err := provider.DeletePod(context.Background(), "default", "delete-me", nil)
176+
if err != nil {
177+
t.Errorf("DeletePod() unexpected error: %v", err)
178+
}
179+
180+
err = provider.DeletePod(context.Background(), "default", "ghost-pod", nil)
181+
if err == nil {
182+
t.Error("DeletePod() expected error for non-existent pod, got nil")
183+
}
184+
})
185+
186+
t.Run("GetPodLogs", func(t *testing.T) {
187+
fakePod := &corev1.Pod{
188+
ObjectMeta: metav1.ObjectMeta{
189+
Name: "test-pod",
190+
Namespace: "default",
191+
},
192+
}
193+
clientset := fake.NewSimpleClientset(fakePod)
194+
provider := &PodsProvider{clientset: clientset}
195+
196+
// Note: fake clientset's GetLogs doesn't return actual logs, but we can verify it doesn't error
197+
_, err := provider.GetPodLogs(context.Background(), "default", "test-pod", "", 10, false)
198+
if err != nil {
199+
t.Errorf("GetPodLogs() unexpected error: %v", err)
200+
}
201+
})
164202
}

internal/mcp/registry.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,17 @@ func RegisterPodsTools(server *mcp.Server, provider *providers.PodsProvider) {
113113
Description: "List all lifecycle events associated with a specific pod",
114114
}, handleListPodEvents(provider))
115115

116-
telemetry.Info("registered pods tools", "count", 3)
116+
mcp.AddTool(server, &mcp.Tool{
117+
Name: "get_pod_logs",
118+
Description: "Retrieve logs from a specific pod/container",
119+
}, handleGetPodLogs(provider))
120+
121+
mcp.AddTool(server, &mcp.Tool{
122+
Name: "delete_pod",
123+
Description: "Delete a specific pod (useful for restarting stuck pods)",
124+
}, handleDeletePod(provider))
125+
126+
telemetry.Info("registered pods tools", "count", 5)
117127
}
118128

119129
func handleInspectPods(provider *providers.PodsProvider) mcp.ToolHandlerFor[tools.PodsInput, any] {
@@ -188,6 +198,9 @@ func RegisterHubTools(server *mcp.Server, provider *providers.HubProvider) {
188198
func handleInspectPlatform(provider *providers.HubProvider) mcp.ToolHandlerFor[tools.HubInput, any] {
189199
handler := tools.NewInspectPlatformHandler(provider.InspectPlatform)
190200
return func(ctx context.Context, _ *mcp.CallToolRequest, input tools.HubInput) (*mcp.CallToolResult, any, error) {
201+
func handleGetPodLogs(provider *providers.PodsProvider) mcp.ToolHandlerFor[tools.PodLogsInput, any] {
202+
handler := tools.NewGetPodLogsHandler(provider.GetPodLogs)
203+
return func(ctx context.Context, _ *mcp.CallToolRequest, input tools.PodLogsInput) (*mcp.CallToolResult, any, error) {
191204
result, err := handler.Execute(ctx, input)
192205
if err != nil {
193206
return nil, nil, err
@@ -223,13 +236,18 @@ func handleListHostServices(provider *providers.HubProvider) mcp.ToolHandlerFor[
223236
text, _ := json.Marshal(result)
224237
return &mcp.CallToolResult{
225238
Content: []mcp.Content{&mcp.TextContent{Text: string(text)}},
239+
return &mcp.CallToolResult{
240+
Content: []mcp.Content{&mcp.TextContent{Text: result.(string)}},
226241
}, nil, nil
227242
}
228243
}
229244

230245
func handleQueryServiceLogs(provider *providers.HubProvider) mcp.ToolHandlerFor[tools.HubInput, any] {
231246
handler := tools.NewQueryServiceLogsHandler(provider.QueryServiceLogs)
232247
return func(ctx context.Context, _ *mcp.CallToolRequest, input tools.HubInput) (*mcp.CallToolResult, any, error) {
248+
func handleDeletePod(provider *providers.PodsProvider) mcp.ToolHandlerFor[tools.DeletePodInput, any] {
249+
handler := tools.NewDeletePodHandler(provider.DeletePod)
250+
return func(ctx context.Context, _ *mcp.CallToolRequest, input tools.DeletePodInput) (*mcp.CallToolResult, any, error) {
233251
result, err := handler.Execute(ctx, input)
234252
if err != nil {
235253
return nil, nil, err

internal/mcp/tools/pods.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,49 @@ func NewListPodEventsHandler(listEventsFn func(ctx context.Context, namespace, n
7575
func (h *ListPodEventsHandler) Execute(ctx context.Context, input PodsInput) (interface{}, error) {
7676
return h.listEventsFn(ctx, input.Namespace, input.Name)
7777
}
78+
79+
// PodLogsInput is the input for getting pod logs.
80+
type PodLogsInput struct {
81+
Namespace string `json:"namespace"`
82+
Name string `json:"name"`
83+
Container string `json:"container,omitempty"`
84+
TailLines int64 `json:"tail_lines,omitempty"`
85+
Previous bool `json:"previous,omitempty"`
86+
}
87+
88+
// GetPodLogsHandler handles retrieving logs for a pod.
89+
type GetPodLogsHandler struct {
90+
getLogsFn func(ctx context.Context, namespace, name, container string, tailLines int64, previous bool) (string, error)
91+
}
92+
93+
func NewGetPodLogsHandler(getLogsFn func(ctx context.Context, namespace, name, container string, tailLines int64, previous bool) (string, error)) *GetPodLogsHandler {
94+
return &GetPodLogsHandler{getLogsFn: getLogsFn}
95+
}
96+
97+
func (h *GetPodLogsHandler) Execute(ctx context.Context, input PodLogsInput) (interface{}, error) {
98+
return h.getLogsFn(ctx, input.Namespace, input.Name, input.Container, input.TailLines, input.Previous)
99+
}
100+
101+
// DeletePodInput is the input for deleting a pod.
102+
type DeletePodInput struct {
103+
Namespace string `json:"namespace"`
104+
Name string `json:"name"`
105+
GraceSeconds *int64 `json:"grace_seconds,omitempty"`
106+
}
107+
108+
// DeletePodHandler handles deleting a pod.
109+
type DeletePodHandler struct {
110+
deleteFn func(ctx context.Context, namespace, name string, gracePeriod *int64) error
111+
}
112+
113+
func NewDeletePodHandler(deleteFn func(ctx context.Context, namespace, name string, gracePeriod *int64) error) *DeletePodHandler {
114+
return &DeletePodHandler{deleteFn: deleteFn}
115+
}
116+
117+
func (h *DeletePodHandler) Execute(ctx context.Context, input DeletePodInput) (interface{}, error) {
118+
err := h.deleteFn(ctx, input.Namespace, input.Name, input.GraceSeconds)
119+
if err != nil {
120+
return nil, err
121+
}
122+
return map[string]string{"status": "deleted", "pod": input.Name, "namespace": input.Namespace}, nil
123+
}

internal/mcp/tools/pods_test.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,73 @@ func TestListPodEventsHandler_Execute(t *testing.T) {
135135
})
136136
}
137137
}
138+
139+
func TestGetPodLogsHandler_Execute(t *testing.T) {
140+
tests := []struct {
141+
name string
142+
getLogsFn func(ctx context.Context, namespace, name, container string, tailLines int64, previous bool) (string, error)
143+
wantErr bool
144+
}{
145+
{
146+
name: "successful logs get",
147+
getLogsFn: func(ctx context.Context, namespace, name, container string, tailLines int64, previous bool) (string, error) {
148+
return "logs", nil
149+
},
150+
wantErr: false,
151+
},
152+
{
153+
name: "api error",
154+
getLogsFn: func(ctx context.Context, namespace, name, container string, tailLines int64, previous bool) (string, error) {
155+
return "", errors.New("api error")
156+
},
157+
wantErr: true,
158+
},
159+
}
160+
161+
for _, tt := range tests {
162+
t.Run(tt.name, func(t *testing.T) {
163+
h := NewGetPodLogsHandler(tt.getLogsFn)
164+
got, err := h.Execute(context.Background(), PodLogsInput{Namespace: "default", Name: "test-pod"})
165+
if (err != nil) != tt.wantErr {
166+
t.Errorf("Execute() error = %v, wantErr %v", err, tt.wantErr)
167+
return
168+
}
169+
if !tt.wantErr && got.(string) != "logs" {
170+
t.Errorf("Execute() got = %v, want %v", got, "logs")
171+
}
172+
})
173+
}
174+
}
175+
176+
func TestDeletePodHandler_Execute(t *testing.T) {
177+
tests := []struct {
178+
name string
179+
deleteFn func(ctx context.Context, namespace, name string, gracePeriod *int64) error
180+
wantErr bool
181+
}{
182+
{
183+
name: "successful delete",
184+
deleteFn: func(ctx context.Context, namespace, name string, gracePeriod *int64) error {
185+
return nil
186+
},
187+
wantErr: false,
188+
},
189+
{
190+
name: "api error",
191+
deleteFn: func(ctx context.Context, namespace, name string, gracePeriod *int64) error {
192+
return errors.New("api error")
193+
},
194+
wantErr: true,
195+
},
196+
}
197+
198+
for _, tt := range tests {
199+
t.Run(tt.name, func(t *testing.T) {
200+
h := NewDeletePodHandler(tt.deleteFn)
201+
_, err := h.Execute(context.Background(), DeletePodInput{Namespace: "default", Name: "test-pod"})
202+
if (err != nil) != tt.wantErr {
203+
t.Errorf("Execute() error = %v, wantErr %v", err, tt.wantErr)
204+
}
205+
})
206+
}
207+
}

0 commit comments

Comments
 (0)