Skip to content

Commit 1364ece

Browse files
committed
fix(containerd): prevent silent network failures from leaving containers unreachable (#202)
* fix(containerd): prevent silent network failures from leaving containers unreachable Container network setup failures were silently swallowed at multiple points in the call chain, leaving containers in a "running but unreachable" ghost state. This patch closes every silent-failure path: - setupCNINetwork: return error when CNI yields no usable IP - Manager.Start: roll back container when IP is empty instead of returning success - ensureContainerAndTask: extract setupNetworkOrFail with 1 retry, propagate error to callers - ReconcileContainers: stop reporting "healthy" when network setup fails - recoverContainerIP: retry up to 2 times with backoff for transient CNI failures (IPAM lock contention, etc.) - gRPC Pool: evict connections stuck in Connecting state for >30s * fix(containerd): clean stale cni0 bridge on startup to prevent MAC error After a Docker container restart, the cni0 bridge interface can linger with a zeroed MAC (00:00:00:00:00:00) and DOWN state. The CNI bridge plugin then fails with "could not set bridge's mac: invalid argument", making all MCP containers unreachable. Two-layer fix: - Entrypoint: delete cni0 and flush IPAM state before starting containerd - Go: detect bridge MAC errors in setupCNINetwork and auto-delete cni0 before retrying, as defense-in-depth for runtime recovery * fix(containerd): use exec.CommandContext to satisfy noctx linter
1 parent 7f3f8ce commit 1364ece

File tree

6 files changed

+113
-61
lines changed

6 files changed

+113
-61
lines changed

devenv/server-entrypoint.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
#!/bin/sh
22
set -e
33

4+
# Clean up stale CNI state from previous runs. After a container restart the
5+
# cni0 bridge may linger with a zeroed MAC (00:00:00:00:00:00), causing the
6+
# bridge plugin to fail with "could not set bridge's mac: invalid argument".
7+
ip link delete cni0 2>/dev/null || true
8+
rm -rf /var/lib/cni/networks/* /var/lib/cni/results/* 2>/dev/null || true
9+
410
# Ensure IP forwarding and subnet MASQUERADE for CNI.
511
sysctl -w net.ipv4.ip_forward=1 2>/dev/null || true
612
iptables -t nat -C POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || \

docker/server-entrypoint.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ set -e
33

44
MCP_IMAGE="${MCP_IMAGE:-docker.io/library/memoh-mcp:latest}"
55

6+
# ---- Clean up stale CNI state from previous runs ----
7+
# After a container restart the cni0 bridge may linger with a zeroed MAC
8+
# (00:00:00:00:00:00), causing "could not set bridge's mac: invalid argument".
9+
ip link delete cni0 2>/dev/null || true
10+
rm -rf /var/lib/cni/networks/* /var/lib/cni/results/* 2>/dev/null || true
11+
612
# ---- Ensure IP forwarding and subnet MASQUERADE for CNI ----
713
sysctl -w net.ipv4.ip_forward=1 2>/dev/null || true
814
iptables -t nat -C POSTROUTING -s 10.88.0.0/16 ! -o cni0 -j MASQUERADE 2>/dev/null || \

internal/containerd/network.go

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"os"
7+
"os/exec"
78
"path/filepath"
89
"strconv"
910
"strings"
@@ -51,19 +52,26 @@ func setupCNINetwork(ctx context.Context, task client.Task, containerID string,
5152
}
5253
result, err := cni.Setup(ctx, containerID, netnsPath)
5354
if err != nil {
54-
if !isDuplicateAllocationError(err) && !isVethExistsError(err) {
55+
retryable := isDuplicateAllocationError(err) || isVethExistsError(err) || isBridgeMACError(err)
56+
if !retryable {
5557
return "", err
5658
}
57-
// Stale IPAM allocation or veth exists (e.g. after container restart with persisted
58-
// /var/lib/cni). Remove may fail if the previous iptables/veth state
59-
// is already gone; ignore the error so the retry Setup still runs.
59+
if isBridgeMACError(err) {
60+
// Stale bridge with zeroed MAC after container restart; delete it so
61+
// the plugin can recreate a healthy one.
62+
_ = exec.CommandContext(ctx, "ip", "link", "delete", "cni0").Run()
63+
}
6064
_ = cni.Remove(ctx, containerID, netnsPath)
6165
result, err = cni.Setup(ctx, containerID, netnsPath)
6266
if err != nil {
6367
return "", err
6468
}
6569
}
66-
return extractIP(result), nil
70+
ip := extractIP(result)
71+
if ip == "" {
72+
return "", fmt.Errorf("cni setup returned no usable IP for %s", containerID)
73+
}
74+
return ip, nil
6775
}
6876

6977
func extractIP(result *gocni.Result) string {
@@ -139,3 +147,13 @@ func isVethExistsError(err error) bool {
139147
}
140148
return strings.Contains(err.Error(), "already exists")
141149
}
150+
151+
// isBridgeMACError returns true if the CNI bridge plugin failed because the
152+
// stale cni0 bridge has a zeroed MAC address (common after container restart).
153+
func isBridgeMACError(err error) bool {
154+
if err == nil {
155+
return false
156+
}
157+
msg := err.Error()
158+
return strings.Contains(msg, "set bridge") && strings.Contains(msg, "mac")
159+
}

internal/handlers/containerd.go

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -247,15 +247,8 @@ func (h *ContainerdHandler) ensureContainerAndTask(ctx context.Context, containe
247247
}
248248
if len(tasks) > 0 {
249249
if tasks[0].Status == ctr.TaskStatusRunning {
250-
if netResult, netErr := h.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
251-
ContainerID: containerID,
252-
CNIBinDir: h.cfg.CNIBinaryDir,
253-
CNIConfDir: h.cfg.CNIConfigDir,
254-
}); netErr != nil {
255-
h.logger.Warn("network re-setup failed for running task",
256-
slog.String("container_id", containerID), slog.Any("error", netErr))
257-
} else if netResult.IP != "" && h.manager != nil {
258-
h.manager.SetContainerIP(botID, netResult.IP)
250+
if err := h.setupNetworkOrFail(ctx, containerID, botID); err != nil {
251+
return err
259252
}
260253
return nil
261254
}
@@ -270,17 +263,37 @@ func (h *ContainerdHandler) ensureContainerAndTask(ctx context.Context, containe
270263
if err := h.service.StartContainer(ctx, containerID, nil); err != nil {
271264
return err
272265
}
273-
if netResult, netErr := h.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
274-
ContainerID: containerID,
275-
CNIBinDir: h.cfg.CNIBinaryDir,
276-
CNIConfDir: h.cfg.CNIConfigDir,
277-
}); netErr != nil {
278-
h.logger.Warn("network setup failed, task kept running",
279-
slog.String("container_id", containerID), slog.Any("error", netErr))
280-
} else if netResult.IP != "" && h.manager != nil {
281-
h.manager.SetContainerIP(botID, netResult.IP)
266+
return h.setupNetworkOrFail(ctx, containerID, botID)
267+
}
268+
269+
// setupNetworkOrFail attempts CNI network setup with one retry. Returns an error
270+
// if no usable IP is obtained — callers must not silently ignore this.
271+
func (h *ContainerdHandler) setupNetworkOrFail(ctx context.Context, containerID, botID string) error {
272+
var lastErr error
273+
for attempt := 0; attempt < 2; attempt++ {
274+
netResult, err := h.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
275+
ContainerID: containerID,
276+
CNIBinDir: h.cfg.CNIBinaryDir,
277+
CNIConfDir: h.cfg.CNIConfigDir,
278+
})
279+
if err != nil {
280+
lastErr = err
281+
h.logger.Warn("network setup attempt failed",
282+
slog.String("container_id", containerID),
283+
slog.Int("attempt", attempt+1),
284+
slog.Any("error", err))
285+
continue
286+
}
287+
if netResult.IP == "" {
288+
lastErr = fmt.Errorf("network setup returned no IP for %s", containerID)
289+
continue
290+
}
291+
if h.manager != nil {
292+
h.manager.SetContainerIP(botID, netResult.IP)
293+
}
294+
return nil
282295
}
283-
return nil
296+
return fmt.Errorf("network setup failed for container %s: %w", containerID, lastErr)
284297
}
285298

286299
// botContainerID resolves container_id for a bot from the database.
@@ -967,20 +980,15 @@ func (h *ContainerdHandler) ReconcileContainers(ctx context.Context) {
967980
slog.String("bot_id", botID), slog.Any("error", dbErr))
968981
}
969982
}
970-
if netResult, netErr := h.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
971-
ContainerID: containerID,
972-
CNIBinDir: h.cfg.CNIBinaryDir,
973-
CNIConfDir: h.cfg.CNIConfigDir,
974-
}); netErr != nil {
975-
h.logger.Warn("reconcile: network re-setup failed for running task",
983+
if netErr := h.setupNetworkOrFail(ctx, containerID, botID); netErr != nil {
984+
h.logger.Error("reconcile: network setup failed for running task, container unreachable",
976985
slog.String("bot_id", botID),
977986
slog.String("container_id", containerID),
978987
slog.Any("error", netErr))
979-
} else if netResult.IP != "" && h.manager != nil {
980-
h.manager.SetContainerIP(botID, netResult.IP)
988+
} else {
989+
h.logger.Info("reconcile: container healthy",
990+
slog.String("bot_id", botID), slog.String("container_id", containerID))
981991
}
982-
h.logger.Info("reconcile: container healthy",
983-
slog.String("bot_id", botID), slog.String("container_id", containerID))
984992
continue
985993
}
986994

internal/mcp/manager.go

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -116,34 +116,39 @@ func (m *Manager) SetContainerIP(botID, ip string) {
116116
}
117117

118118
// recoverContainerIP attempts to restore the container IP by re-running CNI setup.
119-
// CNI plugins are idempotent - calling Setup again returns the existing IP allocation.
119+
// CNI plugins are idempotent — calling Setup again returns the existing IP allocation.
120+
// Retries up to 2 times to tolerate transient CNI failures (IPAM lock contention, etc.).
120121
func (m *Manager) recoverContainerIP(botID string) (string, error) {
121122
ctx := context.Background()
122123
containerID := m.containerID(botID)
123124

124-
// First check if container exists and get basic info
125125
info, err := m.service.GetContainer(ctx, containerID)
126126
if err != nil {
127127
return "", err
128128
}
129129

130-
// Check if IP is stored in labels (if we ever add label persistence)
131130
if ip, ok := info.Labels["mcp.container_ip"]; ok {
132131
return ip, nil
133132
}
134133

135-
// Container exists but IP not cached - need to re-setup network to get IP
136-
// This happens after server restart when in-memory cache is lost
137-
netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
138-
ContainerID: containerID,
139-
CNIBinDir: m.cfg.CNIBinaryDir,
140-
CNIConfDir: m.cfg.CNIConfigDir,
141-
})
142-
if err != nil {
143-
return "", fmt.Errorf("network setup for IP recovery: %w", err)
134+
const maxAttempts = 2
135+
var lastErr error
136+
for i := 0; i < maxAttempts; i++ {
137+
netResult, err := m.service.SetupNetwork(ctx, ctr.NetworkSetupRequest{
138+
ContainerID: containerID,
139+
CNIBinDir: m.cfg.CNIBinaryDir,
140+
CNIConfDir: m.cfg.CNIConfigDir,
141+
})
142+
if err != nil {
143+
lastErr = err
144+
m.logger.Warn("IP recovery attempt failed",
145+
slog.String("bot_id", botID), slog.Int("attempt", i+1), slog.Any("error", err))
146+
time.Sleep(time.Duration(i+1) * 500 * time.Millisecond)
147+
continue
148+
}
149+
return netResult.IP, nil
144150
}
145-
146-
return netResult.IP, nil
151+
return "", fmt.Errorf("network setup for IP recovery after %d attempts: %w", maxAttempts, lastErr)
147152
}
148153

149154
// MCPClient returns a gRPC client for the given bot's container.
@@ -295,12 +300,14 @@ func (m *Manager) Start(ctx context.Context, botID string) error {
295300
}
296301
return err
297302
}
298-
if netResult.IP != "" {
299-
m.mu.Lock()
300-
m.containerIPs[botID] = netResult.IP
301-
m.mu.Unlock()
302-
m.logger.Info("container network ready", slog.String("bot_id", botID), slog.String("ip", netResult.IP))
303+
if netResult.IP == "" {
304+
if stopErr := m.service.StopContainer(ctx, m.containerID(botID), &ctr.StopTaskOptions{Force: true}); stopErr != nil {
305+
m.logger.Warn("cleanup: stop task failed", slog.String("container_id", m.containerID(botID)), slog.Any("error", stopErr))
306+
}
307+
return fmt.Errorf("network setup returned no IP for bot %s", botID)
303308
}
309+
m.SetContainerIP(botID, netResult.IP)
310+
m.logger.Info("container network ready", slog.String("bot_id", botID), slog.String("ip", netResult.IP))
304311
return nil
305312
}
306313

internal/mcp/mcpclient/client.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"fmt"
1212
"io"
1313
"sync"
14+
"time"
1415

1516
"google.golang.org/grpc"
1617
"google.golang.org/grpc/connectivity"
@@ -20,11 +21,14 @@ import (
2021
pb "github.com/memohai/memoh/internal/mcp/mcpcontainer"
2122
)
2223

24+
const connectingTimeout = 30 * time.Second
25+
2326
// Client wraps a gRPC connection to a single MCP container.
2427
type Client struct {
25-
conn *grpc.ClientConn
26-
svc pb.ContainerServiceClient
27-
target string
28+
conn *grpc.ClientConn
29+
svc pb.ContainerServiceClient
30+
target string
31+
createdAt time.Time
2832
}
2933

3034
// NewClientFromConn wraps an existing gRPC connection into a Client.
@@ -47,9 +51,10 @@ func Dial(_ context.Context, ip string) (*Client, error) {
4751
return nil, fmt.Errorf("grpc dial %s: %w", target, err)
4852
}
4953
return &Client{
50-
conn: conn,
51-
svc: pb.NewContainerServiceClient(conn),
52-
target: target,
54+
conn: conn,
55+
svc: pb.NewContainerServiceClient(conn),
56+
target: target,
57+
createdAt: time.Now(),
5358
}, nil
5459
}
5560

@@ -308,12 +313,14 @@ func (p *Pool) MCPClient(ctx context.Context, botID string) (*Client, error) {
308313
}
309314

310315
// Get returns a cached client or dials a new one.
311-
// Stale connections (Shutdown / TransientFailure) are evicted automatically.
316+
// Stale connections (Shutdown / TransientFailure / stuck Connecting) are evicted automatically.
312317
func (p *Pool) Get(ctx context.Context, botID string) (*Client, error) {
313318
p.mu.RLock()
314319
if c, ok := p.clients[botID]; ok {
315320
state := c.conn.GetState()
316-
if state != connectivity.Shutdown && state != connectivity.TransientFailure {
321+
stale := state == connectivity.Shutdown || state == connectivity.TransientFailure ||
322+
(state == connectivity.Connecting && time.Since(c.createdAt) > connectingTimeout)
323+
if !stale {
317324
p.mu.RUnlock()
318325
return c, nil
319326
}

0 commit comments

Comments
 (0)