Skip to content

Commit 7c19af0

Browse files
cmainasurunc-bot[bot]
authored andcommitted
fix(ipc): Handle unsuccessful setup for monitor environment
In case there was an error during the monitor execution environment setup (e.g. vmm not found, rootfs issue) then the urunc start command was keep waiting for a message from reexec that everything was ok. This was leading to the urunc start process hanging there forever and of course the container was staying like that. THis commit fixes this issue and moreover it slightly refactors the whole IPC between the urunc instances and reexec. PR: #357 Signed-off-by: Charalampos Mainas <cmainas@nubificus.co.uk> Reviewed-by: Anastassios Nanos <ananos@nubificus.co.uk> Approved-by: Anastassios Nanos <ananos@nubificus.co.uk>
1 parent 6bff47d commit 7c19af0

File tree

6 files changed

+214
-88
lines changed

6 files changed

+214
-88
lines changed

cmd/urunc/create.go

Lines changed: 65 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323
"net"
2424
"os"
2525
"os/exec"
26-
"path/filepath"
2726
"strconv"
2827

2928
"github.com/creack/pty"
@@ -268,16 +267,14 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) (
268267
}
269268
metrics.Capture(m.TS07)
270269

271-
// send ACK to reexec process
272-
err = unikontainer.SendAckReexec()
270+
_, err = initSockParent.Write([]byte(unikontainers.AckReexec))
273271
if err != nil {
274272
err = fmt.Errorf("failed to send ACK to reexec process: %w", err)
275273
return err
276-
277274
}
275+
278276
metrics.Capture(m.TS08)
279277

280-
err = nil
281278
return err
282279
}
283280

@@ -367,22 +364,21 @@ func reexecUnikontainer(cmd *cli.Command) error {
367364
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
368365
}
369366
initPipe := os.NewFile(uintptr(initFd), "initpipe")
370-
err = initPipe.Close()
371-
if err != nil {
372-
return fmt.Errorf("close init pipe: %w", err)
373-
}
374-
375-
// We have already made sure in main.go that root is not nil
376-
rootDir := cmd.String("root")
377-
baseDir := filepath.Join(rootDir, containerID)
378-
379367
metrics.Capture(m.TS05)
380368

381-
// wait AckReexec message on urunc.sock from parent process
382-
socketPath := unikontainers.GetUruncSockAddr(baseDir)
383-
err = unikontainers.ListenAndAwaitMsg(socketPath, unikontainers.AckReexec)
369+
// wait AckReexec message on init socket from parent process
370+
buf := make([]byte, len(unikontainers.AckReexec))
371+
n, err := initPipe.Read(buf)
384372
if err != nil {
385-
return err
373+
return fmt.Errorf("failed to read from init socket: %w", err)
374+
}
375+
msg := string(buf[0:n])
376+
if msg != string(unikontainers.AckReexec) {
377+
return fmt.Errorf("received unexpected message: %s", msg)
378+
}
379+
err = initPipe.Close()
380+
if err != nil {
381+
return fmt.Errorf("close init pipe: %w", err)
386382
}
387383
metrics.Capture(m.TS09)
388384

@@ -406,14 +402,63 @@ func reexecUnikontainer(cmd *cli.Command) error {
406402
metrics.Capture(m.TS10)
407403

408404
// wait StartExecve message on urunc.sock from urunc start process
409-
err = unikontainers.ListenAndAwaitMsg(socketPath, unikontainers.StartExecve)
405+
err = unikontainer.CreateListener(unikontainers.FromReexec)
410406
if err != nil {
407+
return fmt.Errorf("error creating listener on reexec socket: %w", err)
408+
}
409+
awaitErr := unikontainer.AwaitMsg(unikontainers.StartExecve)
410+
// Before checking for any errors, make sure to clean up the socket
411+
cleanErr := unikontainer.DestroyListener(unikontainers.FromReexec)
412+
if cleanErr != nil {
413+
logrus.WithError(cleanErr).Error("failed to destroy listener on reexec socket")
414+
cleanErr = fmt.Errorf("error destroying listener on reexec socket: %w", cleanErr)
415+
}
416+
// NOTE: Currently, we do not fail in case the socket cleanup fails
417+
// because urunc start has told reexec to start preparing the execution
418+
// environment for the monitor. Therefore, if the socket cleanup affects the
419+
// preparation the error will get manifested later. However, if environment
420+
// setup goes well and the socket was not cleaned up correctly,
421+
// we execve the monitor and we rely on Go's close-on-exec feature in all file
422+
// descriptors. THerefore, we might want to rethink this in future and not rely
423+
// on Go, but this requires quite a a lot of changes.
424+
if awaitErr != nil {
425+
awaitErr = fmt.Errorf("error waiting START message: %w", awaitErr)
426+
err = errors.Join(awaitErr, cleanErr)
411427
return err
412428
}
413429
metrics.Capture(m.TS14)
414430

431+
err = unikontainer.CreateConn(unikontainers.FromReexec)
432+
if err != nil {
433+
return fmt.Errorf("error creating connection to urunc socket: %w", err)
434+
}
435+
// NOTE: More changes are required for a proper handling of this cleanup.
436+
// Currently, the cleanup will happen if something fails in the container
437+
// setup and unikontainers.Exec returns with an error. In that case, we also
438+
// ignore any errors, since it is a simple socket cleanup and the process exits.
439+
// If unikontainers.Exec succeeds though, then we will never execute this
440+
// cleanup, since we execve to the monitor process. In that case, we rely
441+
// once more in Go's close on exec handling of all file descriptors.
442+
// In the future, we might want to revisit this and rely less in Go.
443+
defer func() {
444+
tmpErr := unikontainer.DestroyConn(unikontainers.FromReexec)
445+
if tmpErr != nil {
446+
logrus.WithError(tmpErr).Error("failed to destroy connection on reexec socket")
447+
}
448+
}()
449+
415450
// execve
416451
// we need to pass metrics to Exec() function, as the unikontainer
417452
// struct does not have the part of urunc config that configures metrics
418-
return unikontainer.Exec(metrics)
453+
var sockErr error
454+
err = unikontainer.Exec(metrics)
455+
if err != nil {
456+
logrus.WithError(err).Error("Setting up execution environment for monitor")
457+
sockErr = unikontainer.SendMessage(unikontainers.StartErr)
458+
if sockErr != nil {
459+
logrus.WithError(sockErr).Error("failed to send error message to urunc socket")
460+
}
461+
}
462+
463+
return errors.Join(err, sockErr)
419464
}

cmd/urunc/start.go

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ package main
1616

1717
import (
1818
"context"
19+
"errors"
20+
"fmt"
1921
"os"
2022

2123
"github.com/sirupsen/logrus"
@@ -56,27 +58,56 @@ func startUnikontainer(cmd *cli.Command) error {
5658
}
5759
metrics.Capture(m.TS12)
5860

59-
sockAddr := unikontainers.GetStartSockAddr(unikontainer.BaseDir)
60-
listener, cleaner, err := unikontainers.CreateListener(sockAddr, true)
61+
err = unikontainer.CreateListener(!unikontainers.FromReexec)
6162
if err != nil {
6263
return err
6364
}
64-
defer cleaner()
65+
// NOTE: We ignore any errors from the DestroyListener here, because
66+
// the reexec process has already started the monitor execution and hence
67+
// returning an error would confuse the shim. Furthermore, this process
68+
// exits. However, we might want to revisit this in the future and
69+
// handle it better.
70+
defer func() {
71+
tmpErr := unikontainer.DestroyListener(!unikontainers.FromReexec)
72+
if tmpErr != nil {
73+
logrus.WithError(tmpErr).Error("failed to destroy listener on reexec socket")
74+
}
75+
}()
6576

66-
err = unikontainer.SendStartExecve()
77+
// Send message to reexec to start the monitor
78+
err = unikontainer.CreateConn(!unikontainers.FromReexec)
79+
if err != nil {
80+
err = fmt.Errorf("failed to create connection with reexec socket: %w", err)
81+
return err
82+
}
83+
sendErr := unikontainer.SendMessage(unikontainers.StartExecve)
84+
if sendErr != nil {
85+
logrus.WithError(sendErr).Error("failed to send START message to reexec")
86+
sendErr = fmt.Errorf("error sending START message: %w", sendErr)
87+
}
88+
// Regardless of the SendMessage status, make sure to clean up the socket,
89+
// since it is not required anymore
90+
cleanErr := unikontainer.DestroyConn(!unikontainers.FromReexec)
91+
if cleanErr != nil {
92+
logrus.WithError(cleanErr).Error("failed to destroy connection to reexec socket")
93+
cleanErr = fmt.Errorf("error destroying connection to reexec socket: %w", cleanErr)
94+
}
95+
err = errors.Join(sendErr, cleanErr)
6796
if err != nil {
6897
return err
6998
}
7099
metrics.Capture(m.TS13)
71100

72101
// wait ContainerStarted message on start.sock from reexec process
73-
err = unikontainers.AwaitMessage(listener, unikontainers.ContainerStarted)
102+
err = unikontainer.AwaitMsg(unikontainers.StartSuccess)
74103
if err != nil {
104+
err = fmt.Errorf("failed to get message from successful start from reexec: %w", err)
75105
return err
76106
}
77107

78108
err = unikontainer.SetRunningState()
79109
if err != nil {
110+
err = fmt.Errorf("failed to set the state as running for container: %w", err)
80111
return err
81112
}
82113

pkg/unikontainers/ipc.go

Lines changed: 19 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121
"net"
2222
"os"
2323
"path/filepath"
24-
"syscall"
2524
"time"
2625

2726
"github.com/sirupsen/logrus"
@@ -30,14 +29,18 @@ import (
3029
type IPCMessage string
3130

3231
const (
33-
startSock = "start.sock"
34-
uruncSock = "urunc.sock"
35-
ReexecStarted IPCMessage = "BOOTED"
36-
AckReexec IPCMessage = "ACK"
37-
StartExecve IPCMessage = "START"
38-
ContainerStarted IPCMessage = "CNTR_STARTED"
39-
maxRetries = 50
40-
waitTime = 5 * time.Millisecond
32+
// Socket for messages towards reexec. The reexec process listens in this socket
33+
reexecSock = "reexec.sock"
34+
// Socket for messages from reexec. The reexec process writes in this socket
35+
uruncSock = "urunc.sock"
36+
ReexecStarted IPCMessage = "RX_START"
37+
AckReexec IPCMessage = "UC_ACK"
38+
StartExecve IPCMessage = "UC_START"
39+
StartSuccess IPCMessage = "RX_SUCCESS"
40+
StartErr IPCMessage = "RX_ERROR"
41+
maxRetries = 50
42+
waitTime = 5 * time.Millisecond
43+
FromReexec = true
4144
)
4245

4346
func getSockAddr(dir string, name string) string {
@@ -48,8 +51,8 @@ func getUruncSockAddr(containerDir string) string {
4851
return getSockAddr(containerDir, uruncSock)
4952
}
5053

51-
func getStartSockAddr(baseDir string) string {
52-
return getSockAddr(baseDir, startSock)
54+
func getReexecSockAddr(baseDir string) string {
55+
return getSockAddr(baseDir, reexecSock)
5356
}
5457

5558
func ensureValidSockAddr(sockAddr string) error {
@@ -125,32 +128,21 @@ func sendIPCMessageWithRetry(socketAddress string, message IPCMessage, mustBeVal
125128
return err
126129
}
127130

128-
// CreateListener sets up a listener for new connection to socketAddress
129-
func CreateListener(socketAddress string, mustBeValid bool) (*net.UnixListener, func(), error) {
131+
// createListener sets up a listener for new connection to socketAddress
132+
func createListener(socketAddress string, mustBeValid bool) (*net.UnixListener, error) {
130133
if mustBeValid {
131134
err := ensureValidSockAddr(socketAddress)
132135
if err != nil {
133-
return nil, nil, err
136+
return nil, err
134137
}
135138
}
136139

137140
listener, err := net.ListenUnix("unix", &net.UnixAddr{Name: socketAddress, Net: "unix"})
138141
if err != nil {
139-
return nil, nil, err
142+
return nil, err
140143
}
141144

142-
cleanup := func() {
143-
derr := listener.Close()
144-
if derr != nil {
145-
logrus.WithError(derr).Error("failed to close listener")
146-
}
147-
derr = syscall.Unlink(socketAddress)
148-
if derr != nil {
149-
logrus.WithError(derr).Errorf("failed to unlink %s", socketAddress)
150-
}
151-
}
152-
153-
return listener, cleanup, nil
145+
return listener, nil
154146
}
155147

156148
// awaitMessage opens a new connection to socketAddress

pkg/unikontainers/ipc_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,11 @@ func TestSendIPCMessageWithRetry(t *testing.T) {
141141
func TestCreateListener(t *testing.T) {
142142
socketAddress := "/tmp/test_create_listener.sock"
143143

144-
listener, cleaner, err := CreateListener(socketAddress, true)
144+
listener, err := createListener(socketAddress, true)
145145
if err != nil {
146146
t.Fatalf("Failed to create listener: %v", err)
147147
}
148-
defer cleaner()
148+
defer listener.Close()
149149

150150
assert.NotNil(t, listener, "Expected listener to be created")
151151
}
@@ -154,11 +154,11 @@ func TestAwaitMessage(t *testing.T) {
154154
socketAddress := "/tmp/test_await_message.sock"
155155
expectedMessage := ReexecStarted
156156

157-
listener, cleaner, err := CreateListener(socketAddress, true)
157+
listener, err := createListener(socketAddress, true)
158158
if err != nil {
159159
t.Fatalf("Failed to create listener: %v", err)
160160
}
161-
defer cleaner()
161+
defer listener.Close()
162162

163163
go func() {
164164
conn, err := net.Dial("unix", socketAddress)

pkg/unikontainers/rootfs.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ func (rs *rootfsSelector) tryContainerRootfs() (types.RootfsParams, bool) {
172172
return result, true
173173
}
174174

175-
uniklog.Error("can not use the container rootfs as block, or through shared-fs")
175+
uniklog.Error("can not use the container rootfs as the sandbox's guest rootfs through block or shared-fs")
176176
return types.RootfsParams{}, false
177177
}
178178

@@ -225,7 +225,7 @@ func chooseRootfs(bundle string, cntrRootfs string, annot map[string]string,
225225
}
226226

227227
if selector.shouldMountContainerRootfs() {
228-
return types.RootfsParams{}, fmt.Errorf("can not mount container's rootfs as block or through shared-fs to guest")
228+
return types.RootfsParams{}, fmt.Errorf("can not use the container rootfs as the sandbox's guest rootfs through block or shared-fs")
229229
}
230230

231231
uniklog.Info("no rootfs configured for guest")

0 commit comments

Comments
 (0)