Skip to content

Commit 2a02425

Browse files
committed
fix(ipc): Handle unsuccessful setup for monitor environment
In case there was an error during the monitor execution environment setup (e.g. vmm not found, rootfs issue) then the urunc start command was keep waiting for a message from reexec that everything was ok. This was leading to the urunc start process hanging there forever and of course the container was staying like that. THis commit fixes this issue and moreover it slightly refactors the whole IPC between the urunc instances and reexec. Signed-off-by: Charalampos Mainas <cmainas@nubificus.co.uk>
1 parent 6bff47d commit 2a02425

File tree

6 files changed

+248
-73
lines changed

6 files changed

+248
-73
lines changed

cmd/urunc/create.go

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,16 +268,27 @@ func createUnikontainer(cmd *cli.Command, uruncCfg *unikontainers.UruncConfig) (
268268
}
269269
metrics.Capture(m.TS07)
270270

271+
err = unikontainer.CreateConn(!unikontainers.FromReexec)
272+
if err != nil {
273+
err = fmt.Errorf("failed to open a new connection to reexec socket: %w", err)
274+
return err
275+
}
276+
271277
// send ACK to reexec process
272-
err = unikontainer.SendAckReexec()
278+
err = unikontainer.SendMessage(unikontainers.AckReexec)
273279
if err != nil {
274280
err = fmt.Errorf("failed to send ACK to reexec process: %w", err)
275-
return err
281+
}
276282

283+
cleanErr := unikontainer.DestroyConn(!unikontainers.FromReexec)
284+
if cleanErr != nil {
285+
// Log the error but not return an error, since reexec has started
286+
// the container setup process and any error return here will confuse
287+
// the shim and the reexec process will not receive any other commands.
288+
logrus.WithError(cleanErr).Error("failed to destroy connection to reexec socket")
277289
}
278290
metrics.Capture(m.TS08)
279291

280-
err = nil
281292
return err
282293
}
283294

@@ -379,7 +390,10 @@ func reexecUnikontainer(cmd *cli.Command) error {
379390
metrics.Capture(m.TS05)
380391

381392
// wait AckReexec message on urunc.sock from parent process
382-
socketPath := unikontainers.GetUruncSockAddr(baseDir)
393+
// we use different functions here for the sockets, because we have not
394+
// created the respective unikontainers instance yet. We need to get the Ack
395+
// from urunc create first, since urunc create needs to write the pid of reexec
396+
socketPath := unikontainers.GetReexecSockAddr(baseDir)
383397
err = unikontainers.ListenAndAwaitMsg(socketPath, unikontainers.AckReexec)
384398
if err != nil {
385399
return err
@@ -406,14 +420,63 @@ func reexecUnikontainer(cmd *cli.Command) error {
406420
metrics.Capture(m.TS10)
407421

408422
// wait StartExecve message on urunc.sock from urunc start process
409-
err = unikontainers.ListenAndAwaitMsg(socketPath, unikontainers.StartExecve)
423+
err = unikontainer.CreateListener(unikontainers.FromReexec)
410424
if err != nil {
425+
return fmt.Errorf("error creating listener on reexec socket: %w", err)
426+
}
427+
awaitErr := unikontainer.AwaitMsg(unikontainers.StartExecve)
428+
// Before checking for any errors, make sure to clean up the socket
429+
cleanErr := unikontainer.DestroyListener(unikontainers.FromReexec)
430+
if cleanErr != nil {
431+
logrus.WithError(cleanErr).Error("failed to destroy listener on reexec socket")
432+
cleanErr = fmt.Errorf("error destroying listener on reexec socket: %w", cleanErr)
433+
}
434+
// NOTE: Currently, we do not fail in case the socket cleanup fails
435+
// because urunc start has told reexec to start preparing the execution
436+
// environment for the monitor. Therefore, if the socket cleanup affects the
437+
// preparation the error will get manifested later. However, if environment
438+
// setup goes well and the socket was not cleaned up corectly,
439+
// we execve the monitor and we rely on Go's close-on-exec feature in all file
440+
// descriptors. THerefore, we might want to rethink this in future and not rely
441+
// on Go, but this requires quite a a lot of changes.
442+
if awaitErr != nil {
443+
awaitErr = fmt.Errorf("error waiting START message: %w", awaitErr)
444+
err = errors.Join(awaitErr, cleanErr)
411445
return err
412446
}
413447
metrics.Capture(m.TS14)
414448

449+
err = unikontainer.CreateConn(unikontainers.FromReexec)
450+
if err != nil {
451+
return fmt.Errorf("error creating connection to urunc socket: %w", err)
452+
}
453+
// NOTE: More changes are required for a proper handling of this cleanup.
454+
// Currently, the cleanup will happen if something fails in the container
455+
// setup and unikontainers.Exec returns with an error. In that case, we also
456+
// ignore any errors, since it is a simple socket cleanup and the process exits.
457+
// If unikontainers.Exec succeeds though, then we will never execute this
458+
// cleanup, since we execve to the monitor process. In that case, we relyonce more
459+
// in Go's close on exec handling of all file descriptors. In the future, we might
460+
// want to revisit this and rely less in Go.
461+
defer func() {
462+
tmpErr := unikontainer.DestroyConn(unikontainers.FromReexec)
463+
if tmpErr != nil {
464+
logrus.WithError(tmpErr).Error("failed to destroy connection on reexec socket")
465+
}
466+
}()
467+
415468
// execve
416469
// we need to pass metrics to Exec() function, as the unikontainer
417470
// struct does not have the part of urunc config that configures metrics
418-
return unikontainer.Exec(metrics)
471+
var sockErr error
472+
err = unikontainer.Exec(metrics)
473+
if err != nil {
474+
logrus.WithError(err).Error("Setting up execution environment for monitor")
475+
sockErr = unikontainer.SendMessage(unikontainers.StartErr)
476+
if sockErr != nil {
477+
logrus.WithError(sockErr).Error("failed to send error message to urunc socket")
478+
}
479+
}
480+
481+
return errors.Join(err, sockErr)
419482
}

cmd/urunc/start.go

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ package main
1616

1717
import (
1818
"context"
19+
"errors"
20+
"fmt"
1921
"os"
2022

2123
"github.com/sirupsen/logrus"
@@ -56,27 +58,56 @@ func startUnikontainer(cmd *cli.Command) error {
5658
}
5759
metrics.Capture(m.TS12)
5860

59-
sockAddr := unikontainers.GetStartSockAddr(unikontainer.BaseDir)
60-
listener, cleaner, err := unikontainers.CreateListener(sockAddr, true)
61+
err = unikontainer.CreateListener(!unikontainers.FromReexec)
6162
if err != nil {
6263
return err
6364
}
64-
defer cleaner()
65+
// NOTE: We ignore any errors from the DestroyListener here, because
66+
// the reexec process has already started the monitor execution and hence
67+
// returning an error would confuse the shim. Furthermore, this process
68+
// exits. However, we might want to revisit this in the future and
69+
// handle it better.
70+
defer func() {
71+
tmpErr := unikontainer.DestroyListener(!unikontainers.FromReexec)
72+
if tmpErr != nil {
73+
logrus.WithError(tmpErr).Error("failed to destroy listener on reexec socket")
74+
}
75+
}()
6576

66-
err = unikontainer.SendStartExecve()
77+
// Send message to reexec to start the monitor
78+
err = unikontainer.CreateConn(!unikontainers.FromReexec)
79+
if err != nil {
80+
err = fmt.Errorf("failed to create connection with reexec socket: %w", err)
81+
return err
82+
}
83+
sendErr := unikontainer.SendMessage(unikontainers.StartExecve)
84+
if sendErr != nil {
85+
logrus.WithError(sendErr).Error("failed to send START message to reexec")
86+
sendErr = fmt.Errorf("error sending START message: %w", sendErr)
87+
}
88+
// Regardless of the SendMessage status, make sure to clean up the socket,
89+
// since it is not required anymore
90+
cleanErr := unikontainer.DestroyConn(!unikontainers.FromReexec)
91+
if cleanErr != nil {
92+
logrus.WithError(cleanErr).Error("failed to destroy connection to reexec socket")
93+
cleanErr = fmt.Errorf("error destroying connection to reexec socket: %w", cleanErr)
94+
}
95+
err = errors.Join(sendErr, cleanErr)
6796
if err != nil {
6897
return err
6998
}
7099
metrics.Capture(m.TS13)
71100

72101
// wait ContainerStarted message on start.sock from reexec process
73-
err = unikontainers.AwaitMessage(listener, unikontainers.ContainerStarted)
102+
err = unikontainer.AwaitMsg(unikontainers.StartSuccess)
74103
if err != nil {
104+
err = fmt.Errorf("failed to get message from successful start from reexec: %w", err)
75105
return err
76106
}
77107

78108
err = unikontainer.SetRunningState()
79109
if err != nil {
110+
err = fmt.Errorf("failed to set the state as running for container: %w", err)
80111
return err
81112
}
82113

pkg/unikontainers/ipc.go

Lines changed: 49 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,18 @@ import (
3030
type IPCMessage string
3131

3232
const (
33-
startSock = "start.sock"
34-
uruncSock = "urunc.sock"
35-
ReexecStarted IPCMessage = "BOOTED"
36-
AckReexec IPCMessage = "ACK"
37-
StartExecve IPCMessage = "START"
38-
ContainerStarted IPCMessage = "CNTR_STARTED"
39-
maxRetries = 50
40-
waitTime = 5 * time.Millisecond
33+
// Socket for messages towards reexec. The reexec process listens in this socket
34+
reexecSock = "reexec.sock"
35+
// Socket for messages from reexec. The reexec process writes in this socket
36+
uruncSock = "urunc.sock"
37+
ReexecStarted IPCMessage = "RX_START"
38+
AckReexec IPCMessage = "UC_ACK"
39+
StartExecve IPCMessage = "UC_START"
40+
StartSuccess IPCMessage = "RX_SUCCESS"
41+
StartErr IPCMessage = "RX_EDROR"
42+
maxRetries = 50
43+
waitTime = 5 * time.Millisecond
44+
FromReexec = true
4145
)
4246

4347
func getSockAddr(dir string, name string) string {
@@ -48,8 +52,12 @@ func getUruncSockAddr(containerDir string) string {
4852
return getSockAddr(containerDir, uruncSock)
4953
}
5054

51-
func getStartSockAddr(baseDir string) string {
52-
return getSockAddr(baseDir, startSock)
55+
func getReexecSockAddr(baseDir string) string {
56+
return getSockAddr(baseDir, reexecSock)
57+
}
58+
59+
func GetReexecSockAddr(baseDir string) string {
60+
return getReexecSockAddr(baseDir)
5361
}
5462

5563
func ensureValidSockAddr(sockAddr string) error {
@@ -125,32 +133,21 @@ func sendIPCMessageWithRetry(socketAddress string, message IPCMessage, mustBeVal
125133
return err
126134
}
127135

128-
// CreateListener sets up a listener for new connection to socketAddress
129-
func CreateListener(socketAddress string, mustBeValid bool) (*net.UnixListener, func(), error) {
136+
// createListener sets up a listener for new connection to socketAddress
137+
func createListener(socketAddress string, mustBeValid bool) (*net.UnixListener, error) {
130138
if mustBeValid {
131139
err := ensureValidSockAddr(socketAddress)
132140
if err != nil {
133-
return nil, nil, err
141+
return nil, err
134142
}
135143
}
136144

137145
listener, err := net.ListenUnix("unix", &net.UnixAddr{Name: socketAddress, Net: "unix"})
138146
if err != nil {
139-
return nil, nil, err
140-
}
141-
142-
cleanup := func() {
143-
derr := listener.Close()
144-
if derr != nil {
145-
logrus.WithError(derr).Error("failed to close listener")
146-
}
147-
derr = syscall.Unlink(socketAddress)
148-
if derr != nil {
149-
logrus.WithError(derr).Errorf("failed to unlink %s", socketAddress)
150-
}
147+
return nil, err
151148
}
152149

153-
return listener, cleanup, nil
150+
return listener, nil
154151
}
155152

156153
// awaitMessage opens a new connection to socketAddress
@@ -177,3 +174,29 @@ func AwaitMessage(listener *net.UnixListener, expectedMessage IPCMessage) error
177174
}
178175
return nil
179176
}
177+
178+
func ListenAndAwaitMsg(sockAddr string, msg IPCMessage) error {
179+
listener, err := createListener(sockAddr, true)
180+
if err != nil {
181+
err = fmt.Errorf("failed to create listener at %s: %w", sockAddr, err)
182+
return err
183+
}
184+
185+
err = AwaitMessage(listener, msg)
186+
if err != nil {
187+
err = fmt.Errorf("failed to get message %s: %w", msg, err)
188+
return err
189+
}
190+
191+
err = listener.Close()
192+
if err != nil {
193+
uniklog.WithError(err).Errorf("failed to close listener at %s", sockAddr)
194+
}
195+
196+
err = syscall.Unlink(sockAddr)
197+
if err != nil {
198+
uniklog.WithError(err).Errorf("failed to unlink socket at %s", sockAddr)
199+
}
200+
201+
return nil
202+
}

pkg/unikontainers/ipc_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,11 @@ func TestSendIPCMessageWithRetry(t *testing.T) {
141141
func TestCreateListener(t *testing.T) {
142142
socketAddress := "/tmp/test_create_listener.sock"
143143

144-
listener, cleaner, err := CreateListener(socketAddress, true)
144+
listener, err := createListener(socketAddress, true)
145145
if err != nil {
146146
t.Fatalf("Failed to create listener: %v", err)
147147
}
148-
defer cleaner()
148+
defer listener.Close()
149149

150150
assert.NotNil(t, listener, "Expected listener to be created")
151151
}
@@ -154,11 +154,11 @@ func TestAwaitMessage(t *testing.T) {
154154
socketAddress := "/tmp/test_await_message.sock"
155155
expectedMessage := ReexecStarted
156156

157-
listener, cleaner, err := CreateListener(socketAddress, true)
157+
listener, err := createListener(socketAddress, true)
158158
if err != nil {
159159
t.Fatalf("Failed to create listener: %v", err)
160160
}
161-
defer cleaner()
161+
defer listener.Close()
162162

163163
go func() {
164164
conn, err := net.Dial("unix", socketAddress)

pkg/unikontainers/rootfs.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ func (rs *rootfsSelector) tryContainerRootfs() (types.RootfsParams, bool) {
172172
return result, true
173173
}
174174

175-
uniklog.Error("can not use the container rootfs as block, or through shared-fs")
175+
uniklog.Error("can not use the container rootfs as the sandbox's guest rootfs through block or shared-fs")
176176
return types.RootfsParams{}, false
177177
}
178178

@@ -225,7 +225,7 @@ func chooseRootfs(bundle string, cntrRootfs string, annot map[string]string,
225225
}
226226

227227
if selector.shouldMountContainerRootfs() {
228-
return types.RootfsParams{}, fmt.Errorf("can not mount container's rootfs as block or through shared-fs to guest")
228+
return types.RootfsParams{}, fmt.Errorf("can not use the container rootfs as the sandbox's guest rootfs through block or shared-fs")
229229
}
230230

231231
uniklog.Info("no rootfs configured for guest")

0 commit comments

Comments
 (0)