Skip to content

Commit 103853e

Browse files
committed
libcontainer: set cgroup config late
Due to the fact that the init is implemented in Go (which seemingly randomly spawns new processes and loves eating memory), most cgroup configurations are required to have an arbitrary minimum dictated by the init. This confuses users and makes configuration more annoying than it should. An example of this is pids.max, where Go spawns multiple processes that then cause init to violate the pids cgroup constraint before the container can even start. Solve this problem by setting the cgroup configurations as late as possible, to avoid hitting as many of the resources hogged by the Go init as possible. This has to be done before seccomp rules are applied, as the parent and child must synchronise in order for the parent to correctly set the configurations (and writes might be blocked by seccomp). Signed-off-by: Aleksa Sarai <[email protected]>
1 parent a954834 commit 103853e

File tree

6 files changed

+111
-17
lines changed

6 files changed

+111
-17
lines changed

libcontainer/factory_linux.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ package libcontainer
55
import (
66
"encoding/json"
77
"fmt"
8-
"io/ioutil"
98
"os"
109
"os/exec"
1110
"path/filepath"
@@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) {
226225
// clear the current process's environment to clean any libcontainer
227226
// specific env vars.
228227
os.Clearenv()
228+
var i initer
229229
defer func() {
230230
// if we have an error during the initialization of the container's init then send it back to the
231231
// parent process in the form of an initError.
232232
if err != nil {
233-
// ensure that any data sent from the parent is consumed so it doesn't
234-
// receive ECONNRESET when the child writes to the pipe.
235-
ioutil.ReadAll(pipe)
233+
if _, ok := i.(*linuxStandardInit); ok {
234+
// Synchronisation only necessary for standard init.
235+
if err := json.NewEncoder(pipe).Encode(procError); err != nil {
236+
panic(err)
237+
}
238+
}
236239
if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil {
237240
panic(err)
238241
}
242+
} else {
243+
if err := json.NewEncoder(pipe).Encode(procStart); err != nil {
244+
panic(err)
245+
}
239246
}
240247
// ensure that this pipe is always closed
241248
pipe.Close()
242249
}()
243-
i, err := newContainerInit(it, pipe)
250+
i, err = newContainerInit(it, pipe)
244251
if err != nil {
245252
return err
246253
}

libcontainer/generic_error.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ import (
99
"github.com/opencontainers/runc/libcontainer/stacktrace"
1010
)
1111

12+
type syncType uint8
13+
14+
const (
15+
procReady syncType = iota
16+
procError
17+
procStart
18+
procRun
19+
)
20+
1221
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
1322
Code: {{.ECode}}
1423
{{if .Message }}

libcontainer/init_linux.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package libcontainer
55
import (
66
"encoding/json"
77
"fmt"
8+
"io"
89
"io/ioutil"
910
"net"
1011
"os"
@@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
7374
}, nil
7475
case initStandard:
7576
return &linuxStandardInit{
77+
pipe: pipe,
7678
parentPid: syscall.Getppid(),
7779
config: config,
7880
}, nil
@@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error {
140142
return nil
141143
}
142144

145+
// syncParentReady sends to the given pipe a JSON payload which indicates that
146+
// the init is ready to Exec the child process. It then waits for the parent to
147+
// indicate that it is cleared to Exec.
148+
func syncParentReady(pipe io.ReadWriter) error {
149+
// Tell parent.
150+
if err := json.NewEncoder(pipe).Encode(procReady); err != nil {
151+
return err
152+
}
153+
154+
// Wait for parent to give the all-clear.
155+
var procSync syncType
156+
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
157+
if err == io.EOF {
158+
return fmt.Errorf("parent closed synchronisation channel")
159+
}
160+
if procSync != procRun {
161+
return fmt.Errorf("invalid synchronisation flag from parent")
162+
}
163+
}
164+
return nil
165+
}
166+
143167
// joinExistingNamespaces gets all the namespace paths specified for the container and
144168
// does a setns on the namespace fd so that the current process joins the namespace.
145169
func joinExistingNamespaces(namespaces []configs.Namespace) error {

libcontainer/integration/exec_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,11 @@ func testPids(t *testing.T, systemd bool) {
587587
if err == nil {
588588
t.Fatalf("expected fork() to fail with restrictive pids limit")
589589
}
590+
591+
// Minimal restrictions are not really supported, due to quirks in using Go
592+
// due to the fact that it spawns random processes. While we do our best with
593+
// late setting cgroup values, it's just too unreliable with very small pids.max.
594+
// As such, we don't test that case. YMMV.
590595
}
591596

592597
func TestRunWithKernelMemory(t *testing.T) {

libcontainer/process_linux.go

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package libcontainer
55
import (
66
"encoding/json"
77
"errors"
8+
"fmt"
89
"io"
910
"os"
1011
"os/exec"
@@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) {
8687
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
8788
return newSystemError(err)
8889
}
90+
8991
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
9092
return newSystemError(err)
9193
}
@@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) {
9597
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
9698
return newSystemError(err)
9799
}
100+
// Must be done after Shutdown so the child will exit and we can wait for it.
98101
if ierr != nil {
99102
p.wait()
100103
return newSystemError(ierr)
@@ -198,15 +201,11 @@ func (p *initProcess) start() (err error) {
198201
return newSystemError(err)
199202
}
200203
p.setExternalDescriptors(fds)
201-
202204
// Do this before syncing with child so that no children
203205
// can escape the cgroup
204206
if err := p.manager.Apply(p.pid()); err != nil {
205207
return newSystemError(err)
206208
}
207-
if err := p.manager.Set(p.config.Config); err != nil {
208-
return newSystemError(err)
209-
}
210209
defer func() {
211210
if err != nil {
212211
// TODO: should not be the responsibility to call here
@@ -232,13 +231,58 @@ func (p *initProcess) start() (err error) {
232231
if err := p.sendConfig(); err != nil {
233232
return newSystemError(err)
234233
}
235-
// wait for the child process to fully complete and receive an error message
236-
// if one was encoutered
237-
var ierr *genericError
238-
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
234+
235+
var (
236+
procSync syncType
237+
sentRun bool
238+
ierr *genericError
239+
)
240+
241+
loop:
242+
for {
243+
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
244+
if err == io.EOF {
245+
break loop
246+
}
247+
return newSystemError(err)
248+
}
249+
250+
switch procSync {
251+
case procStart:
252+
break loop
253+
case procReady:
254+
if err := p.manager.Set(p.config.Config); err != nil {
255+
return newSystemError(err)
256+
}
257+
// Sync with child.
258+
if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil {
259+
return newSystemError(err)
260+
}
261+
sentRun = true
262+
case procError:
263+
// wait for the child process to fully complete and receive an error message
264+
// if one was encoutered
265+
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
266+
return newSystemError(err)
267+
}
268+
if ierr != nil {
269+
break loop
270+
}
271+
// Programmer error.
272+
panic("No error following JSON procError payload.")
273+
default:
274+
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
275+
}
276+
}
277+
if !sentRun {
278+
return newSystemError(fmt.Errorf("could not synchronise with container process"))
279+
}
280+
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
239281
return newSystemError(err)
240282
}
283+
// Must be done after Shutdown so the child will exit and we can wait for it.
241284
if ierr != nil {
285+
p.wait()
242286
return newSystemError(ierr)
243287
}
244288
return nil
@@ -276,8 +320,7 @@ func (p *initProcess) sendConfig() error {
276320
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
277321
return err
278322
}
279-
// shutdown writes for the parent side of the pipe
280-
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
323+
return nil
281324
}
282325

283326
func (p *initProcess) createNetworkInterfaces() error {

libcontainer/standard_init_linux.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package libcontainer
44

55
import (
6+
"io"
67
"os"
78
"syscall"
89

@@ -14,6 +15,7 @@ import (
1415
)
1516

1617
type linuxStandardInit struct {
18+
pipe io.ReadWriter
1719
parentPid int
1820
config *initConfig
1921
}
@@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error {
5052
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
5153
return err
5254
}
53-
5455
label.Init()
5556
// InitializeMountNamespace() can be executed only for a new mount namespace
5657
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
@@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error {
7576
return err
7677
}
7778
}
78-
7979
for _, path := range l.config.Config.ReadonlyPaths {
8080
if err := remountReadonly(path); err != nil {
8181
return err
@@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error {
9090
if err != nil {
9191
return err
9292
}
93+
// Tell our parent that we're ready to Execv. This must be done before the
94+
// Seccomp rules have been applied, because we need to be able to read and
95+
// write to a socket.
96+
if err := syncParentReady(l.pipe); err != nil {
97+
return err
98+
}
9399
if l.config.Config.Seccomp != nil {
94100
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
95101
return err

0 commit comments

Comments
 (0)