@@ -30,28 +30,28 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
3030 var newperms uint32
3131
3232 if l .config .Config .Namespaces .Contains (configs .NEWUSER ) {
33- // with user ns we need 'other' search permissions
33+ // With user ns we need 'other' search permissions.
3434 newperms = 0x8
3535 } else {
36- // without user ns we need 'UID' search permissions
36+ // Without user ns we need 'UID' search permissions.
3737 newperms = 0x80000
3838 }
3939
40- // create a unique per session container name that we can
41- // join in setns; however , other containers can also join it
40+ // Create a unique per session container name that we can join in setns;
41+ // However , other containers can also join it.
4242 return fmt .Sprintf ("_ses.%s" , l .config .ContainerId ), 0xffffffff , newperms
4343}
4444
4545func (l * linuxStandardInit ) Init () error {
4646 if ! l .config .Config .NoNewKeyring {
4747 ringname , keepperms , newperms := l .getSessionRingParams ()
4848
49- // do not inherit the parent's session keyring
49+ // Do not inherit the parent's session keyring.
5050 sessKeyId , err := keys .JoinSessionKeyring (ringname )
5151 if err != nil {
5252 return err
5353 }
54- // make session keyring searcheable
54+ // Make session keyring searcheable.
5555 if err := keys .ModKeyringPerm (sessKeyId , keepperms , newperms ); err != nil {
5656 return err
5757 }
@@ -150,39 +150,47 @@ func (l *linuxStandardInit) Init() error {
150150 if err := pdeath .Restore (); err != nil {
151151 return err
152152 }
153- // compare the parent from the initial start of the init process and make sure that it did not change.
154- // if the parent changes that means it died and we were reparented to something else so we should
155- // just kill ourself and not cause problems for someone else.
153+ // Compare the parent from the initial start of the init process and make
154+ // sure that it did not change. if the parent changes that means it died
155+ // and we were reparented to something else so we should just kill ourself
156+ // and not cause problems for someone else.
156157 if unix .Getppid () != l .parentPid {
157158 return unix .Kill (unix .Getpid (), unix .SIGKILL )
158159 }
159- // check for the arg before waiting to make sure it exists and it is returned
160- // as a create time error.
160+ // Check for the arg before waiting to make sure it exists and it is
161+ // returned as a create time error.
161162 name , err := exec .LookPath (l .config .Args [0 ])
162163 if err != nil {
163164 return err
164165 }
165- // close the pipe to signal that we have completed our init.
166+ // Close the pipe to signal that we have completed our init.
166167 l .pipe .Close ()
167168 // Wait for the FIFO to be opened on the other side before exec-ing the
168169 // user process. We open it through /proc/self/fd/$fd, because the fd that
169170 // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
170171 // re-open an O_PATH fd through /proc.
171172 fd , err := unix .Open (fmt .Sprintf ("/proc/self/fd/%d" , l .fifoFd ), unix .O_WRONLY | unix .O_CLOEXEC , 0 )
172173 if err != nil {
173- return newSystemErrorWithCause (err , "openat exec fifo" )
174+ return newSystemErrorWithCause (err , "open exec fifo" )
174175 }
175176 if _ , err := unix .Write (fd , []byte ("0" )); err != nil {
176177 return newSystemErrorWithCause (err , "write 0 exec fifo" )
177178 }
179+ // Close the O_PATH fifofd fd before exec because the kernel resets
180+ // dumpable in the wrong order. This has been fixed in newer kernels, but
181+ // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
182+ // N.B. the core issue itself (passing dirfds to the host filesystem) has
183+ // since been resolved.
184+ // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
185+ unix .Close (l .fifoFd )
186+ // Set seccomp as close to execve as possible, so as few syscalls take
187+ // place afterward (reducing the amount of syscalls that users need to
188+ // enable in their seccomp profiles).
178189 if l .config .Config .Seccomp != nil && l .config .NoNewPrivileges {
179190 if err := seccomp .InitSeccomp (l .config .Config .Seccomp ); err != nil {
180191 return newSystemErrorWithCause (err , "init seccomp" )
181192 }
182193 }
183- // close the statedir fd before exec because the kernel resets dumpable in the wrong order
184- // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
185- unix .Close (l .fifoFd )
186194 if err := syscall .Exec (name , l .config .Args [0 :], os .Environ ()); err != nil {
187195 return newSystemErrorWithCause (err , "exec user process" )
188196 }
0 commit comments