|
| 1 | +/* |
| 2 | + * Copyright (c) 2023 SUSE LLC |
| 3 | + * Copyright (c) 2023 Aleksa Sarai <[email protected]> |
| 4 | + * |
| 5 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | + * you may not use this file except in compliance with the License. |
| 7 | + * You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package main |
| 19 | + |
| 20 | +import ( |
| 21 | + "errors" |
| 22 | + "fmt" |
| 23 | + "io" |
| 24 | + "os" |
| 25 | + "os/signal" |
| 26 | + "runtime" |
| 27 | + "strings" |
| 28 | + "time" |
| 29 | + |
| 30 | + "github.com/opencontainers/runc/libcontainer/dmz" |
| 31 | + |
| 32 | + "github.com/sirupsen/logrus" |
| 33 | + "github.com/urfave/cli" |
| 34 | + "golang.org/x/sys/unix" |
| 35 | +) |
| 36 | + |
| 37 | +// version will be populated by the Makefile, read from |
| 38 | +// VERSION file of the source code. |
| 39 | +var version = "" |
| 40 | + |
| 41 | +// gitCommit will be the hash that the binary was built from |
| 42 | +// and will be populated by the Makefile. |
| 43 | +var gitCommit = "" |
| 44 | + |
| 45 | +const ( |
| 46 | + usage = `Open Container Initiative contrib/cmd/memfd-bind |
| 47 | +
|
| 48 | +In order to protect against certain container attacks, every runc invocation |
| 49 | +that involves creating or joining a container will cause runc to make a copy of |
| 50 | +the runc binary in memory (usually to a memfd). While "runc init" is very |
| 51 | +short-lived, this extra memory usage can cause problems for containers with |
| 52 | +very small memory limits (or containers that have many "runc exec" invocations |
| 53 | +applied to them at the same time). |
| 54 | +
|
| 55 | +memfd-bind is a tool to create a persistent memfd-sealed-copy of the runc binary, |
| 56 | +which will cause runc to not make its own copy. This means you can get the |
| 57 | +benefits of using a sealed memfd as runc's binary (even in a container breakout |
| 58 | +attack to get write access to the runc binary, neither the underlying binary |
| 59 | +nor the memfd copy can be changed). |
| 60 | +
|
| 61 | +To use memfd-bind, just specify which path you want to create a socket path at |
| 62 | +which you want to receive terminals: |
| 63 | +
|
| 64 | + $ sudo memfd-bind /usr/bin/runc |
| 65 | +
|
| 66 | +Note that (due to kernel restrictions on bind-mounts), this program must remain |
| 67 | +running on the host in order for the binary to be readable (it is recommended |
| 68 | +you use a systemd unit to keep this process around). |
| 69 | +
|
| 70 | +If this program dies, there will be a leftover mountpoint that always returns |
| 71 | +-EINVAL when attempting to access it. You need to use memfd-bind --cleanup on the |
| 72 | +path in order to unmount the path (regular umount(8) will not work): |
| 73 | +
|
| 74 | + $ sudo memfd-bind --cleanup /usr/bin/runc |
| 75 | +
|
| 76 | +Note that (due to restrictions on /proc/$pid/fd/$fd magic-link resolution), |
| 77 | +only privileged users (specifically, those that have ptrace privileges over the |
| 78 | +memfd-bind daemon) can access the memfd bind-mount. This means that using this |
| 79 | +tool to harden your /usr/bin/runc binary would result in unprivileged users |
| 80 | +being unable to execute the binary. If this is an issue, you could make all |
| 81 | +privileged process use a different copy of runc (by making a copy in somewhere |
| 82 | +like /usr/sbin/runc) and only using memfd-bind for the version used by |
| 83 | +privileged users. |
| 84 | +` |
| 85 | +) |
| 86 | + |
| 87 | +func cleanup(path string) error { |
| 88 | + file, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) |
| 89 | + if err != nil { |
| 90 | + return fmt.Errorf("cleanup: failed to open runc binary path: %w", err) |
| 91 | + } |
| 92 | + defer file.Close() |
| 93 | + fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd()) |
| 94 | + |
| 95 | + // Keep umounting until we hit a umount error. |
| 96 | + for unix.Unmount(fdPath, unix.MNT_DETACH) == nil { |
| 97 | + // loop... |
| 98 | + logrus.Debugf("memfd-bind: path %q unmount succeeded...", path) |
| 99 | + } |
| 100 | + logrus.Infof("memfd-bind: path %q has been cleared of all old bind-mounts", path) |
| 101 | + return nil |
| 102 | +} |
| 103 | + |
| 104 | +// memfdClone is a memfd-only implementation of dmz.CloneBinary. |
| 105 | +func memfdClone(path string) (*os.File, error) { |
| 106 | + binFile, err := os.Open(path) |
| 107 | + if err != nil { |
| 108 | + return nil, fmt.Errorf("failed to open runc binary path: %w", err) |
| 109 | + } |
| 110 | + defer binFile.Close() |
| 111 | + stat, err := binFile.Stat() |
| 112 | + if err != nil { |
| 113 | + return nil, fmt.Errorf("checking %s size: %w", path, err) |
| 114 | + } |
| 115 | + size := stat.Size() |
| 116 | + memfd, sealFn, err := dmz.Memfd("/proc/self/exe") |
| 117 | + if err != nil { |
| 118 | + return nil, fmt.Errorf("creating memfd failed: %w", err) |
| 119 | + } |
| 120 | + copied, err := io.Copy(memfd, binFile) |
| 121 | + if err != nil { |
| 122 | + return nil, fmt.Errorf("copy binary: %w", err) |
| 123 | + } else if copied != size { |
| 124 | + return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size) |
| 125 | + } |
| 126 | + if err := sealFn(&memfd); err != nil { |
| 127 | + return nil, fmt.Errorf("could not seal fd: %w", err) |
| 128 | + } |
| 129 | + if !dmz.IsCloned(memfd) { |
| 130 | + return nil, fmt.Errorf("cloned memfd is not properly sealed") |
| 131 | + } |
| 132 | + return memfd, nil |
| 133 | +} |
| 134 | + |
| 135 | +func mount(path string) error { |
| 136 | + memfdFile, err := memfdClone(path) |
| 137 | + if err != nil { |
| 138 | + return fmt.Errorf("memfd clone: %w", err) |
| 139 | + } |
| 140 | + defer memfdFile.Close() |
| 141 | + memfdPath := fmt.Sprintf("/proc/self/fd/%d", memfdFile.Fd()) |
| 142 | + |
| 143 | + // We have to open an O_NOFOLLOW|O_PATH to the memfd magic-link because we |
| 144 | + // cannot bind-mount the memfd itself (it's in the internal kernel mount |
| 145 | + // namespace and cross-mount-namespace bind-mounts are not allowed). This |
| 146 | + // also requires that this program stay alive continuously for the |
| 147 | + // magic-link to stay alive... |
| 148 | + memfdLink, err := os.OpenFile(memfdPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) |
| 149 | + if err != nil { |
| 150 | + return fmt.Errorf("mount: failed to /proc/self/fd magic-link for memfd: %w", err) |
| 151 | + } |
| 152 | + defer memfdLink.Close() |
| 153 | + memfdLinkFdPath := fmt.Sprintf("/proc/self/fd/%d", memfdLink.Fd()) |
| 154 | + |
| 155 | + exeFile, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) |
| 156 | + if err != nil { |
| 157 | + return fmt.Errorf("mount: failed to open target runc binary path: %w", err) |
| 158 | + } |
| 159 | + defer exeFile.Close() |
| 160 | + exeFdPath := fmt.Sprintf("/proc/self/fd/%d", exeFile.Fd()) |
| 161 | + |
| 162 | + err = unix.Mount(memfdLinkFdPath, exeFdPath, "", unix.MS_BIND, "") |
| 163 | + if err != nil { |
| 164 | + return fmt.Errorf("mount: failed to mount memfd on top of runc binary path target: %w", err) |
| 165 | + } |
| 166 | + |
| 167 | + // If there is a signal we want to do cleanup. |
| 168 | + sigCh := make(chan os.Signal, 1) |
| 169 | + signal.Notify(sigCh, os.Interrupt, unix.SIGTERM, unix.SIGINT) |
| 170 | + go func() { |
| 171 | + <-sigCh |
| 172 | + logrus.Infof("memfd-bind: exit signal caught! cleaning up the bind-mount on %q...", path) |
| 173 | + _ = cleanup(path) |
| 174 | + os.Exit(0) |
| 175 | + }() |
| 176 | + |
| 177 | + // Clean up things we don't need... |
| 178 | + _ = exeFile.Close() |
| 179 | + _ = memfdLink.Close() |
| 180 | + |
| 181 | + // We now have to stay alive to keep the magic-link alive... |
| 182 | + logrus.Infof("memfd-bind: bind-mount of memfd over %q created -- looping forever!", path) |
| 183 | + for { |
| 184 | + // loop forever... |
| 185 | + time.Sleep(time.Duration(1<<63 - 1)) |
| 186 | + // make sure the memfd isn't gc'd |
| 187 | + runtime.KeepAlive(memfdFile) |
| 188 | + } |
| 189 | +} |
| 190 | + |
| 191 | +func main() { |
| 192 | + app := cli.NewApp() |
| 193 | + app.Name = "memfd-bind" |
| 194 | + app.Usage = usage |
| 195 | + |
| 196 | + // Set version to be the same as runC. |
| 197 | + var v []string |
| 198 | + if version != "" { |
| 199 | + v = append(v, version) |
| 200 | + } |
| 201 | + if gitCommit != "" { |
| 202 | + v = append(v, "commit: "+gitCommit) |
| 203 | + } |
| 204 | + app.Version = strings.Join(v, "\n") |
| 205 | + |
| 206 | + // Set the flags. |
| 207 | + app.Flags = []cli.Flag{ |
| 208 | + cli.BoolFlag{ |
| 209 | + Name: "cleanup", |
| 210 | + Usage: "Do not create a new memfd-sealed file, only clean up an existing one at <path>.", |
| 211 | + }, |
| 212 | + cli.BoolFlag{ |
| 213 | + Name: "debug", |
| 214 | + Usage: "Enable debug logging.", |
| 215 | + }, |
| 216 | + } |
| 217 | + |
| 218 | + app.Action = func(ctx *cli.Context) error { |
| 219 | + args := ctx.Args() |
| 220 | + if len(args) != 1 { |
| 221 | + return errors.New("need to specify a single path to the runc binary") |
| 222 | + } |
| 223 | + path := ctx.Args()[0] |
| 224 | + |
| 225 | + if ctx.Bool("debug") { |
| 226 | + logrus.SetLevel(logrus.DebugLevel) |
| 227 | + } |
| 228 | + |
| 229 | + err := cleanup(path) |
| 230 | + // We only care about cleanup errors when doing --cleanup. |
| 231 | + if ctx.Bool("cleanup") { |
| 232 | + return err |
| 233 | + } |
| 234 | + return mount(path) |
| 235 | + } |
| 236 | + if err := app.Run(os.Args); err != nil { |
| 237 | + fmt.Fprintf(os.Stderr, "memfd-bind: %v\n", err) |
| 238 | + os.Exit(1) |
| 239 | + } |
| 240 | +} |
0 commit comments