|
| 1 | +#!/bin/bash |
| 2 | +# Add bind mounts in a user namespace and change to that space. |
| 3 | +# Requires being able to run unshare -rm and the ability to do fuse mounts |
| 4 | +# (kernel >= 4.18) and requires fuse-overlayfs. |
| 5 | +# Written by Dave Dykstra November 2024, based heavily on cvmfsexec. |
| 6 | + |
| 7 | +#set -x |
| 8 | +#PS4='c$$+ ' |
| 9 | + |
| 10 | +VERSION=4.42 |
| 11 | + |
| 12 | +usage() |
| 13 | +{ |
| 14 | + echo "Usage: bindexec [-v] [src:dest ...] -- [command]" >&2 |
| 15 | + echo " Bind mount each src to dest in new user mount namespace and run command" >&2 |
| 16 | + echo " -v: print current version and exit" >&2 |
| 17 | + exit 1 |
| 18 | +} |
| 19 | + |
| 20 | +# needed for pivot_root |
| 21 | +PATH=$PATH:/usr/sbin |
| 22 | + |
| 23 | +TMPD=/dev/shm/bindexec |
| 24 | +STARTFIFO=$TMPD/start |
| 25 | +WAITFIFO=$TMPD/wait |
| 26 | + |
| 27 | +# bash syntax {NAME}<&N doesn't work on older bashes such as the |
| 28 | +# version 3.2.x on macOS Big Sur, and in fact it fails with an error |
| 29 | +# message but not an error code, so test for it first to be able to |
| 30 | +# gracefully die |
| 31 | + |
| 32 | +if [ -n "$({TESTX}<&0 2>&1)" ]; then |
| 33 | + echo "Cannot assign file descriptors to variables, bash version too old" >&2 |
| 34 | + exit 1 |
| 35 | +fi |
| 36 | + |
| 37 | +# make a copy of stdin fd, for sending to the final command |
| 38 | +exec {STDINCOPYFD}<&0 |
| 39 | + |
| 40 | +ORIGPWD=$PWD |
| 41 | + |
| 42 | +# can't use OPTIND because it can't distinguish between -- there or missing |
| 43 | +NOPTS=0 |
| 44 | +while getopts "v" OPTION; do |
| 45 | + let NOPTS+=1 |
| 46 | + case $OPTION in |
| 47 | + v) echo "$VERSION" |
| 48 | + exit |
| 49 | + ;; |
| 50 | + \?) usage |
| 51 | + ;; |
| 52 | + esac |
| 53 | +done |
| 54 | +shift $NOPTS |
| 55 | + |
| 56 | +BINDS="" |
| 57 | +for ARG; do |
| 58 | + if [ "$ARG" == "--" ]; then |
| 59 | + break |
| 60 | + fi |
| 61 | + if [[ "$ARG" != *:* ]]; then |
| 62 | + echo "bindexec: $ARG does not contain a colon" >&2 |
| 63 | + usage |
| 64 | + fi |
| 65 | + if [[ "$ARG" != /* ]] || [[ "$ARG" != *:/* ]]; then |
| 66 | + echo "bindexec: source or destination in $ARG do not start with \"/\"" >&2 |
| 67 | + usage |
| 68 | + fi |
| 69 | + BINDS="$BINDS $ARG" |
| 70 | + shift |
| 71 | +done |
| 72 | + |
| 73 | +if [ "$ARG" != "--" ]; then |
| 74 | + echo "bindexec: no double-hyphen found" >&2 |
| 75 | + usage |
| 76 | +fi |
| 77 | +shift |
| 78 | + |
| 79 | +ORIGUID="$(id -u)" |
| 80 | +ORIGGID="$(id -g)" |
| 81 | + |
| 82 | +UNSHAREOPTS="--propagation unchanged" |
| 83 | + |
| 84 | +# Note that within the HERE document, unprotected $ substitutions are |
| 85 | +# done by the surrounding shell, and \$ is within the unshare shell |
| 86 | +unshare -rm -pf $UNSHAREOPTS /bin/bash /dev/stdin "${@:-$SHELL}" <<!EOF-1! |
| 87 | + #set -x |
| 88 | + #PS4='c\$$+ ' |
| 89 | +
|
| 90 | + # mount a private /dev/shm |
| 91 | + mount -t tmpfs tmpfs /dev/shm |
| 92 | + mkdir $TMPD |
| 93 | +
|
| 94 | + # now in the first "fake root" namespace |
| 95 | + mount -t proc proc /proc |
| 96 | + mkdir -p $TMPD/upper $TMPD/work $TMPD/overlay |
| 97 | +
|
| 98 | + # put the bind mounts into the upper dir |
| 99 | + for BIND in $BINDS; do |
| 100 | + SRC="\${BIND%:*}" |
| 101 | + DST="\${BIND#*:}" |
| 102 | + if [ -d "\$SRC" ]; then |
| 103 | + mkdir -p $TMPD/upper\$DST |
| 104 | + elif [ -f "\$SRC" ]; then |
| 105 | + DSTDIR="\${DST%/*}" |
| 106 | + if [ "\$DST" != "\$DSTDIR" ]; then |
| 107 | + mkdir -p $TMPD/upper\$DSTDIR |
| 108 | + fi |
| 109 | + touch $TMPD/upper\$DST |
| 110 | + else |
| 111 | + echo "bindexec: \$SRC not found, skipping" >&2 |
| 112 | + fi |
| 113 | + mount --rbind \$SRC $TMPD/upper\$DST |
| 114 | + done |
| 115 | +
|
| 116 | + # Leave this bash running as PID 1, because most other |
| 117 | + # programs won't handle signals & child reaping correctly. |
| 118 | + # Note that all other processes in the namespaces will get |
| 119 | + # a SIGKILL when PID 1 exits. |
| 120 | + trap "" 1 2 3 15 # ignore all ordinary signals |
| 121 | +
|
| 122 | + fuse-overlayfs -o allow_other,noacl,squash_to_root,lowerdir=/,upperdir=$TMPD/upper,workdir=$TMPD/work $TMPD/overlay 2> >(grep -v lazytime >&2) |
| 123 | + # Put original system dirs on top of the overlay |
| 124 | + mount -t proc proc $TMPD/overlay/proc |
| 125 | + mount --rbind /sys $TMPD/overlay/sys |
| 126 | + mount --rbind /dev $TMPD/overlay/dev |
| 127 | +
|
| 128 | + # Add cvmfs on top if it is present |
| 129 | + if [ -d /cvmfs ]; then |
| 130 | + mkdir -p $TMPD/overlay/cvmfs |
| 131 | + mount --rbind /cvmfs $TMPD/overlay/cvmfs |
| 132 | + fi |
| 133 | +
|
| 134 | + # Also bind on top nfs mounts because they don't work through fuse-overlayfs |
| 135 | + mount|while read FROM X TO X TYPE REST; do |
| 136 | + if [[ \$TYPE = nfs* ]]; then |
| 137 | + mkdir -p $TMPD/overlay\$TO |
| 138 | + # this sometimes fails with weird bind mount combinations |
| 139 | + # under apptainer so just save the output in a variable so |
| 140 | + # it can be seen with debugging enabled |
| 141 | + MSG="\$(mount --rbind \$TO $TMPD/overlay\$TO 2>&1)" |
| 142 | + fi |
| 143 | + done |
| 144 | +
|
| 145 | + # Also bind /tmp and /var/tmp so files created there go into the system |
| 146 | + # directories instead of into a ram disk |
| 147 | + for D in /tmp /var/tmp; do |
| 148 | + mount --rbind \$D $TMPD/overlay\$D |
| 149 | + done |
| 150 | +
|
| 151 | + # Start a second fake root namespace so we don't interfere with the |
| 152 | + # fuse-overlayfs mount space when we do the pivot_root. |
| 153 | + # Quoting the HERE document's delimeter makes this nested shell not |
| 154 | + # interpret $ substitutions, but the previous one still does so |
| 155 | + # need to use \$ when don't want first shell to expand. |
| 156 | + unshare -rm $UNSHAREOPTS /bin/bash /dev/stdin "\${@:-$SHELL}" <<'!EOF-2!' |
| 157 | + #set -x |
| 158 | + #PS4='c\$$+ ' |
| 159 | +
|
| 160 | + mkfifo $STARTFIFO $WAITFIFO |
| 161 | +
|
| 162 | + ( |
| 163 | + # This is a background process for setting up the child's uid map |
| 164 | + trap "" 1 2 3 15 # ignore ordinary signals |
| 165 | + read PID |
| 166 | + # set up uid/gid map |
| 167 | + echo "$ORIGGID 0 1" >/proc/"\$PID"/gid_map |
| 168 | + echo "$ORIGUID 0 1" >/proc/"\$PID"/uid_map |
| 169 | + echo "ready" >$WAITFIFO |
| 170 | + ) <$STARTFIFO & |
| 171 | + |
| 172 | + # Change to the new root. Would use chroot but it doesn't work. |
| 173 | + mount --rbind $TMPD/overlay $TMPD/overlay # pivot_root requires this |
| 174 | + cd $TMPD/overlay |
| 175 | + mkdir -p .old-root |
| 176 | + pivot_root . .old-root |
| 177 | + umount -l .old-root 2>/dev/null |
| 178 | + rmdir .old-root |
| 179 | + cd / |
| 180 | +
|
| 181 | + # Finally, start the user namespace with the original uid/gid |
| 182 | + # This HERE document is also quoted and so the shell does not expand |
| 183 | + exec unshare -U $UNSHAREOPTS /bin/bash /dev/stdin "\${@:-$SHELL}" <<'!EOF-3!' |
| 184 | + #set -x |
| 185 | + #PS4='c\$$+ ' |
| 186 | +
|
| 187 | + # now in the user namespace |
| 188 | +
|
| 189 | + cd $ORIGPWD |
| 190 | +
|
| 191 | + echo "\$$" >$STARTFIFO |
| 192 | + # wait for the uid/gid maps to be set up |
| 193 | + read X <$WAITFIFO |
| 194 | +
|
| 195 | + exec "\$@" <&$STDINCOPYFD $STDINCOPYFD<&- |
| 196 | +!EOF-3! |
| 197 | +
|
| 198 | +!EOF-2! |
| 199 | +
|
| 200 | +!EOF-1! |
0 commit comments